diff --git a/pom.xml b/pom.xml index de5cf91..2309a15 100644 --- a/pom.xml +++ b/pom.xml @@ -88,7 +88,7 @@ us.codecraft xsoup - 0.2.3 + 0.2.4-SNAPSHOT com.alibaba diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/AbstractSelectable.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/AbstractSelectable.java index 2ac4c70..e2bb552 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/AbstractSelectable.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/AbstractSelectable.java @@ -11,17 +11,7 @@ import java.util.List; */ public abstract class AbstractSelectable implements Selectable { - protected List strings; - - public AbstractSelectable(String text) { - List results = new ArrayList(); - results.add(text); - this.strings = results; - } - - public AbstractSelectable(List strings) { - this.strings = strings; - } + protected abstract List getSourceTexts(); @Override public Selectable css(String selector) { @@ -55,7 +45,7 @@ public abstract class AbstractSelectable implements Selectable { @Override public List all() { - return strings; + return getSourceTexts(); } @Override @@ -74,30 +64,37 @@ public abstract class AbstractSelectable implements Selectable { @Override public Selectable select(Selector selector) { - return select(selector, strings); + return select(selector, getSourceTexts()); } @Override public Selectable selectList(Selector selector) { - return selectList(selector, strings); + return selectList(selector, getSourceTexts()); } @Override public Selectable regex(String regex) { RegexSelector regexSelector = Selectors.regex(regex); - return selectList(regexSelector, strings); + return selectList(regexSelector, getSourceTexts()); } @Override public Selectable regex(String regex, int group) { RegexSelector regexSelector = Selectors.regex(regex, group); - return selectList(regexSelector, strings); + return selectList(regexSelector, getSourceTexts()); } @Override public Selectable replace(String regex, String replacement) { ReplaceSelector replaceSelector = new ReplaceSelector(regex,replacement); - return select(replaceSelector, strings); + return select(replaceSelector, getSourceTexts()); + } + + public String getFirstSourceText() { + if (getSourceTexts() != null && getSourceTexts().size() > 0) { + return getSourceTexts().get(0); + } + return null; } @Override @@ -107,6 +104,6 @@ public abstract class AbstractSelectable implements Selectable { @Override public boolean match() { - return strings != null && strings.size() > 0; + return getSourceTexts() != null && getSourceTexts().size() > 0; } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java index 3b9b22d..bbc7217 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java @@ -2,7 +2,6 @@ package us.codecraft.webmagic.selector; import org.jsoup.Jsoup; import org.jsoup.nodes.Element; -import org.jsoup.select.Elements; import java.util.ArrayList; import java.util.List; @@ -37,16 +36,18 @@ public abstract class BaseElementSelector implements Selector, ElementSelector { return null; } - public Elements selectElements(String text) { + public List selectElements(String text) { if (text != null) { return selectElements(Jsoup.parse(text)); } else { - return new Elements(); + return new ArrayList(); } } public abstract Element selectElement(Element element); - public abstract Elements selectElements(Element element); + public abstract List selectElements(Element element); + + public abstract boolean hasAttribute(); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java index 095af35..6a638db 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java @@ -57,7 +57,7 @@ public class CssSelector extends BaseElementSelector { @Override public String select(Element element) { - Elements elements = selectElements(element); + List elements = selectElements(element); if (CollectionUtils.isEmpty(elements)) { return null; } @@ -67,7 +67,7 @@ public class CssSelector extends BaseElementSelector { @Override public List selectList(Element doc) { List strings = new ArrayList(); - Elements elements = selectElements(doc); + List elements = selectElements(doc); if (CollectionUtils.isNotEmpty(elements)) { for (Element element : elements) { String value = getValue(element); @@ -89,7 +89,12 @@ public class CssSelector extends BaseElementSelector { } @Override - public Elements selectElements(Element element) { + public List selectElements(Element element) { return element.select(selectorText); } + + @Override + public boolean hasAttribute() { + return attrName != null; + } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java index 9748577..7b593ed 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java @@ -2,10 +2,11 @@ package us.codecraft.webmagic.selector; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.util.ArrayList; +import java.util.Collections; import java.util.List; /** @@ -14,7 +15,7 @@ import java.util.List; * @author code4crafter@gmail.com
* @since 0.1.0 */ -public class Html extends PlainText { +public class Html extends HtmlNode { private Logger logger = LoggerFactory.getLogger(getClass()); @@ -23,130 +24,26 @@ public class Html extends PlainText { */ private Document document; - private boolean needInitCache = true; - - public Html(List strings) { - super(strings); - } - public Html(String text) { - super(text); - } - - public Html(List strings, boolean needInitCache) { - super(strings); - this.needInitCache = needInitCache; - } - - public Html(String text, boolean needInitCache) { - super(text); - this.needInitCache = needInitCache; - } - - /** - * lazy init - */ - private void initDocument() { - if (this.document == null && needInitCache) { - needInitCache = false; - //just init once whether the parsing succeeds or not - try { - this.document = Jsoup.parse(getText()); - } catch (Exception e) { - logger.warn("parse document error ", e); - } + try { + this.document = Jsoup.parse(text); + } catch (Exception e) { + this.document = null; + logger.warn("parse document error ", e); } } public Html(Document document) { - super(document.html()); this.document = document; } - public static Html create(String text) { - return new Html(text); - } - - @Override - protected Selectable select(Selector selector, List strings) { - initDocument(); - List results = new ArrayList(); - for (String string : strings) { - String result = selector.select(string); - if (result != null) { - results.add(result); - } - } - return new Html(results, false); - } - - @Override - protected Selectable selectList(Selector selector, List strings) { - initDocument(); - List results = new ArrayList(); - for (String string : strings) { - List result = selector.selectList(string); - results.addAll(result); - } - return new Html(results, false); - } - - @Override - public Selectable smartContent() { - initDocument(); - SmartContentSelector smartContentSelector = Selectors.smartContent(); - return select(smartContentSelector, strings); - } - - @Override - public Selectable links() { - return xpath("//a/@href"); - } - - @Override - public Selectable xpath(String xpath) { - XpathSelector xpathSelector = Selectors.xpath(xpath); - if (document != null) { - return new Html(xpathSelector.selectList(document), false); - } - return selectList(xpathSelector, strings); - } - - @Override - public Selectable $(String selector) { - CssSelector cssSelector = Selectors.$(selector); - if (document != null) { - return new Html(cssSelector.selectList(document), false); - } - return selectList(cssSelector, strings); - } - - @Override - public Selectable $(String selector, String attrName) { - CssSelector cssSelector = Selectors.$(selector, attrName); - if (document != null) { - return new Html(cssSelector.selectList(document), false); - } - return selectList(cssSelector, strings); - } - public Document getDocument() { - initDocument(); return document; } - public String getText() { - if (strings != null && strings.size() > 0) { - return strings.get(0); - } - return document.html(); - } - @Override - public List nodes() { - ArrayList selectables = new ArrayList(); - selectables.add(this); - return selectables; + protected List getElements() { + return Collections.singletonList(getDocument()); } /** @@ -158,7 +55,7 @@ public class Html extends PlainText { ElementSelector elementSelector = (ElementSelector) selector; return elementSelector.select(getDocument()); } else { - return selector.select(getText()); + return selector.select(getFirstSourceText()); } } @@ -167,7 +64,12 @@ public class Html extends PlainText { ElementSelector elementSelector = (ElementSelector) selector; return elementSelector.selectList(getDocument()); } else { - return selector.selectList(getText()); + return selector.selectList(getFirstSourceText()); } } + + public static Html create(String text) { + return new Html(text); + } + } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlFragment.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlFragment.java deleted file mode 100644 index d427f67..0000000 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlFragment.java +++ /dev/null @@ -1,7 +0,0 @@ -package us.codecraft.webmagic.selector; - -/** - * @author code4crafer@gmail.com - */ -public class HtmlFragment { -} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java new file mode 100644 index 0000000..3ca7e5c --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java @@ -0,0 +1,97 @@ +package us.codecraft.webmagic.selector; + +import org.jsoup.nodes.Element; + +import java.util.ArrayList; +import java.util.List; + +/** + * @author code4crafer@gmail.com + */ +public class HtmlNode extends AbstractSelectable { + + private final List elements; + + public HtmlNode(List elements) { + this.elements = elements; + } + + public HtmlNode() { + elements = null; + } + + protected List getElements() { + return elements; + } + + @Override + public Selectable smartContent() { + SmartContentSelector smartContentSelector = Selectors.smartContent(); + return select(smartContentSelector, getSourceTexts()); + } + + @Override + public Selectable links() { + return xpath("//a/@href"); + } + + @Override + public Selectable xpath(String xpath) { + XpathSelector xpathSelector = Selectors.xpath(xpath); + return selectElements(xpathSelector); + } + + /** + * select elements + * + * @param elementSelector + * @return + */ + protected Selectable selectElements(BaseElementSelector elementSelector) { + if (!elementSelector.hasAttribute()) { + List resultElements = new ArrayList(); + for (Element element : getElements()) { + List selectElements = elementSelector.selectElements(element); + resultElements.addAll(selectElements); + } + return new HtmlNode(resultElements); + } else { + // has attribute, consider as plaintext + List resultStrings = new ArrayList(); + for (Element element : getElements()) { + List selectList = elementSelector.selectList(element); + resultStrings.addAll(selectList); + } + return new PlainText(resultStrings); + + } + } + + @Override + public Selectable $(String selector) { + CssSelector cssSelector = Selectors.$(selector); + return selectElements(cssSelector); + } + + @Override + public Selectable $(String selector, String attrName) { + CssSelector cssSelector = Selectors.$(selector, attrName); + return selectElements(cssSelector); + } + + @Override + public List nodes() { + ArrayList selectables = new ArrayList(); + selectables.add(this); + return selectables; + } + + @Override + protected List getSourceTexts() { + List sourceTexts = new ArrayList(getElements().size()); + for (Element element : getElements()) { + sourceTexts.add(element.toString()); + } + return sourceTexts; + } +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Json.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Json.java index 96d1c2b..4c31eb4 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Json.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Json.java @@ -26,7 +26,7 @@ public class Json extends PlainText { * @return */ public Json removePadding(String padding) { - String text = getText(); + String text = getFirstSourceText(); XTokenQueue tokenQueue = new XTokenQueue(text); tokenQueue.consumeWhitespace(); tokenQueue.consume(padding); @@ -36,29 +36,22 @@ public class Json extends PlainText { } public T toObject(Class clazz) { - if (getText() == null) { + if (getFirstSourceText() == null) { return null; } - return JSON.parseObject(getText(), clazz); + return JSON.parseObject(getFirstSourceText(), clazz); } public List toList(Class clazz) { - if (getText() == null) { + if (getFirstSourceText() == null) { return null; } - return JSON.parseArray(getText(), clazz); - } - - public String getText() { - if (strings != null && strings.size() > 0) { - return strings.get(0); - } - return null; + return JSON.parseArray(getFirstSourceText(), clazz); } @Override public Selectable jsonPath(String jsonPath) { JsonPathSelector jsonPathSelector = new JsonPathSelector(jsonPath); - return selectList(jsonPathSelector,strings); + return selectList(jsonPathSelector,getSourceTexts()); } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java index c1d034a..557763b 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java @@ -12,12 +12,15 @@ import java.util.List; */ public class PlainText extends AbstractSelectable { - public PlainText(List strings) { - super(strings); + protected List sourceTexts; + + public PlainText(List sourceTexts) { + this.sourceTexts = sourceTexts; } public PlainText(String text) { - super(text); + this.sourceTexts = new ArrayList(); + sourceTexts.add(text); } public static PlainText create(String text) { @@ -51,11 +54,15 @@ public class PlainText extends AbstractSelectable { @Override public List nodes() { - List nodes = new ArrayList(strings.size()); - for (String string : strings) { + List nodes = new ArrayList(getSourceTexts().size()); + for (String string : getSourceTexts()) { nodes.add(PlainText.create(string)); } return nodes; } + @Override + protected List getSourceTexts() { + return sourceTexts; + } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XpathSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XpathSelector.java index 4516a3d..8a980a5 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XpathSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XpathSelector.java @@ -2,7 +2,6 @@ package us.codecraft.webmagic.selector; import org.apache.commons.collections.CollectionUtils; import org.jsoup.nodes.Element; -import org.jsoup.select.Elements; import us.codecraft.xsoup.XPathEvaluator; import us.codecraft.xsoup.Xsoup; @@ -34,7 +33,7 @@ public class XpathSelector extends BaseElementSelector { @Override public Element selectElement(Element element) { - Elements elements = selectElements(element); + List elements = selectElements(element); if (CollectionUtils.isNotEmpty(elements)){ return elements.get(0); } @@ -42,7 +41,12 @@ public class XpathSelector extends BaseElementSelector { } @Override - public Elements selectElements(Element element) { + public List selectElements(Element element) { return xPathEvaluator.evaluate(element).getElements(); } + + @Override + public boolean hasAttribute() { + return xPathEvaluator.hasAttribute(); + } } diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java index 084a110..352e49c 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java @@ -39,7 +39,7 @@ public class HttpClientDownloaderTest { public void testDownloader() { HttpClientDownloader httpClientDownloader = new HttpClientDownloader(); Html html = httpClientDownloader.download("https://github.com"); - assertTrue(!html.getText().isEmpty()); + assertTrue(!html.getFirstSourceText().isEmpty()); } @Test(expected = IllegalArgumentException.class)