diff --git a/pom.xml b/pom.xml
index de5cf91..2309a15 100644
--- a/pom.xml
+++ b/pom.xml
@@ -88,7 +88,7 @@
us.codecraft
xsoup
- 0.2.3
+ 0.2.4-SNAPSHOT
com.alibaba
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/AbstractSelectable.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/AbstractSelectable.java
index 2ac4c70..e2bb552 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/AbstractSelectable.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/AbstractSelectable.java
@@ -11,17 +11,7 @@ import java.util.List;
*/
public abstract class AbstractSelectable implements Selectable {
- protected List strings;
-
- public AbstractSelectable(String text) {
- List results = new ArrayList();
- results.add(text);
- this.strings = results;
- }
-
- public AbstractSelectable(List strings) {
- this.strings = strings;
- }
+ protected abstract List getSourceTexts();
@Override
public Selectable css(String selector) {
@@ -55,7 +45,7 @@ public abstract class AbstractSelectable implements Selectable {
@Override
public List all() {
- return strings;
+ return getSourceTexts();
}
@Override
@@ -74,30 +64,37 @@ public abstract class AbstractSelectable implements Selectable {
@Override
public Selectable select(Selector selector) {
- return select(selector, strings);
+ return select(selector, getSourceTexts());
}
@Override
public Selectable selectList(Selector selector) {
- return selectList(selector, strings);
+ return selectList(selector, getSourceTexts());
}
@Override
public Selectable regex(String regex) {
RegexSelector regexSelector = Selectors.regex(regex);
- return selectList(regexSelector, strings);
+ return selectList(regexSelector, getSourceTexts());
}
@Override
public Selectable regex(String regex, int group) {
RegexSelector regexSelector = Selectors.regex(regex, group);
- return selectList(regexSelector, strings);
+ return selectList(regexSelector, getSourceTexts());
}
@Override
public Selectable replace(String regex, String replacement) {
ReplaceSelector replaceSelector = new ReplaceSelector(regex,replacement);
- return select(replaceSelector, strings);
+ return select(replaceSelector, getSourceTexts());
+ }
+
+ public String getFirstSourceText() {
+ if (getSourceTexts() != null && getSourceTexts().size() > 0) {
+ return getSourceTexts().get(0);
+ }
+ return null;
}
@Override
@@ -107,6 +104,6 @@ public abstract class AbstractSelectable implements Selectable {
@Override
public boolean match() {
- return strings != null && strings.size() > 0;
+ return getSourceTexts() != null && getSourceTexts().size() > 0;
}
}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java
index 3b9b22d..bbc7217 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java
@@ -2,7 +2,6 @@ package us.codecraft.webmagic.selector;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Element;
-import org.jsoup.select.Elements;
import java.util.ArrayList;
import java.util.List;
@@ -37,16 +36,18 @@ public abstract class BaseElementSelector implements Selector, ElementSelector {
return null;
}
- public Elements selectElements(String text) {
+ public List selectElements(String text) {
if (text != null) {
return selectElements(Jsoup.parse(text));
} else {
- return new Elements();
+ return new ArrayList();
}
}
public abstract Element selectElement(Element element);
- public abstract Elements selectElements(Element element);
+ public abstract List selectElements(Element element);
+
+ public abstract boolean hasAttribute();
}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java
index 095af35..6a638db 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java
@@ -57,7 +57,7 @@ public class CssSelector extends BaseElementSelector {
@Override
public String select(Element element) {
- Elements elements = selectElements(element);
+ List elements = selectElements(element);
if (CollectionUtils.isEmpty(elements)) {
return null;
}
@@ -67,7 +67,7 @@ public class CssSelector extends BaseElementSelector {
@Override
public List selectList(Element doc) {
List strings = new ArrayList();
- Elements elements = selectElements(doc);
+ List elements = selectElements(doc);
if (CollectionUtils.isNotEmpty(elements)) {
for (Element element : elements) {
String value = getValue(element);
@@ -89,7 +89,12 @@ public class CssSelector extends BaseElementSelector {
}
@Override
- public Elements selectElements(Element element) {
+ public List selectElements(Element element) {
return element.select(selectorText);
}
+
+ @Override
+ public boolean hasAttribute() {
+ return attrName != null;
+ }
}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java
index 9748577..7b593ed 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java
@@ -2,10 +2,11 @@ package us.codecraft.webmagic.selector;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import java.util.ArrayList;
+import java.util.Collections;
import java.util.List;
/**
@@ -14,7 +15,7 @@ import java.util.List;
* @author code4crafter@gmail.com
* @since 0.1.0
*/
-public class Html extends PlainText {
+public class Html extends HtmlNode {
private Logger logger = LoggerFactory.getLogger(getClass());
@@ -23,130 +24,26 @@ public class Html extends PlainText {
*/
private Document document;
- private boolean needInitCache = true;
-
- public Html(List strings) {
- super(strings);
- }
-
public Html(String text) {
- super(text);
- }
-
- public Html(List strings, boolean needInitCache) {
- super(strings);
- this.needInitCache = needInitCache;
- }
-
- public Html(String text, boolean needInitCache) {
- super(text);
- this.needInitCache = needInitCache;
- }
-
- /**
- * lazy init
- */
- private void initDocument() {
- if (this.document == null && needInitCache) {
- needInitCache = false;
- //just init once whether the parsing succeeds or not
- try {
- this.document = Jsoup.parse(getText());
- } catch (Exception e) {
- logger.warn("parse document error ", e);
- }
+ try {
+ this.document = Jsoup.parse(text);
+ } catch (Exception e) {
+ this.document = null;
+ logger.warn("parse document error ", e);
}
}
public Html(Document document) {
- super(document.html());
this.document = document;
}
- public static Html create(String text) {
- return new Html(text);
- }
-
- @Override
- protected Selectable select(Selector selector, List strings) {
- initDocument();
- List results = new ArrayList();
- for (String string : strings) {
- String result = selector.select(string);
- if (result != null) {
- results.add(result);
- }
- }
- return new Html(results, false);
- }
-
- @Override
- protected Selectable selectList(Selector selector, List strings) {
- initDocument();
- List results = new ArrayList();
- for (String string : strings) {
- List result = selector.selectList(string);
- results.addAll(result);
- }
- return new Html(results, false);
- }
-
- @Override
- public Selectable smartContent() {
- initDocument();
- SmartContentSelector smartContentSelector = Selectors.smartContent();
- return select(smartContentSelector, strings);
- }
-
- @Override
- public Selectable links() {
- return xpath("//a/@href");
- }
-
- @Override
- public Selectable xpath(String xpath) {
- XpathSelector xpathSelector = Selectors.xpath(xpath);
- if (document != null) {
- return new Html(xpathSelector.selectList(document), false);
- }
- return selectList(xpathSelector, strings);
- }
-
- @Override
- public Selectable $(String selector) {
- CssSelector cssSelector = Selectors.$(selector);
- if (document != null) {
- return new Html(cssSelector.selectList(document), false);
- }
- return selectList(cssSelector, strings);
- }
-
- @Override
- public Selectable $(String selector, String attrName) {
- CssSelector cssSelector = Selectors.$(selector, attrName);
- if (document != null) {
- return new Html(cssSelector.selectList(document), false);
- }
- return selectList(cssSelector, strings);
- }
-
public Document getDocument() {
- initDocument();
return document;
}
- public String getText() {
- if (strings != null && strings.size() > 0) {
- return strings.get(0);
- }
- return document.html();
- }
-
@Override
- public List nodes() {
- ArrayList selectables = new ArrayList();
- selectables.add(this);
- return selectables;
+ protected List getElements() {
+ return Collections.singletonList(getDocument());
}
/**
@@ -158,7 +55,7 @@ public class Html extends PlainText {
ElementSelector elementSelector = (ElementSelector) selector;
return elementSelector.select(getDocument());
} else {
- return selector.select(getText());
+ return selector.select(getFirstSourceText());
}
}
@@ -167,7 +64,12 @@ public class Html extends PlainText {
ElementSelector elementSelector = (ElementSelector) selector;
return elementSelector.selectList(getDocument());
} else {
- return selector.selectList(getText());
+ return selector.selectList(getFirstSourceText());
}
}
+
+ public static Html create(String text) {
+ return new Html(text);
+ }
+
}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlFragment.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlFragment.java
deleted file mode 100644
index d427f67..0000000
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlFragment.java
+++ /dev/null
@@ -1,7 +0,0 @@
-package us.codecraft.webmagic.selector;
-
-/**
- * @author code4crafer@gmail.com
- */
-public class HtmlFragment {
-}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java
new file mode 100644
index 0000000..3ca7e5c
--- /dev/null
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java
@@ -0,0 +1,97 @@
+package us.codecraft.webmagic.selector;
+
+import org.jsoup.nodes.Element;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * @author code4crafer@gmail.com
+ */
+public class HtmlNode extends AbstractSelectable {
+
+ private final List elements;
+
+ public HtmlNode(List elements) {
+ this.elements = elements;
+ }
+
+ public HtmlNode() {
+ elements = null;
+ }
+
+ protected List getElements() {
+ return elements;
+ }
+
+ @Override
+ public Selectable smartContent() {
+ SmartContentSelector smartContentSelector = Selectors.smartContent();
+ return select(smartContentSelector, getSourceTexts());
+ }
+
+ @Override
+ public Selectable links() {
+ return xpath("//a/@href");
+ }
+
+ @Override
+ public Selectable xpath(String xpath) {
+ XpathSelector xpathSelector = Selectors.xpath(xpath);
+ return selectElements(xpathSelector);
+ }
+
+ /**
+ * select elements
+ *
+ * @param elementSelector
+ * @return
+ */
+ protected Selectable selectElements(BaseElementSelector elementSelector) {
+ if (!elementSelector.hasAttribute()) {
+ List resultElements = new ArrayList();
+ for (Element element : getElements()) {
+ List selectElements = elementSelector.selectElements(element);
+ resultElements.addAll(selectElements);
+ }
+ return new HtmlNode(resultElements);
+ } else {
+ // has attribute, consider as plaintext
+ List resultStrings = new ArrayList();
+ for (Element element : getElements()) {
+ List selectList = elementSelector.selectList(element);
+ resultStrings.addAll(selectList);
+ }
+ return new PlainText(resultStrings);
+
+ }
+ }
+
+ @Override
+ public Selectable $(String selector) {
+ CssSelector cssSelector = Selectors.$(selector);
+ return selectElements(cssSelector);
+ }
+
+ @Override
+ public Selectable $(String selector, String attrName) {
+ CssSelector cssSelector = Selectors.$(selector, attrName);
+ return selectElements(cssSelector);
+ }
+
+ @Override
+ public List nodes() {
+ ArrayList selectables = new ArrayList();
+ selectables.add(this);
+ return selectables;
+ }
+
+ @Override
+ protected List getSourceTexts() {
+ List sourceTexts = new ArrayList(getElements().size());
+ for (Element element : getElements()) {
+ sourceTexts.add(element.toString());
+ }
+ return sourceTexts;
+ }
+}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Json.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Json.java
index 96d1c2b..4c31eb4 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Json.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Json.java
@@ -26,7 +26,7 @@ public class Json extends PlainText {
* @return
*/
public Json removePadding(String padding) {
- String text = getText();
+ String text = getFirstSourceText();
XTokenQueue tokenQueue = new XTokenQueue(text);
tokenQueue.consumeWhitespace();
tokenQueue.consume(padding);
@@ -36,29 +36,22 @@ public class Json extends PlainText {
}
public T toObject(Class clazz) {
- if (getText() == null) {
+ if (getFirstSourceText() == null) {
return null;
}
- return JSON.parseObject(getText(), clazz);
+ return JSON.parseObject(getFirstSourceText(), clazz);
}
public List toList(Class clazz) {
- if (getText() == null) {
+ if (getFirstSourceText() == null) {
return null;
}
- return JSON.parseArray(getText(), clazz);
- }
-
- public String getText() {
- if (strings != null && strings.size() > 0) {
- return strings.get(0);
- }
- return null;
+ return JSON.parseArray(getFirstSourceText(), clazz);
}
@Override
public Selectable jsonPath(String jsonPath) {
JsonPathSelector jsonPathSelector = new JsonPathSelector(jsonPath);
- return selectList(jsonPathSelector,strings);
+ return selectList(jsonPathSelector,getSourceTexts());
}
}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java
index c1d034a..557763b 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java
@@ -12,12 +12,15 @@ import java.util.List;
*/
public class PlainText extends AbstractSelectable {
- public PlainText(List strings) {
- super(strings);
+ protected List sourceTexts;
+
+ public PlainText(List sourceTexts) {
+ this.sourceTexts = sourceTexts;
}
public PlainText(String text) {
- super(text);
+ this.sourceTexts = new ArrayList();
+ sourceTexts.add(text);
}
public static PlainText create(String text) {
@@ -51,11 +54,15 @@ public class PlainText extends AbstractSelectable {
@Override
public List nodes() {
- List nodes = new ArrayList(strings.size());
- for (String string : strings) {
+ List nodes = new ArrayList(getSourceTexts().size());
+ for (String string : getSourceTexts()) {
nodes.add(PlainText.create(string));
}
return nodes;
}
+ @Override
+ protected List getSourceTexts() {
+ return sourceTexts;
+ }
}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XpathSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XpathSelector.java
index 4516a3d..8a980a5 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XpathSelector.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XpathSelector.java
@@ -2,7 +2,6 @@ package us.codecraft.webmagic.selector;
import org.apache.commons.collections.CollectionUtils;
import org.jsoup.nodes.Element;
-import org.jsoup.select.Elements;
import us.codecraft.xsoup.XPathEvaluator;
import us.codecraft.xsoup.Xsoup;
@@ -34,7 +33,7 @@ public class XpathSelector extends BaseElementSelector {
@Override
public Element selectElement(Element element) {
- Elements elements = selectElements(element);
+ List elements = selectElements(element);
if (CollectionUtils.isNotEmpty(elements)){
return elements.get(0);
}
@@ -42,7 +41,12 @@ public class XpathSelector extends BaseElementSelector {
}
@Override
- public Elements selectElements(Element element) {
+ public List selectElements(Element element) {
return xPathEvaluator.evaluate(element).getElements();
}
+
+ @Override
+ public boolean hasAttribute() {
+ return xPathEvaluator.hasAttribute();
+ }
}
diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java
index 084a110..352e49c 100644
--- a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java
+++ b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java
@@ -39,7 +39,7 @@ public class HttpClientDownloaderTest {
public void testDownloader() {
HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
Html html = httpClientDownloader.download("https://github.com");
- assertTrue(!html.getText().isEmpty());
+ assertTrue(!html.getFirstSourceText().isEmpty());
}
@Test(expected = IllegalArgumentException.class)