diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java index 93c184d..0821e6d 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java @@ -1,6 +1,7 @@ package us.codecraft.webmagic; import org.apache.commons.lang3.StringUtils; +import us.codecraft.webmagic.selector.Html; import us.codecraft.webmagic.selector.Selectable; import us.codecraft.webmagic.utils.UrlUtils; @@ -28,7 +29,7 @@ public class Page { private ResultItems resultItems = new ResultItems(); - private Selectable html; + private Html html; private Selectable url; @@ -58,11 +59,11 @@ public class Page { * * @return html */ - public Selectable getHtml() { + public Html getHtml() { return html; } - public void setHtml(Selectable html) { + public void setHtml(Html html) { this.html = html; } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CacheElement.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CacheElement.java new file mode 100644 index 0000000..a58eba2 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CacheElement.java @@ -0,0 +1,36 @@ +package us.codecraft.webmagic.selector; + +import org.jsoup.nodes.Element; + +import java.util.List; + +/** + * Cache parsed element for extract. + * + * @author code4crafter@gmail.com + * @since 0.2.2 + */ +public class CacheElement { + + public String text; + + public Element element; + + public String select(Selector selector) { + if (selector instanceof ElementSelector) { + ElementSelector elementSelector = (ElementSelector) selector; + return elementSelector.select(getElement()); + } else { + return selector.select(getText()); + } + } + + public List selectList(Selector selector) { + if (selector instanceof ElementSelector) { + ElementSelector elementSelector = (ElementSelector) selector; + return elementSelector.selectList(getElement()); + } else { + return selector.selectList(getText()); + } + } +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java index 06987d8..74aa976 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java @@ -97,4 +97,34 @@ public class Html extends PlainText { return selectList(cssSelector, strings); } + public Document getDocument() { + return document; + } + + public String getText() { + return document.html(); + } + + /** + * + * @param selector + * @return + */ + public String select(Selector selector) { + if (selector instanceof ElementSelector) { + ElementSelector elementSelector = (ElementSelector) selector; + return elementSelector.select(getDocument()); + } else { + return selector.select(getText()); + } + } + + public List selectList(Selector selector) { + if (selector instanceof ElementSelector) { + ElementSelector elementSelector = (ElementSelector) selector; + return elementSelector.selectList(getDocument()); + } else { + return selector.selectList(getText()); + } + } } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java index a16c7a1..8849052 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java @@ -1,6 +1,7 @@ package us.codecraft.webmagic.model; import org.apache.commons.lang3.StringUtils; +import org.jsoup.nodes.Element; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.model.annotation.*; import us.codecraft.webmagic.selector.*; @@ -34,7 +35,7 @@ class PageModelExtractor { private List fieldExtractors; - private Extractor extractor; + private Extractor objectExtractor; public static PageModelExtractor create(Class clazz) { PageModelExtractor pageModelExtractor = new PageModelExtractor(); @@ -169,7 +170,7 @@ class PageModelExtractor { annotation = clazz.getAnnotation(ExtractBy.class); if (annotation != null) { ExtractBy extractBy = (ExtractBy) annotation; - extractor = new Extractor(new XpathSelector(extractBy.value()), Extractor.Source.Html, extractBy.notNull(), extractBy.multi()); + objectExtractor = new Extractor(new XpathSelector(extractBy.value()), Extractor.Source.Html, extractBy.notNull(), extractBy.multi()); } } @@ -183,12 +184,12 @@ class PageModelExtractor { if (!matched) { return null; } - if (extractor == null) { + if (objectExtractor == null) { return processSingle(page, page.getHtml().toString()); } else { - if (extractor.multi) { + if (objectExtractor.multi) { List os = new ArrayList(); - List list = extractor.getSelector().selectList(page.getHtml().toString()); + List list = objectExtractor.getSelector().selectList(page.getHtml().toString()); for (String s : list) { Object o = processSingle(page, s); if (o != null) { @@ -197,13 +198,19 @@ class PageModelExtractor { } return os; } else { - String select = extractor.getSelector().select(page.getHtml().toString()); + String select = objectExtractor.getSelector().select(page.getHtml().toString()); Object o = processSingle(page, select); return o; } } } + private List select(Selector selector,Element element,String html){ + if (selector instanceof ElementSelector){ + + } + } + private Object processSingle(Page page, String html) { Object o = null; try {