From 9b2cb43f47a367623279fb20e8ff0de93e7cc56b Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 5 Apr 2014 23:40:10 +0800 Subject: [PATCH] ConfigurablePageProcessor #91 --- .../us/codecraft/webmagic/selector/Html.java | 1 + .../ConfigurablePageProcessor.java | 49 ++++++++ .../webmagic/configurable/ExpressionType.java | 11 ++ .../webmagic/configurable/ExtractRule.java | 113 ++++++++++++++++++ .../webmagic/configurable/Inject.java | 15 --- .../webmagic/configurable/PropertyLoader.java | 18 --- .../ConfigurableBlogPageProcessor.java | 51 -------- .../ConfigurablePageProcessorTest.java | 39 ++++++ 8 files changed, 213 insertions(+), 84 deletions(-) create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ConfigurablePageProcessor.java create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ExpressionType.java create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ExtractRule.java delete mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/Inject.java delete mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/PropertyLoader.java delete mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/example/ConfigurableBlogPageProcessor.java create mode 100644 webmagic-extension/src/test/java/us/codecraft/webmagic/configurable/ConfigurablePageProcessorTest.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java index 614b111..34386b5 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java @@ -131,6 +131,7 @@ public class Html extends PlainText { } public Document getDocument() { + initDocument(); return document; } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ConfigurablePageProcessor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ConfigurablePageProcessor.java new file mode 100644 index 0000000..36615d8 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ConfigurablePageProcessor.java @@ -0,0 +1,49 @@ +package us.codecraft.webmagic.configurable; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.processor.PageProcessor; + +import java.util.List; + +/** + * @author code4crafter@gmail.com
+ */ +public class ConfigurablePageProcessor implements PageProcessor { + + private Site site; + + private List extractRules; + + public ConfigurablePageProcessor(Site site, List extractRules) { + this.site = site; + this.extractRules = extractRules; + } + + @Override + public void process(Page page) { + for (ExtractRule extractRule : extractRules) { + if (extractRule.isMulti()) { + List results = page.getHtml().selectDocumentForList(extractRule.getSelector()); + if (extractRule.isNotNull() && results.size() == 0) { + page.setSkip(true); + } else { + page.getResultItems().put(extractRule.getFieldName(), results); + } + } else { + String result = page.getHtml().selectDocument(extractRule.getSelector()); + if (extractRule.isNotNull() && result == null) { + page.setSkip(true); + } else { + page.getResultItems().put(extractRule.getFieldName(), result); + } + } + } + } + + @Override + public Site getSite() { + return site; + } + +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ExpressionType.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ExpressionType.java new file mode 100644 index 0000000..bd84be3 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ExpressionType.java @@ -0,0 +1,11 @@ +package us.codecraft.webmagic.configurable; + +/** + * @author code4crafter@gmail.com + * @date 14-4-5 + */ +public enum ExpressionType { + + XPath, Regex, Css, JsonPath; + +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ExtractRule.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ExtractRule.java new file mode 100644 index 0000000..82337c4 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ExtractRule.java @@ -0,0 +1,113 @@ +package us.codecraft.webmagic.configurable; + +import us.codecraft.webmagic.selector.JsonPathSelector; +import us.codecraft.webmagic.selector.Selector; + +import static us.codecraft.webmagic.selector.Selectors.*; + +/** + * @author code4crafter@gmail.com + * @date 14-4-5 + */ +public class ExtractRule { + + private String fieldName; + + private ExpressionType expressionType; + + private String expressionValue; + + private String[] expressionParams; + + private boolean multi = false; + + private volatile Selector selector; + + private boolean notNull = false; + + public String getFieldName() { + return fieldName; + } + + public void setFieldName(String fieldName) { + this.fieldName = fieldName; + } + + public ExpressionType getExpressionType() { + return expressionType; + } + + public void setExpressionType(ExpressionType expressionType) { + this.expressionType = expressionType; + } + + public String getExpressionValue() { + return expressionValue; + } + + public void setExpressionValue(String expressionValue) { + this.expressionValue = expressionValue; + } + + public String[] getExpressionParams() { + return expressionParams; + } + + public void setExpressionParams(String[] expressionParams) { + this.expressionParams = expressionParams; + } + + public boolean isMulti() { + return multi; + } + + public void setMulti(boolean multi) { + this.multi = multi; + } + + public Selector getSelector() { + if (selector == null) { + synchronized (this) { + if (selector == null) { + selector = compileSelector(); + } + } + } + return selector; + } + + private Selector compileSelector() { + switch (expressionType) { + case Css: + if (expressionParams.length >= 1) { + return $(expressionValue, expressionParams[0]); + } else { + return $(expressionValue); + } + case XPath: + return xpath(expressionValue); + case Regex: + if (expressionParams.length >= 1) { + return regex(expressionValue, Integer.parseInt(expressionParams[0])); + } else { + return regex(expressionValue); + } + case JsonPath: + return new JsonPathSelector(expressionValue); + default: + return xpath(expressionValue); + } + } + + public void setSelector(Selector selector) { + this.selector = selector; + } + + public boolean isNotNull() { + return notNull; + } + + public void setNotNull(boolean notNull) { + this.notNull = notNull; + } +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/Inject.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/Inject.java deleted file mode 100644 index c6608ae..0000000 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/Inject.java +++ /dev/null @@ -1,15 +0,0 @@ -package us.codecraft.webmagic.configurable; - -import java.lang.annotation.ElementType; -import java.lang.annotation.Retention; -import java.lang.annotation.Target; - -/** - * @author yihua.huang@dianping.com - */ -@Retention(java.lang.annotation.RetentionPolicy.RUNTIME) -@Target({ElementType.FIELD}) -public @interface Inject { - - String value() default ""; -} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/PropertyLoader.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/PropertyLoader.java deleted file mode 100644 index bffbcf2..0000000 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/PropertyLoader.java +++ /dev/null @@ -1,18 +0,0 @@ -package us.codecraft.webmagic.configurable; - -import us.codecraft.webmagic.processor.PageProcessor; - -import java.util.Map; - -/** - * Inject property to object by {@link Inject} annotation. - * - * @author yihua.huang@dianping.com - */ -public class PropertyLoader { - - public T load(T object, Map properties) { - return object; - } - -} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/ConfigurableBlogPageProcessor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/ConfigurableBlogPageProcessor.java deleted file mode 100644 index 28d3ab0..0000000 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/ConfigurableBlogPageProcessor.java +++ /dev/null @@ -1,51 +0,0 @@ -package us.codecraft.webmagic.example; - -import java.util.List; -import us.codecraft.webmagic.Page; -import us.codecraft.webmagic.Site; -import us.codecraft.webmagic.Spider; -import us.codecraft.webmagic.configurable.Inject; -import us.codecraft.webmagic.processor.PageProcessor; - -/** - * @author code4crafter@gmail.com
- */ -public class ConfigurableBlogPageProcessor implements PageProcessor { - - private Site site = Site.me().setDomain("my.oschina.net"); - - @Inject("linkRegex") - private String linkRegex; - - @Inject("titleXpath") - private String titleXpath; - - @Inject("contentXpath") - private String contentXpath; - - @Inject("tagsXpath") - private String tagsXpath; - - @Override - public void process(Page page) { - List links = page.getHtml().links().regex(linkRegex).all(); - page.addTargetRequests(links); - page.putField("title", page.getHtml().xpath(titleXpath).toString()); - if (page.getResultItems().get("title") == null) { - //skip this page - page.setSkip(true); - } - page.putField("content", page.getHtml().smartContent().toString()); - page.putField("tags", page.getHtml().xpath(tagsXpath).all()); - } - - @Override - public Site getSite() { - return site; - - } - - public static void main(String[] args) { - Spider.create(new ConfigurableBlogPageProcessor()).addUrl("http://my.oschina.net/flashsword/blog").thread(2).run(); - } -} diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/configurable/ConfigurablePageProcessorTest.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/configurable/ConfigurablePageProcessorTest.java new file mode 100644 index 0000000..a35fffa --- /dev/null +++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/configurable/ConfigurablePageProcessorTest.java @@ -0,0 +1,39 @@ +package us.codecraft.webmagic.configurable; + +import org.junit.Test; +import us.codecraft.webmagic.ResultItems; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.downloader.MockGithubDownloader; + +import java.util.ArrayList; +import java.util.List; + +import static org.assertj.core.api.Assertions.assertThat; + +/** + * @author code4crafter@gmail.com + * @date 14-4-5 + */ +public class ConfigurablePageProcessorTest { + + @Test + public void test() throws Exception { + List extractRules = new ArrayList(); + ExtractRule extractRule = new ExtractRule(); + extractRule.setExpressionType(ExpressionType.XPath); + extractRule.setExpressionValue("//title"); + extractRule.setFieldName("title"); + extractRules.add(extractRule); + extractRule = new ExtractRule(); + extractRule.setExpressionType(ExpressionType.XPath); + extractRule.setExpressionValue("//ul[@class='pagehead-actions']/li[1]//a[@class='social-count js-social-count']/text()"); + extractRule.setFieldName("star"); + extractRules.add(extractRule); + ResultItems resultItems = Spider.create(new ConfigurablePageProcessor(Site.me(), extractRules)) + .setDownloader(new MockGithubDownloader()).get("https://github.com/code4craft/webmagic"); + assertThat(resultItems.getAll()).containsEntry("title", "code4craft/webmagic · GitHub"); + assertThat(resultItems.getAll()).containsEntry("star", " 86 "); + + } +}