diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java
index 614b111..34386b5 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java
@@ -131,6 +131,7 @@ public class Html extends PlainText {
}
public Document getDocument() {
+ initDocument();
return document;
}
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ConfigurablePageProcessor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ConfigurablePageProcessor.java
new file mode 100644
index 0000000..36615d8
--- /dev/null
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ConfigurablePageProcessor.java
@@ -0,0 +1,49 @@
+package us.codecraft.webmagic.configurable;
+
+import us.codecraft.webmagic.Page;
+import us.codecraft.webmagic.Site;
+import us.codecraft.webmagic.processor.PageProcessor;
+
+import java.util.List;
+
+/**
+ * @author code4crafter@gmail.com
+ */
+public class ConfigurablePageProcessor implements PageProcessor {
+
+ private Site site;
+
+ private List extractRules;
+
+ public ConfigurablePageProcessor(Site site, List extractRules) {
+ this.site = site;
+ this.extractRules = extractRules;
+ }
+
+ @Override
+ public void process(Page page) {
+ for (ExtractRule extractRule : extractRules) {
+ if (extractRule.isMulti()) {
+ List results = page.getHtml().selectDocumentForList(extractRule.getSelector());
+ if (extractRule.isNotNull() && results.size() == 0) {
+ page.setSkip(true);
+ } else {
+ page.getResultItems().put(extractRule.getFieldName(), results);
+ }
+ } else {
+ String result = page.getHtml().selectDocument(extractRule.getSelector());
+ if (extractRule.isNotNull() && result == null) {
+ page.setSkip(true);
+ } else {
+ page.getResultItems().put(extractRule.getFieldName(), result);
+ }
+ }
+ }
+ }
+
+ @Override
+ public Site getSite() {
+ return site;
+ }
+
+}
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ExpressionType.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ExpressionType.java
new file mode 100644
index 0000000..bd84be3
--- /dev/null
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ExpressionType.java
@@ -0,0 +1,11 @@
+package us.codecraft.webmagic.configurable;
+
+/**
+ * @author code4crafter@gmail.com
+ * @date 14-4-5
+ */
+public enum ExpressionType {
+
+ XPath, Regex, Css, JsonPath;
+
+}
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ExtractRule.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ExtractRule.java
new file mode 100644
index 0000000..82337c4
--- /dev/null
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ExtractRule.java
@@ -0,0 +1,113 @@
+package us.codecraft.webmagic.configurable;
+
+import us.codecraft.webmagic.selector.JsonPathSelector;
+import us.codecraft.webmagic.selector.Selector;
+
+import static us.codecraft.webmagic.selector.Selectors.*;
+
+/**
+ * @author code4crafter@gmail.com
+ * @date 14-4-5
+ */
+public class ExtractRule {
+
+ private String fieldName;
+
+ private ExpressionType expressionType;
+
+ private String expressionValue;
+
+ private String[] expressionParams;
+
+ private boolean multi = false;
+
+ private volatile Selector selector;
+
+ private boolean notNull = false;
+
+ public String getFieldName() {
+ return fieldName;
+ }
+
+ public void setFieldName(String fieldName) {
+ this.fieldName = fieldName;
+ }
+
+ public ExpressionType getExpressionType() {
+ return expressionType;
+ }
+
+ public void setExpressionType(ExpressionType expressionType) {
+ this.expressionType = expressionType;
+ }
+
+ public String getExpressionValue() {
+ return expressionValue;
+ }
+
+ public void setExpressionValue(String expressionValue) {
+ this.expressionValue = expressionValue;
+ }
+
+ public String[] getExpressionParams() {
+ return expressionParams;
+ }
+
+ public void setExpressionParams(String[] expressionParams) {
+ this.expressionParams = expressionParams;
+ }
+
+ public boolean isMulti() {
+ return multi;
+ }
+
+ public void setMulti(boolean multi) {
+ this.multi = multi;
+ }
+
+ public Selector getSelector() {
+ if (selector == null) {
+ synchronized (this) {
+ if (selector == null) {
+ selector = compileSelector();
+ }
+ }
+ }
+ return selector;
+ }
+
+ private Selector compileSelector() {
+ switch (expressionType) {
+ case Css:
+ if (expressionParams.length >= 1) {
+ return $(expressionValue, expressionParams[0]);
+ } else {
+ return $(expressionValue);
+ }
+ case XPath:
+ return xpath(expressionValue);
+ case Regex:
+ if (expressionParams.length >= 1) {
+ return regex(expressionValue, Integer.parseInt(expressionParams[0]));
+ } else {
+ return regex(expressionValue);
+ }
+ case JsonPath:
+ return new JsonPathSelector(expressionValue);
+ default:
+ return xpath(expressionValue);
+ }
+ }
+
+ public void setSelector(Selector selector) {
+ this.selector = selector;
+ }
+
+ public boolean isNotNull() {
+ return notNull;
+ }
+
+ public void setNotNull(boolean notNull) {
+ this.notNull = notNull;
+ }
+}
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/Inject.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/Inject.java
deleted file mode 100644
index c6608ae..0000000
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/Inject.java
+++ /dev/null
@@ -1,15 +0,0 @@
-package us.codecraft.webmagic.configurable;
-
-import java.lang.annotation.ElementType;
-import java.lang.annotation.Retention;
-import java.lang.annotation.Target;
-
-/**
- * @author yihua.huang@dianping.com
- */
-@Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
-@Target({ElementType.FIELD})
-public @interface Inject {
-
- String value() default "";
-}
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/PropertyLoader.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/PropertyLoader.java
deleted file mode 100644
index bffbcf2..0000000
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/PropertyLoader.java
+++ /dev/null
@@ -1,18 +0,0 @@
-package us.codecraft.webmagic.configurable;
-
-import us.codecraft.webmagic.processor.PageProcessor;
-
-import java.util.Map;
-
-/**
- * Inject property to object by {@link Inject} annotation.
- *
- * @author yihua.huang@dianping.com
- */
-public class PropertyLoader {
-
- public T load(T object, Map properties) {
- return object;
- }
-
-}
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/ConfigurableBlogPageProcessor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/ConfigurableBlogPageProcessor.java
deleted file mode 100644
index 28d3ab0..0000000
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/ConfigurableBlogPageProcessor.java
+++ /dev/null
@@ -1,51 +0,0 @@
-package us.codecraft.webmagic.example;
-
-import java.util.List;
-import us.codecraft.webmagic.Page;
-import us.codecraft.webmagic.Site;
-import us.codecraft.webmagic.Spider;
-import us.codecraft.webmagic.configurable.Inject;
-import us.codecraft.webmagic.processor.PageProcessor;
-
-/**
- * @author code4crafter@gmail.com
- */
-public class ConfigurableBlogPageProcessor implements PageProcessor {
-
- private Site site = Site.me().setDomain("my.oschina.net");
-
- @Inject("linkRegex")
- private String linkRegex;
-
- @Inject("titleXpath")
- private String titleXpath;
-
- @Inject("contentXpath")
- private String contentXpath;
-
- @Inject("tagsXpath")
- private String tagsXpath;
-
- @Override
- public void process(Page page) {
- List links = page.getHtml().links().regex(linkRegex).all();
- page.addTargetRequests(links);
- page.putField("title", page.getHtml().xpath(titleXpath).toString());
- if (page.getResultItems().get("title") == null) {
- //skip this page
- page.setSkip(true);
- }
- page.putField("content", page.getHtml().smartContent().toString());
- page.putField("tags", page.getHtml().xpath(tagsXpath).all());
- }
-
- @Override
- public Site getSite() {
- return site;
-
- }
-
- public static void main(String[] args) {
- Spider.create(new ConfigurableBlogPageProcessor()).addUrl("http://my.oschina.net/flashsword/blog").thread(2).run();
- }
-}
diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/configurable/ConfigurablePageProcessorTest.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/configurable/ConfigurablePageProcessorTest.java
new file mode 100644
index 0000000..a35fffa
--- /dev/null
+++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/configurable/ConfigurablePageProcessorTest.java
@@ -0,0 +1,39 @@
+package us.codecraft.webmagic.configurable;
+
+import org.junit.Test;
+import us.codecraft.webmagic.ResultItems;
+import us.codecraft.webmagic.Site;
+import us.codecraft.webmagic.Spider;
+import us.codecraft.webmagic.downloader.MockGithubDownloader;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+/**
+ * @author code4crafter@gmail.com
+ * @date 14-4-5
+ */
+public class ConfigurablePageProcessorTest {
+
+ @Test
+ public void test() throws Exception {
+ List extractRules = new ArrayList();
+ ExtractRule extractRule = new ExtractRule();
+ extractRule.setExpressionType(ExpressionType.XPath);
+ extractRule.setExpressionValue("//title");
+ extractRule.setFieldName("title");
+ extractRules.add(extractRule);
+ extractRule = new ExtractRule();
+ extractRule.setExpressionType(ExpressionType.XPath);
+ extractRule.setExpressionValue("//ul[@class='pagehead-actions']/li[1]//a[@class='social-count js-social-count']/text()");
+ extractRule.setFieldName("star");
+ extractRules.add(extractRule);
+ ResultItems resultItems = Spider.create(new ConfigurablePageProcessor(Site.me(), extractRules))
+ .setDownloader(new MockGithubDownloader()).get("https://github.com/code4craft/webmagic");
+ assertThat(resultItems.getAll()).containsEntry("title", "code4craft/webmagic · GitHub");
+ assertThat(resultItems.getAll()).containsEntry("star", " 86 ");
+
+ }
+}