diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ExtractByUrl.java b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ExtractByUrl.java
new file mode 100644
index 0000000..3ecb451
--- /dev/null
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ExtractByUrl.java
@@ -0,0 +1,18 @@
+package us.codecraft.webmagic.annotation;
+
+import java.lang.annotation.ElementType;
+import java.lang.annotation.Retention;
+import java.lang.annotation.Target;
+
+/**
+ * @author yihua.huang@dianping.com
+ * @date: 13-8-1
+ * Time: 下午8:40
+ */
+@Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
+@Target({ElementType.FIELD})
+public @interface ExtractByUrl {
+
+ String value() default "";
+
+}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/FieldExtractor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/FieldExtractor.java
index 243ae9f..1827d7a 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/FieldExtractor.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/FieldExtractor.java
@@ -15,9 +15,20 @@ class FieldExtractor {
private final Selector selector;
- FieldExtractor(Field field, Selector selector) {
+ private final Source source;
+
+ static enum Source {Html, Url}
+
+ public FieldExtractor(Field field, Selector selector) {
this.field = field;
this.selector = selector;
+ this.source = Source.Html;
+ }
+
+ public FieldExtractor(Field field, Selector selector, Source source) {
+ this.field = field;
+ this.selector = selector;
+ this.source = source;
}
Field getField() {
@@ -27,4 +38,8 @@ class FieldExtractor {
Selector getSelector() {
return selector;
}
+
+ Source getSource() {
+ return source;
+ }
}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ObjectPageProcessor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ObjectPageProcessor.java
index 4b54963..ae3131e 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ObjectPageProcessor.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ObjectPageProcessor.java
@@ -47,6 +47,7 @@ public class ObjectPageProcessor implements PageProcessor {
public void process(Page page) {
for (PageModelExtractor pageModelExtractor : pageModelExtractorList) {
Object process = pageModelExtractor.process(page);
+ postProcessPageModel(pageModelExtractor.getClazz(), process);
page.putField(pageModelExtractor.getClazz().getCanonicalName(), process);
}
for (String link : page.getHtml().links().all()) {
@@ -58,6 +59,9 @@ public class ObjectPageProcessor implements PageProcessor {
}
}
+ protected void postProcessPageModel(Class clazz, Object object){
+ }
+
@Override
public Site getSite() {
return site;
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/PageModelExtractor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/PageModelExtractor.java
index 671dd56..14b869d 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/PageModelExtractor.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/PageModelExtractor.java
@@ -38,22 +38,32 @@ class PageModelExtractor {
for (Field field : clazz.getDeclaredFields()) {
field.setAccessible(true);
ExtractBy extractBy = field.getAnnotation(ExtractBy.class);
- String value = extractBy.value();
- Selector selector;
- switch (extractBy.type()) {
- case Css:
- selector = new CssSelector(value);
- break;
- case Regex:
- selector = new RegexSelector(value);
- break;
- case XPath:
- selector = new XpathSelector(value);
- break;
- default:
- selector = new XpathSelector(value);
+ if (extractBy != null) {
+ String value = extractBy.value();
+ Selector selector;
+ switch (extractBy.type()) {
+ case Css:
+ selector = new CssSelector(value);
+ break;
+ case Regex:
+ selector = new RegexSelector(value);
+ break;
+ case XPath:
+ selector = new XpathSelector(value);
+ break;
+ default:
+ selector = new XpathSelector(value);
+ }
+ fieldExtractors.add(new FieldExtractor(field, selector));
+ }
+ ExtractByUrl extractByUrl = field.getAnnotation(ExtractByUrl.class);
+ if (extractByUrl != null) {
+ String regexPattern = extractByUrl.value();
+ if (regexPattern.trim().equals("")) {
+ regexPattern = ".*";
+ }
+ fieldExtractors.add(new FieldExtractor(field, new RegexSelector(regexPattern), FieldExtractor.Source.Url));
}
- fieldExtractors.add(new FieldExtractor(field, selector));
}
}
@@ -65,7 +75,7 @@ class PageModelExtractor {
} else {
String[] value = ((TargetUrl) annotation).value();
for (String s : value) {
- targetUrlPatterns.add(Pattern.compile(s.replace(".","\\.").replace("*","[^\"'#]*")));
+ targetUrlPatterns.add(Pattern.compile(s.replace(".", "\\.").replace("*", "[^\"'#]*")));
}
}
}
@@ -84,7 +94,15 @@ class PageModelExtractor {
try {
o = clazz.newInstance();
for (FieldExtractor fieldExtractor : fieldExtractors) {
- fieldExtractor.getField().set(o, fieldExtractor.getSelector().select(page.getHtml().toString()));
+ switch (fieldExtractor.getSource()) {
+ case Html:
+ fieldExtractor.getField().set(o, fieldExtractor.getSelector().select(page.getHtml().toString()));
+ break;
+ case Url:
+ fieldExtractor.getField().set(o, fieldExtractor.getSelector().select(page.getUrl().toString()));
+ break;
+ }
+
}
} catch (InstantiationException e) {
e.printStackTrace();
diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/annotation/Blog.java b/webmagic-core/src/test/java/us/codecraft/webmagic/annotation/OschinaBlog.java
similarity index 89%
rename from webmagic-core/src/test/java/us/codecraft/webmagic/annotation/Blog.java
rename to webmagic-core/src/test/java/us/codecraft/webmagic/annotation/OschinaBlog.java
index 7139694..0435843 100644
--- a/webmagic-core/src/test/java/us/codecraft/webmagic/annotation/Blog.java
+++ b/webmagic-core/src/test/java/us/codecraft/webmagic/annotation/OschinaBlog.java
@@ -6,7 +6,7 @@ package us.codecraft.webmagic.annotation;
* Time: 下午10:18
*/
@TargetUrl("http://my.oschina.net/flashsword/blog/*")
-public class Blog {
+public class OschinaBlog {
@ExtractBy("//title")
private String title;
@@ -16,7 +16,7 @@ public class Blog {
@Override
public String toString() {
- return "Blog{" +
+ return "OschinaBlog{" +
"title='" + title + '\'' +
", content='" + content + '\'' +
'}';
diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/annotation/TestFetcher.java b/webmagic-core/src/test/java/us/codecraft/webmagic/annotation/TestFetcher.java
index e97b5cf..37a3305 100644
--- a/webmagic-core/src/test/java/us/codecraft/webmagic/annotation/TestFetcher.java
+++ b/webmagic-core/src/test/java/us/codecraft/webmagic/annotation/TestFetcher.java
@@ -15,7 +15,7 @@ public class TestFetcher {
@Ignore("takes long")
@Test
public void test() {
- Spider.create(ObjectPageProcessor.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog/145796"), Blog.class)).run();
+ Spider.create(ObjectPageProcessor.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog/145796"), OschinaBlog.class)).run();
}
diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java
index 30d8a81..6f1c21e 100644
--- a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java
+++ b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java
@@ -1168,7 +1168,7 @@ public class XpathSelectorTest {
+ " var location = window.location;\n"
+ " source_url = location.protocol + \"//\" + location.host + location.pathname + location.search;\n"
+ " pre.writeAttribute('codeable_id', post_id);\n"
- + " pre.writeAttribute('codeable_type', \"Blog\");\n"
+ + " pre.writeAttribute('codeable_type', \"OschinaBlog\");\n"
+ " pre.writeAttribute('source_url', source_url);\n"
+ " pre.writeAttribute('pre_index', index);\n"
+ " pre.writeAttribute('title', 'jsoup 解析页面商品信息');\n"