diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ExtractByUrl.java b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ExtractByUrl.java new file mode 100644 index 0000000..3ecb451 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ExtractByUrl.java @@ -0,0 +1,18 @@ +package us.codecraft.webmagic.annotation; + +import java.lang.annotation.ElementType; +import java.lang.annotation.Retention; +import java.lang.annotation.Target; + +/** + * @author yihua.huang@dianping.com
+ * @date: 13-8-1
+ * Time: 下午8:40
+ */ +@Retention(java.lang.annotation.RetentionPolicy.RUNTIME) +@Target({ElementType.FIELD}) +public @interface ExtractByUrl { + + String value() default ""; + +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/FieldExtractor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/FieldExtractor.java index 243ae9f..1827d7a 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/FieldExtractor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/FieldExtractor.java @@ -15,9 +15,20 @@ class FieldExtractor { private final Selector selector; - FieldExtractor(Field field, Selector selector) { + private final Source source; + + static enum Source {Html, Url} + + public FieldExtractor(Field field, Selector selector) { this.field = field; this.selector = selector; + this.source = Source.Html; + } + + public FieldExtractor(Field field, Selector selector, Source source) { + this.field = field; + this.selector = selector; + this.source = source; } Field getField() { @@ -27,4 +38,8 @@ class FieldExtractor { Selector getSelector() { return selector; } + + Source getSource() { + return source; + } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ObjectPageProcessor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ObjectPageProcessor.java index 4b54963..ae3131e 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ObjectPageProcessor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ObjectPageProcessor.java @@ -47,6 +47,7 @@ public class ObjectPageProcessor implements PageProcessor { public void process(Page page) { for (PageModelExtractor pageModelExtractor : pageModelExtractorList) { Object process = pageModelExtractor.process(page); + postProcessPageModel(pageModelExtractor.getClazz(), process); page.putField(pageModelExtractor.getClazz().getCanonicalName(), process); } for (String link : page.getHtml().links().all()) { @@ -58,6 +59,9 @@ public class ObjectPageProcessor implements PageProcessor { } } + protected void postProcessPageModel(Class clazz, Object object){ + } + @Override public Site getSite() { return site; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/PageModelExtractor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/PageModelExtractor.java index 671dd56..14b869d 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/PageModelExtractor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/PageModelExtractor.java @@ -38,22 +38,32 @@ class PageModelExtractor { for (Field field : clazz.getDeclaredFields()) { field.setAccessible(true); ExtractBy extractBy = field.getAnnotation(ExtractBy.class); - String value = extractBy.value(); - Selector selector; - switch (extractBy.type()) { - case Css: - selector = new CssSelector(value); - break; - case Regex: - selector = new RegexSelector(value); - break; - case XPath: - selector = new XpathSelector(value); - break; - default: - selector = new XpathSelector(value); + if (extractBy != null) { + String value = extractBy.value(); + Selector selector; + switch (extractBy.type()) { + case Css: + selector = new CssSelector(value); + break; + case Regex: + selector = new RegexSelector(value); + break; + case XPath: + selector = new XpathSelector(value); + break; + default: + selector = new XpathSelector(value); + } + fieldExtractors.add(new FieldExtractor(field, selector)); + } + ExtractByUrl extractByUrl = field.getAnnotation(ExtractByUrl.class); + if (extractByUrl != null) { + String regexPattern = extractByUrl.value(); + if (regexPattern.trim().equals("")) { + regexPattern = ".*"; + } + fieldExtractors.add(new FieldExtractor(field, new RegexSelector(regexPattern), FieldExtractor.Source.Url)); } - fieldExtractors.add(new FieldExtractor(field, selector)); } } @@ -65,7 +75,7 @@ class PageModelExtractor { } else { String[] value = ((TargetUrl) annotation).value(); for (String s : value) { - targetUrlPatterns.add(Pattern.compile(s.replace(".","\\.").replace("*","[^\"'#]*"))); + targetUrlPatterns.add(Pattern.compile(s.replace(".", "\\.").replace("*", "[^\"'#]*"))); } } } @@ -84,7 +94,15 @@ class PageModelExtractor { try { o = clazz.newInstance(); for (FieldExtractor fieldExtractor : fieldExtractors) { - fieldExtractor.getField().set(o, fieldExtractor.getSelector().select(page.getHtml().toString())); + switch (fieldExtractor.getSource()) { + case Html: + fieldExtractor.getField().set(o, fieldExtractor.getSelector().select(page.getHtml().toString())); + break; + case Url: + fieldExtractor.getField().set(o, fieldExtractor.getSelector().select(page.getUrl().toString())); + break; + } + } } catch (InstantiationException e) { e.printStackTrace(); diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/annotation/Blog.java b/webmagic-core/src/test/java/us/codecraft/webmagic/annotation/OschinaBlog.java similarity index 89% rename from webmagic-core/src/test/java/us/codecraft/webmagic/annotation/Blog.java rename to webmagic-core/src/test/java/us/codecraft/webmagic/annotation/OschinaBlog.java index 7139694..0435843 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/annotation/Blog.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/annotation/OschinaBlog.java @@ -6,7 +6,7 @@ package us.codecraft.webmagic.annotation; * Time: 下午10:18
*/ @TargetUrl("http://my.oschina.net/flashsword/blog/*") -public class Blog { +public class OschinaBlog { @ExtractBy("//title") private String title; @@ -16,7 +16,7 @@ public class Blog { @Override public String toString() { - return "Blog{" + + return "OschinaBlog{" + "title='" + title + '\'' + ", content='" + content + '\'' + '}'; diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/annotation/TestFetcher.java b/webmagic-core/src/test/java/us/codecraft/webmagic/annotation/TestFetcher.java index e97b5cf..37a3305 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/annotation/TestFetcher.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/annotation/TestFetcher.java @@ -15,7 +15,7 @@ public class TestFetcher { @Ignore("takes long") @Test public void test() { - Spider.create(ObjectPageProcessor.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog/145796"), Blog.class)).run(); + Spider.create(ObjectPageProcessor.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog/145796"), OschinaBlog.class)).run(); } diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java index 30d8a81..6f1c21e 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java @@ -1168,7 +1168,7 @@ public class XpathSelectorTest { + " var location = window.location;\n" + " source_url = location.protocol + \"//\" + location.host + location.pathname + location.search;\n" + " pre.writeAttribute('codeable_id', post_id);\n" - + " pre.writeAttribute('codeable_type', \"Blog\");\n" + + " pre.writeAttribute('codeable_type', \"OschinaBlog\");\n" + " pre.writeAttribute('source_url', source_url);\n" + " pre.writeAttribute('pre_index', index);\n" + " pre.writeAttribute('title', 'jsoup 解析页面商品信息');\n"