From 866ab0a05607ab6bc17f7058e86a19207d550031 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 3 Aug 2013 14:01:18 +0800 Subject: [PATCH] update email --- .../us/codecraft/webmagic/ResultItems.java | 2 +- .../java/us/codecraft/webmagic/Spider.java | 4 --- .../webmagic/annotation/AfterExtractor.java | 15 ++++++++++ .../webmagic/annotation/ExtractBy.java | 2 +- .../webmagic/annotation/ExtractByUrl.java | 2 +- .../webmagic/annotation/FieldExtractor.java | 2 +- .../webmagic/annotation/HelpUrl.java | 2 +- .../webmagic/annotation/OOSpider.java | 29 ++++++++++++++++++ .../annotation/ObjectPageProcessor.java | 2 +- .../webmagic/annotation/ObjectPipeline.java | 30 +++++++++++++++---- .../annotation/PageModelExtractor.java | 14 ++++++++- .../annotation/PageModelPipeline.java | 14 +++++++++ .../webmagic/annotation/TargetUrl.java | 3 +- .../webmagic/downloader/Destroyable.java | 2 +- .../webmagic/annotation/TestFetcher.java | 8 ++--- .../annotation/samples/IteyeBlog.java | 4 +-- .../annotation/samples/OschinaBlog.java | 4 +-- 17 files changed, 110 insertions(+), 29 deletions(-) create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/annotation/AfterExtractor.java create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/annotation/OOSpider.java create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/annotation/PageModelPipeline.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/ResultItems.java b/webmagic-core/src/main/java/us/codecraft/webmagic/ResultItems.java index 0c1d94c..c91a270 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/ResultItems.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/ResultItems.java @@ -5,7 +5,7 @@ import java.util.Map; /** * 保存抽取结果的类,由PageProcessor处理得到,传递给{@link us.codecraft.webmagic.pipeline.Pipeline}进行持久化。
- * @author yihua.huang@dianping.com
+ * @author code4crafter@gmail.com
* @date: 13-7-25
* Time: 下午12:20
*/ diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 2717b66..facfd95 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -90,10 +90,6 @@ public class Spider implements Runnable, Task { return new Spider(pageProcessor); } - public static Spider create(Site site,Class... pageModels) { - return new Spider(ObjectPageProcessor.create(site,pageModels)); - } - /** * 重新设置startUrls,会覆盖Site本身的startUrls。 * diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/AfterExtractor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/AfterExtractor.java new file mode 100644 index 0000000..89d03e9 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/AfterExtractor.java @@ -0,0 +1,15 @@ +package us.codecraft.webmagic.annotation; + +import us.codecraft.webmagic.Page; + +/** + * 实现这个接口即可在抽取后进行后处理。
+ * + * @author code4crafter@gmail.com
+ * @date: 13-8-3
+ * Time: 上午9:42
+ */ +public interface AfterExtractor { + + public void afterProcess(Page page, T t); +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ExtractBy.java b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ExtractBy.java index 2d08417..115a219 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ExtractBy.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ExtractBy.java @@ -5,7 +5,7 @@ import java.lang.annotation.Retention; import java.lang.annotation.Target; /** - * @author yihua.huang@dianping.com
+ * @author code4crafter@gmail.com
* @date: 13-8-1
* Time: 下午8:40
*/ diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ExtractByUrl.java b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ExtractByUrl.java index 57747f5..c40c9ca 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ExtractByUrl.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ExtractByUrl.java @@ -5,7 +5,7 @@ import java.lang.annotation.Retention; import java.lang.annotation.Target; /** - * @author yihua.huang@dianping.com
+ * @author code4crafter@gmail.com
* @date: 13-8-1
* Time: 下午8:40
*/ diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/FieldExtractor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/FieldExtractor.java index f415cb8..4cd09ef 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/FieldExtractor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/FieldExtractor.java @@ -6,7 +6,7 @@ import java.lang.reflect.Field; import java.lang.reflect.Method; /** - * @author yihua.huang@dianping.com
+ * @author code4crafter@gmail.com
* @date: 13-8-1
* Time: 下午9:48
*/ diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/HelpUrl.java b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/HelpUrl.java index 3020817..e5727f0 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/HelpUrl.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/HelpUrl.java @@ -5,7 +5,7 @@ import java.lang.annotation.Retention; import java.lang.annotation.Target; /** - * @author yihua.huang@dianping.com
+ * @author code4crafter@gmail.com
* @date: 13-8-1
* Time: 下午8:40
*/ diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/OOSpider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/OOSpider.java new file mode 100644 index 0000000..c6ae2f3 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/OOSpider.java @@ -0,0 +1,29 @@ +package us.codecraft.webmagic.annotation; + +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.processor.PageProcessor; + +/** + * @author code4crafter@gmail.com
+ * @date: 13-8-3
+ * Time: 上午9:51
+ */ +public class OOSpider extends Spider{ + + /** + * 使用已定义的抽取规则新建一个Spider。 + * + * @param pageProcessor 已定义的抽取规则 + */ + public OOSpider(PageProcessor pageProcessor) { + super(pageProcessor); + } + + public static OOSpider create(Site site,Class... pageModels) { + OOSpider ooSpider = new OOSpider(ObjectPageProcessor.create(site, pageModels)); + ooSpider.pipeline(new ObjectPipeline()); + return ooSpider; + } + +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ObjectPageProcessor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ObjectPageProcessor.java index ad8297e..063dc81 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ObjectPageProcessor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ObjectPageProcessor.java @@ -12,7 +12,7 @@ import java.util.Set; import java.util.regex.Pattern; /** - * @author yihua.huang@dianping.com
+ * @author code4crafter@gmail.com
* @date: 13-8-1
* Time: 下午8:46
*/ diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ObjectPipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ObjectPipeline.java index dd27395..0b3ec4d 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ObjectPipeline.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ObjectPipeline.java @@ -4,18 +4,36 @@ import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.pipeline.Pipeline; +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; + /** - * @author yihua.huang@dianping.com
+ * @author code4crafter@gmail.com
* @date: 13-8-2
* Time: 上午10:47
*/ public class ObjectPipeline implements Pipeline { + + private Map pageModelPipelines = new ConcurrentHashMap(); + + public ObjectPipeline() { + } + + public ObjectPipeline put(Class clazz, PageModelPipeline pageModelPipeline) { + pageModelPipelines.put(clazz, pageModelPipeline); + return this; + } + @Override public void process(ResultItems resultItems, Task task) { - - } - - public T read() { - return null; + if (resultItems.isSkip()) { + return; + } + for (Map.Entry classPageModelPipelineEntry : pageModelPipelines.entrySet()) { + Object o = resultItems.get(classPageModelPipelineEntry.getKey().getCanonicalName()); + if (o != null) { + classPageModelPipelineEntry.getValue().process(o, task); + } + } } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/PageModelExtractor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/PageModelExtractor.java index e610e10..8ed3b6b 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/PageModelExtractor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/PageModelExtractor.java @@ -16,7 +16,7 @@ import java.util.List; import java.util.regex.Pattern; /** - * @author yihua.huang@dianping.com
+ * @author code4crafter@gmail.com
* @date: 13-8-1
* Time: 下午9:33
*/ @@ -30,6 +30,8 @@ class PageModelExtractor { private List fieldExtractors; + private AfterExtractor afterExtractor; + public static PageModelExtractor create(Class clazz) { PageModelExtractor pageModelExtractor = new PageModelExtractor(); pageModelExtractor.init(clazz); @@ -40,6 +42,13 @@ class PageModelExtractor { this.clazz = clazz; initTargetUrlPatterns(); fieldExtractors = new ArrayList(); + if (clazz.isAssignableFrom(AfterExtractor.class)){ + try { + afterExtractor=(AfterExtractor)clazz.newInstance(); + } catch (Exception e) { + throw new IllegalArgumentException(e); + } + } for (Field field : clazz.getDeclaredFields()) { field.setAccessible(true); if (!field.getType().isAssignableFrom(String.class)){ @@ -147,6 +156,9 @@ class PageModelExtractor { } setField(o, fieldExtractor, value); } + if (afterExtractor!=null){ + afterExtractor.afterProcess(page,o); + } } catch (InstantiationException e) { e.printStackTrace(); } catch (IllegalAccessException e) { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/PageModelPipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/PageModelPipeline.java new file mode 100644 index 0000000..afef926 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/PageModelPipeline.java @@ -0,0 +1,14 @@ +package us.codecraft.webmagic.annotation; + +import us.codecraft.webmagic.Task; + +/** + * @author code4crafter@gmail.com
+ * @date: 13-8-3
+ * Time: 上午9:34
+ */ +public interface PageModelPipeline { + + public void process(T t, Task task); + +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/TargetUrl.java b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/TargetUrl.java index f4f58ed..5303064 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/TargetUrl.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/TargetUrl.java @@ -5,7 +5,7 @@ import java.lang.annotation.Retention; import java.lang.annotation.Target; /** - * @author yihua.huang@dianping.com
+ * @author code4crafter@gmail.com
* @date: 13-8-1
* Time: 下午8:40
*/ @@ -14,4 +14,5 @@ import java.lang.annotation.Target; public @interface TargetUrl { String[] value(); + } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Destroyable.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Destroyable.java index 4f07528..2b040fa 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Destroyable.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Destroyable.java @@ -2,7 +2,7 @@ package us.codecraft.webmagic.downloader; /** * 比较占用资源的服务可以实现该接口,Spider会在结束时调用destroy()释放资源。
- * @author yihua.huang@dianping.com
+ * @author code4crafter@gmail.com
* @date: 13-7-26
* Time: 下午3:10
*/ diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/annotation/TestFetcher.java b/webmagic-core/src/test/java/us/codecraft/webmagic/annotation/TestFetcher.java index 5d70a54..b29d053 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/annotation/TestFetcher.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/annotation/TestFetcher.java @@ -3,7 +3,6 @@ package us.codecraft.webmagic.annotation; import org.junit.Ignore; import org.junit.Test; import us.codecraft.webmagic.Site; -import us.codecraft.webmagic.Spider; /** * @author yihua.huang@dianping.com
@@ -16,12 +15,9 @@ public class TestFetcher { @Test public void test() { ObjectPipeline objectPipeline = new ObjectPipeline(); - Spider.create(ObjectPageProcessor.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog/145796"), OschinaBlog.class)) - .pipeline(objectPipeline).runAsync(); + OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog/145796"), OschinaBlog.class) + .pipeline(objectPipeline); OschinaBlog oschinaBlog = null; - while ((oschinaBlog = objectPipeline.read()) != null) { - System.out.println(oschinaBlog); - } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/annotation/samples/IteyeBlog.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/annotation/samples/IteyeBlog.java index 09a1d5b..002a42c 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/annotation/samples/IteyeBlog.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/annotation/samples/IteyeBlog.java @@ -1,8 +1,8 @@ package us.codecraft.webmagic.annotation.samples; import us.codecraft.webmagic.Site; -import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.annotation.ExtractBy; +import us.codecraft.webmagic.annotation.OOSpider; import us.codecraft.webmagic.annotation.TargetUrl; /** @@ -28,7 +28,7 @@ public class IteyeBlog implements Blog{ } public static void main(String[] args) { - Spider.create(Site.me().addStartUrl("http://dengminhui.iteye.com/blog"),IteyeBlog.class).run(); + OOSpider.create(Site.me().addStartUrl("http://dengminhui.iteye.com/blog"), IteyeBlog.class).run(); } public String getTitle() { diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/annotation/samples/OschinaBlog.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/annotation/samples/OschinaBlog.java index 817c1aa..a5c44b0 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/annotation/samples/OschinaBlog.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/annotation/samples/OschinaBlog.java @@ -1,8 +1,8 @@ package us.codecraft.webmagic.annotation.samples; import us.codecraft.webmagic.Site; -import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.annotation.ExtractBy; +import us.codecraft.webmagic.annotation.OOSpider; import us.codecraft.webmagic.annotation.TargetUrl; /** @@ -28,7 +28,7 @@ public class OschinaBlog implements Blog{ } public static void main(String[] args) { - Spider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog"),OschinaBlog.class).run(); + OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog"), OschinaBlog.class).run(); } public String getTitle() {