From 50edd22ef6351ce554e0c881512eb014ae8f2732 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Thu, 1 Aug 2013 22:40:57 +0800 Subject: [PATCH 01/84] add annotation --- .../webmagic/annotation/Fetcher.java | 21 ++++ .../webmagic/annotation/FieldFetcher.java | 30 +++++ .../annotation/ObjectPageProcessor.java | 65 +++++++++++ .../webmagic/annotation/PageModelFetcher.java | 104 ++++++++++++++++++ .../webmagic/annotation/TargetUrl.java | 17 +++ .../codecraft/webmagic/selector/Selector.java | 2 +- .../codecraft/webmagic/annotation/Blog.java | 24 ++++ .../webmagic/annotation/TestFetcher.java | 20 ++++ 8 files changed, 282 insertions(+), 1 deletion(-) create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/annotation/Fetcher.java create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/annotation/FieldFetcher.java create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ObjectPageProcessor.java create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/annotation/PageModelFetcher.java create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/annotation/TargetUrl.java create mode 100644 webmagic-core/src/test/java/us/codecraft/webmagic/annotation/Blog.java create mode 100644 webmagic-core/src/test/java/us/codecraft/webmagic/annotation/TestFetcher.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/Fetcher.java b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/Fetcher.java new file mode 100644 index 0000000..86f78db --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/Fetcher.java @@ -0,0 +1,21 @@ +package us.codecraft.webmagic.annotation; + +import java.lang.annotation.ElementType; +import java.lang.annotation.Retention; +import java.lang.annotation.Target; + +/** + * @author yihua.huang@dianping.com
+ * @date: 13-8-1
+ * Time: 下午8:40
+ */ +@Retention(java.lang.annotation.RetentionPolicy.RUNTIME) +@Target({ElementType.FIELD}) +public @interface Fetcher { + + String value(); + + public enum Type {XPath, Regex, Css}; + + Type type() default Type.XPath; +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/FieldFetcher.java b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/FieldFetcher.java new file mode 100644 index 0000000..ee9962b --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/FieldFetcher.java @@ -0,0 +1,30 @@ +package us.codecraft.webmagic.annotation; + +import us.codecraft.webmagic.selector.Selector; + +import java.lang.reflect.Field; + +/** + * @author yihua.huang@dianping.com
+ * @date: 13-8-1
+ * Time: 下午9:48
+ */ +class FieldFetcher { + + private final Field field; + + private final Selector selector; + + FieldFetcher(Field field, Selector selector) { + this.field = field; + this.selector = selector; + } + + Field getField() { + return field; + } + + Selector getSelector() { + return selector; + } +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ObjectPageProcessor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ObjectPageProcessor.java new file mode 100644 index 0000000..98c969e --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ObjectPageProcessor.java @@ -0,0 +1,65 @@ +package us.codecraft.webmagic.annotation; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.processor.PageProcessor; + +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.regex.Pattern; + +/** + * @author yihua.huang@dianping.com
+ * @date: 13-8-1
+ * Time: 下午8:46
+ */ +public class ObjectPageProcessor implements PageProcessor { + + private List pageModelFetcherList; + + private Site site; + + private Set targetUrlPatterns; + + public static ObjectPageProcessor create(Site site, Class... clazzs) { + List pageModelFetcherList = new ArrayList(); + for (Class clazz : clazzs) { + PageModelFetcher pageModelFetcher = PageModelFetcher.create(clazz); + pageModelFetcherList.add(pageModelFetcher); + } + ObjectPageProcessor objectPageProcessor = new ObjectPageProcessor(site, pageModelFetcherList); + return objectPageProcessor; + } + + private ObjectPageProcessor(Site site, List pageModelFetcherList) { + this.site = site; + this.pageModelFetcherList = pageModelFetcherList; + targetUrlPatterns = new HashSet(); + for (PageModelFetcher pageModelFetcher : pageModelFetcherList) { + targetUrlPatterns.addAll(pageModelFetcher.getTargetUrlPatterns()); + } + } + + @Override + public void process(Page page) { + for (PageModelFetcher pageModelFetcher : pageModelFetcherList) { + Object process = pageModelFetcher.process(page); + page.putField(pageModelFetcher.getClazz().getCanonicalName(), process); + } + for (String link : page.getHtml().links().all()) { + for (Pattern targetUrlPattern : targetUrlPatterns) { + if (targetUrlPattern.matcher(link).matches()){ + page.addTargetRequest(new Request(link)); + } + } + } + } + + @Override + public Site getSite() { + return site; + } +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/PageModelFetcher.java b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/PageModelFetcher.java new file mode 100644 index 0000000..097f1af --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/PageModelFetcher.java @@ -0,0 +1,104 @@ +package us.codecraft.webmagic.annotation; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.selector.CssSelector; +import us.codecraft.webmagic.selector.RegexSelector; +import us.codecraft.webmagic.selector.Selector; +import us.codecraft.webmagic.selector.XpathSelector; + +import java.lang.annotation.Annotation; +import java.lang.reflect.Field; +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Pattern; + +/** + * @author yihua.huang@dianping.com
+ * @date: 13-8-1
+ * Time: 下午9:33
+ */ +class PageModelFetcher { + + private List targetUrlPatterns; + + private Class clazz; + + private List fieldFetchers; + + public static PageModelFetcher create(Class clazz) { + PageModelFetcher pageModelFetcher = new PageModelFetcher(); + pageModelFetcher.init(clazz); + return pageModelFetcher; + } + + private void init(Class clazz) { + this.clazz = clazz; + initTargetUrlPatterns(); + fieldFetchers = new ArrayList(); + for (Field field : clazz.getDeclaredFields()) { + field.setAccessible(true); + Fetcher fetcher = field.getAnnotation(Fetcher.class); + String value = fetcher.value(); + Selector selector; + switch (fetcher.type()) { + case Css: + selector = new CssSelector(value); + break; + case Regex: + selector = new RegexSelector(value); + break; + case XPath: + selector = new XpathSelector(value); + break; + default: + selector = new XpathSelector(value); + } + fieldFetchers.add(new FieldFetcher(field, selector)); + } + } + + private void initTargetUrlPatterns() { + targetUrlPatterns = new ArrayList(); + Annotation annotation = clazz.getAnnotation(TargetUrl.class); + if (annotation == null) { + targetUrlPatterns.add(Pattern.compile(".*")); + } else { + String[] value = ((TargetUrl) annotation).value(); + for (String s : value) { + targetUrlPatterns.add(Pattern.compile(s.replace(".","\\.").replace("*","[^\"'#]*"))); + } + } + } + + public Object process(Page page) { + boolean matched = false; + for (Pattern targetPattern : targetUrlPatterns) { + if (targetPattern.matcher(page.getUrl().toString()).matches()) { + matched = true; + } + } + if (!matched) { + return null; + } + Object o = null; + try { + o = clazz.newInstance(); + for (FieldFetcher fieldFetcher : fieldFetchers) { + fieldFetcher.getField().set(o, fieldFetcher.getSelector().select(page.getHtml().toString())); + } + } catch (InstantiationException e) { + e.printStackTrace(); + } catch (IllegalAccessException e) { + e.printStackTrace(); + } + return o; + } + + Class getClazz() { + return clazz; + } + + List getTargetUrlPatterns() { + return targetUrlPatterns; + } +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/TargetUrl.java b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/TargetUrl.java new file mode 100644 index 0000000..f4f58ed --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/TargetUrl.java @@ -0,0 +1,17 @@ +package us.codecraft.webmagic.annotation; + +import java.lang.annotation.ElementType; +import java.lang.annotation.Retention; +import java.lang.annotation.Target; + +/** + * @author yihua.huang@dianping.com
+ * @date: 13-8-1
+ * Time: 下午8:40
+ */ +@Retention(java.lang.annotation.RetentionPolicy.RUNTIME) +@Target({ElementType.TYPE}) +public @interface TargetUrl { + + String[] value(); +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selector.java index 845c0b6..4af2b44 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selector.java @@ -8,7 +8,7 @@ import java.util.List; * Date: 13-4-20 * Time: 下午8:02 */ -interface Selector { +public interface Selector { public String select(String text); diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/annotation/Blog.java b/webmagic-core/src/test/java/us/codecraft/webmagic/annotation/Blog.java new file mode 100644 index 0000000..6c6e88c --- /dev/null +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/annotation/Blog.java @@ -0,0 +1,24 @@ +package us.codecraft.webmagic.annotation; + +/** + * @author yihua.huang@dianping.com
+ * @date: 13-8-1
+ * Time: 下午10:18
+ */ +@TargetUrl("http://djjchobits.iteye.com/blog/\\d+") +public class Blog { + + @Fetcher("//title") + private String title; + + @Fetcher(value = "div#main",type = Fetcher.Type.Css) + private String content; + + @Override + public String toString() { + return "Blog{" + + "title='" + title + '\'' + + ", content='" + content + '\'' + + '}'; + } +} diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/annotation/TestFetcher.java b/webmagic-core/src/test/java/us/codecraft/webmagic/annotation/TestFetcher.java new file mode 100644 index 0000000..5318703 --- /dev/null +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/annotation/TestFetcher.java @@ -0,0 +1,20 @@ +package us.codecraft.webmagic.annotation; + +import org.junit.Test; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Spider; + +/** + * @author yihua.huang@dianping.com
+ * @date: 13-8-1
+ * Time: 下午8:42
+ */ +public class TestFetcher { + + @Test + public void test() { + Spider.create(ObjectPageProcessor.create(Site.me().addStartUrl("http://djjchobits.iteye.com/blog/569000"), Blog.class)).run(); + + } + +} From c5cf05640a6ebcda8cd88d5b0415415bbcacc056 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Thu, 1 Aug 2013 22:53:44 +0800 Subject: [PATCH 02/84] processor --- .../main/java/us/codecraft/webmagic/selector/CssSelector.java | 2 +- .../src/test/java/us/codecraft/webmagic/annotation/Blog.java | 4 ++-- .../java/us/codecraft/webmagic/annotation/TestFetcher.java | 4 +++- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java index 10dfb62..90a9d1d 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java @@ -27,7 +27,7 @@ public class CssSelector implements Selector { public String select(String text) { Document doc = Jsoup.parse(text); Elements elements = doc.select(selectorText); - if (CollectionUtils.isNotEmpty(elements)) { + if (CollectionUtils.isEmpty(elements)) { return null; } return elements.get(0).outerHtml(); diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/annotation/Blog.java b/webmagic-core/src/test/java/us/codecraft/webmagic/annotation/Blog.java index 6c6e88c..391ce10 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/annotation/Blog.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/annotation/Blog.java @@ -5,13 +5,13 @@ package us.codecraft.webmagic.annotation; * @date: 13-8-1
* Time: 下午10:18
*/ -@TargetUrl("http://djjchobits.iteye.com/blog/\\d+") +@TargetUrl("http://my.oschina.net/flashsword/blog/*") public class Blog { @Fetcher("//title") private String title; - @Fetcher(value = "div#main",type = Fetcher.Type.Css) + @Fetcher(value = "div.BlogContent",type = Fetcher.Type.Css) private String content; @Override diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/annotation/TestFetcher.java b/webmagic-core/src/test/java/us/codecraft/webmagic/annotation/TestFetcher.java index 5318703..e97b5cf 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/annotation/TestFetcher.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/annotation/TestFetcher.java @@ -1,5 +1,6 @@ package us.codecraft.webmagic.annotation; +import org.junit.Ignore; import org.junit.Test; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; @@ -11,9 +12,10 @@ import us.codecraft.webmagic.Spider; */ public class TestFetcher { + @Ignore("takes long") @Test public void test() { - Spider.create(ObjectPageProcessor.create(Site.me().addStartUrl("http://djjchobits.iteye.com/blog/569000"), Blog.class)).run(); + Spider.create(ObjectPageProcessor.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog/145796"), Blog.class)).run(); } From f08ffc34fd6f63faeb6c25693b92f290f9acebfa Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Fri, 2 Aug 2013 06:33:48 +0800 Subject: [PATCH 03/84] rename --- .../{Fetcher.java => ExtractBy.java} | 2 +- ...{FieldFetcher.java => FieldExtractor.java} | 4 +-- .../annotation/ObjectPageProcessor.java | 24 ++++++++--------- ...elFetcher.java => PageModelExtractor.java} | 26 +++++++++---------- .../codecraft/webmagic/annotation/Blog.java | 4 +-- 5 files changed, 30 insertions(+), 30 deletions(-) rename webmagic-core/src/main/java/us/codecraft/webmagic/annotation/{Fetcher.java => ExtractBy.java} (93%) rename webmagic-core/src/main/java/us/codecraft/webmagic/annotation/{FieldFetcher.java => FieldExtractor.java} (86%) rename webmagic-core/src/main/java/us/codecraft/webmagic/annotation/{PageModelFetcher.java => PageModelExtractor.java} (77%) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/Fetcher.java b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ExtractBy.java similarity index 93% rename from webmagic-core/src/main/java/us/codecraft/webmagic/annotation/Fetcher.java rename to webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ExtractBy.java index 86f78db..7c749b3 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/Fetcher.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ExtractBy.java @@ -11,7 +11,7 @@ import java.lang.annotation.Target; */ @Retention(java.lang.annotation.RetentionPolicy.RUNTIME) @Target({ElementType.FIELD}) -public @interface Fetcher { +public @interface ExtractBy { String value(); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/FieldFetcher.java b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/FieldExtractor.java similarity index 86% rename from webmagic-core/src/main/java/us/codecraft/webmagic/annotation/FieldFetcher.java rename to webmagic-core/src/main/java/us/codecraft/webmagic/annotation/FieldExtractor.java index ee9962b..243ae9f 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/FieldFetcher.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/FieldExtractor.java @@ -9,13 +9,13 @@ import java.lang.reflect.Field; * @date: 13-8-1
* Time: 下午9:48
*/ -class FieldFetcher { +class FieldExtractor { private final Field field; private final Selector selector; - FieldFetcher(Field field, Selector selector) { + FieldExtractor(Field field, Selector selector) { this.field = field; this.selector = selector; } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ObjectPageProcessor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ObjectPageProcessor.java index 98c969e..4b54963 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ObjectPageProcessor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ObjectPageProcessor.java @@ -18,36 +18,36 @@ import java.util.regex.Pattern; */ public class ObjectPageProcessor implements PageProcessor { - private List pageModelFetcherList; + private List pageModelExtractorList; private Site site; private Set targetUrlPatterns; public static ObjectPageProcessor create(Site site, Class... clazzs) { - List pageModelFetcherList = new ArrayList(); + List pageModelExtractorList = new ArrayList(); for (Class clazz : clazzs) { - PageModelFetcher pageModelFetcher = PageModelFetcher.create(clazz); - pageModelFetcherList.add(pageModelFetcher); + PageModelExtractor pageModelExtractor = PageModelExtractor.create(clazz); + pageModelExtractorList.add(pageModelExtractor); } - ObjectPageProcessor objectPageProcessor = new ObjectPageProcessor(site, pageModelFetcherList); + ObjectPageProcessor objectPageProcessor = new ObjectPageProcessor(site, pageModelExtractorList); return objectPageProcessor; } - private ObjectPageProcessor(Site site, List pageModelFetcherList) { + private ObjectPageProcessor(Site site, List pageModelExtractorList) { this.site = site; - this.pageModelFetcherList = pageModelFetcherList; + this.pageModelExtractorList = pageModelExtractorList; targetUrlPatterns = new HashSet(); - for (PageModelFetcher pageModelFetcher : pageModelFetcherList) { - targetUrlPatterns.addAll(pageModelFetcher.getTargetUrlPatterns()); + for (PageModelExtractor pageModelExtractor : pageModelExtractorList) { + targetUrlPatterns.addAll(pageModelExtractor.getTargetUrlPatterns()); } } @Override public void process(Page page) { - for (PageModelFetcher pageModelFetcher : pageModelFetcherList) { - Object process = pageModelFetcher.process(page); - page.putField(pageModelFetcher.getClazz().getCanonicalName(), process); + for (PageModelExtractor pageModelExtractor : pageModelExtractorList) { + Object process = pageModelExtractor.process(page); + page.putField(pageModelExtractor.getClazz().getCanonicalName(), process); } for (String link : page.getHtml().links().all()) { for (Pattern targetUrlPattern : targetUrlPatterns) { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/PageModelFetcher.java b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/PageModelExtractor.java similarity index 77% rename from webmagic-core/src/main/java/us/codecraft/webmagic/annotation/PageModelFetcher.java rename to webmagic-core/src/main/java/us/codecraft/webmagic/annotation/PageModelExtractor.java index 097f1af..671dd56 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/PageModelFetcher.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/PageModelExtractor.java @@ -17,30 +17,30 @@ import java.util.regex.Pattern; * @date: 13-8-1
* Time: 下午9:33
*/ -class PageModelFetcher { +class PageModelExtractor { private List targetUrlPatterns; private Class clazz; - private List fieldFetchers; + private List fieldExtractors; - public static PageModelFetcher create(Class clazz) { - PageModelFetcher pageModelFetcher = new PageModelFetcher(); - pageModelFetcher.init(clazz); - return pageModelFetcher; + public static PageModelExtractor create(Class clazz) { + PageModelExtractor pageModelExtractor = new PageModelExtractor(); + pageModelExtractor.init(clazz); + return pageModelExtractor; } private void init(Class clazz) { this.clazz = clazz; initTargetUrlPatterns(); - fieldFetchers = new ArrayList(); + fieldExtractors = new ArrayList(); for (Field field : clazz.getDeclaredFields()) { field.setAccessible(true); - Fetcher fetcher = field.getAnnotation(Fetcher.class); - String value = fetcher.value(); + ExtractBy extractBy = field.getAnnotation(ExtractBy.class); + String value = extractBy.value(); Selector selector; - switch (fetcher.type()) { + switch (extractBy.type()) { case Css: selector = new CssSelector(value); break; @@ -53,7 +53,7 @@ class PageModelFetcher { default: selector = new XpathSelector(value); } - fieldFetchers.add(new FieldFetcher(field, selector)); + fieldExtractors.add(new FieldExtractor(field, selector)); } } @@ -83,8 +83,8 @@ class PageModelFetcher { Object o = null; try { o = clazz.newInstance(); - for (FieldFetcher fieldFetcher : fieldFetchers) { - fieldFetcher.getField().set(o, fieldFetcher.getSelector().select(page.getHtml().toString())); + for (FieldExtractor fieldExtractor : fieldExtractors) { + fieldExtractor.getField().set(o, fieldExtractor.getSelector().select(page.getHtml().toString())); } } catch (InstantiationException e) { e.printStackTrace(); diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/annotation/Blog.java b/webmagic-core/src/test/java/us/codecraft/webmagic/annotation/Blog.java index 391ce10..7139694 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/annotation/Blog.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/annotation/Blog.java @@ -8,10 +8,10 @@ package us.codecraft.webmagic.annotation; @TargetUrl("http://my.oschina.net/flashsword/blog/*") public class Blog { - @Fetcher("//title") + @ExtractBy("//title") private String title; - @Fetcher(value = "div.BlogContent",type = Fetcher.Type.Css) + @ExtractBy(value = "div.BlogContent",type = ExtractBy.Type.Css) private String content; @Override From abba3b7bff5a386d2a0f77cb82c4b7927bc4534c Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Fri, 2 Aug 2013 06:59:25 +0800 Subject: [PATCH 04/84] add extract by url --- .../webmagic/annotation/ExtractByUrl.java | 18 +++++++ .../webmagic/annotation/FieldExtractor.java | 17 +++++- .../annotation/ObjectPageProcessor.java | 4 ++ .../annotation/PageModelExtractor.java | 52 +++++++++++++------ .../{Blog.java => OschinaBlog.java} | 4 +- .../webmagic/annotation/TestFetcher.java | 2 +- .../webmagic/selector/XpathSelectorTest.java | 2 +- 7 files changed, 77 insertions(+), 22 deletions(-) create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ExtractByUrl.java rename webmagic-core/src/test/java/us/codecraft/webmagic/annotation/{Blog.java => OschinaBlog.java} (89%) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ExtractByUrl.java b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ExtractByUrl.java new file mode 100644 index 0000000..3ecb451 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ExtractByUrl.java @@ -0,0 +1,18 @@ +package us.codecraft.webmagic.annotation; + +import java.lang.annotation.ElementType; +import java.lang.annotation.Retention; +import java.lang.annotation.Target; + +/** + * @author yihua.huang@dianping.com
+ * @date: 13-8-1
+ * Time: 下午8:40
+ */ +@Retention(java.lang.annotation.RetentionPolicy.RUNTIME) +@Target({ElementType.FIELD}) +public @interface ExtractByUrl { + + String value() default ""; + +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/FieldExtractor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/FieldExtractor.java index 243ae9f..1827d7a 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/FieldExtractor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/FieldExtractor.java @@ -15,9 +15,20 @@ class FieldExtractor { private final Selector selector; - FieldExtractor(Field field, Selector selector) { + private final Source source; + + static enum Source {Html, Url} + + public FieldExtractor(Field field, Selector selector) { this.field = field; this.selector = selector; + this.source = Source.Html; + } + + public FieldExtractor(Field field, Selector selector, Source source) { + this.field = field; + this.selector = selector; + this.source = source; } Field getField() { @@ -27,4 +38,8 @@ class FieldExtractor { Selector getSelector() { return selector; } + + Source getSource() { + return source; + } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ObjectPageProcessor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ObjectPageProcessor.java index 4b54963..ae3131e 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ObjectPageProcessor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ObjectPageProcessor.java @@ -47,6 +47,7 @@ public class ObjectPageProcessor implements PageProcessor { public void process(Page page) { for (PageModelExtractor pageModelExtractor : pageModelExtractorList) { Object process = pageModelExtractor.process(page); + postProcessPageModel(pageModelExtractor.getClazz(), process); page.putField(pageModelExtractor.getClazz().getCanonicalName(), process); } for (String link : page.getHtml().links().all()) { @@ -58,6 +59,9 @@ public class ObjectPageProcessor implements PageProcessor { } } + protected void postProcessPageModel(Class clazz, Object object){ + } + @Override public Site getSite() { return site; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/PageModelExtractor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/PageModelExtractor.java index 671dd56..14b869d 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/PageModelExtractor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/PageModelExtractor.java @@ -38,22 +38,32 @@ class PageModelExtractor { for (Field field : clazz.getDeclaredFields()) { field.setAccessible(true); ExtractBy extractBy = field.getAnnotation(ExtractBy.class); - String value = extractBy.value(); - Selector selector; - switch (extractBy.type()) { - case Css: - selector = new CssSelector(value); - break; - case Regex: - selector = new RegexSelector(value); - break; - case XPath: - selector = new XpathSelector(value); - break; - default: - selector = new XpathSelector(value); + if (extractBy != null) { + String value = extractBy.value(); + Selector selector; + switch (extractBy.type()) { + case Css: + selector = new CssSelector(value); + break; + case Regex: + selector = new RegexSelector(value); + break; + case XPath: + selector = new XpathSelector(value); + break; + default: + selector = new XpathSelector(value); + } + fieldExtractors.add(new FieldExtractor(field, selector)); + } + ExtractByUrl extractByUrl = field.getAnnotation(ExtractByUrl.class); + if (extractByUrl != null) { + String regexPattern = extractByUrl.value(); + if (regexPattern.trim().equals("")) { + regexPattern = ".*"; + } + fieldExtractors.add(new FieldExtractor(field, new RegexSelector(regexPattern), FieldExtractor.Source.Url)); } - fieldExtractors.add(new FieldExtractor(field, selector)); } } @@ -65,7 +75,7 @@ class PageModelExtractor { } else { String[] value = ((TargetUrl) annotation).value(); for (String s : value) { - targetUrlPatterns.add(Pattern.compile(s.replace(".","\\.").replace("*","[^\"'#]*"))); + targetUrlPatterns.add(Pattern.compile(s.replace(".", "\\.").replace("*", "[^\"'#]*"))); } } } @@ -84,7 +94,15 @@ class PageModelExtractor { try { o = clazz.newInstance(); for (FieldExtractor fieldExtractor : fieldExtractors) { - fieldExtractor.getField().set(o, fieldExtractor.getSelector().select(page.getHtml().toString())); + switch (fieldExtractor.getSource()) { + case Html: + fieldExtractor.getField().set(o, fieldExtractor.getSelector().select(page.getHtml().toString())); + break; + case Url: + fieldExtractor.getField().set(o, fieldExtractor.getSelector().select(page.getUrl().toString())); + break; + } + } } catch (InstantiationException e) { e.printStackTrace(); diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/annotation/Blog.java b/webmagic-core/src/test/java/us/codecraft/webmagic/annotation/OschinaBlog.java similarity index 89% rename from webmagic-core/src/test/java/us/codecraft/webmagic/annotation/Blog.java rename to webmagic-core/src/test/java/us/codecraft/webmagic/annotation/OschinaBlog.java index 7139694..0435843 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/annotation/Blog.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/annotation/OschinaBlog.java @@ -6,7 +6,7 @@ package us.codecraft.webmagic.annotation; * Time: 下午10:18
*/ @TargetUrl("http://my.oschina.net/flashsword/blog/*") -public class Blog { +public class OschinaBlog { @ExtractBy("//title") private String title; @@ -16,7 +16,7 @@ public class Blog { @Override public String toString() { - return "Blog{" + + return "OschinaBlog{" + "title='" + title + '\'' + ", content='" + content + '\'' + '}'; diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/annotation/TestFetcher.java b/webmagic-core/src/test/java/us/codecraft/webmagic/annotation/TestFetcher.java index e97b5cf..37a3305 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/annotation/TestFetcher.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/annotation/TestFetcher.java @@ -15,7 +15,7 @@ public class TestFetcher { @Ignore("takes long") @Test public void test() { - Spider.create(ObjectPageProcessor.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog/145796"), Blog.class)).run(); + Spider.create(ObjectPageProcessor.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog/145796"), OschinaBlog.class)).run(); } diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java index 30d8a81..6f1c21e 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java @@ -1168,7 +1168,7 @@ public class XpathSelectorTest { + " var location = window.location;\n" + " source_url = location.protocol + \"//\" + location.host + location.pathname + location.search;\n" + " pre.writeAttribute('codeable_id', post_id);\n" - + " pre.writeAttribute('codeable_type', \"Blog\");\n" + + " pre.writeAttribute('codeable_type', \"OschinaBlog\");\n" + " pre.writeAttribute('source_url', source_url);\n" + " pre.writeAttribute('pre_index', index);\n" + " pre.writeAttribute('title', 'jsoup 解析页面商品信息');\n" From 06a39af0f3e82ccef20a4aa8b91668415e2c07bc Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Fri, 2 Aug 2013 07:32:37 +0800 Subject: [PATCH 05/84] add setter support --- .../webmagic/annotation/FieldExtractor.java | 11 +++++ .../annotation/PageModelExtractor.java | 46 +++++++++++++++++-- 2 files changed, 52 insertions(+), 5 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/FieldExtractor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/FieldExtractor.java index 1827d7a..d241c8d 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/FieldExtractor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/FieldExtractor.java @@ -3,6 +3,7 @@ package us.codecraft.webmagic.annotation; import us.codecraft.webmagic.selector.Selector; import java.lang.reflect.Field; +import java.lang.reflect.Method; /** * @author yihua.huang@dianping.com
@@ -17,6 +18,8 @@ class FieldExtractor { private final Source source; + private Method setterMethod; + static enum Source {Html, Url} public FieldExtractor(Field field, Selector selector) { @@ -42,4 +45,12 @@ class FieldExtractor { Source getSource() { return source; } + + void setSetterMethod(Method setterMethod) { + this.setterMethod = setterMethod; + } + + Method getSetterMethod() { + return setterMethod; + } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/PageModelExtractor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/PageModelExtractor.java index 14b869d..7d0d4f2 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/PageModelExtractor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/PageModelExtractor.java @@ -1,5 +1,6 @@ package us.codecraft.webmagic.annotation; +import org.apache.commons.lang3.StringUtils; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.selector.CssSelector; import us.codecraft.webmagic.selector.RegexSelector; @@ -8,6 +9,8 @@ import us.codecraft.webmagic.selector.XpathSelector; import java.lang.annotation.Annotation; import java.lang.reflect.Field; +import java.lang.reflect.InvocationTargetException; +import java.lang.reflect.Method; import java.util.ArrayList; import java.util.List; import java.util.regex.Pattern; @@ -54,7 +57,12 @@ class PageModelExtractor { default: selector = new XpathSelector(value); } - fieldExtractors.add(new FieldExtractor(field, selector)); + FieldExtractor fieldExtractor = new FieldExtractor(field, selector); + Method setterMethod = getSetterMethod(clazz, field); + if (setterMethod != null) { + fieldExtractor.setSetterMethod(setterMethod); + } + fieldExtractors.add(fieldExtractor); } ExtractByUrl extractByUrl = field.getAnnotation(ExtractByUrl.class); if (extractByUrl != null) { @@ -62,11 +70,27 @@ class PageModelExtractor { if (regexPattern.trim().equals("")) { regexPattern = ".*"; } - fieldExtractors.add(new FieldExtractor(field, new RegexSelector(regexPattern), FieldExtractor.Source.Url)); + FieldExtractor fieldExtractor = new FieldExtractor(field, new RegexSelector(regexPattern), FieldExtractor.Source.Url); + Method setterMethod = getSetterMethod(clazz, field); + if (setterMethod != null) { + fieldExtractor.setSetterMethod(setterMethod); + } + fieldExtractors.add(fieldExtractor); } } } + public static Method getSetterMethod(Class clazz, Field field) { + String name = "set" + StringUtils.capitalize(field.getName()); + try { + Method declaredMethod = clazz.getDeclaredMethod(name, field.getType()); + declaredMethod.setAccessible(true); + return declaredMethod; + } catch (NoSuchMethodException e) { + return null; + } + } + private void initTargetUrlPatterns() { targetUrlPatterns = new ArrayList(); Annotation annotation = clazz.getAnnotation(TargetUrl.class); @@ -94,24 +118,36 @@ class PageModelExtractor { try { o = clazz.newInstance(); for (FieldExtractor fieldExtractor : fieldExtractors) { + String value; switch (fieldExtractor.getSource()) { case Html: - fieldExtractor.getField().set(o, fieldExtractor.getSelector().select(page.getHtml().toString())); + value = fieldExtractor.getSelector().select(page.getHtml().toString()); break; case Url: - fieldExtractor.getField().set(o, fieldExtractor.getSelector().select(page.getUrl().toString())); + value = fieldExtractor.getSelector().select(page.getUrl().toString()); break; + default: + value = fieldExtractor.getSelector().select(page.getHtml().toString()); } - + setField(o,fieldExtractor,value); } } catch (InstantiationException e) { e.printStackTrace(); } catch (IllegalAccessException e) { e.printStackTrace(); + } catch (InvocationTargetException e) { + e.printStackTrace(); } return o; } + private void setField(Object o, FieldExtractor fieldExtractor, String value) throws IllegalAccessException, InvocationTargetException { + if (fieldExtractor.getSetterMethod()!=null){ + fieldExtractor.getSetterMethod().invoke(o,value); + } + fieldExtractor.getField().set(o, value); + } + Class getClazz() { return clazz; } From 7a4dbb1f15ecd244d777abb6898b68e939e6fabd Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Fri, 2 Aug 2013 08:09:37 +0800 Subject: [PATCH 06/84] invite notnull --- .../java/us/codecraft/webmagic/Spider.java | 5 +++ .../webmagic/annotation/ExtractBy.java | 2 ++ .../webmagic/annotation/ExtractByUrl.java | 4 ++- .../webmagic/annotation/FieldExtractor.java | 15 ++++---- .../webmagic/annotation/HelpUrl.java | 17 ++++++++++ .../annotation/ObjectPageProcessor.java | 4 +++ .../annotation/PageModelExtractor.java | 27 ++++++++++++--- .../annotation/samples/IteyeBlog.java | 34 +++++++++++++++++++ 8 files changed, 95 insertions(+), 13 deletions(-) create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/annotation/HelpUrl.java create mode 100644 webmagic-samples/src/main/java/us/codecraft/webmagic/annotation/samples/IteyeBlog.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index a25fd02..2717b66 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -2,6 +2,7 @@ package us.codecraft.webmagic; import org.apache.commons.collections.CollectionUtils; import org.apache.log4j.Logger; +import us.codecraft.webmagic.annotation.ObjectPageProcessor; import us.codecraft.webmagic.downloader.Destroyable; import us.codecraft.webmagic.downloader.Downloader; import us.codecraft.webmagic.downloader.HttpClientDownloader; @@ -89,6 +90,10 @@ public class Spider implements Runnable, Task { return new Spider(pageProcessor); } + public static Spider create(Site site,Class... pageModels) { + return new Spider(ObjectPageProcessor.create(site,pageModels)); + } + /** * 重新设置startUrls,会覆盖Site本身的startUrls。 * diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ExtractBy.java b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ExtractBy.java index 7c749b3..4c791fd 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ExtractBy.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ExtractBy.java @@ -18,4 +18,6 @@ public @interface ExtractBy { public enum Type {XPath, Regex, Css}; Type type() default Type.XPath; + + boolean notNull() default true; } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ExtractByUrl.java b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ExtractByUrl.java index 3ecb451..57747f5 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ExtractByUrl.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ExtractByUrl.java @@ -11,8 +11,10 @@ import java.lang.annotation.Target; */ @Retention(java.lang.annotation.RetentionPolicy.RUNTIME) @Target({ElementType.FIELD}) -public @interface ExtractByUrl { +public @interface ExtractByUrl{ String value() default ""; + boolean notNull() default true; + } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/FieldExtractor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/FieldExtractor.java index d241c8d..f415cb8 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/FieldExtractor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/FieldExtractor.java @@ -20,18 +20,15 @@ class FieldExtractor { private Method setterMethod; + private final boolean notNull; + static enum Source {Html, Url} - public FieldExtractor(Field field, Selector selector) { - this.field = field; - this.selector = selector; - this.source = Source.Html; - } - - public FieldExtractor(Field field, Selector selector, Source source) { + public FieldExtractor(Field field, Selector selector, Source source, boolean notNull) { this.field = field; this.selector = selector; this.source = source; + this.notNull = notNull; } Field getField() { @@ -53,4 +50,8 @@ class FieldExtractor { Method getSetterMethod() { return setterMethod; } + + boolean isNotNull() { + return notNull; + } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/HelpUrl.java b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/HelpUrl.java new file mode 100644 index 0000000..3020817 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/HelpUrl.java @@ -0,0 +1,17 @@ +package us.codecraft.webmagic.annotation; + +import java.lang.annotation.ElementType; +import java.lang.annotation.Retention; +import java.lang.annotation.Target; + +/** + * @author yihua.huang@dianping.com
+ * @date: 13-8-1
+ * Time: 下午8:40
+ */ +@Retention(java.lang.annotation.RetentionPolicy.RUNTIME) +@Target({ElementType.TYPE}) +public @interface HelpUrl { + + String[] value(); +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ObjectPageProcessor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ObjectPageProcessor.java index ae3131e..ad8297e 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ObjectPageProcessor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ObjectPageProcessor.java @@ -40,6 +40,7 @@ public class ObjectPageProcessor implements PageProcessor { targetUrlPatterns = new HashSet(); for (PageModelExtractor pageModelExtractor : pageModelExtractorList) { targetUrlPatterns.addAll(pageModelExtractor.getTargetUrlPatterns()); + targetUrlPatterns.addAll(pageModelExtractor.getHelpUrlPatterns()); } } @@ -47,6 +48,9 @@ public class ObjectPageProcessor implements PageProcessor { public void process(Page page) { for (PageModelExtractor pageModelExtractor : pageModelExtractorList) { Object process = pageModelExtractor.process(page); + if (process==null){ + page.getResultItems().setSkip(true); + } postProcessPageModel(pageModelExtractor.getClazz(), process); page.putField(pageModelExtractor.getClazz().getCanonicalName(), process); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/PageModelExtractor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/PageModelExtractor.java index 7d0d4f2..41f635c 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/PageModelExtractor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/PageModelExtractor.java @@ -24,6 +24,8 @@ class PageModelExtractor { private List targetUrlPatterns; + private List helpUrlPatterns; + private Class clazz; private List fieldExtractors; @@ -57,7 +59,7 @@ class PageModelExtractor { default: selector = new XpathSelector(value); } - FieldExtractor fieldExtractor = new FieldExtractor(field, selector); + FieldExtractor fieldExtractor = new FieldExtractor(field, selector, FieldExtractor.Source.Html, extractBy.notNull()); Method setterMethod = getSetterMethod(clazz, field); if (setterMethod != null) { fieldExtractor.setSetterMethod(setterMethod); @@ -70,7 +72,7 @@ class PageModelExtractor { if (regexPattern.trim().equals("")) { regexPattern = ".*"; } - FieldExtractor fieldExtractor = new FieldExtractor(field, new RegexSelector(regexPattern), FieldExtractor.Source.Url); + FieldExtractor fieldExtractor = new FieldExtractor(field, new RegexSelector(regexPattern), FieldExtractor.Source.Url, extractByUrl.notNull()); Method setterMethod = getSetterMethod(clazz, field); if (setterMethod != null) { fieldExtractor.setSetterMethod(setterMethod); @@ -102,6 +104,14 @@ class PageModelExtractor { targetUrlPatterns.add(Pattern.compile(s.replace(".", "\\.").replace("*", "[^\"'#]*"))); } } + helpUrlPatterns = new ArrayList(); + annotation = clazz.getAnnotation(HelpUrl.class); + if (annotation != null) { + String[] value = ((HelpUrl) annotation).value(); + for (String s : value) { + helpUrlPatterns.add(Pattern.compile(s.replace(".", "\\.").replace("*", "[^\"'#]*"))); + } + } } public Object process(Page page) { @@ -129,7 +139,10 @@ class PageModelExtractor { default: value = fieldExtractor.getSelector().select(page.getHtml().toString()); } - setField(o,fieldExtractor,value); + if (value==null&&fieldExtractor.isNotNull()){ + page.getResultItems().setSkip(true); + } + setField(o, fieldExtractor, value); } } catch (InstantiationException e) { e.printStackTrace(); @@ -142,8 +155,8 @@ class PageModelExtractor { } private void setField(Object o, FieldExtractor fieldExtractor, String value) throws IllegalAccessException, InvocationTargetException { - if (fieldExtractor.getSetterMethod()!=null){ - fieldExtractor.getSetterMethod().invoke(o,value); + if (fieldExtractor.getSetterMethod() != null) { + fieldExtractor.getSetterMethod().invoke(o, value); } fieldExtractor.getField().set(o, value); } @@ -155,4 +168,8 @@ class PageModelExtractor { List getTargetUrlPatterns() { return targetUrlPatterns; } + + List getHelpUrlPatterns() { + return helpUrlPatterns; + } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/annotation/samples/IteyeBlog.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/annotation/samples/IteyeBlog.java new file mode 100644 index 0000000..8fbf089 --- /dev/null +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/annotation/samples/IteyeBlog.java @@ -0,0 +1,34 @@ +package us.codecraft.webmagic.annotation.samples; + +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.annotation.ExtractBy; +import us.codecraft.webmagic.annotation.TargetUrl; + +/** + * @author yihua.huang@dianping.com
+ * @date: 13-8-2
+ * Time: 上午7:52
+ */ +@TargetUrl("http://dengminhui.iteye.com/blog/*") +public class IteyeBlog { + + @ExtractBy("//title") + private String title; + + @ExtractBy(value = "div#blog_content",type = ExtractBy.Type.Css) + private String content; + + @Override + public String toString() { + return "IteyeBlog{" + + "title='" + title + '\'' + + ", content='" + content + '\'' + + '}'; + } + + public static void main(String[] args) { + Spider.create(Site.me().addStartUrl("http://dengminhui.iteye.com/blog"),IteyeBlog.class).run(); + } + +} From 7ee567b8047ed83ea274a1a5dca91f067079fcbd Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Fri, 2 Aug 2013 08:13:40 +0800 Subject: [PATCH 07/84] add some samples --- .../webmagic/annotation/samples/Blog.java | 13 ++++++ .../annotation/samples/IteyeBlog.java | 9 +++- .../annotation/samples/OschinaBlog.java | 41 +++++++++++++++++++ 3 files changed, 62 insertions(+), 1 deletion(-) create mode 100644 webmagic-samples/src/main/java/us/codecraft/webmagic/annotation/samples/Blog.java create mode 100644 webmagic-samples/src/main/java/us/codecraft/webmagic/annotation/samples/OschinaBlog.java diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/annotation/samples/Blog.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/annotation/samples/Blog.java new file mode 100644 index 0000000..c0890e1 --- /dev/null +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/annotation/samples/Blog.java @@ -0,0 +1,13 @@ +package us.codecraft.webmagic.annotation.samples; + +/** + * @author yihua.huang@dianping.com
+ * @date: 13-8-2
+ * Time: 上午8:10
+ */ +public interface Blog { + + public String getTitle(); + + public String getContent(); +} diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/annotation/samples/IteyeBlog.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/annotation/samples/IteyeBlog.java index 8fbf089..09a1d5b 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/annotation/samples/IteyeBlog.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/annotation/samples/IteyeBlog.java @@ -11,7 +11,7 @@ import us.codecraft.webmagic.annotation.TargetUrl; * Time: 上午7:52
*/ @TargetUrl("http://dengminhui.iteye.com/blog/*") -public class IteyeBlog { +public class IteyeBlog implements Blog{ @ExtractBy("//title") private String title; @@ -31,4 +31,11 @@ public class IteyeBlog { Spider.create(Site.me().addStartUrl("http://dengminhui.iteye.com/blog"),IteyeBlog.class).run(); } + public String getTitle() { + return title; + } + + public String getContent() { + return content; + } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/annotation/samples/OschinaBlog.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/annotation/samples/OschinaBlog.java new file mode 100644 index 0000000..817c1aa --- /dev/null +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/annotation/samples/OschinaBlog.java @@ -0,0 +1,41 @@ +package us.codecraft.webmagic.annotation.samples; + +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.annotation.ExtractBy; +import us.codecraft.webmagic.annotation.TargetUrl; + +/** + * @author yihua.huang@dianping.com
+ * @date: 13-8-2
+ * Time: 上午7:52
+ */ +@TargetUrl("http://my.oschina.net/flashsword/blog/\\d+") +public class OschinaBlog implements Blog{ + + @ExtractBy("//title") + private String title; + + @ExtractBy(value = "div.BlogContent",type = ExtractBy.Type.Css) + private String content; + + @Override + public String toString() { + return "OschinaBlog{" + + "title='" + title + '\'' + + ", content='" + content + '\'' + + '}'; + } + + public static void main(String[] args) { + Spider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog"),OschinaBlog.class).run(); + } + + public String getTitle() { + return title; + } + + public String getContent() { + return content; + } +} From 516ff3310d3cdf77d79f2508e53b84489f7fac64 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Fri, 2 Aug 2013 08:20:55 +0800 Subject: [PATCH 08/84] add failfast --- .../main/java/us/codecraft/webmagic/annotation/ExtractBy.java | 2 ++ .../us/codecraft/webmagic/annotation/PageModelExtractor.java | 3 +++ .../us/codecraft/webmagic/annotation/samples/IteyeBlog.java | 4 ++-- 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ExtractBy.java b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ExtractBy.java index 4c791fd..2d08417 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ExtractBy.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ExtractBy.java @@ -13,6 +13,8 @@ import java.lang.annotation.Target; @Target({ElementType.FIELD}) public @interface ExtractBy { + + //TODO: add list support String value(); public enum Type {XPath, Regex, Css}; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/PageModelExtractor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/PageModelExtractor.java index 41f635c..e610e10 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/PageModelExtractor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/PageModelExtractor.java @@ -42,6 +42,9 @@ class PageModelExtractor { fieldExtractors = new ArrayList(); for (Field field : clazz.getDeclaredFields()) { field.setAccessible(true); + if (!field.getType().isAssignableFrom(String.class)){ + throw new IllegalStateException("Field "+field.getName()+" must be string"); + } ExtractBy extractBy = field.getAnnotation(ExtractBy.class); if (extractBy != null) { String value = extractBy.value(); diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/annotation/samples/IteyeBlog.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/annotation/samples/IteyeBlog.java index 09a1d5b..48e4129 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/annotation/samples/IteyeBlog.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/annotation/samples/IteyeBlog.java @@ -14,7 +14,7 @@ import us.codecraft.webmagic.annotation.TargetUrl; public class IteyeBlog implements Blog{ @ExtractBy("//title") - private String title; + private int title; @ExtractBy(value = "div#blog_content",type = ExtractBy.Type.Css) private String content; @@ -32,7 +32,7 @@ public class IteyeBlog implements Blog{ } public String getTitle() { - return title; + return null; } public String getContent() { From 29c6a03f6019265fe40bc63b935a9a6a5c006f05 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Fri, 2 Aug 2013 08:22:28 +0800 Subject: [PATCH 09/84] fix --- .../us/codecraft/webmagic/annotation/samples/IteyeBlog.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/annotation/samples/IteyeBlog.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/annotation/samples/IteyeBlog.java index 48e4129..a927054 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/annotation/samples/IteyeBlog.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/annotation/samples/IteyeBlog.java @@ -14,7 +14,7 @@ import us.codecraft.webmagic.annotation.TargetUrl; public class IteyeBlog implements Blog{ @ExtractBy("//title") - private int title; + private String title; @ExtractBy(value = "div#blog_content",type = ExtractBy.Type.Css) private String content; From 901d6fde1d2a851eadb13eb323012860a411ec72 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Fri, 2 Aug 2013 08:22:59 +0800 Subject: [PATCH 10/84] fix --- .../us/codecraft/webmagic/annotation/samples/IteyeBlog.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/annotation/samples/IteyeBlog.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/annotation/samples/IteyeBlog.java index a927054..09a1d5b 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/annotation/samples/IteyeBlog.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/annotation/samples/IteyeBlog.java @@ -32,7 +32,7 @@ public class IteyeBlog implements Blog{ } public String getTitle() { - return null; + return title; } public String getContent() { From 3fe3d8f04455af431647e1e56f4505f512e446e3 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Fri, 2 Aug 2013 13:51:42 +0800 Subject: [PATCH 11/84] update --- .../webmagic/annotation/ObjectPipeline.java | 21 +++++++++++++++++++ .../webmagic/annotation/OschinaBlog.java | 7 ------- .../webmagic/annotation/TestFetcher.java | 8 ++++++- 3 files changed, 28 insertions(+), 8 deletions(-) create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ObjectPipeline.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ObjectPipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ObjectPipeline.java new file mode 100644 index 0000000..dd27395 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ObjectPipeline.java @@ -0,0 +1,21 @@ +package us.codecraft.webmagic.annotation; + +import us.codecraft.webmagic.ResultItems; +import us.codecraft.webmagic.Task; +import us.codecraft.webmagic.pipeline.Pipeline; + +/** + * @author yihua.huang@dianping.com
+ * @date: 13-8-2
+ * Time: 上午10:47
+ */ +public class ObjectPipeline implements Pipeline { + @Override + public void process(ResultItems resultItems, Task task) { + + } + + public T read() { + return null; + } +} diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/annotation/OschinaBlog.java b/webmagic-core/src/test/java/us/codecraft/webmagic/annotation/OschinaBlog.java index 0435843..9b83c01 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/annotation/OschinaBlog.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/annotation/OschinaBlog.java @@ -14,11 +14,4 @@ public class OschinaBlog { @ExtractBy(value = "div.BlogContent",type = ExtractBy.Type.Css) private String content; - @Override - public String toString() { - return "OschinaBlog{" + - "title='" + title + '\'' + - ", content='" + content + '\'' + - '}'; - } } diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/annotation/TestFetcher.java b/webmagic-core/src/test/java/us/codecraft/webmagic/annotation/TestFetcher.java index 37a3305..5d70a54 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/annotation/TestFetcher.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/annotation/TestFetcher.java @@ -15,7 +15,13 @@ public class TestFetcher { @Ignore("takes long") @Test public void test() { - Spider.create(ObjectPageProcessor.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog/145796"), OschinaBlog.class)).run(); + ObjectPipeline objectPipeline = new ObjectPipeline(); + Spider.create(ObjectPageProcessor.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog/145796"), OschinaBlog.class)) + .pipeline(objectPipeline).runAsync(); + OschinaBlog oschinaBlog = null; + while ((oschinaBlog = objectPipeline.read()) != null) { + System.out.println(oschinaBlog); + } } From d7899e94aeab00d0be2aaed2989c814f08e2cf2b Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Fri, 2 Aug 2013 23:39:34 +0800 Subject: [PATCH 12/84] test saxon and invite XPath2.0 support --- pom.xml | 7 ++- webmagic-core/pom.xml | 5 ++ .../webmagic/selector/SaxonTest.java | 45 +++++++++++++ .../webmagic/selector/XpathSelectorTest.java | 63 +++++++++++++++++++ 4 files changed, 119 insertions(+), 1 deletion(-) create mode 100644 webmagic-core/src/test/java/us/codecraft/webmagic/selector/SaxonTest.java diff --git a/pom.xml b/pom.xml index 5974eae..fa369f4 100644 --- a/pom.xml +++ b/pom.xml @@ -27,6 +27,11 @@ httpclient 4.2.4 + + net.sf.saxon + Saxon-HE + 9.5.1-1 + log4j log4j @@ -45,7 +50,7 @@ net.sourceforge.htmlcleaner htmlcleaner - 2.4 + 2.5 org.apache.commons diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index 60c37c0..a5fbd75 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -27,6 +27,11 @@ commons-lang3 + + net.sf.saxon + Saxon-HE + + log4j log4j diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/SaxonTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/SaxonTest.java new file mode 100644 index 0000000..509be44 --- /dev/null +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/SaxonTest.java @@ -0,0 +1,45 @@ +package us.codecraft.webmagic.selector; + +import net.sf.saxon.xpath.XPathFactoryImpl; +import org.htmlcleaner.CleanerProperties; +import org.htmlcleaner.DomSerializer; +import org.htmlcleaner.HtmlCleaner; +import org.htmlcleaner.TagNode; +import org.junit.Test; +import org.w3c.dom.Document; +import org.w3c.dom.NodeList; + +import javax.xml.xpath.XPath; +import javax.xml.xpath.XPathConstants; +import javax.xml.xpath.XPathExpression; +import javax.xml.xpath.XPathFactoryConfigurationException; + +/** + * @author yihua.huang@dianping.com
+ * @date: 13-8-2
+ * Time: 下午5:48
+ */ +public class SaxonTest { + + @Test + public void test() throws XPathFactoryConfigurationException { +// System.setProperty("javax.xml.xpath.XPathFactory:" + NamespaceConstant.OBJECT_MODEL_SAXON, "net.sf.saxon.xpath.XPathFactoryImpl"); +// XPathFactory xpf = XPathFactory.newInstance(NamespaceConstant.OBJECT_MODEL_SAXON); + String xml = "#BBB##CCC##DDD#"; + try { + HtmlCleaner htmlCleaner = new HtmlCleaner(); + TagNode tagNode = htmlCleaner.clean(""); + Document document = new DomSerializer(new CleanerProperties()).createDOM(tagNode); + + javax.xml.xpath.XPathFactory factory = XPathFactoryImpl.newInstance(); + XPath xpath = factory.newXPath(); + XPathExpression expr = xpath.compile("//a[matches(.,'#...#')]"); + + Object result = expr.evaluate(document, XPathConstants.NODESET); + NodeList nodes = (NodeList) result; + System.out.println(nodes); + } catch (Exception e) { + e.printStackTrace(); + } + } +} diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java index 6f1c21e..c2cc7ec 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java @@ -1,7 +1,24 @@ package us.codecraft.webmagic.selector; +import net.sf.saxon.Configuration; +import net.sf.saxon.lib.NamespaceConstant; +import net.sf.saxon.om.NamespaceResolver; +import net.sf.saxon.pull.NamespaceContextImpl; +import net.sf.saxon.xpath.JAXPXPathStaticContext; +import net.sf.saxon.xpath.XPathEvaluator; +import net.sf.saxon.xpath.XPathFactoryImpl; +import org.htmlcleaner.CleanerProperties; +import org.htmlcleaner.DomSerializer; +import org.htmlcleaner.HtmlCleaner; +import org.htmlcleaner.TagNode; import org.junit.Assert; import org.junit.Test; +import org.w3c.dom.Document; +import org.w3c.dom.NodeList; + +import javax.xml.xpath.*; +import java.util.Collections; +import java.util.Iterator; /** * @author code4crafter@gmail.com
Date: 13-4-21 Time: 上午10:06 @@ -1354,4 +1371,50 @@ public class XpathSelectorTest { Assert.assertNotNull(html1.$("a[href]").xpath("//@href").all()); } + @Test + public void testXPath2() { + String text = "

眉山:扎实推进农业农村工作 促农持续增收
\n" + + "2013-07-31 23:29:45   来源:眉山网      责任编辑:张斯炜

"; + XpathSelector xpathSelector = new XpathSelector("//h1/text()"); + System.out.println(xpathSelector.select(text)); + } + + //http://sourceforge.net/mailarchive/forum.php?thread_name=4EA92A8A.6080202%40saxonica.com&forum_name=saxon-help + @Test + public void testSaxon() throws XPathFactoryConfigurationException { + System.setProperty("javax.xml.xpath.XPathFactory:" + NamespaceConstant.OBJECT_MODEL_SAXON, "net.sf.saxon.xpath.XPathFactoryImpl"); + System.setProperty("javax.xml.xpath.XPathFactory:" + NamespaceConstant.FN, "net.sf.saxon.xpath.XPathFactoryImpl"); + XPathFactory xpf = XPathFactory.newInstance(NamespaceConstant.OBJECT_MODEL_SAXON); + String text = "

眉山:扎实推进农业农村工作 促农持续增收
\n" + + "2013-07-31 23:29:45   来源:眉山网      责任编辑:张斯炜

"; + try { + HtmlCleaner htmlCleaner = new HtmlCleaner(); + TagNode tagNode = htmlCleaner.clean(text); + Document document = new DomSerializer(new CleanerProperties()).createDOM(tagNode); + javax.xml.xpath.XPathFactory factory = XPathFactoryImpl.newInstance(NamespaceConstant.OBJECT_MODEL_SAXON); + Configuration config = Configuration.newConfiguration(); + XPathEvaluator xPathEvaluator = new XPathEvaluator(config); + JAXPXPathStaticContext context = new JAXPXPathStaticContext(config); + context.setNamespaceContext(new NamespaceContextImpl(new NamespaceResolver() { + + + @Override + public String getURIForPrefix(String s, boolean b) { + return NamespaceConstant.FN; + } + + @Override + public Iterator iteratePrefixes() { + return Collections.singletonList("fn").iterator(); + } + })); + xPathEvaluator.setStaticContext(context); + XPathExpression expr = xPathEvaluator.compile("fn:substring-before(//h1,'\n')"); + Object result = expr.evaluate(document, XPathConstants.STRING); + System.out.println(result); + } catch (Exception e) { + e.printStackTrace(); + } + } + } From 7f27c28d4c43d3d0d285d9c997c676659a55f916 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Fri, 2 Aug 2013 23:45:13 +0800 Subject: [PATCH 13/84] simplify api --- .../codecraft/webmagic/selector/XpathSelectorTest.java | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java index c2cc7ec..6544e9e 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java @@ -1384,18 +1384,14 @@ public class XpathSelectorTest { public void testSaxon() throws XPathFactoryConfigurationException { System.setProperty("javax.xml.xpath.XPathFactory:" + NamespaceConstant.OBJECT_MODEL_SAXON, "net.sf.saxon.xpath.XPathFactoryImpl"); System.setProperty("javax.xml.xpath.XPathFactory:" + NamespaceConstant.FN, "net.sf.saxon.xpath.XPathFactoryImpl"); - XPathFactory xpf = XPathFactory.newInstance(NamespaceConstant.OBJECT_MODEL_SAXON); String text = "

眉山:扎实推进农业农村工作 促农持续增收
\n" + "2013-07-31 23:29:45   来源:眉山网      责任编辑:张斯炜

"; try { HtmlCleaner htmlCleaner = new HtmlCleaner(); TagNode tagNode = htmlCleaner.clean(text); Document document = new DomSerializer(new CleanerProperties()).createDOM(tagNode); - javax.xml.xpath.XPathFactory factory = XPathFactoryImpl.newInstance(NamespaceConstant.OBJECT_MODEL_SAXON); - Configuration config = Configuration.newConfiguration(); - XPathEvaluator xPathEvaluator = new XPathEvaluator(config); - JAXPXPathStaticContext context = new JAXPXPathStaticContext(config); - context.setNamespaceContext(new NamespaceContextImpl(new NamespaceResolver() { + XPathEvaluator xPathEvaluator = new XPathEvaluator(); + xPathEvaluator.setNamespaceContext(new NamespaceContextImpl(new NamespaceResolver() { @Override @@ -1408,7 +1404,6 @@ public class XpathSelectorTest { return Collections.singletonList("fn").iterator(); } })); - xPathEvaluator.setStaticContext(context); XPathExpression expr = xPathEvaluator.compile("fn:substring-before(//h1,'\n')"); Object result = expr.evaluate(document, XPathConstants.STRING); System.out.println(result); From 7c9e9ce8694693082d179c47427b053020f899aa Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 3 Aug 2013 07:28:46 +0800 Subject: [PATCH 14/84] xpath2.0 --- .../webmagic/selector/Xpath2Selector.java | 167 ++++++++++++++++++ .../webmagic/selector/XpathSelectorTest.java | 41 ++++- 2 files changed, 204 insertions(+), 4 deletions(-) create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java new file mode 100644 index 0000000..99112ca --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java @@ -0,0 +1,167 @@ +package us.codecraft.webmagic.selector; + +import net.sf.saxon.lib.NamespaceConstant; +import net.sf.saxon.xpath.XPathEvaluator; +import org.apache.log4j.Logger; +import org.htmlcleaner.CleanerProperties; +import org.htmlcleaner.DomSerializer; +import org.htmlcleaner.HtmlCleaner; +import org.htmlcleaner.TagNode; +import org.w3c.dom.Document; +import org.w3c.dom.NodeList; + +import javax.xml.namespace.NamespaceContext; +import javax.xml.transform.OutputKeys; +import javax.xml.transform.Transformer; +import javax.xml.transform.TransformerFactory; +import javax.xml.transform.dom.DOMSource; +import javax.xml.transform.stream.StreamResult; +import javax.xml.xpath.XPathConstants; +import javax.xml.xpath.XPathExpression; +import javax.xml.xpath.XPathExpressionException; +import java.io.StringWriter; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; + +/** + * 支持xpath2.0的选择器。包装了HtmlCleaner和Saxon HE。
+ * + * @author code4crafter@gmail.com
+ * Date: 13-4-21 + * Time: 上午9:39 + */ +public class Xpath2Selector implements Selector { + + private String xpathStr; + + private XPathExpression xPathExpression; + + private Logger logger = Logger.getLogger(getClass()); + + public Xpath2Selector(String xpathStr) { + this.xpathStr = xpathStr; + try { + init(); + } catch (XPathExpressionException e) { + throw new IllegalArgumentException("XPath error!", e); + } + } + + enum XPath2NamespaceContext implements NamespaceContext { + + INSTANCE; + + private final Map prefix2NamespaceMap = new ConcurrentHashMap(); + + private final Map> namespace2PrefixMap = new ConcurrentHashMap>(); + + private void put(String prefix, String namespaceURI) { + prefix2NamespaceMap.put(prefix, namespaceURI); + List prefixes = namespace2PrefixMap.get(namespaceURI); + if (prefixes == null) { + prefixes = new ArrayList(); + namespace2PrefixMap.put(namespaceURI, prefixes); + } + prefixes.add(prefix); + } + + private XPath2NamespaceContext() { + put("fn", NamespaceConstant.FN); + put("xslt",NamespaceConstant.XSLT); + } + + @Override + public String getNamespaceURI(String prefix) { + return prefix2NamespaceMap.get(prefix); + } + + @Override + public String getPrefix(String namespaceURI) { + List prefixes = namespace2PrefixMap.get(namespaceURI); + if (prefixes == null || prefixes.size() < 1) { + return null; + } + return prefixes.get(0); + } + + @Override + public Iterator getPrefixes(String namespaceURI) { + List prefixes = namespace2PrefixMap.get(namespaceURI); + if (prefixes == null || prefixes.size() < 1) { + return null; + } + return prefixes.iterator(); + } + } + + private void init() throws XPathExpressionException { + XPathEvaluator xPathEvaluator = new XPathEvaluator(); + xPathEvaluator.setNamespaceContext(XPath2NamespaceContext.INSTANCE); + xPathExpression = xPathEvaluator.compile(xpathStr); + } + + @Override + public String select(String text) { + try { + HtmlCleaner htmlCleaner = new HtmlCleaner(); + TagNode tagNode = htmlCleaner.clean(text); + Document document = new DomSerializer(new CleanerProperties()).createDOM(tagNode); + Object result; + try { + result = xPathExpression.evaluate(document, XPathConstants.NODESET); + } catch (XPathExpressionException e) { + result = xPathExpression.evaluate(document, XPathConstants.STRING); + } + if (result instanceof NodeList) { + StreamResult xmlOutput = new StreamResult(new StringWriter()); + Transformer transformer = TransformerFactory.newInstance().newTransformer(); + transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes"); + NodeList nodeList = (NodeList) result; + if (nodeList.getLength() == 0) { + return null; + } + transformer.transform(new DOMSource(nodeList.item(0)), xmlOutput); + return xmlOutput.getWriter().toString(); + } + return result.toString(); + } catch (Exception e) { + logger.error("select text error! " + xpathStr, e); + } + return null; + } + + @Override + public List selectList(String text) { + List results = new ArrayList(); + try { + HtmlCleaner htmlCleaner = new HtmlCleaner(); + TagNode tagNode = htmlCleaner.clean(text); + Document document = new DomSerializer(new CleanerProperties()).createDOM(tagNode); + Object result; + try { + result = xPathExpression.evaluate(document, XPathConstants.NODESET); + } catch (XPathExpressionException e) { + result = xPathExpression.evaluate(document, XPathConstants.STRING); + } + if (result instanceof NodeList) { + NodeList nodeList = (NodeList) result; + Transformer transformer = TransformerFactory.newInstance().newTransformer(); + StreamResult xmlOutput = new StreamResult(); + transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes"); + for (int i = 0; i < nodeList.getLength(); i++) { + xmlOutput.setWriter(new StringWriter()); + transformer.transform(new DOMSource(nodeList.item(i)), xmlOutput); + results.add(xmlOutput.getWriter().toString()); + } + } else { + results.add(result.toString()); + } + } catch (Exception e) { + logger.error("select text error! " + xpathStr, e); + } + return results; + } +} diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java index 6544e9e..2b8e15d 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java @@ -12,6 +12,7 @@ import org.htmlcleaner.DomSerializer; import org.htmlcleaner.HtmlCleaner; import org.htmlcleaner.TagNode; import org.junit.Assert; +import org.junit.Ignore; import org.junit.Test; import org.w3c.dom.Document; import org.w3c.dom.NodeList; @@ -1381,9 +1382,7 @@ public class XpathSelectorTest { //http://sourceforge.net/mailarchive/forum.php?thread_name=4EA92A8A.6080202%40saxonica.com&forum_name=saxon-help @Test - public void testSaxon() throws XPathFactoryConfigurationException { - System.setProperty("javax.xml.xpath.XPathFactory:" + NamespaceConstant.OBJECT_MODEL_SAXON, "net.sf.saxon.xpath.XPathFactoryImpl"); - System.setProperty("javax.xml.xpath.XPathFactory:" + NamespaceConstant.FN, "net.sf.saxon.xpath.XPathFactoryImpl"); + public void testSaxon() { String text = "

眉山:扎实推进农业农村工作 促农持续增收
\n" + "2013-07-31 23:29:45   来源:眉山网      责任编辑:张斯炜

"; try { @@ -1406,10 +1405,44 @@ public class XpathSelectorTest { })); XPathExpression expr = xPathEvaluator.compile("fn:substring-before(//h1,'\n')"); Object result = expr.evaluate(document, XPathConstants.STRING); - System.out.println(result); + Assert.assertNotNull(result); } catch (Exception e) { e.printStackTrace(); } + Xpath2Selector xpath2Selector = new Xpath2Selector("fn:substring-before(//h1,'\n')"); + String select = xpath2Selector.select(text); + Assert.assertNotNull(select); + Assert.assertNotNull(xpath2Selector.selectList(text)); + + } + + @Test + public void testXpath2Selector() { + Xpath2Selector xpath2Selector = new Xpath2Selector("//a"); + String select = xpath2Selector.select(html); + Assert.assertNotNull(select); + } + + @Ignore("take long time") + @Test + public void performanceTest() { + Xpath2Selector xpath2Selector = new Xpath2Selector("//a"); + long time =System.currentTimeMillis(); + for (int i = 0; i < 1000; i++) { + xpath2Selector.selectList(html); + } + System.out.println(System.currentTimeMillis()-time); + XpathSelector xpathSelector = new XpathSelector("//a"); + time =System.currentTimeMillis(); + for (int i = 0; i < 1000; i++) { + xpathSelector.selectList(html); + } + System.out.println(System.currentTimeMillis()-time); + time =System.currentTimeMillis(); + for (int i = 0; i < 1000; i++) { + xpath2Selector.selectList(html); + } + System.out.println(System.currentTimeMillis()-time); } } From 866ab0a05607ab6bc17f7058e86a19207d550031 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 3 Aug 2013 14:01:18 +0800 Subject: [PATCH 15/84] update email --- .../us/codecraft/webmagic/ResultItems.java | 2 +- .../java/us/codecraft/webmagic/Spider.java | 4 --- .../webmagic/annotation/AfterExtractor.java | 15 ++++++++++ .../webmagic/annotation/ExtractBy.java | 2 +- .../webmagic/annotation/ExtractByUrl.java | 2 +- .../webmagic/annotation/FieldExtractor.java | 2 +- .../webmagic/annotation/HelpUrl.java | 2 +- .../webmagic/annotation/OOSpider.java | 29 ++++++++++++++++++ .../annotation/ObjectPageProcessor.java | 2 +- .../webmagic/annotation/ObjectPipeline.java | 30 +++++++++++++++---- .../annotation/PageModelExtractor.java | 14 ++++++++- .../annotation/PageModelPipeline.java | 14 +++++++++ .../webmagic/annotation/TargetUrl.java | 3 +- .../webmagic/downloader/Destroyable.java | 2 +- .../webmagic/annotation/TestFetcher.java | 8 ++--- .../annotation/samples/IteyeBlog.java | 4 +-- .../annotation/samples/OschinaBlog.java | 4 +-- 17 files changed, 110 insertions(+), 29 deletions(-) create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/annotation/AfterExtractor.java create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/annotation/OOSpider.java create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/annotation/PageModelPipeline.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/ResultItems.java b/webmagic-core/src/main/java/us/codecraft/webmagic/ResultItems.java index 0c1d94c..c91a270 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/ResultItems.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/ResultItems.java @@ -5,7 +5,7 @@ import java.util.Map; /** * 保存抽取结果的类,由PageProcessor处理得到,传递给{@link us.codecraft.webmagic.pipeline.Pipeline}进行持久化。
- * @author yihua.huang@dianping.com
+ * @author code4crafter@gmail.com
* @date: 13-7-25
* Time: 下午12:20
*/ diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 2717b66..facfd95 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -90,10 +90,6 @@ public class Spider implements Runnable, Task { return new Spider(pageProcessor); } - public static Spider create(Site site,Class... pageModels) { - return new Spider(ObjectPageProcessor.create(site,pageModels)); - } - /** * 重新设置startUrls,会覆盖Site本身的startUrls。 * diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/AfterExtractor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/AfterExtractor.java new file mode 100644 index 0000000..89d03e9 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/AfterExtractor.java @@ -0,0 +1,15 @@ +package us.codecraft.webmagic.annotation; + +import us.codecraft.webmagic.Page; + +/** + * 实现这个接口即可在抽取后进行后处理。
+ * + * @author code4crafter@gmail.com
+ * @date: 13-8-3
+ * Time: 上午9:42
+ */ +public interface AfterExtractor { + + public void afterProcess(Page page, T t); +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ExtractBy.java b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ExtractBy.java index 2d08417..115a219 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ExtractBy.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ExtractBy.java @@ -5,7 +5,7 @@ import java.lang.annotation.Retention; import java.lang.annotation.Target; /** - * @author yihua.huang@dianping.com
+ * @author code4crafter@gmail.com
* @date: 13-8-1
* Time: 下午8:40
*/ diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ExtractByUrl.java b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ExtractByUrl.java index 57747f5..c40c9ca 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ExtractByUrl.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ExtractByUrl.java @@ -5,7 +5,7 @@ import java.lang.annotation.Retention; import java.lang.annotation.Target; /** - * @author yihua.huang@dianping.com
+ * @author code4crafter@gmail.com
* @date: 13-8-1
* Time: 下午8:40
*/ diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/FieldExtractor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/FieldExtractor.java index f415cb8..4cd09ef 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/FieldExtractor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/FieldExtractor.java @@ -6,7 +6,7 @@ import java.lang.reflect.Field; import java.lang.reflect.Method; /** - * @author yihua.huang@dianping.com
+ * @author code4crafter@gmail.com
* @date: 13-8-1
* Time: 下午9:48
*/ diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/HelpUrl.java b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/HelpUrl.java index 3020817..e5727f0 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/HelpUrl.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/HelpUrl.java @@ -5,7 +5,7 @@ import java.lang.annotation.Retention; import java.lang.annotation.Target; /** - * @author yihua.huang@dianping.com
+ * @author code4crafter@gmail.com
* @date: 13-8-1
* Time: 下午8:40
*/ diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/OOSpider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/OOSpider.java new file mode 100644 index 0000000..c6ae2f3 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/OOSpider.java @@ -0,0 +1,29 @@ +package us.codecraft.webmagic.annotation; + +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.processor.PageProcessor; + +/** + * @author code4crafter@gmail.com
+ * @date: 13-8-3
+ * Time: 上午9:51
+ */ +public class OOSpider extends Spider{ + + /** + * 使用已定义的抽取规则新建一个Spider。 + * + * @param pageProcessor 已定义的抽取规则 + */ + public OOSpider(PageProcessor pageProcessor) { + super(pageProcessor); + } + + public static OOSpider create(Site site,Class... pageModels) { + OOSpider ooSpider = new OOSpider(ObjectPageProcessor.create(site, pageModels)); + ooSpider.pipeline(new ObjectPipeline()); + return ooSpider; + } + +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ObjectPageProcessor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ObjectPageProcessor.java index ad8297e..063dc81 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ObjectPageProcessor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ObjectPageProcessor.java @@ -12,7 +12,7 @@ import java.util.Set; import java.util.regex.Pattern; /** - * @author yihua.huang@dianping.com
+ * @author code4crafter@gmail.com
* @date: 13-8-1
* Time: 下午8:46
*/ diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ObjectPipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ObjectPipeline.java index dd27395..0b3ec4d 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ObjectPipeline.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ObjectPipeline.java @@ -4,18 +4,36 @@ import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.pipeline.Pipeline; +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; + /** - * @author yihua.huang@dianping.com
+ * @author code4crafter@gmail.com
* @date: 13-8-2
* Time: 上午10:47
*/ public class ObjectPipeline implements Pipeline { + + private Map pageModelPipelines = new ConcurrentHashMap(); + + public ObjectPipeline() { + } + + public ObjectPipeline put(Class clazz, PageModelPipeline pageModelPipeline) { + pageModelPipelines.put(clazz, pageModelPipeline); + return this; + } + @Override public void process(ResultItems resultItems, Task task) { - - } - - public T read() { - return null; + if (resultItems.isSkip()) { + return; + } + for (Map.Entry classPageModelPipelineEntry : pageModelPipelines.entrySet()) { + Object o = resultItems.get(classPageModelPipelineEntry.getKey().getCanonicalName()); + if (o != null) { + classPageModelPipelineEntry.getValue().process(o, task); + } + } } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/PageModelExtractor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/PageModelExtractor.java index e610e10..8ed3b6b 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/PageModelExtractor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/PageModelExtractor.java @@ -16,7 +16,7 @@ import java.util.List; import java.util.regex.Pattern; /** - * @author yihua.huang@dianping.com
+ * @author code4crafter@gmail.com
* @date: 13-8-1
* Time: 下午9:33
*/ @@ -30,6 +30,8 @@ class PageModelExtractor { private List fieldExtractors; + private AfterExtractor afterExtractor; + public static PageModelExtractor create(Class clazz) { PageModelExtractor pageModelExtractor = new PageModelExtractor(); pageModelExtractor.init(clazz); @@ -40,6 +42,13 @@ class PageModelExtractor { this.clazz = clazz; initTargetUrlPatterns(); fieldExtractors = new ArrayList(); + if (clazz.isAssignableFrom(AfterExtractor.class)){ + try { + afterExtractor=(AfterExtractor)clazz.newInstance(); + } catch (Exception e) { + throw new IllegalArgumentException(e); + } + } for (Field field : clazz.getDeclaredFields()) { field.setAccessible(true); if (!field.getType().isAssignableFrom(String.class)){ @@ -147,6 +156,9 @@ class PageModelExtractor { } setField(o, fieldExtractor, value); } + if (afterExtractor!=null){ + afterExtractor.afterProcess(page,o); + } } catch (InstantiationException e) { e.printStackTrace(); } catch (IllegalAccessException e) { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/PageModelPipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/PageModelPipeline.java new file mode 100644 index 0000000..afef926 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/PageModelPipeline.java @@ -0,0 +1,14 @@ +package us.codecraft.webmagic.annotation; + +import us.codecraft.webmagic.Task; + +/** + * @author code4crafter@gmail.com
+ * @date: 13-8-3
+ * Time: 上午9:34
+ */ +public interface PageModelPipeline { + + public void process(T t, Task task); + +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/TargetUrl.java b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/TargetUrl.java index f4f58ed..5303064 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/TargetUrl.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/TargetUrl.java @@ -5,7 +5,7 @@ import java.lang.annotation.Retention; import java.lang.annotation.Target; /** - * @author yihua.huang@dianping.com
+ * @author code4crafter@gmail.com
* @date: 13-8-1
* Time: 下午8:40
*/ @@ -14,4 +14,5 @@ import java.lang.annotation.Target; public @interface TargetUrl { String[] value(); + } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Destroyable.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Destroyable.java index 4f07528..2b040fa 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Destroyable.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Destroyable.java @@ -2,7 +2,7 @@ package us.codecraft.webmagic.downloader; /** * 比较占用资源的服务可以实现该接口,Spider会在结束时调用destroy()释放资源。
- * @author yihua.huang@dianping.com
+ * @author code4crafter@gmail.com
* @date: 13-7-26
* Time: 下午3:10
*/ diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/annotation/TestFetcher.java b/webmagic-core/src/test/java/us/codecraft/webmagic/annotation/TestFetcher.java index 5d70a54..b29d053 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/annotation/TestFetcher.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/annotation/TestFetcher.java @@ -3,7 +3,6 @@ package us.codecraft.webmagic.annotation; import org.junit.Ignore; import org.junit.Test; import us.codecraft.webmagic.Site; -import us.codecraft.webmagic.Spider; /** * @author yihua.huang@dianping.com
@@ -16,12 +15,9 @@ public class TestFetcher { @Test public void test() { ObjectPipeline objectPipeline = new ObjectPipeline(); - Spider.create(ObjectPageProcessor.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog/145796"), OschinaBlog.class)) - .pipeline(objectPipeline).runAsync(); + OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog/145796"), OschinaBlog.class) + .pipeline(objectPipeline); OschinaBlog oschinaBlog = null; - while ((oschinaBlog = objectPipeline.read()) != null) { - System.out.println(oschinaBlog); - } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/annotation/samples/IteyeBlog.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/annotation/samples/IteyeBlog.java index 09a1d5b..002a42c 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/annotation/samples/IteyeBlog.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/annotation/samples/IteyeBlog.java @@ -1,8 +1,8 @@ package us.codecraft.webmagic.annotation.samples; import us.codecraft.webmagic.Site; -import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.annotation.ExtractBy; +import us.codecraft.webmagic.annotation.OOSpider; import us.codecraft.webmagic.annotation.TargetUrl; /** @@ -28,7 +28,7 @@ public class IteyeBlog implements Blog{ } public static void main(String[] args) { - Spider.create(Site.me().addStartUrl("http://dengminhui.iteye.com/blog"),IteyeBlog.class).run(); + OOSpider.create(Site.me().addStartUrl("http://dengminhui.iteye.com/blog"), IteyeBlog.class).run(); } public String getTitle() { diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/annotation/samples/OschinaBlog.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/annotation/samples/OschinaBlog.java index 817c1aa..a5c44b0 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/annotation/samples/OschinaBlog.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/annotation/samples/OschinaBlog.java @@ -1,8 +1,8 @@ package us.codecraft.webmagic.annotation.samples; import us.codecraft.webmagic.Site; -import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.annotation.ExtractBy; +import us.codecraft.webmagic.annotation.OOSpider; import us.codecraft.webmagic.annotation.TargetUrl; /** @@ -28,7 +28,7 @@ public class OschinaBlog implements Blog{ } public static void main(String[] args) { - Spider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog"),OschinaBlog.class).run(); + OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog"), OschinaBlog.class).run(); } public String getTitle() { From f84b53514f172f46965bbac38a05874791496c0e Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 3 Aug 2013 15:55:54 +0800 Subject: [PATCH 16/84] complete objectpipeline --- .../annotation/ConsolePageModelPipeline.java | 16 +++++++ .../webmagic/annotation/OOSpider.java | 47 +++++++++++++++---- .../annotation/ObjectPageProcessor.java | 27 ++++++----- .../annotation/PageModelExtractor.java | 6 +-- .../webmagic/annotation/TestFetcher.java | 7 +-- 5 files changed, 72 insertions(+), 31 deletions(-) create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ConsolePageModelPipeline.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ConsolePageModelPipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ConsolePageModelPipeline.java new file mode 100644 index 0000000..9991b7f --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ConsolePageModelPipeline.java @@ -0,0 +1,16 @@ +package us.codecraft.webmagic.annotation; + +import org.apache.commons.lang3.builder.ToStringBuilder; +import us.codecraft.webmagic.Task; + +/** + * @author yihua.huang@dianping.com
+ * @date: 13-8-3
+ * Time: 下午3:41
+ */ +public class ConsolePageModelPipeline implements PageModelPipeline { + @Override + public void process(Object o, Task task) { + System.out.println(ToStringBuilder.reflectionToString(o)); + } +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/OOSpider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/OOSpider.java index c6ae2f3..e500745 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/OOSpider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/OOSpider.java @@ -2,28 +2,57 @@ package us.codecraft.webmagic.annotation; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; -import us.codecraft.webmagic.processor.PageProcessor; +import us.codecraft.webmagic.pipeline.Pipeline; /** * @author code4crafter@gmail.com
* @date: 13-8-3
* Time: 上午9:51
*/ -public class OOSpider extends Spider{ +public class OOSpider extends Spider { /** - * 使用已定义的抽取规则新建一个Spider。 + * OOSpider只能由ObjectPageProcessor创建。 * * @param pageProcessor 已定义的抽取规则 */ - public OOSpider(PageProcessor pageProcessor) { - super(pageProcessor); + + private ObjectPageProcessor objectPageProcessor; + + private ObjectPipeline objectPipeline; + + protected OOSpider(ObjectPageProcessor objectPageProcessor) { + super(objectPageProcessor); + this.objectPageProcessor = objectPageProcessor; } - public static OOSpider create(Site site,Class... pageModels) { - OOSpider ooSpider = new OOSpider(ObjectPageProcessor.create(site, pageModels)); - ooSpider.pipeline(new ObjectPipeline()); - return ooSpider; + public OOSpider(Site site, PageModelPipeline pageModelPipeline, Class... pageModels) { + this(ObjectPageProcessor.create(site, pageModels)); + this.objectPipeline = new ObjectPipeline(); + super.pipeline(objectPipeline); + for (Class pageModel : pageModels) { + this.objectPipeline.put(pageModel, pageModelPipeline); + } + } + + public static OOSpider create(Site site, Class... pageModels) { + return new OOSpider(site, new ConsolePageModelPipeline(), pageModels); + } + + public static OOSpider create(Site site, PageModelPipeline pageModelPipeline, Class... pageModels) { + return new OOSpider(site, pageModelPipeline, pageModels); + } + + public OOSpider addPageModel(PageModelPipeline pageModelPipeline, Class... pageModels) { + for (Class pageModel : pageModels) { + objectPageProcessor.addPageModel(pageModel); + objectPipeline.put(pageModel, pageModelPipeline); + } + return this; + } + + public Spider pipeline(Pipeline pipeline) { + throw new UnsupportedOperationException("Sorry, OOSpider can only use ObjectPipeline"); } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ObjectPageProcessor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ObjectPageProcessor.java index 063dc81..f375868 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ObjectPageProcessor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ObjectPageProcessor.java @@ -18,30 +18,31 @@ import java.util.regex.Pattern; */ public class ObjectPageProcessor implements PageProcessor { - private List pageModelExtractorList; + private List pageModelExtractorList = new ArrayList(); private Site site; - private Set targetUrlPatterns; + private Set targetUrlPatterns = new HashSet(); public static ObjectPageProcessor create(Site site, Class... clazzs) { - List pageModelExtractorList = new ArrayList(); + ObjectPageProcessor objectPageProcessor = new ObjectPageProcessor(site); for (Class clazz : clazzs) { - PageModelExtractor pageModelExtractor = PageModelExtractor.create(clazz); - pageModelExtractorList.add(pageModelExtractor); + objectPageProcessor.addPageModel(clazz); } - ObjectPageProcessor objectPageProcessor = new ObjectPageProcessor(site, pageModelExtractorList); return objectPageProcessor; } - private ObjectPageProcessor(Site site, List pageModelExtractorList) { + + public ObjectPageProcessor addPageModel(Class clazz){ + PageModelExtractor pageModelExtractor = PageModelExtractor.create(clazz); + targetUrlPatterns.addAll(pageModelExtractor.getTargetUrlPatterns()); + targetUrlPatterns.addAll(pageModelExtractor.getHelpUrlPatterns()); + pageModelExtractorList.add(pageModelExtractor); + return this; + } + + private ObjectPageProcessor(Site site) { this.site = site; - this.pageModelExtractorList = pageModelExtractorList; - targetUrlPatterns = new HashSet(); - for (PageModelExtractor pageModelExtractor : pageModelExtractorList) { - targetUrlPatterns.addAll(pageModelExtractor.getTargetUrlPatterns()); - targetUrlPatterns.addAll(pageModelExtractor.getHelpUrlPatterns()); - } } @Override diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/PageModelExtractor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/PageModelExtractor.java index 8ed3b6b..f91252f 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/PageModelExtractor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/PageModelExtractor.java @@ -22,9 +22,9 @@ import java.util.regex.Pattern; */ class PageModelExtractor { - private List targetUrlPatterns; + private List targetUrlPatterns = new ArrayList(); - private List helpUrlPatterns; + private List helpUrlPatterns = new ArrayList(); private Class clazz; @@ -106,7 +106,6 @@ class PageModelExtractor { } private void initTargetUrlPatterns() { - targetUrlPatterns = new ArrayList(); Annotation annotation = clazz.getAnnotation(TargetUrl.class); if (annotation == null) { targetUrlPatterns.add(Pattern.compile(".*")); @@ -116,7 +115,6 @@ class PageModelExtractor { targetUrlPatterns.add(Pattern.compile(s.replace(".", "\\.").replace("*", "[^\"'#]*"))); } } - helpUrlPatterns = new ArrayList(); annotation = clazz.getAnnotation(HelpUrl.class); if (annotation != null) { String[] value = ((HelpUrl) annotation).value(); diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/annotation/TestFetcher.java b/webmagic-core/src/test/java/us/codecraft/webmagic/annotation/TestFetcher.java index b29d053..0026431 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/annotation/TestFetcher.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/annotation/TestFetcher.java @@ -1,6 +1,5 @@ package us.codecraft.webmagic.annotation; -import org.junit.Ignore; import org.junit.Test; import us.codecraft.webmagic.Site; @@ -11,13 +10,11 @@ import us.codecraft.webmagic.Site; */ public class TestFetcher { - @Ignore("takes long") +// @Ignore("takes long") @Test public void test() { - ObjectPipeline objectPipeline = new ObjectPipeline(); OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog/145796"), OschinaBlog.class) - .pipeline(objectPipeline); - OschinaBlog oschinaBlog = null; + .run(); } From d26cd82d59af26452cb412badad37db733fd69cf Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 3 Aug 2013 16:29:50 +0800 Subject: [PATCH 17/84] rename package --- .../src/main/java/us/codecraft/webmagic/Spider.java | 1 - .../webmagic/{annotation => oo}/AfterExtractor.java | 2 +- .../{annotation => oo}/ConsolePageModelPipeline.java | 2 +- .../codecraft/webmagic/{annotation => oo}/ExtractBy.java | 2 +- .../webmagic/{annotation => oo}/ExtractByUrl.java | 2 +- .../webmagic/{annotation => oo}/FieldExtractor.java | 2 +- .../us/codecraft/webmagic/{annotation => oo}/HelpUrl.java | 2 +- .../codecraft/webmagic/{annotation => oo}/OOSpider.java | 3 +-- .../webmagic/{annotation => oo}/ObjectPageProcessor.java | 2 +- .../webmagic/{annotation => oo}/ObjectPipeline.java | 2 +- .../webmagic/{annotation => oo}/PageModelExtractor.java | 2 +- .../webmagic/{annotation => oo}/PageModelPipeline.java | 2 +- .../codecraft/webmagic/{annotation => oo}/TargetUrl.java | 2 +- .../webmagic/{annotation => oo}/OschinaBlog.java | 2 +- .../webmagic/{annotation => oo}/TestFetcher.java | 2 +- .../webmagic/{annotation => oo}/samples/Blog.java | 2 +- .../webmagic/{annotation => oo}/samples/IteyeBlog.java | 8 ++++---- .../webmagic/{annotation => oo}/samples/OschinaBlog.java | 8 ++++---- 18 files changed, 23 insertions(+), 25 deletions(-) rename webmagic-core/src/main/java/us/codecraft/webmagic/{annotation => oo}/AfterExtractor.java (87%) rename webmagic-core/src/main/java/us/codecraft/webmagic/{annotation => oo}/ConsolePageModelPipeline.java (90%) rename webmagic-core/src/main/java/us/codecraft/webmagic/{annotation => oo}/ExtractBy.java (92%) rename webmagic-core/src/main/java/us/codecraft/webmagic/{annotation => oo}/ExtractByUrl.java (90%) rename webmagic-core/src/main/java/us/codecraft/webmagic/{annotation => oo}/FieldExtractor.java (96%) rename webmagic-core/src/main/java/us/codecraft/webmagic/{annotation => oo}/HelpUrl.java (89%) rename webmagic-core/src/main/java/us/codecraft/webmagic/{annotation => oo}/OOSpider.java (97%) rename webmagic-core/src/main/java/us/codecraft/webmagic/{annotation => oo}/ObjectPageProcessor.java (98%) rename webmagic-core/src/main/java/us/codecraft/webmagic/{annotation => oo}/ObjectPipeline.java (96%) rename webmagic-core/src/main/java/us/codecraft/webmagic/{annotation => oo}/PageModelExtractor.java (99%) rename webmagic-core/src/main/java/us/codecraft/webmagic/{annotation => oo}/PageModelPipeline.java (83%) rename webmagic-core/src/main/java/us/codecraft/webmagic/{annotation => oo}/TargetUrl.java (89%) rename webmagic-core/src/test/java/us/codecraft/webmagic/{annotation => oo}/OschinaBlog.java (88%) rename webmagic-core/src/test/java/us/codecraft/webmagic/{annotation => oo}/TestFetcher.java (90%) rename webmagic-samples/src/main/java/us/codecraft/webmagic/{annotation => oo}/samples/Blog.java (78%) rename webmagic-samples/src/main/java/us/codecraft/webmagic/{annotation => oo}/samples/IteyeBlog.java (80%) rename webmagic-samples/src/main/java/us/codecraft/webmagic/{annotation => oo}/samples/OschinaBlog.java (81%) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index facfd95..a25fd02 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -2,7 +2,6 @@ package us.codecraft.webmagic; import org.apache.commons.collections.CollectionUtils; import org.apache.log4j.Logger; -import us.codecraft.webmagic.annotation.ObjectPageProcessor; import us.codecraft.webmagic.downloader.Destroyable; import us.codecraft.webmagic.downloader.Downloader; import us.codecraft.webmagic.downloader.HttpClientDownloader; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/AfterExtractor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/AfterExtractor.java similarity index 87% rename from webmagic-core/src/main/java/us/codecraft/webmagic/annotation/AfterExtractor.java rename to webmagic-core/src/main/java/us/codecraft/webmagic/oo/AfterExtractor.java index 89d03e9..cb9788b 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/AfterExtractor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/AfterExtractor.java @@ -1,4 +1,4 @@ -package us.codecraft.webmagic.annotation; +package us.codecraft.webmagic.oo; import us.codecraft.webmagic.Page; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ConsolePageModelPipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/ConsolePageModelPipeline.java similarity index 90% rename from webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ConsolePageModelPipeline.java rename to webmagic-core/src/main/java/us/codecraft/webmagic/oo/ConsolePageModelPipeline.java index 9991b7f..16b1ef2 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ConsolePageModelPipeline.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/ConsolePageModelPipeline.java @@ -1,4 +1,4 @@ -package us.codecraft.webmagic.annotation; +package us.codecraft.webmagic.oo; import org.apache.commons.lang3.builder.ToStringBuilder; import us.codecraft.webmagic.Task; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ExtractBy.java b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/ExtractBy.java similarity index 92% rename from webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ExtractBy.java rename to webmagic-core/src/main/java/us/codecraft/webmagic/oo/ExtractBy.java index 115a219..00ff7fb 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ExtractBy.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/ExtractBy.java @@ -1,4 +1,4 @@ -package us.codecraft.webmagic.annotation; +package us.codecraft.webmagic.oo; import java.lang.annotation.ElementType; import java.lang.annotation.Retention; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ExtractByUrl.java b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/ExtractByUrl.java similarity index 90% rename from webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ExtractByUrl.java rename to webmagic-core/src/main/java/us/codecraft/webmagic/oo/ExtractByUrl.java index c40c9ca..715112c 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ExtractByUrl.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/ExtractByUrl.java @@ -1,4 +1,4 @@ -package us.codecraft.webmagic.annotation; +package us.codecraft.webmagic.oo; import java.lang.annotation.ElementType; import java.lang.annotation.Retention; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/FieldExtractor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/FieldExtractor.java similarity index 96% rename from webmagic-core/src/main/java/us/codecraft/webmagic/annotation/FieldExtractor.java rename to webmagic-core/src/main/java/us/codecraft/webmagic/oo/FieldExtractor.java index 4cd09ef..26c1ec6 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/FieldExtractor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/FieldExtractor.java @@ -1,4 +1,4 @@ -package us.codecraft.webmagic.annotation; +package us.codecraft.webmagic.oo; import us.codecraft.webmagic.selector.Selector; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/HelpUrl.java b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/HelpUrl.java similarity index 89% rename from webmagic-core/src/main/java/us/codecraft/webmagic/annotation/HelpUrl.java rename to webmagic-core/src/main/java/us/codecraft/webmagic/oo/HelpUrl.java index e5727f0..1746048 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/HelpUrl.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/HelpUrl.java @@ -1,4 +1,4 @@ -package us.codecraft.webmagic.annotation; +package us.codecraft.webmagic.oo; import java.lang.annotation.ElementType; import java.lang.annotation.Retention; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/OOSpider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/OOSpider.java similarity index 97% rename from webmagic-core/src/main/java/us/codecraft/webmagic/annotation/OOSpider.java rename to webmagic-core/src/main/java/us/codecraft/webmagic/oo/OOSpider.java index e500745..d41ee9f 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/OOSpider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/OOSpider.java @@ -1,4 +1,4 @@ -package us.codecraft.webmagic.annotation; +package us.codecraft.webmagic.oo; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; @@ -16,7 +16,6 @@ public class OOSpider extends Spider { * * @param pageProcessor 已定义的抽取规则 */ - private ObjectPageProcessor objectPageProcessor; private ObjectPipeline objectPipeline; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ObjectPageProcessor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/ObjectPageProcessor.java similarity index 98% rename from webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ObjectPageProcessor.java rename to webmagic-core/src/main/java/us/codecraft/webmagic/oo/ObjectPageProcessor.java index f375868..dda96b5 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ObjectPageProcessor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/ObjectPageProcessor.java @@ -1,4 +1,4 @@ -package us.codecraft.webmagic.annotation; +package us.codecraft.webmagic.oo; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ObjectPipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/ObjectPipeline.java similarity index 96% rename from webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ObjectPipeline.java rename to webmagic-core/src/main/java/us/codecraft/webmagic/oo/ObjectPipeline.java index 0b3ec4d..a5f02ed 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ObjectPipeline.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/ObjectPipeline.java @@ -1,4 +1,4 @@ -package us.codecraft.webmagic.annotation; +package us.codecraft.webmagic.oo; import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.Task; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/PageModelExtractor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/PageModelExtractor.java similarity index 99% rename from webmagic-core/src/main/java/us/codecraft/webmagic/annotation/PageModelExtractor.java rename to webmagic-core/src/main/java/us/codecraft/webmagic/oo/PageModelExtractor.java index f91252f..dc1ef82 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/PageModelExtractor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/PageModelExtractor.java @@ -1,4 +1,4 @@ -package us.codecraft.webmagic.annotation; +package us.codecraft.webmagic.oo; import org.apache.commons.lang3.StringUtils; import us.codecraft.webmagic.Page; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/PageModelPipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/PageModelPipeline.java similarity index 83% rename from webmagic-core/src/main/java/us/codecraft/webmagic/annotation/PageModelPipeline.java rename to webmagic-core/src/main/java/us/codecraft/webmagic/oo/PageModelPipeline.java index afef926..7406cde 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/PageModelPipeline.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/PageModelPipeline.java @@ -1,4 +1,4 @@ -package us.codecraft.webmagic.annotation; +package us.codecraft.webmagic.oo; import us.codecraft.webmagic.Task; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/TargetUrl.java b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/TargetUrl.java similarity index 89% rename from webmagic-core/src/main/java/us/codecraft/webmagic/annotation/TargetUrl.java rename to webmagic-core/src/main/java/us/codecraft/webmagic/oo/TargetUrl.java index 5303064..7bbb962 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/TargetUrl.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/TargetUrl.java @@ -1,4 +1,4 @@ -package us.codecraft.webmagic.annotation; +package us.codecraft.webmagic.oo; import java.lang.annotation.ElementType; import java.lang.annotation.Retention; diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/annotation/OschinaBlog.java b/webmagic-core/src/test/java/us/codecraft/webmagic/oo/OschinaBlog.java similarity index 88% rename from webmagic-core/src/test/java/us/codecraft/webmagic/annotation/OschinaBlog.java rename to webmagic-core/src/test/java/us/codecraft/webmagic/oo/OschinaBlog.java index 9b83c01..937eba1 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/annotation/OschinaBlog.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/oo/OschinaBlog.java @@ -1,4 +1,4 @@ -package us.codecraft.webmagic.annotation; +package us.codecraft.webmagic.oo; /** * @author yihua.huang@dianping.com
diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/annotation/TestFetcher.java b/webmagic-core/src/test/java/us/codecraft/webmagic/oo/TestFetcher.java similarity index 90% rename from webmagic-core/src/test/java/us/codecraft/webmagic/annotation/TestFetcher.java rename to webmagic-core/src/test/java/us/codecraft/webmagic/oo/TestFetcher.java index 0026431..e8e3799 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/annotation/TestFetcher.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/oo/TestFetcher.java @@ -1,4 +1,4 @@ -package us.codecraft.webmagic.annotation; +package us.codecraft.webmagic.oo; import org.junit.Test; import us.codecraft.webmagic.Site; diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/annotation/samples/Blog.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/oo/samples/Blog.java similarity index 78% rename from webmagic-samples/src/main/java/us/codecraft/webmagic/annotation/samples/Blog.java rename to webmagic-samples/src/main/java/us/codecraft/webmagic/oo/samples/Blog.java index c0890e1..565c711 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/annotation/samples/Blog.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/oo/samples/Blog.java @@ -1,4 +1,4 @@ -package us.codecraft.webmagic.annotation.samples; +package us.codecraft.webmagic.oo.samples; /** * @author yihua.huang@dianping.com
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/annotation/samples/IteyeBlog.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/oo/samples/IteyeBlog.java similarity index 80% rename from webmagic-samples/src/main/java/us/codecraft/webmagic/annotation/samples/IteyeBlog.java rename to webmagic-samples/src/main/java/us/codecraft/webmagic/oo/samples/IteyeBlog.java index 002a42c..e289c87 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/annotation/samples/IteyeBlog.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/oo/samples/IteyeBlog.java @@ -1,9 +1,9 @@ -package us.codecraft.webmagic.annotation.samples; +package us.codecraft.webmagic.oo.samples; import us.codecraft.webmagic.Site; -import us.codecraft.webmagic.annotation.ExtractBy; -import us.codecraft.webmagic.annotation.OOSpider; -import us.codecraft.webmagic.annotation.TargetUrl; +import us.codecraft.webmagic.oo.ExtractBy; +import us.codecraft.webmagic.oo.OOSpider; +import us.codecraft.webmagic.oo.TargetUrl; /** * @author yihua.huang@dianping.com
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/annotation/samples/OschinaBlog.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/oo/samples/OschinaBlog.java similarity index 81% rename from webmagic-samples/src/main/java/us/codecraft/webmagic/annotation/samples/OschinaBlog.java rename to webmagic-samples/src/main/java/us/codecraft/webmagic/oo/samples/OschinaBlog.java index a5c44b0..4a52a14 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/annotation/samples/OschinaBlog.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/oo/samples/OschinaBlog.java @@ -1,9 +1,9 @@ -package us.codecraft.webmagic.annotation.samples; +package us.codecraft.webmagic.oo.samples; import us.codecraft.webmagic.Site; -import us.codecraft.webmagic.annotation.ExtractBy; -import us.codecraft.webmagic.annotation.OOSpider; -import us.codecraft.webmagic.annotation.TargetUrl; +import us.codecraft.webmagic.oo.ExtractBy; +import us.codecraft.webmagic.oo.OOSpider; +import us.codecraft.webmagic.oo.TargetUrl; /** * @author yihua.huang@dianping.com
From d4de60a56232a18de823e52c0243dec1f89e6524 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 3 Aug 2013 16:35:12 +0800 Subject: [PATCH 18/84] skip test --- .../src/main/java/us/codecraft/webmagic/oo/package.html | 5 +++++ .../src/test/java/us/codecraft/webmagic/oo/TestFetcher.java | 3 ++- 2 files changed, 7 insertions(+), 1 deletion(-) create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/oo/package.html diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/package.html b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/package.html new file mode 100644 index 0000000..b5f80b1 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/package.html @@ -0,0 +1,5 @@ + + +webmagic对抓取器编写的面向对象方式的封装。基于POJO(称为PageModel)及注解即可实现一个PageProcessor。 + + diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/oo/TestFetcher.java b/webmagic-core/src/test/java/us/codecraft/webmagic/oo/TestFetcher.java index e8e3799..56f5a9a 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/oo/TestFetcher.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/oo/TestFetcher.java @@ -1,5 +1,6 @@ package us.codecraft.webmagic.oo; +import org.junit.Ignore; import org.junit.Test; import us.codecraft.webmagic.Site; @@ -10,7 +11,7 @@ import us.codecraft.webmagic.Site; */ public class TestFetcher { -// @Ignore("takes long") + @Ignore("takes long") @Test public void test() { OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog/145796"), OschinaBlog.class) From 65518f76720553bac0ab966ac164e97b469e85e4 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 3 Aug 2013 17:01:25 +0800 Subject: [PATCH 19/84] add list support --- .../us/codecraft/webmagic/oo/ExtractBy.java | 11 +-- .../codecraft/webmagic/oo/ExtractByUrl.java | 2 + .../us/codecraft/webmagic/oo/Extractor.java | 40 +++++++++ .../codecraft/webmagic/oo/FieldExtractor.java | 16 +--- .../us/codecraft/webmagic/oo/OOSpider.java | 5 -- .../webmagic/oo/PageModelExtractor.java | 83 ++++++++++++------- .../us/codecraft/webmagic/oo/OschinaBlog.java | 7 +- .../us/codecraft/webmagic/oo/TestFetcher.java | 3 +- 8 files changed, 112 insertions(+), 55 deletions(-) create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/oo/Extractor.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/ExtractBy.java b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/ExtractBy.java index 00ff7fb..71bdc93 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/ExtractBy.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/ExtractBy.java @@ -10,16 +10,17 @@ import java.lang.annotation.Target; * Time: 下午8:40
*/ @Retention(java.lang.annotation.RetentionPolicy.RUNTIME) -@Target({ElementType.FIELD}) +@Target({ElementType.FIELD,ElementType.TYPE}) public @interface ExtractBy { - - //TODO: add list support String value(); - public enum Type {XPath, Regex, Css}; + public enum Type {XPath2, XPath, Regex, Css} - Type type() default Type.XPath; + Type type() default Type.XPath2; boolean notNull() default true; + + boolean multi() default false; + } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/ExtractByUrl.java b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/ExtractByUrl.java index 715112c..e86f08f 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/ExtractByUrl.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/ExtractByUrl.java @@ -17,4 +17,6 @@ public @interface ExtractByUrl{ boolean notNull() default true; + boolean multi() default false; + } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/Extractor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/Extractor.java new file mode 100644 index 0000000..f0607cf --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/Extractor.java @@ -0,0 +1,40 @@ +package us.codecraft.webmagic.oo; + +import us.codecraft.webmagic.selector.Selector; + +/** + * @author code4crafter@gmail.com
+ * @date: 13-8-1
+ * Time: 下午9:48
+ */ +class Extractor { + + protected final Selector selector; + + protected final Source source; + + protected final boolean notNull; + + protected final boolean multi; + + static enum Source {Html, Url} + + public Extractor(Selector selector, Source source, boolean notNull, boolean multi) { + this.selector = selector; + this.source = source; + this.notNull = notNull; + this.multi = multi; + } + + Selector getSelector() { + return selector; + } + + Source getSource() { + return source; + } + + boolean isNotNull() { + return notNull; + } +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/FieldExtractor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/FieldExtractor.java index 26c1ec6..2a6bcf7 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/FieldExtractor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/FieldExtractor.java @@ -10,25 +10,15 @@ import java.lang.reflect.Method; * @date: 13-8-1
* Time: 下午9:48
*/ -class FieldExtractor { +class FieldExtractor extends Extractor{ private final Field field; - private final Selector selector; - - private final Source source; - private Method setterMethod; - private final boolean notNull; - - static enum Source {Html, Url} - - public FieldExtractor(Field field, Selector selector, Source source, boolean notNull) { + public FieldExtractor(Field field, Selector selector, Source source, boolean notNull,boolean multi) { + super(selector, source, notNull,multi); this.field = field; - this.selector = selector; - this.source = source; - this.notNull = notNull; } Field getField() { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/OOSpider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/OOSpider.java index d41ee9f..5f523ed 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/OOSpider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/OOSpider.java @@ -2,7 +2,6 @@ package us.codecraft.webmagic.oo; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; -import us.codecraft.webmagic.pipeline.Pipeline; /** * @author code4crafter@gmail.com
@@ -50,8 +49,4 @@ public class OOSpider extends Spider { return this; } - public Spider pipeline(Pipeline pipeline) { - throw new UnsupportedOperationException("Sorry, OOSpider can only use ObjectPipeline"); - } - } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/PageModelExtractor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/PageModelExtractor.java index dc1ef82..e743e06 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/PageModelExtractor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/PageModelExtractor.java @@ -2,10 +2,7 @@ package us.codecraft.webmagic.oo; import org.apache.commons.lang3.StringUtils; import us.codecraft.webmagic.Page; -import us.codecraft.webmagic.selector.CssSelector; -import us.codecraft.webmagic.selector.RegexSelector; -import us.codecraft.webmagic.selector.Selector; -import us.codecraft.webmagic.selector.XpathSelector; +import us.codecraft.webmagic.selector.*; import java.lang.annotation.Annotation; import java.lang.reflect.Field; @@ -42,20 +39,22 @@ class PageModelExtractor { this.clazz = clazz; initTargetUrlPatterns(); fieldExtractors = new ArrayList(); - if (clazz.isAssignableFrom(AfterExtractor.class)){ + if (clazz.isAssignableFrom(AfterExtractor.class)) { try { - afterExtractor=(AfterExtractor)clazz.newInstance(); + afterExtractor = (AfterExtractor) clazz.newInstance(); } catch (Exception e) { throw new IllegalArgumentException(e); } } for (Field field : clazz.getDeclaredFields()) { field.setAccessible(true); - if (!field.getType().isAssignableFrom(String.class)){ - throw new IllegalStateException("Field "+field.getName()+" must be string"); - } ExtractBy extractBy = field.getAnnotation(ExtractBy.class); if (extractBy != null) { + if (!extractBy.multi() && !field.getType().isAssignableFrom(String.class)) { + throw new IllegalStateException("Field " + field.getName() + " must be string"); + } else if (extractBy.multi() && !field.getType().isAssignableFrom(List.class)) { + throw new IllegalStateException("Field " + field.getName() + " must be list"); + } String value = extractBy.value(); Selector selector; switch (extractBy.type()) { @@ -68,10 +67,13 @@ class PageModelExtractor { case XPath: selector = new XpathSelector(value); break; + case XPath2: + selector = new Xpath2Selector(value); + break; default: - selector = new XpathSelector(value); + selector = new Xpath2Selector(value); } - FieldExtractor fieldExtractor = new FieldExtractor(field, selector, FieldExtractor.Source.Html, extractBy.notNull()); + FieldExtractor fieldExtractor = new FieldExtractor(field, selector, FieldExtractor.Source.Html, extractBy.notNull(), extractBy.multi()); Method setterMethod = getSetterMethod(clazz, field); if (setterMethod != null) { fieldExtractor.setSetterMethod(setterMethod); @@ -80,11 +82,16 @@ class PageModelExtractor { } ExtractByUrl extractByUrl = field.getAnnotation(ExtractByUrl.class); if (extractByUrl != null) { + if (!extractByUrl.multi() && !field.getType().isAssignableFrom(String.class)) { + throw new IllegalStateException("Field " + field.getName() + " must be string"); + } else if (extractByUrl.multi() && !field.getType().isAssignableFrom(List.class)) { + throw new IllegalStateException("Field " + field.getName() + " must be list"); + } String regexPattern = extractByUrl.value(); if (regexPattern.trim().equals("")) { regexPattern = ".*"; } - FieldExtractor fieldExtractor = new FieldExtractor(field, new RegexSelector(regexPattern), FieldExtractor.Source.Url, extractByUrl.notNull()); + FieldExtractor fieldExtractor = new FieldExtractor(field, new RegexSelector(regexPattern), FieldExtractor.Source.Url, extractByUrl.notNull(), extractByUrl.multi()); Method setterMethod = getSetterMethod(clazz, field); if (setterMethod != null) { fieldExtractor.setSetterMethod(setterMethod); @@ -138,24 +145,42 @@ class PageModelExtractor { try { o = clazz.newInstance(); for (FieldExtractor fieldExtractor : fieldExtractors) { - String value; - switch (fieldExtractor.getSource()) { - case Html: - value = fieldExtractor.getSelector().select(page.getHtml().toString()); - break; - case Url: - value = fieldExtractor.getSelector().select(page.getUrl().toString()); - break; - default: - value = fieldExtractor.getSelector().select(page.getHtml().toString()); + if (fieldExtractor.multi) { + List value; + switch (fieldExtractor.getSource()) { + case Html: + value = fieldExtractor.getSelector().selectList(page.getHtml().toString()); + break; + case Url: + value = fieldExtractor.getSelector().selectList(page.getUrl().toString()); + break; + default: + value = fieldExtractor.getSelector().selectList(page.getHtml().toString()); + } + if ((value == null || value.size() == 0) && fieldExtractor.isNotNull()) { + page.getResultItems().setSkip(true); + } + setField(o, fieldExtractor, value); + } else { + String value; + switch (fieldExtractor.getSource()) { + case Html: + value = fieldExtractor.getSelector().select(page.getHtml().toString()); + break; + case Url: + value = fieldExtractor.getSelector().select(page.getUrl().toString()); + break; + default: + value = fieldExtractor.getSelector().select(page.getHtml().toString()); + } + if (value == null && fieldExtractor.isNotNull()) { + page.getResultItems().setSkip(true); + } + setField(o, fieldExtractor, value); } - if (value==null&&fieldExtractor.isNotNull()){ - page.getResultItems().setSkip(true); - } - setField(o, fieldExtractor, value); } - if (afterExtractor!=null){ - afterExtractor.afterProcess(page,o); + if (afterExtractor != null) { + afterExtractor.afterProcess(page, o); } } catch (InstantiationException e) { e.printStackTrace(); @@ -167,7 +192,7 @@ class PageModelExtractor { return o; } - private void setField(Object o, FieldExtractor fieldExtractor, String value) throws IllegalAccessException, InvocationTargetException { + private void setField(Object o, FieldExtractor fieldExtractor, Object value) throws IllegalAccessException, InvocationTargetException { if (fieldExtractor.getSetterMethod() != null) { fieldExtractor.getSetterMethod().invoke(o, value); } diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/oo/OschinaBlog.java b/webmagic-core/src/test/java/us/codecraft/webmagic/oo/OschinaBlog.java index 937eba1..b8c7e4a 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/oo/OschinaBlog.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/oo/OschinaBlog.java @@ -1,5 +1,7 @@ package us.codecraft.webmagic.oo; +import java.util.List; + /** * @author yihua.huang@dianping.com
* @date: 13-8-1
@@ -11,7 +13,10 @@ public class OschinaBlog { @ExtractBy("//title") private String title; - @ExtractBy(value = "div.BlogContent",type = ExtractBy.Type.Css) + @ExtractBy(value = "div.BlogContent", type = ExtractBy.Type.Css) private String content; + @ExtractBy(value = "//div[@class='BlogTags']/a/text()", multi = true) + private List tags; + } diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/oo/TestFetcher.java b/webmagic-core/src/test/java/us/codecraft/webmagic/oo/TestFetcher.java index 56f5a9a..e8e3799 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/oo/TestFetcher.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/oo/TestFetcher.java @@ -1,6 +1,5 @@ package us.codecraft.webmagic.oo; -import org.junit.Ignore; import org.junit.Test; import us.codecraft.webmagic.Site; @@ -11,7 +10,7 @@ import us.codecraft.webmagic.Site; */ public class TestFetcher { - @Ignore("takes long") +// @Ignore("takes long") @Test public void test() { OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog/145796"), OschinaBlog.class) From 69245e8c03e88ea89e94a9c4ec076b843fbc6f93 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 3 Aug 2013 17:17:59 +0800 Subject: [PATCH 20/84] fix Class.assinable bug --- .../us/codecraft/webmagic/oo/PageModelExtractor.java | 10 +++++----- .../java/us/codecraft/webmagic/oo/OschinaBlog.java | 7 ++++++- .../java/us/codecraft/webmagic/oo/TestFetcher.java | 9 ++++++++- 3 files changed, 19 insertions(+), 7 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/PageModelExtractor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/PageModelExtractor.java index e743e06..83a4d31 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/PageModelExtractor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/PageModelExtractor.java @@ -39,7 +39,7 @@ class PageModelExtractor { this.clazz = clazz; initTargetUrlPatterns(); fieldExtractors = new ArrayList(); - if (clazz.isAssignableFrom(AfterExtractor.class)) { + if (AfterExtractor.class.isAssignableFrom(clazz)) { try { afterExtractor = (AfterExtractor) clazz.newInstance(); } catch (Exception e) { @@ -50,9 +50,9 @@ class PageModelExtractor { field.setAccessible(true); ExtractBy extractBy = field.getAnnotation(ExtractBy.class); if (extractBy != null) { - if (!extractBy.multi() && !field.getType().isAssignableFrom(String.class)) { + if (!extractBy.multi() && !String.class.isAssignableFrom(field.getType())) { throw new IllegalStateException("Field " + field.getName() + " must be string"); - } else if (extractBy.multi() && !field.getType().isAssignableFrom(List.class)) { + } else if (extractBy.multi() && !List.class.isAssignableFrom(field.getType())) { throw new IllegalStateException("Field " + field.getName() + " must be list"); } String value = extractBy.value(); @@ -82,9 +82,9 @@ class PageModelExtractor { } ExtractByUrl extractByUrl = field.getAnnotation(ExtractByUrl.class); if (extractByUrl != null) { - if (!extractByUrl.multi() && !field.getType().isAssignableFrom(String.class)) { + if (!extractByUrl.multi() && !String.class.isAssignableFrom(field.getType())) { throw new IllegalStateException("Field " + field.getName() + " must be string"); - } else if (extractByUrl.multi() && !field.getType().isAssignableFrom(List.class)) { + } else if (extractByUrl.multi() && !List.class.isAssignableFrom(field.getType())) { throw new IllegalStateException("Field " + field.getName() + " must be list"); } String regexPattern = extractByUrl.value(); diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/oo/OschinaBlog.java b/webmagic-core/src/test/java/us/codecraft/webmagic/oo/OschinaBlog.java index b8c7e4a..c82ef23 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/oo/OschinaBlog.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/oo/OschinaBlog.java @@ -1,5 +1,7 @@ package us.codecraft.webmagic.oo; +import us.codecraft.webmagic.Page; + import java.util.List; /** @@ -8,7 +10,7 @@ import java.util.List; * Time: 下午10:18
*/ @TargetUrl("http://my.oschina.net/flashsword/blog/*") -public class OschinaBlog { +public class OschinaBlog implements AfterExtractor{ @ExtractBy("//title") private String title; @@ -19,4 +21,7 @@ public class OschinaBlog { @ExtractBy(value = "//div[@class='BlogTags']/a/text()", multi = true) private List tags; + @Override + public void afterProcess(Page page, OschinaBlog oschinaBlog) { + } } diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/oo/TestFetcher.java b/webmagic-core/src/test/java/us/codecraft/webmagic/oo/TestFetcher.java index e8e3799..f4525f0 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/oo/TestFetcher.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/oo/TestFetcher.java @@ -1,8 +1,12 @@ package us.codecraft.webmagic.oo; +import org.junit.Ignore; import org.junit.Test; import us.codecraft.webmagic.Site; +import java.util.ArrayList; +import java.util.List; + /** * @author yihua.huang@dianping.com
* @date: 13-8-1
@@ -10,12 +14,15 @@ import us.codecraft.webmagic.Site; */ public class TestFetcher { -// @Ignore("takes long") + @Ignore("takes long") @Test public void test() { + System.out.println(List.class.isAssignableFrom(ArrayList.class)); OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog/145796"), OschinaBlog.class) .run(); } + + } From aca165b13257f3fa4116863403efa1ec28b3f468 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 3 Aug 2013 17:38:36 +0800 Subject: [PATCH 21/84] add and or selector --- .../us/codecraft/webmagic/oo/HelpUrl.java | 2 + .../us/codecraft/webmagic/oo/TargetUrl.java | 2 + .../webmagic/selector/AndSelector.java | 53 +++++++++++++++++++ .../webmagic/selector/OrSelector.java | 41 ++++++++++++++ .../us/codecraft/webmagic/oo/OschinaBlog.java | 3 +- .../us/codecraft/webmagic/oo/TestFetcher.java | 4 -- 6 files changed, 100 insertions(+), 5 deletions(-) create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/selector/AndSelector.java create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/selector/OrSelector.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/HelpUrl.java b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/HelpUrl.java index 1746048..a8ed995 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/HelpUrl.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/HelpUrl.java @@ -14,4 +14,6 @@ import java.lang.annotation.Target; public @interface HelpUrl { String[] value(); + + String sourceRegion() default ""; } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/TargetUrl.java b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/TargetUrl.java index 7bbb962..77b5a82 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/TargetUrl.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/TargetUrl.java @@ -15,4 +15,6 @@ public @interface TargetUrl { String[] value(); + String sourceRegion() default ""; + } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/AndSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/AndSelector.java new file mode 100644 index 0000000..98481ef --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/AndSelector.java @@ -0,0 +1,53 @@ +package us.codecraft.webmagic.selector; + +import java.util.ArrayList; +import java.util.List; + +/** + * @author yihua.huang@dianping.com
+ * @date: 13-8-3
+ * Time: 下午5:29
+ */ +public class AndSelector implements Selector { + + private List selectors = new ArrayList(); + + public AndSelector(Selector... selectors) { + for (Selector selector : selectors) { + this.selectors.add(selector); + } + } + + @Override + public String select(String text) { + for (Selector selector : selectors) { + if (text == null) { + return null; + } + text = selector.select(text); + } + return text; + } + + @Override + public List selectList(String text) { + List results = new ArrayList(); + boolean first = true; + for (Selector selector : selectors) { + if (first) { + results = selector.selectList(text); + first = false; + } else { + List resultsTemp = new ArrayList(); + for (String result : results) { + resultsTemp.addAll(selector.selectList(result)); + } + results = resultsTemp; + if (results == null || results.size() == 0) { + return results; + } + } + } + return results; + } +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/OrSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/OrSelector.java new file mode 100644 index 0000000..2cdd870 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/OrSelector.java @@ -0,0 +1,41 @@ +package us.codecraft.webmagic.selector; + +import java.util.ArrayList; +import java.util.List; + +/** + * @author yihua.huang@dianping.com
+ * @date: 13-8-3
+ * Time: 下午5:29
+ */ +public class OrSelector implements Selector { + + private List selectors = new ArrayList(); + + public OrSelector(Selector... selectors) { + for (Selector selector : selectors) { + this.selectors.add(selector); + } + } + + @Override + public String select(String text) { + for (Selector selector : selectors) { + text = selector.select(text); + if (text!=null){ + return text; + } + } + return null; + } + + @Override + public List selectList(String text) { + List results = new ArrayList(); + for (Selector selector : selectors) { + List strings = selector.selectList(text); + results.addAll(strings); + } + return results; + } +} diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/oo/OschinaBlog.java b/webmagic-core/src/test/java/us/codecraft/webmagic/oo/OschinaBlog.java index c82ef23..85d4817 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/oo/OschinaBlog.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/oo/OschinaBlog.java @@ -10,7 +10,7 @@ import java.util.List; * Time: 下午10:18
*/ @TargetUrl("http://my.oschina.net/flashsword/blog/*") -public class OschinaBlog implements AfterExtractor{ +public class OschinaBlog implements AfterExtractor { @ExtractBy("//title") private String title; @@ -23,5 +23,6 @@ public class OschinaBlog implements AfterExtractor{ @Override public void afterProcess(Page page, OschinaBlog oschinaBlog) { + content = null; } } diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/oo/TestFetcher.java b/webmagic-core/src/test/java/us/codecraft/webmagic/oo/TestFetcher.java index f4525f0..289cd4f 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/oo/TestFetcher.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/oo/TestFetcher.java @@ -4,9 +4,6 @@ import org.junit.Ignore; import org.junit.Test; import us.codecraft.webmagic.Site; -import java.util.ArrayList; -import java.util.List; - /** * @author yihua.huang@dianping.com
* @date: 13-8-1
@@ -17,7 +14,6 @@ public class TestFetcher { @Ignore("takes long") @Test public void test() { - System.out.println(List.class.isAssignableFrom(ArrayList.class)); OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog/145796"), OschinaBlog.class) .run(); From 145628557d39d162fc801c86f2f237a8d36f08f5 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 3 Aug 2013 18:01:17 +0800 Subject: [PATCH 22/84] update afterextract api --- .../codecraft/webmagic/oo/AfterExtractor.java | 4 +- .../webmagic/oo/ObjectPageProcessor.java | 28 +++++++++---- .../webmagic/oo/PageModelExtractor.java | 41 ++++++++++++------- .../us/codecraft/webmagic/oo/OschinaBlog.java | 6 +-- 4 files changed, 52 insertions(+), 27 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/AfterExtractor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/AfterExtractor.java index cb9788b..79feaaf 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/AfterExtractor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/AfterExtractor.java @@ -9,7 +9,7 @@ import us.codecraft.webmagic.Page; * @date: 13-8-3
* Time: 上午9:42
*/ -public interface AfterExtractor { +public interface AfterExtractor { - public void afterProcess(Page page, T t); + public void afterProcess(Page page); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/ObjectPageProcessor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/ObjectPageProcessor.java index dda96b5..c280acd 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/ObjectPageProcessor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/ObjectPageProcessor.java @@ -4,11 +4,13 @@ import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.processor.PageProcessor; +import us.codecraft.webmagic.selector.Selector; import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Set; +import java.util.regex.Matcher; import java.util.regex.Pattern; /** @@ -33,7 +35,7 @@ public class ObjectPageProcessor implements PageProcessor { } - public ObjectPageProcessor addPageModel(Class clazz){ + public ObjectPageProcessor addPageModel(Class clazz) { PageModelExtractor pageModelExtractor = PageModelExtractor.create(clazz); targetUrlPatterns.addAll(pageModelExtractor.getTargetUrlPatterns()); targetUrlPatterns.addAll(pageModelExtractor.getHelpUrlPatterns()); @@ -49,22 +51,34 @@ public class ObjectPageProcessor implements PageProcessor { public void process(Page page) { for (PageModelExtractor pageModelExtractor : pageModelExtractorList) { Object process = pageModelExtractor.process(page); - if (process==null){ + if (process == null) { page.getResultItems().setSkip(true); } postProcessPageModel(pageModelExtractor.getClazz(), process); page.putField(pageModelExtractor.getClazz().getCanonicalName(), process); + extractLinks(page, pageModelExtractor.getHelpUrlRegionSelector(), pageModelExtractor.getHelpUrlPatterns()); + extractLinks(page, pageModelExtractor.getTargetUrlRegionSelector(), pageModelExtractor.getTargetUrlPatterns()); } - for (String link : page.getHtml().links().all()) { - for (Pattern targetUrlPattern : targetUrlPatterns) { - if (targetUrlPattern.matcher(link).matches()){ - page.addTargetRequest(new Request(link)); + } + + private void extractLinks(Page page, Selector urlRegionSelector, List urlPatterns) { + List links; + if (urlRegionSelector == null) { + links = page.getHtml().links().all(); + } else { + links = urlRegionSelector.selectList(page.getHtml().toString()); + } + for (String link : links) { + for (Pattern targetUrlPattern : urlPatterns) { + Matcher matcher = targetUrlPattern.matcher(link); + if (matcher.find()) { + page.addTargetRequest(new Request(matcher.group(1))); } } } } - protected void postProcessPageModel(Class clazz, Object object){ + protected void postProcessPageModel(Class clazz, Object object) { } @Override diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/PageModelExtractor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/PageModelExtractor.java index 83a4d31..8a0d81b 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/PageModelExtractor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/PageModelExtractor.java @@ -21,14 +21,16 @@ class PageModelExtractor { private List targetUrlPatterns = new ArrayList(); + private Selector targetUrlRegionSelector; + private List helpUrlPatterns = new ArrayList(); + private Selector helpUrlRegionSelector; + private Class clazz; private List fieldExtractors; - private AfterExtractor afterExtractor; - public static PageModelExtractor create(Class clazz) { PageModelExtractor pageModelExtractor = new PageModelExtractor(); pageModelExtractor.init(clazz); @@ -39,13 +41,6 @@ class PageModelExtractor { this.clazz = clazz; initTargetUrlPatterns(); fieldExtractors = new ArrayList(); - if (AfterExtractor.class.isAssignableFrom(clazz)) { - try { - afterExtractor = (AfterExtractor) clazz.newInstance(); - } catch (Exception e) { - throw new IllegalArgumentException(e); - } - } for (Field field : clazz.getDeclaredFields()) { field.setAccessible(true); ExtractBy extractBy = field.getAnnotation(ExtractBy.class); @@ -117,16 +112,24 @@ class PageModelExtractor { if (annotation == null) { targetUrlPatterns.add(Pattern.compile(".*")); } else { - String[] value = ((TargetUrl) annotation).value(); + TargetUrl targetUrl = (TargetUrl) annotation; + String[] value = targetUrl.value(); for (String s : value) { - targetUrlPatterns.add(Pattern.compile(s.replace(".", "\\.").replace("*", "[^\"'#]*"))); + targetUrlPatterns.add(Pattern.compile("("+s.replace(".", "\\.").replace("*", "[^\"'#]*")+")")); + } + if (!targetUrl.sourceRegion().equals("")){ + targetUrlRegionSelector = new Xpath2Selector(targetUrl.sourceRegion()); } } annotation = clazz.getAnnotation(HelpUrl.class); if (annotation != null) { - String[] value = ((HelpUrl) annotation).value(); + HelpUrl helpUrl = (HelpUrl) annotation; + String[] value = helpUrl.value(); for (String s : value) { - helpUrlPatterns.add(Pattern.compile(s.replace(".", "\\.").replace("*", "[^\"'#]*"))); + helpUrlPatterns.add(Pattern.compile("("+s.replace(".", "\\.").replace("*", "[^\"'#]*")+")")); + } + if (!helpUrl.sourceRegion().equals("")){ + helpUrlRegionSelector = new Xpath2Selector(helpUrl.sourceRegion()); } } } @@ -179,8 +182,8 @@ class PageModelExtractor { setField(o, fieldExtractor, value); } } - if (afterExtractor != null) { - afterExtractor.afterProcess(page, o); + if (AfterExtractor.class.isAssignableFrom(clazz)) { + ((AfterExtractor)o).afterProcess(page); } } catch (InstantiationException e) { e.printStackTrace(); @@ -210,4 +213,12 @@ class PageModelExtractor { List getHelpUrlPatterns() { return helpUrlPatterns; } + + Selector getTargetUrlRegionSelector() { + return targetUrlRegionSelector; + } + + Selector getHelpUrlRegionSelector() { + return helpUrlRegionSelector; + } } diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/oo/OschinaBlog.java b/webmagic-core/src/test/java/us/codecraft/webmagic/oo/OschinaBlog.java index 85d4817..0f64aef 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/oo/OschinaBlog.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/oo/OschinaBlog.java @@ -9,8 +9,8 @@ import java.util.List; * @date: 13-8-1
* Time: 下午10:18
*/ -@TargetUrl("http://my.oschina.net/flashsword/blog/*") -public class OschinaBlog implements AfterExtractor { +@TargetUrl(value="http://my.oschina.net/flashsword/blog/*",sourceRegion = "//div[@class='BlogLinks']") +public class OschinaBlog implements AfterExtractor { @ExtractBy("//title") private String title; @@ -22,7 +22,7 @@ public class OschinaBlog implements AfterExtractor { private List tags; @Override - public void afterProcess(Page page, OschinaBlog oschinaBlog) { + public void afterProcess(Page page) { content = null; } } From bfadac756a6db834b04c6033ca44281b9c85c921 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 3 Aug 2013 18:36:03 +0800 Subject: [PATCH 23/84] fix an attribute bug --- .../webmagic/selector/Xpath2Selector.java | 29 +++++++++++++------ .../us/codecraft/webmagic/oo/OschinaBlog.java | 14 +++++++-- .../webmagic/selector/XpathSelectorTest.java | 2 +- 3 files changed, 32 insertions(+), 13 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java index 99112ca..98b1efe 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java @@ -8,6 +8,7 @@ import org.htmlcleaner.DomSerializer; import org.htmlcleaner.HtmlCleaner; import org.htmlcleaner.TagNode; import org.w3c.dom.Document; +import org.w3c.dom.Node; import org.w3c.dom.NodeList; import javax.xml.namespace.NamespaceContext; @@ -70,7 +71,7 @@ public class Xpath2Selector implements Selector { private XPath2NamespaceContext() { put("fn", NamespaceConstant.FN); - put("xslt",NamespaceConstant.XSLT); + put("xslt", NamespaceConstant.XSLT); } @Override @@ -116,15 +117,20 @@ public class Xpath2Selector implements Selector { result = xPathExpression.evaluate(document, XPathConstants.STRING); } if (result instanceof NodeList) { - StreamResult xmlOutput = new StreamResult(new StringWriter()); - Transformer transformer = TransformerFactory.newInstance().newTransformer(); - transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes"); NodeList nodeList = (NodeList) result; if (nodeList.getLength() == 0) { return null; } - transformer.transform(new DOMSource(nodeList.item(0)), xmlOutput); - return xmlOutput.getWriter().toString(); + Node item = nodeList.item(0); + if (item.getNodeType() == Node.ATTRIBUTE_NODE || item.getNodeType() == Node.TEXT_NODE) { + return item.getTextContent(); + } else { + StreamResult xmlOutput = new StreamResult(new StringWriter()); + Transformer transformer = TransformerFactory.newInstance().newTransformer(); + transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes"); + transformer.transform(new DOMSource(item), xmlOutput); + return xmlOutput.getWriter().toString(); + } } return result.toString(); } catch (Exception e) { @@ -152,9 +158,14 @@ public class Xpath2Selector implements Selector { StreamResult xmlOutput = new StreamResult(); transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes"); for (int i = 0; i < nodeList.getLength(); i++) { - xmlOutput.setWriter(new StringWriter()); - transformer.transform(new DOMSource(nodeList.item(i)), xmlOutput); - results.add(xmlOutput.getWriter().toString()); + Node item = nodeList.item(i); + if (item.getNodeType() == Node.ATTRIBUTE_NODE || item.getNodeType() == Node.TEXT_NODE) { + results.add(item.getTextContent()); + } else { + xmlOutput.setWriter(new StringWriter()); + transformer.transform(new DOMSource(item), xmlOutput); + results.add(xmlOutput.getWriter().toString()); + } } } else { results.add(result.toString()); diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/oo/OschinaBlog.java b/webmagic-core/src/test/java/us/codecraft/webmagic/oo/OschinaBlog.java index 0f64aef..98543b0 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/oo/OschinaBlog.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/oo/OschinaBlog.java @@ -1,6 +1,7 @@ package us.codecraft.webmagic.oo; import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; import java.util.List; @@ -9,7 +10,7 @@ import java.util.List; * @date: 13-8-1
* Time: 下午10:18
*/ -@TargetUrl(value="http://my.oschina.net/flashsword/blog/*",sourceRegion = "//div[@class='BlogLinks']") +@TargetUrl(value="http://my.oschina.net/flashsword/blog/*",sourceRegion = "//div[@class='BlogLinks']//a/@href") public class OschinaBlog implements AfterExtractor { @ExtractBy("//title") @@ -23,6 +24,13 @@ public class OschinaBlog implements AfterExtractor { @Override public void afterProcess(Page page) { - content = null; + System.out.println("title:\t"+title); + System.out.println("content:\t"+content); + System.out.println("tags:\t" + tags); } -} + + public static void main(String[] args) { + OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog/145796"), OschinaBlog.class) + .run(); + } +} \ No newline at end of file diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java index 2b8e15d..2f663c9 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java @@ -1418,7 +1418,7 @@ public class XpathSelectorTest { @Test public void testXpath2Selector() { - Xpath2Selector xpath2Selector = new Xpath2Selector("//a"); + Xpath2Selector xpath2Selector = new Xpath2Selector("//a/@href"); String select = xpath2Selector.select(html); Assert.assertNotNull(select); } From b393e3832099f09372500cd7164119616f13a455 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 3 Aug 2013 20:42:29 +0800 Subject: [PATCH 24/84] add multi entity extract --- .../webmagic/oo/ObjectPageProcessor.java | 2 +- .../codecraft/webmagic/oo/ObjectPipeline.java | 13 ++++- .../webmagic/oo/PageModelExtractor.java | 55 ++++++++++++++----- .../webmagic/oo/samples/OschinaAnswer.java | 31 +++++++++++ 4 files changed, 86 insertions(+), 15 deletions(-) create mode 100644 webmagic-samples/src/main/java/us/codecraft/webmagic/oo/samples/OschinaAnswer.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/ObjectPageProcessor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/ObjectPageProcessor.java index c280acd..a02e446 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/ObjectPageProcessor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/ObjectPageProcessor.java @@ -51,7 +51,7 @@ public class ObjectPageProcessor implements PageProcessor { public void process(Page page) { for (PageModelExtractor pageModelExtractor : pageModelExtractorList) { Object process = pageModelExtractor.process(page); - if (process == null) { + if (process == null || (process instanceof List && ((List) process).size() == 0)) { page.getResultItems().setSkip(true); } postProcessPageModel(pageModelExtractor.getClazz(), process); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/ObjectPipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/ObjectPipeline.java index a5f02ed..54ae2ef 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/ObjectPipeline.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/ObjectPipeline.java @@ -4,6 +4,8 @@ import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.pipeline.Pipeline; +import java.lang.annotation.Annotation; +import java.util.List; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; @@ -32,7 +34,16 @@ public class ObjectPipeline implements Pipeline { for (Map.Entry classPageModelPipelineEntry : pageModelPipelines.entrySet()) { Object o = resultItems.get(classPageModelPipelineEntry.getKey().getCanonicalName()); if (o != null) { - classPageModelPipelineEntry.getValue().process(o, task); + Annotation annotation = classPageModelPipelineEntry.getKey().getAnnotation(ExtractBy.class); + ExtractBy extractBy = (ExtractBy) annotation; + if (extractBy.multi()) { + List list = (List) o; + for (Object o1 : list) { + classPageModelPipelineEntry.getValue().process(o1, task); + } + } else { + classPageModelPipelineEntry.getValue().process(o, task); + } } } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/PageModelExtractor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/PageModelExtractor.java index 8a0d81b..d3d5335 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/PageModelExtractor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/PageModelExtractor.java @@ -31,6 +31,8 @@ class PageModelExtractor { private List fieldExtractors; + private Extractor extractor; + public static PageModelExtractor create(Class clazz) { PageModelExtractor pageModelExtractor = new PageModelExtractor(); pageModelExtractor.init(clazz); @@ -39,7 +41,7 @@ class PageModelExtractor { private void init(Class clazz) { this.clazz = clazz; - initTargetUrlPatterns(); + initClassExtractors(); fieldExtractors = new ArrayList(); for (Field field : clazz.getDeclaredFields()) { field.setAccessible(true); @@ -107,7 +109,7 @@ class PageModelExtractor { } } - private void initTargetUrlPatterns() { + private void initClassExtractors() { Annotation annotation = clazz.getAnnotation(TargetUrl.class); if (annotation == null) { targetUrlPatterns.add(Pattern.compile(".*")); @@ -115,9 +117,9 @@ class PageModelExtractor { TargetUrl targetUrl = (TargetUrl) annotation; String[] value = targetUrl.value(); for (String s : value) { - targetUrlPatterns.add(Pattern.compile("("+s.replace(".", "\\.").replace("*", "[^\"'#]*")+")")); + targetUrlPatterns.add(Pattern.compile("(" + s.replace(".", "\\.").replace("*", "[^\"'#]*") + ")")); } - if (!targetUrl.sourceRegion().equals("")){ + if (!targetUrl.sourceRegion().equals("")) { targetUrlRegionSelector = new Xpath2Selector(targetUrl.sourceRegion()); } } @@ -126,12 +128,17 @@ class PageModelExtractor { HelpUrl helpUrl = (HelpUrl) annotation; String[] value = helpUrl.value(); for (String s : value) { - helpUrlPatterns.add(Pattern.compile("("+s.replace(".", "\\.").replace("*", "[^\"'#]*")+")")); + helpUrlPatterns.add(Pattern.compile("(" + s.replace(".", "\\.").replace("*", "[^\"'#]*") + ")")); } - if (!helpUrl.sourceRegion().equals("")){ + if (!helpUrl.sourceRegion().equals("")) { helpUrlRegionSelector = new Xpath2Selector(helpUrl.sourceRegion()); } } + annotation = clazz.getAnnotation(ExtractBy.class); + if (annotation != null) { + ExtractBy extractBy = (ExtractBy) annotation; + extractor = new Extractor(new Xpath2Selector(extractBy.value()), Extractor.Source.Html, extractBy.notNull(), extractBy.multi()); + } } public Object process(Page page) { @@ -144,6 +151,28 @@ class PageModelExtractor { if (!matched) { return null; } + if (extractor == null) { + return processSingle(page,page.getHtml().toString()); + } else { + if (extractor.multi){ + List os = new ArrayList(); + List list = extractor.getSelector().selectList(page.getHtml().toString()); + for (String s : list) { + Object o = processSingle(page, s); + if (o!=null){ + os.add(o); + } + } + return os; + }else { + String select = extractor.getSelector().select(page.getHtml().toString()); + Object o = processSingle(page, select); + return o; + } + } + } + + private Object processSingle(Page page,String html) { Object o = null; try { o = clazz.newInstance(); @@ -152,38 +181,38 @@ class PageModelExtractor { List value; switch (fieldExtractor.getSource()) { case Html: - value = fieldExtractor.getSelector().selectList(page.getHtml().toString()); + value = fieldExtractor.getSelector().selectList(html); break; case Url: value = fieldExtractor.getSelector().selectList(page.getUrl().toString()); break; default: - value = fieldExtractor.getSelector().selectList(page.getHtml().toString()); + value = fieldExtractor.getSelector().selectList(html); } if ((value == null || value.size() == 0) && fieldExtractor.isNotNull()) { - page.getResultItems().setSkip(true); + return null; } setField(o, fieldExtractor, value); } else { String value; switch (fieldExtractor.getSource()) { case Html: - value = fieldExtractor.getSelector().select(page.getHtml().toString()); + value = fieldExtractor.getSelector().select(html); break; case Url: value = fieldExtractor.getSelector().select(page.getUrl().toString()); break; default: - value = fieldExtractor.getSelector().select(page.getHtml().toString()); + value = fieldExtractor.getSelector().select(html); } if (value == null && fieldExtractor.isNotNull()) { - page.getResultItems().setSkip(true); + return null; } setField(o, fieldExtractor, value); } } if (AfterExtractor.class.isAssignableFrom(clazz)) { - ((AfterExtractor)o).afterProcess(page); + ((AfterExtractor) o).afterProcess(page); } } catch (InstantiationException e) { e.printStackTrace(); diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/oo/samples/OschinaAnswer.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/oo/samples/OschinaAnswer.java new file mode 100644 index 0000000..fd04b1d --- /dev/null +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/oo/samples/OschinaAnswer.java @@ -0,0 +1,31 @@ +package us.codecraft.webmagic.oo.samples; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.oo.*; + +/** + * @author yihua.huang@dianping.com
+ * @date: 13-8-3
+ * Time: 下午8:25
+ */ +@TargetUrl("http://www.oschina.net/question/\\d+_\\d+*") +@HelpUrl("http://www.oschina.net/question/*") +@ExtractBy(value = "//ul[@class='list']/li[@class='Answer']", multi = true) +public class OschinaAnswer implements AfterExtractor{ + + @ExtractBy("//img/@title") + private String user; + + @ExtractBy(value="//div[@class='detail']",notNull = false) + private String content; + + public static void main(String[] args) { + OOSpider.create(Site.me().addStartUrl("http://www.oschina.net/question/567527_120597"), OschinaAnswer.class).run(); + } + + @Override + public void afterProcess(Page page) { + + } +} From cfb8990453a09fdfb79265f07840a8be8a47b60e Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sun, 4 Aug 2013 03:04:30 +0800 Subject: [PATCH 25/84] update author --- .../us/codecraft/webmagic/oo/ConsolePageModelPipeline.java | 2 +- .../main/java/us/codecraft/webmagic/selector/AndSelector.java | 2 +- .../main/java/us/codecraft/webmagic/selector/OrSelector.java | 2 +- .../src/test/java/us/codecraft/webmagic/oo/OschinaBlog.java | 2 +- .../src/test/java/us/codecraft/webmagic/oo/TestFetcher.java | 2 +- .../test/java/us/codecraft/webmagic/selector/SaxonTest.java | 2 +- .../java/us/codecraft/webmagic/scheduler/RedisScheduler.java | 2 +- .../us/codecraft/webmagic/scheduler/RedisSchedulerTest.java | 2 +- .../webmagic/selenium/downloader/SeleniumDownloader.java | 2 +- .../codecraft/webmagic/selenium/downloader/WebDriverPool.java | 2 +- .../java/us/codecraft/webmagic/selenium/SeleniumTest.java | 2 +- .../webmagic/selenium/downloader/SeleniumDownloaderTest.java | 2 +- .../webmagic/selenium/downloader/WebDriverPoolTest.java | 2 +- .../src/main/java/us/codecraft/webmagic/oo/samples/Blog.java | 2 +- .../main/java/us/codecraft/webmagic/oo/samples/IteyeBlog.java | 2 +- .../java/us/codecraft/webmagic/oo/samples/OschinaAnswer.java | 4 ++-- .../java/us/codecraft/webmagic/oo/samples/OschinaBlog.java | 2 +- .../java/us/codecraft/webmagic/samples/GlobalProcessor.java | 2 +- .../java/us/codecraft/webmagic/samples/GuoxueProcessor.java | 2 +- .../java/us/codecraft/webmagic/samples/HuabanProcessor.java | 2 +- .../us/codecraft/webmagic/samples/IteyeBlogProcessor.java | 2 +- 21 files changed, 22 insertions(+), 22 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/ConsolePageModelPipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/ConsolePageModelPipeline.java index 16b1ef2..e17f210 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/ConsolePageModelPipeline.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/ConsolePageModelPipeline.java @@ -4,7 +4,7 @@ import org.apache.commons.lang3.builder.ToStringBuilder; import us.codecraft.webmagic.Task; /** - * @author yihua.huang@dianping.com
+ * @author code4crafter@gmail.com
* @date: 13-8-3
* Time: 下午3:41
*/ diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/AndSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/AndSelector.java index 98481ef..e6bbbb8 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/AndSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/AndSelector.java @@ -4,7 +4,7 @@ import java.util.ArrayList; import java.util.List; /** - * @author yihua.huang@dianping.com
+ * @author code4crafter@gmail.com
* @date: 13-8-3
* Time: 下午5:29
*/ diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/OrSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/OrSelector.java index 2cdd870..dca1b34 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/OrSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/OrSelector.java @@ -4,7 +4,7 @@ import java.util.ArrayList; import java.util.List; /** - * @author yihua.huang@dianping.com
+ * @author code4crafter@gmail.com
* @date: 13-8-3
* Time: 下午5:29
*/ diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/oo/OschinaBlog.java b/webmagic-core/src/test/java/us/codecraft/webmagic/oo/OschinaBlog.java index 98543b0..728f143 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/oo/OschinaBlog.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/oo/OschinaBlog.java @@ -6,7 +6,7 @@ import us.codecraft.webmagic.Site; import java.util.List; /** - * @author yihua.huang@dianping.com
+ * @author code4crafter@gmail.com
* @date: 13-8-1
* Time: 下午10:18
*/ diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/oo/TestFetcher.java b/webmagic-core/src/test/java/us/codecraft/webmagic/oo/TestFetcher.java index 289cd4f..b7f2d29 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/oo/TestFetcher.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/oo/TestFetcher.java @@ -5,7 +5,7 @@ import org.junit.Test; import us.codecraft.webmagic.Site; /** - * @author yihua.huang@dianping.com
+ * @author code4crafter@gmail.com
* @date: 13-8-1
* Time: 下午8:42
*/ diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/SaxonTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/SaxonTest.java index 509be44..05a8906 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/SaxonTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/SaxonTest.java @@ -15,7 +15,7 @@ import javax.xml.xpath.XPathExpression; import javax.xml.xpath.XPathFactoryConfigurationException; /** - * @author yihua.huang@dianping.com
+ * @author code4crafter@gmail.com
* @date: 13-8-2
* Time: 下午5:48
*/ diff --git a/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java index 382642b..094295c 100644 --- a/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java +++ b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java @@ -10,7 +10,7 @@ import us.codecraft.webmagic.schedular.Scheduler; /** * 使用redis管理url,构建一个分布式的爬虫。
* - * @author yihua.huang@dianping.com
+ * @author code4crafter@gmail.com
* @date: 13-7-25
* Time: 上午7:07
*/ diff --git a/webmagic-plugin/webmagic-misc/src/test/java/us/codecraft/webmagic/scheduler/RedisSchedulerTest.java b/webmagic-plugin/webmagic-misc/src/test/java/us/codecraft/webmagic/scheduler/RedisSchedulerTest.java index 6db21a8..3d59671 100644 --- a/webmagic-plugin/webmagic-misc/src/test/java/us/codecraft/webmagic/scheduler/RedisSchedulerTest.java +++ b/webmagic-plugin/webmagic-misc/src/test/java/us/codecraft/webmagic/scheduler/RedisSchedulerTest.java @@ -8,7 +8,7 @@ import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Task; /** - * @author yihua.huang@dianping.com
+ * @author code4crafter@gmail.com
* @date: 13-7-25
* Time: 上午7:51
*/ diff --git a/webmagic-plugin/webmagic-selenium/src/main/java/us/codecraft/webmagic/selenium/downloader/SeleniumDownloader.java b/webmagic-plugin/webmagic-selenium/src/main/java/us/codecraft/webmagic/selenium/downloader/SeleniumDownloader.java index 1b689d4..002dcc9 100644 --- a/webmagic-plugin/webmagic-selenium/src/main/java/us/codecraft/webmagic/selenium/downloader/SeleniumDownloader.java +++ b/webmagic-plugin/webmagic-selenium/src/main/java/us/codecraft/webmagic/selenium/downloader/SeleniumDownloader.java @@ -21,7 +21,7 @@ import java.util.Map; * 使用Selenium调用浏览器进行渲染。目前仅支持chrome。
* 需要下载Selenium driver支持。
* - * @author yihua.huang@dianping.com
+ * @author code4crafter@gmail.com
* @date: 13-7-26
* Time: 下午1:37
*/ diff --git a/webmagic-plugin/webmagic-selenium/src/main/java/us/codecraft/webmagic/selenium/downloader/WebDriverPool.java b/webmagic-plugin/webmagic-selenium/src/main/java/us/codecraft/webmagic/selenium/downloader/WebDriverPool.java index faed8d6..fdd978d 100644 --- a/webmagic-plugin/webmagic-selenium/src/main/java/us/codecraft/webmagic/selenium/downloader/WebDriverPool.java +++ b/webmagic-plugin/webmagic-selenium/src/main/java/us/codecraft/webmagic/selenium/downloader/WebDriverPool.java @@ -11,7 +11,7 @@ import java.util.concurrent.LinkedBlockingDeque; import java.util.concurrent.atomic.AtomicInteger; /** - * @author yihua.huang@dianping.com
+ * @author code4crafter@gmail.com
* @date: 13-7-26
* Time: 下午1:41
*/ diff --git a/webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/SeleniumTest.java b/webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/SeleniumTest.java index a403b91..a6de847 100644 --- a/webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/SeleniumTest.java +++ b/webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/SeleniumTest.java @@ -13,7 +13,7 @@ import java.util.HashMap; import java.util.Map; /** - * @author yihua.huang@dianping.com
+ * @author code4crafter@gmail.com
* @date: 13-7-26
* Time: 下午12:27
*/ diff --git a/webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/downloader/SeleniumDownloaderTest.java b/webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/downloader/SeleniumDownloaderTest.java index 9683083..23711fa 100644 --- a/webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/downloader/SeleniumDownloaderTest.java +++ b/webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/downloader/SeleniumDownloaderTest.java @@ -8,7 +8,7 @@ import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Task; /** - * @author yihua.huang@dianping.com
+ * @author code4crafter@gmail.com
* @date: 13-7-26
* Time: 下午2:46
*/ diff --git a/webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/downloader/WebDriverPoolTest.java b/webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/downloader/WebDriverPoolTest.java index 1efc69b..cbf3860 100644 --- a/webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/downloader/WebDriverPoolTest.java +++ b/webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/downloader/WebDriverPoolTest.java @@ -5,7 +5,7 @@ import org.junit.Test; import org.openqa.selenium.WebDriver; /** - * @author yihua.huang@dianping.com
+ * @author code4crafter@gmail.com
* @date: 13-7-26
* Time: 下午2:12
*/ diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/oo/samples/Blog.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/oo/samples/Blog.java index 565c711..e3e5364 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/oo/samples/Blog.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/oo/samples/Blog.java @@ -1,7 +1,7 @@ package us.codecraft.webmagic.oo.samples; /** - * @author yihua.huang@dianping.com
+ * @author code4crafter@gmail.com
* @date: 13-8-2
* Time: 上午8:10
*/ diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/oo/samples/IteyeBlog.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/oo/samples/IteyeBlog.java index e289c87..39597af 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/oo/samples/IteyeBlog.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/oo/samples/IteyeBlog.java @@ -6,7 +6,7 @@ import us.codecraft.webmagic.oo.OOSpider; import us.codecraft.webmagic.oo.TargetUrl; /** - * @author yihua.huang@dianping.com
+ * @author code4crafter@gmail.com
* @date: 13-8-2
* Time: 上午7:52
*/ diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/oo/samples/OschinaAnswer.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/oo/samples/OschinaAnswer.java index fd04b1d..0a59b7d 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/oo/samples/OschinaAnswer.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/oo/samples/OschinaAnswer.java @@ -5,7 +5,7 @@ import us.codecraft.webmagic.Site; import us.codecraft.webmagic.oo.*; /** - * @author yihua.huang@dianping.com
+ * @author code4crafter@gmail.com
* @date: 13-8-3
* Time: 下午8:25
*/ @@ -17,7 +17,7 @@ public class OschinaAnswer implements AfterExtractor{ @ExtractBy("//img/@title") private String user; - @ExtractBy(value="//div[@class='detail']",notNull = false) + @ExtractBy("//div[@class='detail']") private String content; public static void main(String[] args) { diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/oo/samples/OschinaBlog.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/oo/samples/OschinaBlog.java index 4a52a14..5224c85 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/oo/samples/OschinaBlog.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/oo/samples/OschinaBlog.java @@ -6,7 +6,7 @@ import us.codecraft.webmagic.oo.OOSpider; import us.codecraft.webmagic.oo.TargetUrl; /** - * @author yihua.huang@dianping.com
+ * @author code4crafter@gmail.com
* @date: 13-8-2
* Time: 上午7:52
*/ diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GlobalProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GlobalProcessor.java index 2bdf342..0448683 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GlobalProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GlobalProcessor.java @@ -10,7 +10,7 @@ import us.codecraft.webmagic.scheduler.RedisScheduler; import java.util.List; /** - * Author yihua.huang@dianping.com + * Author code4crafter@gmail.com * Date: 13-6-24 * Time: 下午2:12 */ diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GuoxueProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GuoxueProcessor.java index 54d995e..db00c79 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GuoxueProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GuoxueProcessor.java @@ -6,7 +6,7 @@ import us.codecraft.webmagic.processor.SimplePageProcessor; import us.codecraft.webmagic.schedular.FileCacheQueueScheduler; /** - * @author yihua.huang@dianping.com
+ * @author code4crafter@gmail.com
* @date: 13-7-14
* Time: 上午8:33
*/ diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuabanProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuabanProcessor.java index d8c5f05..eef2b2f 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuabanProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuabanProcessor.java @@ -11,7 +11,7 @@ import us.codecraft.webmagic.selenium.downloader.SeleniumDownloader; /** * 花瓣网抽取器。
* 使用Selenium做页面动态渲染。
- * @author yihua.huang@dianping.com
+ * @author code4crafter@gmail.com
* @date: 13-7-26
* Time: 下午4:08
*/ diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/IteyeBlogProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/IteyeBlogProcessor.java index 76f9cc3..da846e8 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/IteyeBlogProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/IteyeBlogProcessor.java @@ -7,7 +7,7 @@ import us.codecraft.webmagic.pipeline.FilePipeline; import us.codecraft.webmagic.processor.PageProcessor; /** - * @author yihua.huang@dianping.com
+ * @author code4crafter@gmail.com
* @date: 13-7-26
* Time: 上午7:31
*/ From 21cae2ff2ea8dec0a24b03039e571a9fd37e38b9 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sun, 4 Aug 2013 07:53:28 +0800 Subject: [PATCH 26/84] update package --- .../main/java/us/codecraft/webmagic/Page.java | 19 +++++++++++++++++-- .../{oo => model}/AfterExtractor.java | 2 +- .../ConsolePageModelPipeline.java | 2 +- .../webmagic/{oo => model}/ExtractBy.java | 2 +- .../webmagic/{oo => model}/ExtractByUrl.java | 2 +- .../webmagic/{oo => model}/Extractor.java | 2 +- .../{oo => model}/FieldExtractor.java | 2 +- .../webmagic/{oo => model}/HelpUrl.java | 2 +- .../webmagic/{oo => model}/OOSpider.java | 2 +- .../{oo => model}/ObjectPageProcessor.java | 2 +- .../{oo => model}/ObjectPipeline.java | 2 +- .../{oo => model}/PageModelExtractor.java | 2 +- .../{oo => model}/PageModelPipeline.java | 2 +- .../webmagic/{oo => model}/TargetUrl.java | 2 +- .../webmagic/{oo => model}/package.html | 0 .../webmagic/{oo => model}/OschinaBlog.java | 5 +++-- .../webmagic/{oo => model}/TestFetcher.java | 2 +- .../webmagic/{oo => model}/samples/Blog.java | 2 +- .../{oo => model}/samples/IteyeBlog.java | 8 ++++---- .../{oo => model}/samples/OschinaAnswer.java | 4 ++-- .../{oo => model}/samples/OschinaBlog.java | 8 ++++---- 21 files changed, 45 insertions(+), 29 deletions(-) rename webmagic-core/src/main/java/us/codecraft/webmagic/{oo => model}/AfterExtractor.java (88%) rename webmagic-core/src/main/java/us/codecraft/webmagic/{oo => model}/ConsolePageModelPipeline.java (91%) rename webmagic-core/src/main/java/us/codecraft/webmagic/{oo => model}/ExtractBy.java (93%) rename webmagic-core/src/main/java/us/codecraft/webmagic/{oo => model}/ExtractByUrl.java (92%) rename webmagic-core/src/main/java/us/codecraft/webmagic/{oo => model}/Extractor.java (95%) rename webmagic-core/src/main/java/us/codecraft/webmagic/{oo => model}/FieldExtractor.java (96%) rename webmagic-core/src/main/java/us/codecraft/webmagic/{oo => model}/HelpUrl.java (91%) rename webmagic-core/src/main/java/us/codecraft/webmagic/{oo => model}/OOSpider.java (97%) rename webmagic-core/src/main/java/us/codecraft/webmagic/{oo => model}/ObjectPageProcessor.java (98%) rename webmagic-core/src/main/java/us/codecraft/webmagic/{oo => model}/ObjectPipeline.java (97%) rename webmagic-core/src/main/java/us/codecraft/webmagic/{oo => model}/PageModelExtractor.java (99%) rename webmagic-core/src/main/java/us/codecraft/webmagic/{oo => model}/PageModelPipeline.java (85%) rename webmagic-core/src/main/java/us/codecraft/webmagic/{oo => model}/TargetUrl.java (91%) rename webmagic-core/src/main/java/us/codecraft/webmagic/{oo => model}/package.html (100%) rename webmagic-core/src/test/java/us/codecraft/webmagic/{oo => model}/OschinaBlog.java (85%) rename webmagic-core/src/test/java/us/codecraft/webmagic/{oo => model}/TestFetcher.java (91%) rename webmagic-samples/src/main/java/us/codecraft/webmagic/{oo => model}/samples/Blog.java (80%) rename webmagic-samples/src/main/java/us/codecraft/webmagic/{oo => model}/samples/IteyeBlog.java (82%) rename webmagic-samples/src/main/java/us/codecraft/webmagic/{oo => model}/samples/OschinaAnswer.java (90%) rename webmagic-samples/src/main/java/us/codecraft/webmagic/{oo => model}/samples/OschinaBlog.java (82%) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java index 40f17f0..eb2c132 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java @@ -9,7 +9,7 @@ import java.util.List; /** *
- *Page保存了上一次抓取的结果,并可定义待抓取的链接内容。
+ * Page保存了上一次抓取的结果,并可定义待抓取的链接内容。
  *
  *     主要方法:
  *     {@link #getUrl()} 获取页面的Url
@@ -19,6 +19,7 @@ import java.util.List;
  *     {@link #addTargetRequests(java.util.List)} {@link #addTargetRequest(String)} 添加待抓取的链接
  *
  * 
+ * * @author code4crafter@gmail.com
*/ public class Page { @@ -36,9 +37,16 @@ public class Page { public Page() { } + public Page setSkip(boolean skip) { + resultItems.setSkip(skip); + return this; + + } + /** * 保存抽取的结果 - * @param key 结果的key + * + * @param key 结果的key * @param field 结果的value */ public void putField(String key, Object field) { @@ -47,6 +55,7 @@ public class Page { /** * 获取页面的html内容 + * * @return html 页面的html内容 */ public Selectable getHtml() { @@ -63,6 +72,7 @@ public class Page { /** * 添加待抓取的链接 + * * @param requests 待抓取的链接 */ public void addTargetRequests(List requests) { @@ -79,6 +89,7 @@ public class Page { /** * 添加待抓取的链接 + * * @param requestString 待抓取的链接 */ public void addTargetRequest(String requestString) { @@ -93,6 +104,7 @@ public class Page { /** * 添加待抓取的页面,在需要传递附加信息时使用 + * * @param request 待抓取的页面 */ public void addTargetRequest(Request request) { @@ -103,6 +115,7 @@ public class Page { /** * 获取页面的Url + * * @return url 当前页面的url,可用于抽取 */ public Selectable getUrl() { @@ -111,6 +124,7 @@ public class Page { /** * 设置url + * * @param url */ public void setUrl(Selectable url) { @@ -119,6 +133,7 @@ public class Page { /** * 获取抓取请求 + * * @return request 抓取请求 */ public Request getRequest() { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/AfterExtractor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/model/AfterExtractor.java similarity index 88% rename from webmagic-core/src/main/java/us/codecraft/webmagic/oo/AfterExtractor.java rename to webmagic-core/src/main/java/us/codecraft/webmagic/model/AfterExtractor.java index 79feaaf..0117081 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/AfterExtractor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/model/AfterExtractor.java @@ -1,4 +1,4 @@ -package us.codecraft.webmagic.oo; +package us.codecraft.webmagic.model; import us.codecraft.webmagic.Page; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/ConsolePageModelPipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/model/ConsolePageModelPipeline.java similarity index 91% rename from webmagic-core/src/main/java/us/codecraft/webmagic/oo/ConsolePageModelPipeline.java rename to webmagic-core/src/main/java/us/codecraft/webmagic/model/ConsolePageModelPipeline.java index e17f210..e5485a1 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/ConsolePageModelPipeline.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/model/ConsolePageModelPipeline.java @@ -1,4 +1,4 @@ -package us.codecraft.webmagic.oo; +package us.codecraft.webmagic.model; import org.apache.commons.lang3.builder.ToStringBuilder; import us.codecraft.webmagic.Task; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/ExtractBy.java b/webmagic-core/src/main/java/us/codecraft/webmagic/model/ExtractBy.java similarity index 93% rename from webmagic-core/src/main/java/us/codecraft/webmagic/oo/ExtractBy.java rename to webmagic-core/src/main/java/us/codecraft/webmagic/model/ExtractBy.java index 71bdc93..4c37c9b 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/ExtractBy.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/model/ExtractBy.java @@ -1,4 +1,4 @@ -package us.codecraft.webmagic.oo; +package us.codecraft.webmagic.model; import java.lang.annotation.ElementType; import java.lang.annotation.Retention; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/ExtractByUrl.java b/webmagic-core/src/main/java/us/codecraft/webmagic/model/ExtractByUrl.java similarity index 92% rename from webmagic-core/src/main/java/us/codecraft/webmagic/oo/ExtractByUrl.java rename to webmagic-core/src/main/java/us/codecraft/webmagic/model/ExtractByUrl.java index e86f08f..9f77676 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/ExtractByUrl.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/model/ExtractByUrl.java @@ -1,4 +1,4 @@ -package us.codecraft.webmagic.oo; +package us.codecraft.webmagic.model; import java.lang.annotation.ElementType; import java.lang.annotation.Retention; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/Extractor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/model/Extractor.java similarity index 95% rename from webmagic-core/src/main/java/us/codecraft/webmagic/oo/Extractor.java rename to webmagic-core/src/main/java/us/codecraft/webmagic/model/Extractor.java index f0607cf..c8feef4 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/Extractor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/model/Extractor.java @@ -1,4 +1,4 @@ -package us.codecraft.webmagic.oo; +package us.codecraft.webmagic.model; import us.codecraft.webmagic.selector.Selector; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/FieldExtractor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/model/FieldExtractor.java similarity index 96% rename from webmagic-core/src/main/java/us/codecraft/webmagic/oo/FieldExtractor.java rename to webmagic-core/src/main/java/us/codecraft/webmagic/model/FieldExtractor.java index 2a6bcf7..17a55c8 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/FieldExtractor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/model/FieldExtractor.java @@ -1,4 +1,4 @@ -package us.codecraft.webmagic.oo; +package us.codecraft.webmagic.model; import us.codecraft.webmagic.selector.Selector; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/HelpUrl.java b/webmagic-core/src/main/java/us/codecraft/webmagic/model/HelpUrl.java similarity index 91% rename from webmagic-core/src/main/java/us/codecraft/webmagic/oo/HelpUrl.java rename to webmagic-core/src/main/java/us/codecraft/webmagic/model/HelpUrl.java index a8ed995..9dee05b 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/HelpUrl.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/model/HelpUrl.java @@ -1,4 +1,4 @@ -package us.codecraft.webmagic.oo; +package us.codecraft.webmagic.model; import java.lang.annotation.ElementType; import java.lang.annotation.Retention; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/OOSpider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/model/OOSpider.java similarity index 97% rename from webmagic-core/src/main/java/us/codecraft/webmagic/oo/OOSpider.java rename to webmagic-core/src/main/java/us/codecraft/webmagic/model/OOSpider.java index 5f523ed..e008bfe 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/OOSpider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/model/OOSpider.java @@ -1,4 +1,4 @@ -package us.codecraft.webmagic.oo; +package us.codecraft.webmagic.model; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/ObjectPageProcessor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/model/ObjectPageProcessor.java similarity index 98% rename from webmagic-core/src/main/java/us/codecraft/webmagic/oo/ObjectPageProcessor.java rename to webmagic-core/src/main/java/us/codecraft/webmagic/model/ObjectPageProcessor.java index a02e446..5a707bc 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/ObjectPageProcessor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/model/ObjectPageProcessor.java @@ -1,4 +1,4 @@ -package us.codecraft.webmagic.oo; +package us.codecraft.webmagic.model; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/ObjectPipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/model/ObjectPipeline.java similarity index 97% rename from webmagic-core/src/main/java/us/codecraft/webmagic/oo/ObjectPipeline.java rename to webmagic-core/src/main/java/us/codecraft/webmagic/model/ObjectPipeline.java index 54ae2ef..f590384 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/ObjectPipeline.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/model/ObjectPipeline.java @@ -1,4 +1,4 @@ -package us.codecraft.webmagic.oo; +package us.codecraft.webmagic.model; import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.Task; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/PageModelExtractor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java similarity index 99% rename from webmagic-core/src/main/java/us/codecraft/webmagic/oo/PageModelExtractor.java rename to webmagic-core/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java index d3d5335..6ba2c5e 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/PageModelExtractor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java @@ -1,4 +1,4 @@ -package us.codecraft.webmagic.oo; +package us.codecraft.webmagic.model; import org.apache.commons.lang3.StringUtils; import us.codecraft.webmagic.Page; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/PageModelPipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/model/PageModelPipeline.java similarity index 85% rename from webmagic-core/src/main/java/us/codecraft/webmagic/oo/PageModelPipeline.java rename to webmagic-core/src/main/java/us/codecraft/webmagic/model/PageModelPipeline.java index 7406cde..bd3aa95 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/PageModelPipeline.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/model/PageModelPipeline.java @@ -1,4 +1,4 @@ -package us.codecraft.webmagic.oo; +package us.codecraft.webmagic.model; import us.codecraft.webmagic.Task; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/TargetUrl.java b/webmagic-core/src/main/java/us/codecraft/webmagic/model/TargetUrl.java similarity index 91% rename from webmagic-core/src/main/java/us/codecraft/webmagic/oo/TargetUrl.java rename to webmagic-core/src/main/java/us/codecraft/webmagic/model/TargetUrl.java index 77b5a82..96ca864 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/TargetUrl.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/model/TargetUrl.java @@ -1,4 +1,4 @@ -package us.codecraft.webmagic.oo; +package us.codecraft.webmagic.model; import java.lang.annotation.ElementType; import java.lang.annotation.Retention; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/package.html b/webmagic-core/src/main/java/us/codecraft/webmagic/model/package.html similarity index 100% rename from webmagic-core/src/main/java/us/codecraft/webmagic/oo/package.html rename to webmagic-core/src/main/java/us/codecraft/webmagic/model/package.html diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/oo/OschinaBlog.java b/webmagic-core/src/test/java/us/codecraft/webmagic/model/OschinaBlog.java similarity index 85% rename from webmagic-core/src/test/java/us/codecraft/webmagic/oo/OschinaBlog.java rename to webmagic-core/src/test/java/us/codecraft/webmagic/model/OschinaBlog.java index 728f143..1bb219f 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/oo/OschinaBlog.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/model/OschinaBlog.java @@ -1,4 +1,4 @@ -package us.codecraft.webmagic.oo; +package us.codecraft.webmagic.model; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; @@ -10,7 +10,7 @@ import java.util.List; * @date: 13-8-1
* Time: 下午10:18
*/ -@TargetUrl(value="http://my.oschina.net/flashsword/blog/*",sourceRegion = "//div[@class='BlogLinks']//a/@href") +@TargetUrl("http://my.oschina.net/flashsword/blog/*") public class OschinaBlog implements AfterExtractor { @ExtractBy("//title") @@ -27,6 +27,7 @@ public class OschinaBlog implements AfterExtractor { System.out.println("title:\t"+title); System.out.println("content:\t"+content); System.out.println("tags:\t" + tags); + page.setSkip(true); } public static void main(String[] args) { diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/oo/TestFetcher.java b/webmagic-core/src/test/java/us/codecraft/webmagic/model/TestFetcher.java similarity index 91% rename from webmagic-core/src/test/java/us/codecraft/webmagic/oo/TestFetcher.java rename to webmagic-core/src/test/java/us/codecraft/webmagic/model/TestFetcher.java index b7f2d29..009d53a 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/oo/TestFetcher.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/model/TestFetcher.java @@ -1,4 +1,4 @@ -package us.codecraft.webmagic.oo; +package us.codecraft.webmagic.model; import org.junit.Ignore; import org.junit.Test; diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/oo/samples/Blog.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/Blog.java similarity index 80% rename from webmagic-samples/src/main/java/us/codecraft/webmagic/oo/samples/Blog.java rename to webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/Blog.java index e3e5364..484861b 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/oo/samples/Blog.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/Blog.java @@ -1,4 +1,4 @@ -package us.codecraft.webmagic.oo.samples; +package us.codecraft.webmagic.model.samples; /** * @author code4crafter@gmail.com
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/oo/samples/IteyeBlog.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/IteyeBlog.java similarity index 82% rename from webmagic-samples/src/main/java/us/codecraft/webmagic/oo/samples/IteyeBlog.java rename to webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/IteyeBlog.java index 39597af..4d01902 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/oo/samples/IteyeBlog.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/IteyeBlog.java @@ -1,9 +1,9 @@ -package us.codecraft.webmagic.oo.samples; +package us.codecraft.webmagic.model.samples; import us.codecraft.webmagic.Site; -import us.codecraft.webmagic.oo.ExtractBy; -import us.codecraft.webmagic.oo.OOSpider; -import us.codecraft.webmagic.oo.TargetUrl; +import us.codecraft.webmagic.model.ExtractBy; +import us.codecraft.webmagic.model.OOSpider; +import us.codecraft.webmagic.model.TargetUrl; /** * @author code4crafter@gmail.com
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/oo/samples/OschinaAnswer.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaAnswer.java similarity index 90% rename from webmagic-samples/src/main/java/us/codecraft/webmagic/oo/samples/OschinaAnswer.java rename to webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaAnswer.java index 0a59b7d..df23873 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/oo/samples/OschinaAnswer.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaAnswer.java @@ -1,8 +1,8 @@ -package us.codecraft.webmagic.oo.samples; +package us.codecraft.webmagic.model.samples; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; -import us.codecraft.webmagic.oo.*; +import us.codecraft.webmagic.model.*; /** * @author code4crafter@gmail.com
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/oo/samples/OschinaBlog.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java similarity index 82% rename from webmagic-samples/src/main/java/us/codecraft/webmagic/oo/samples/OschinaBlog.java rename to webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java index 5224c85..9f11d0e 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/oo/samples/OschinaBlog.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java @@ -1,9 +1,9 @@ -package us.codecraft.webmagic.oo.samples; +package us.codecraft.webmagic.model.samples; import us.codecraft.webmagic.Site; -import us.codecraft.webmagic.oo.ExtractBy; -import us.codecraft.webmagic.oo.OOSpider; -import us.codecraft.webmagic.oo.TargetUrl; +import us.codecraft.webmagic.model.ExtractBy; +import us.codecraft.webmagic.model.OOSpider; +import us.codecraft.webmagic.model.TargetUrl; /** * @author code4crafter@gmail.com
From 04a7fa037ad47afc13410d7825a17c4925fdf733 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sun, 4 Aug 2013 09:53:01 +0800 Subject: [PATCH 27/84] update pipeline --- .../main/java/us/codecraft/webmagic/model/OOSpider.java | 8 +++++--- .../java/us/codecraft/webmagic/model/ObjectPipeline.java | 7 +++---- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/model/OOSpider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/model/OOSpider.java index e008bfe..900c9b4 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/model/OOSpider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/model/OOSpider.java @@ -28,13 +28,15 @@ public class OOSpider extends Spider { this(ObjectPageProcessor.create(site, pageModels)); this.objectPipeline = new ObjectPipeline(); super.pipeline(objectPipeline); - for (Class pageModel : pageModels) { - this.objectPipeline.put(pageModel, pageModelPipeline); + if (pageModelPipeline!=null){ + for (Class pageModel : pageModels) { + this.objectPipeline.put(pageModel, pageModelPipeline); + } } } public static OOSpider create(Site site, Class... pageModels) { - return new OOSpider(site, new ConsolePageModelPipeline(), pageModels); + return new OOSpider(site, null, pageModels); } public static OOSpider create(Site site, PageModelPipeline pageModelPipeline, Class... pageModels) { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/model/ObjectPipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/model/ObjectPipeline.java index f590384..41296f5 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/model/ObjectPipeline.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/model/ObjectPipeline.java @@ -35,14 +35,13 @@ public class ObjectPipeline implements Pipeline { Object o = resultItems.get(classPageModelPipelineEntry.getKey().getCanonicalName()); if (o != null) { Annotation annotation = classPageModelPipelineEntry.getKey().getAnnotation(ExtractBy.class); - ExtractBy extractBy = (ExtractBy) annotation; - if (extractBy.multi()) { + if (annotation == null || !((ExtractBy) annotation).multi()) { + classPageModelPipelineEntry.getValue().process(o, task); + } else { List list = (List) o; for (Object o1 : list) { classPageModelPipelineEntry.getValue().process(o1, task); } - } else { - classPageModelPipelineEntry.getValue().process(o, task); } } } From a3a868f58466455e59618ff17c6dbd5e9dad156b Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sun, 4 Aug 2013 09:55:50 +0800 Subject: [PATCH 28/84] rename --- ...Processor.java => ModelPageProcessor.java} | 14 ++++++------ ...ObjectPipeline.java => ModelPipeline.java} | 6 ++--- .../us/codecraft/webmagic/model/OOSpider.java | 22 +++++++++---------- .../us/codecraft/webmagic/model/package.html | 2 +- 4 files changed, 22 insertions(+), 22 deletions(-) rename webmagic-core/src/main/java/us/codecraft/webmagic/model/{ObjectPageProcessor.java => ModelPageProcessor.java} (86%) rename webmagic-core/src/main/java/us/codecraft/webmagic/model/{ObjectPipeline.java => ModelPipeline.java} (90%) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/model/ObjectPageProcessor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/model/ModelPageProcessor.java similarity index 86% rename from webmagic-core/src/main/java/us/codecraft/webmagic/model/ObjectPageProcessor.java rename to webmagic-core/src/main/java/us/codecraft/webmagic/model/ModelPageProcessor.java index 5a707bc..1fd8c10 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/model/ObjectPageProcessor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/model/ModelPageProcessor.java @@ -18,7 +18,7 @@ import java.util.regex.Pattern; * @date: 13-8-1
* Time: 下午8:46
*/ -public class ObjectPageProcessor implements PageProcessor { +public class ModelPageProcessor implements PageProcessor { private List pageModelExtractorList = new ArrayList(); @@ -26,16 +26,16 @@ public class ObjectPageProcessor implements PageProcessor { private Set targetUrlPatterns = new HashSet(); - public static ObjectPageProcessor create(Site site, Class... clazzs) { - ObjectPageProcessor objectPageProcessor = new ObjectPageProcessor(site); + public static ModelPageProcessor create(Site site, Class... clazzs) { + ModelPageProcessor modelPageProcessor = new ModelPageProcessor(site); for (Class clazz : clazzs) { - objectPageProcessor.addPageModel(clazz); + modelPageProcessor.addPageModel(clazz); } - return objectPageProcessor; + return modelPageProcessor; } - public ObjectPageProcessor addPageModel(Class clazz) { + public ModelPageProcessor addPageModel(Class clazz) { PageModelExtractor pageModelExtractor = PageModelExtractor.create(clazz); targetUrlPatterns.addAll(pageModelExtractor.getTargetUrlPatterns()); targetUrlPatterns.addAll(pageModelExtractor.getHelpUrlPatterns()); @@ -43,7 +43,7 @@ public class ObjectPageProcessor implements PageProcessor { return this; } - private ObjectPageProcessor(Site site) { + private ModelPageProcessor(Site site) { this.site = site; } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/model/ObjectPipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/model/ModelPipeline.java similarity index 90% rename from webmagic-core/src/main/java/us/codecraft/webmagic/model/ObjectPipeline.java rename to webmagic-core/src/main/java/us/codecraft/webmagic/model/ModelPipeline.java index 41296f5..439a629 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/model/ObjectPipeline.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/model/ModelPipeline.java @@ -14,14 +14,14 @@ import java.util.concurrent.ConcurrentHashMap; * @date: 13-8-2
* Time: 上午10:47
*/ -public class ObjectPipeline implements Pipeline { +public class ModelPipeline implements Pipeline { private Map pageModelPipelines = new ConcurrentHashMap(); - public ObjectPipeline() { + public ModelPipeline() { } - public ObjectPipeline put(Class clazz, PageModelPipeline pageModelPipeline) { + public ModelPipeline put(Class clazz, PageModelPipeline pageModelPipeline) { pageModelPipelines.put(clazz, pageModelPipeline); return this; } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/model/OOSpider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/model/OOSpider.java index 900c9b4..a76144a 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/model/OOSpider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/model/OOSpider.java @@ -15,22 +15,22 @@ public class OOSpider extends Spider { * * @param pageProcessor 已定义的抽取规则 */ - private ObjectPageProcessor objectPageProcessor; + private ModelPageProcessor modelPageProcessor; - private ObjectPipeline objectPipeline; + private ModelPipeline modelPipeline; - protected OOSpider(ObjectPageProcessor objectPageProcessor) { - super(objectPageProcessor); - this.objectPageProcessor = objectPageProcessor; + protected OOSpider(ModelPageProcessor modelPageProcessor) { + super(modelPageProcessor); + this.modelPageProcessor = modelPageProcessor; } public OOSpider(Site site, PageModelPipeline pageModelPipeline, Class... pageModels) { - this(ObjectPageProcessor.create(site, pageModels)); - this.objectPipeline = new ObjectPipeline(); - super.pipeline(objectPipeline); + this(ModelPageProcessor.create(site, pageModels)); + this.modelPipeline = new ModelPipeline(); + super.pipeline(modelPipeline); if (pageModelPipeline!=null){ for (Class pageModel : pageModels) { - this.objectPipeline.put(pageModel, pageModelPipeline); + this.modelPipeline.put(pageModel, pageModelPipeline); } } } @@ -45,8 +45,8 @@ public class OOSpider extends Spider { public OOSpider addPageModel(PageModelPipeline pageModelPipeline, Class... pageModels) { for (Class pageModel : pageModels) { - objectPageProcessor.addPageModel(pageModel); - objectPipeline.put(pageModel, pageModelPipeline); + modelPageProcessor.addPageModel(pageModel); + modelPipeline.put(pageModel, pageModelPipeline); } return this; } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/model/package.html b/webmagic-core/src/main/java/us/codecraft/webmagic/model/package.html index b5f80b1..d62cc00 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/model/package.html +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/model/package.html @@ -1,5 +1,5 @@ -webmagic对抓取器编写的面向对象方式的封装。基于POJO(称为PageModel)及注解即可实现一个PageProcessor。 +webmagic对抓取器编写的面向模型(称为PageModel)的封装。基于POJO及注解即可实现一个PageProcessor。 From 1a50c64e331ea7c0a7a078cdf172528382209918 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sun, 4 Aug 2013 10:05:03 +0800 Subject: [PATCH 29/84] update name --- .../codecraft/webmagic/model/ModelPageProcessor.java | 3 ++- .../us/codecraft/webmagic/model/ModelPipeline.java | 3 ++- .../java/us/codecraft/webmagic/model/OOSpider.java | 12 +++++++----- .../codecraft/webmagic/model/PageModelExtractor.java | 1 + 4 files changed, 12 insertions(+), 7 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/model/ModelPageProcessor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/model/ModelPageProcessor.java index 1fd8c10..12e85d0 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/model/ModelPageProcessor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/model/ModelPageProcessor.java @@ -14,11 +14,12 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; /** + * 基于PageProcessor的扩展点。
* @author code4crafter@gmail.com
* @date: 13-8-1
* Time: 下午8:46
*/ -public class ModelPageProcessor implements PageProcessor { +class ModelPageProcessor implements PageProcessor { private List pageModelExtractorList = new ArrayList(); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/model/ModelPipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/model/ModelPipeline.java index 439a629..f9b0015 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/model/ModelPipeline.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/model/ModelPipeline.java @@ -10,11 +10,12 @@ import java.util.Map; import java.util.concurrent.ConcurrentHashMap; /** + * 基于Pipeline的扩展点。
* @author code4crafter@gmail.com
* @date: 13-8-2
* Time: 上午10:47
*/ -public class ModelPipeline implements Pipeline { +class ModelPipeline implements Pipeline { private Map pageModelPipelines = new ConcurrentHashMap(); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/model/OOSpider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/model/OOSpider.java index a76144a..8a3739d 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/model/OOSpider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/model/OOSpider.java @@ -4,17 +4,13 @@ import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; /** + * 基于Model的Spider,封装后的入口类。
* @author code4crafter@gmail.com
* @date: 13-8-3
* Time: 上午9:51
*/ public class OOSpider extends Spider { - /** - * OOSpider只能由ObjectPageProcessor创建。 - * - * @param pageProcessor 已定义的抽取规则 - */ private ModelPageProcessor modelPageProcessor; private ModelPipeline modelPipeline; @@ -24,6 +20,12 @@ public class OOSpider extends Spider { this.modelPageProcessor = modelPageProcessor; } + /** + * 创建一个爬虫。
+ * @param site + * @param pageModelPipeline + * @param pageModels + */ public OOSpider(Site site, PageModelPipeline pageModelPipeline, Class... pageModels) { this(ModelPageProcessor.create(site, pageModels)); this.modelPipeline = new ModelPipeline(); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java index 6ba2c5e..cf0eeac 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java @@ -13,6 +13,7 @@ import java.util.List; import java.util.regex.Pattern; /** + * Model主要逻辑类。将一个带注解的POJO转换为一个PageModelExtractor。
* @author code4crafter@gmail.com
* @date: 13-8-1
* Time: 下午9:33
From a5c85c3c8b85f2348e9431484a6d5c83231deeaa Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sun, 4 Aug 2013 15:12:06 +0800 Subject: [PATCH 30/84] add annotation ExtractByRaw --- .../webmagic/model/ExtractByRaw.java | 27 ++++ .../codecraft/webmagic/model/Extractor.java | 2 +- .../webmagic/model/PageModelExtractor.java | 146 ++++++++++++------ .../codecraft/webmagic/model/OschinaBlog.java | 3 + .../webmagic/model/OschinaBlogComment.java | 13 ++ 5 files changed, 142 insertions(+), 49 deletions(-) create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/model/ExtractByRaw.java create mode 100644 webmagic-core/src/test/java/us/codecraft/webmagic/model/OschinaBlogComment.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/model/ExtractByRaw.java b/webmagic-core/src/main/java/us/codecraft/webmagic/model/ExtractByRaw.java new file mode 100644 index 0000000..5dca8e1 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/model/ExtractByRaw.java @@ -0,0 +1,27 @@ +package us.codecraft.webmagic.model; + +import java.lang.annotation.ElementType; +import java.lang.annotation.Retention; +import java.lang.annotation.Target; + +/** + * 对于在Class级别就使用过ExtractBy的类,在字段中想抽取全部内容可使用此方法。
+ * @author code4crafter@gmail.com
+ * @date: 13-8-1
+ * Time: 下午8:40
+ */ +@Retention(java.lang.annotation.RetentionPolicy.RUNTIME) +@Target({ElementType.FIELD,ElementType.TYPE}) +public @interface ExtractByRaw { + + String value(); + + public enum Type {XPath2, XPath, Regex, Css} + + Type type() default Type.XPath2; + + boolean notNull() default true; + + boolean multi() default false; + +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/model/Extractor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/model/Extractor.java index c8feef4..498aba9 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/model/Extractor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/model/Extractor.java @@ -17,7 +17,7 @@ class Extractor { protected final boolean multi; - static enum Source {Html, Url} + static enum Source {Html, Url, RawHtml} public Extractor(Selector selector, Source source, boolean notNull, boolean multi) { this.selector = selector; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java index cf0eeac..9694c4e 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java @@ -46,56 +46,100 @@ class PageModelExtractor { fieldExtractors = new ArrayList(); for (Field field : clazz.getDeclaredFields()) { field.setAccessible(true); - ExtractBy extractBy = field.getAnnotation(ExtractBy.class); - if (extractBy != null) { - if (!extractBy.multi() && !String.class.isAssignableFrom(field.getType())) { - throw new IllegalStateException("Field " + field.getName() + " must be string"); - } else if (extractBy.multi() && !List.class.isAssignableFrom(field.getType())) { - throw new IllegalStateException("Field " + field.getName() + " must be list"); - } - String value = extractBy.value(); - Selector selector; - switch (extractBy.type()) { - case Css: - selector = new CssSelector(value); - break; - case Regex: - selector = new RegexSelector(value); - break; - case XPath: - selector = new XpathSelector(value); - break; - case XPath2: - selector = new Xpath2Selector(value); - break; - default: - selector = new Xpath2Selector(value); - } - FieldExtractor fieldExtractor = new FieldExtractor(field, selector, FieldExtractor.Source.Html, extractBy.notNull(), extractBy.multi()); - Method setterMethod = getSetterMethod(clazz, field); - if (setterMethod != null) { - fieldExtractor.setSetterMethod(setterMethod); - } - fieldExtractors.add(fieldExtractor); + getAnnotationExtractBy(clazz, field); + getAnnotationExtractByRaw(clazz,field); + getAnnotationExtractByUrl(clazz, field); + } + } + + private void getAnnotationExtractByUrl(Class clazz, Field field) { + ExtractByUrl extractByUrl = field.getAnnotation(ExtractByUrl.class); + if (extractByUrl != null) { + if (!extractByUrl.multi() && !String.class.isAssignableFrom(field.getType())) { + throw new IllegalStateException("Field " + field.getName() + " must be string"); + } else if (extractByUrl.multi() && !List.class.isAssignableFrom(field.getType())) { + throw new IllegalStateException("Field " + field.getName() + " must be list"); } - ExtractByUrl extractByUrl = field.getAnnotation(ExtractByUrl.class); - if (extractByUrl != null) { - if (!extractByUrl.multi() && !String.class.isAssignableFrom(field.getType())) { - throw new IllegalStateException("Field " + field.getName() + " must be string"); - } else if (extractByUrl.multi() && !List.class.isAssignableFrom(field.getType())) { - throw new IllegalStateException("Field " + field.getName() + " must be list"); - } - String regexPattern = extractByUrl.value(); - if (regexPattern.trim().equals("")) { - regexPattern = ".*"; - } - FieldExtractor fieldExtractor = new FieldExtractor(field, new RegexSelector(regexPattern), FieldExtractor.Source.Url, extractByUrl.notNull(), extractByUrl.multi()); - Method setterMethod = getSetterMethod(clazz, field); - if (setterMethod != null) { - fieldExtractor.setSetterMethod(setterMethod); - } - fieldExtractors.add(fieldExtractor); + String regexPattern = extractByUrl.value(); + if (regexPattern.trim().equals("")) { + regexPattern = ".*"; } + FieldExtractor fieldExtractor = new FieldExtractor(field, new RegexSelector(regexPattern), FieldExtractor.Source.Url, extractByUrl.notNull(), extractByUrl.multi()); + Method setterMethod = getSetterMethod(clazz, field); + if (setterMethod != null) { + fieldExtractor.setSetterMethod(setterMethod); + } + fieldExtractors.add(fieldExtractor); + } + } + + private void getAnnotationExtractBy(Class clazz, Field field) { + ExtractBy extractBy = field.getAnnotation(ExtractBy.class); + if (extractBy != null) { + if (!extractBy.multi() && !String.class.isAssignableFrom(field.getType())) { + throw new IllegalStateException("Field " + field.getName() + " must be string"); + } else if (extractBy.multi() && !List.class.isAssignableFrom(field.getType())) { + throw new IllegalStateException("Field " + field.getName() + " must be list"); + } + String value = extractBy.value(); + Selector selector; + switch (extractBy.type()) { + case Css: + selector = new CssSelector(value); + break; + case Regex: + selector = new RegexSelector(value); + break; + case XPath: + selector = new XpathSelector(value); + break; + case XPath2: + selector = new Xpath2Selector(value); + break; + default: + selector = new Xpath2Selector(value); + } + FieldExtractor fieldExtractor = new FieldExtractor(field, selector, FieldExtractor.Source.Html, extractBy.notNull(), extractBy.multi()); + Method setterMethod = getSetterMethod(clazz, field); + if (setterMethod != null) { + fieldExtractor.setSetterMethod(setterMethod); + } + fieldExtractors.add(fieldExtractor); + } + } + + private void getAnnotationExtractByRaw(Class clazz, Field field) { + ExtractByRaw extractByRaw = field.getAnnotation(ExtractByRaw.class); + if (extractByRaw != null) { + if (!extractByRaw.multi() && !String.class.isAssignableFrom(field.getType())) { + throw new IllegalStateException("Field " + field.getName() + " must be string"); + } else if (extractByRaw.multi() && !List.class.isAssignableFrom(field.getType())) { + throw new IllegalStateException("Field " + field.getName() + " must be list"); + } + String value = extractByRaw.value(); + Selector selector; + switch (extractByRaw.type()) { + case Css: + selector = new CssSelector(value); + break; + case Regex: + selector = new RegexSelector(value); + break; + case XPath: + selector = new XpathSelector(value); + break; + case XPath2: + selector = new Xpath2Selector(value); + break; + default: + selector = new Xpath2Selector(value); + } + FieldExtractor fieldExtractor = new FieldExtractor(field, selector, FieldExtractor.Source.RawHtml, extractByRaw.notNull(), extractByRaw.multi()); + Method setterMethod = getSetterMethod(clazz, field); + if (setterMethod != null) { + fieldExtractor.setSetterMethod(setterMethod); + } + fieldExtractors.add(fieldExtractor); } } @@ -181,6 +225,9 @@ class PageModelExtractor { if (fieldExtractor.multi) { List value; switch (fieldExtractor.getSource()) { + case RawHtml: + value = fieldExtractor.getSelector().selectList(page.getHtml().toString()); + break; case Html: value = fieldExtractor.getSelector().selectList(html); break; @@ -197,6 +244,9 @@ class PageModelExtractor { } else { String value; switch (fieldExtractor.getSource()) { + case RawHtml: + value = fieldExtractor.getSelector().select(page.getHtml().toString()); + break; case Html: value = fieldExtractor.getSelector().select(html); break; diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/model/OschinaBlog.java b/webmagic-core/src/test/java/us/codecraft/webmagic/model/OschinaBlog.java index 1bb219f..2552104 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/model/OschinaBlog.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/model/OschinaBlog.java @@ -22,6 +22,9 @@ public class OschinaBlog implements AfterExtractor { @ExtractBy(value = "//div[@class='BlogTags']/a/text()", multi = true) private List tags; + @ExtractBy(value = "//div[@class='BlogTags']/a/text()", multi = true) + private List comments; + @Override public void afterProcess(Page page) { System.out.println("title:\t"+title); diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/model/OschinaBlogComment.java b/webmagic-core/src/test/java/us/codecraft/webmagic/model/OschinaBlogComment.java new file mode 100644 index 0000000..a1e5843 --- /dev/null +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/model/OschinaBlogComment.java @@ -0,0 +1,13 @@ +package us.codecraft.webmagic.model; + +/** + * @author code4crafter@gmail.com
+ * @date: 13-8-1
+ * Time: 下午10:18
+ */ +@TargetUrl("http://my.oschina.net/flashsword/blog/*") +public class OschinaBlogComment { + + + +} \ No newline at end of file From 619a12b3034b4038d1a2e730b967cc316213d834 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sun, 4 Aug 2013 21:22:15 +0800 Subject: [PATCH 31/84] add paged support --- .../java/us/codecraft/webmagic/Spider.java | 5 + .../webmagic/pipeline/ConsolePipeline.java | 1 - .../webmagic/utils/DoubleKeyMap.java | 111 ++++++++++++++++++ .../webmagic/utils/MultiKeyMapBase.java | 42 +++++++ .../webmagic/model/OschinaBlogComment.java | 13 -- .../us/codecraft/webmagic/PagedModel.java | 20 ++++ .../webmagic/pipeline/PagedPipeline.java | 78 ++++++++++++ .../webmagic/model/samples/News163.java | 81 +++++++++++++ 8 files changed, 337 insertions(+), 14 deletions(-) create mode 100755 webmagic-core/src/main/java/us/codecraft/webmagic/utils/DoubleKeyMap.java create mode 100755 webmagic-core/src/main/java/us/codecraft/webmagic/utils/MultiKeyMapBase.java delete mode 100644 webmagic-core/src/test/java/us/codecraft/webmagic/model/OschinaBlogComment.java create mode 100644 webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/PagedModel.java create mode 100644 webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/pipeline/PagedPipeline.java create mode 100644 webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index a25fd02..414315c 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -283,6 +283,11 @@ public class Spider implements Runnable, Task { return this; } + public Spider clearPipeline(){ + pipelines=new ArrayList(); + return this; + } + @Override public String getUUID() { if (uuid != null) { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java index 97470e0..8f29474 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java @@ -29,7 +29,6 @@ public class ConsolePipeline implements Pipeline{ } else { System.out.println(entry.getKey() + ":\t" + entry.getValue()); } - System.out.println(entry.getKey()+":\t"+entry.getValue()); } } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/DoubleKeyMap.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/DoubleKeyMap.java new file mode 100755 index 0000000..500573a --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/DoubleKeyMap.java @@ -0,0 +1,111 @@ +package us.codecraft.webmagic.utils; + +import java.util.Map; + +/** + * @author yihua.huang@dianping.com + * @date Dec 14, 2012 + */ +public class DoubleKeyMap extends MultiKeyMapBase { + private Map> map; + + public DoubleKeyMap() { + init(); + } + + public DoubleKeyMap(Map> map) { + this(map,DEFAULT_CLAZZ); + } + + public DoubleKeyMap(Class protoMapClass) { + super(protoMapClass); + init(); + } + + private void init() { + if (map == null) { + map = this.>newMap(); + } + } + + /** + * init map with protoMapClass + * + * @param protoMapClass + */ + @SuppressWarnings("rawtypes") + public DoubleKeyMap(Map> map, Class protoMapClass) { + super(protoMapClass); + this.map = map; + init(); + } + + /** + * @param key + * @return + */ + public Map get(K1 key) { + return map.get(key); + } + + /** + * @param key1 + * @param key2 + * @return + */ + public V get(K1 key1, K2 key2) { + if (get(key1) == null) { + return null; + } + return get(key1).get(key2); + } + + + /** + * @param key1 + * @param submap + * @return + */ + public V put(K1 key1, Map submap) { + return put(key1, submap); + } + + /** + * @param key1 + * @param key2 + * @param value + * @return + */ + public V put(K1 key1, K2 key2, V value) { + if (map.get(key1) == null) { + map.put(key1, this.newMap()); + } + return get(key1).put(key2, value); + } + + /** + * @param key1 + * @param key2 + * @return + */ + public V remove(K1 key1, K2 key2) { + if (get(key1) == null) { + return null; + } + V remove = get(key1).remove(key2); + // 如果上一级map为空,把它也回收掉 + if (get(key1).size() == 0) { + remove(key1); + } + return remove; + } + + /** + * @param key1 + * @return + */ + public Map remove(K1 key1) { + Map remove = map.remove(key1); + return remove; + } +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/MultiKeyMapBase.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/MultiKeyMapBase.java new file mode 100755 index 0000000..e0b5c64 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/MultiKeyMapBase.java @@ -0,0 +1,42 @@ +package us.codecraft.webmagic.utils; + +/** + * @author yihua.huang@dianping.com + * @date Dec 14, 2012 + */ + +import java.util.HashMap; +import java.util.Map; + +/** + * multikey map, some basic objects * + * + * @author yihua.huang + */ +public abstract class MultiKeyMapBase { + + protected static final Class DEFAULT_CLAZZ = HashMap.class; + @SuppressWarnings("rawtypes") + private Class protoMapClass = DEFAULT_CLAZZ; + + public MultiKeyMapBase() { + } + + @SuppressWarnings("rawtypes") + public MultiKeyMapBase(Class protoMapClass) { + this.protoMapClass = protoMapClass; + } + + @SuppressWarnings("unchecked") + protected Map newMap() { + try { + return (Map) protoMapClass.newInstance(); + } catch (InstantiationException e) { + throw new IllegalArgumentException("wrong proto type map " + + protoMapClass); + } catch (IllegalAccessException e) { + throw new IllegalArgumentException("wrong proto type map " + + protoMapClass); + } + } +} \ No newline at end of file diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/model/OschinaBlogComment.java b/webmagic-core/src/test/java/us/codecraft/webmagic/model/OschinaBlogComment.java deleted file mode 100644 index a1e5843..0000000 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/model/OschinaBlogComment.java +++ /dev/null @@ -1,13 +0,0 @@ -package us.codecraft.webmagic.model; - -/** - * @author code4crafter@gmail.com
- * @date: 13-8-1
- * Time: 下午10:18
- */ -@TargetUrl("http://my.oschina.net/flashsword/blog/*") -public class OschinaBlogComment { - - - -} \ No newline at end of file diff --git a/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/PagedModel.java b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/PagedModel.java new file mode 100644 index 0000000..f18426a --- /dev/null +++ b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/PagedModel.java @@ -0,0 +1,20 @@ +package us.codecraft.webmagic; + +import java.util.Collection; + +/** + * @author yihua.huang@dianping.com
+ * @date: 13-8-4
+ * Time: 下午5:18
+ */ +public interface PagedModel { + + public String getPageKey(); + + public Collection getOtherPages(); + + public String getPage(); + + public PagedModel combine(PagedModel pagedModel); + +} diff --git a/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/pipeline/PagedPipeline.java b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/pipeline/PagedPipeline.java new file mode 100644 index 0000000..cc71e5c --- /dev/null +++ b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/pipeline/PagedPipeline.java @@ -0,0 +1,78 @@ +package us.codecraft.webmagic.pipeline; + +import us.codecraft.webmagic.PagedModel; +import us.codecraft.webmagic.ResultItems; +import us.codecraft.webmagic.Task; +import us.codecraft.webmagic.utils.DoubleKeyMap; + +import java.util.*; +import java.util.concurrent.ConcurrentHashMap; + +/** + * @author yihua.huang@dianping.com
+ * @date: 13-8-4
+ * Time: 下午5:15
+ */ +public class PagedPipeline implements Pipeline { + + private DoubleKeyMap pageMap = new DoubleKeyMap(ConcurrentHashMap.class); + + private DoubleKeyMap objectMap = new DoubleKeyMap(ConcurrentHashMap.class); + + @Override + public void process(ResultItems resultItems, Task task) { + Map resultItemsAll = resultItems.getAll(); + Iterator> iterator = resultItemsAll.entrySet().iterator(); + while (iterator.hasNext()) { + handleObject(iterator); + } + } + + private void handleObject(Iterator> iterator) { + Map.Entry objectEntry = iterator.next(); + Object o = objectEntry.getValue(); + if (o instanceof PagedModel) { + PagedModel pagedModel = (PagedModel) o; + for (String otherPage : pagedModel.getOtherPages()) { + Boolean aBoolean = pageMap.get(pagedModel.getPageKey(), otherPage); + if (aBoolean == null) { + pageMap.put(pagedModel.getPageKey(), otherPage, Boolean.FALSE); + } + } + //check if all pages are processed + Map booleanMap = pageMap.get(pagedModel.getPageKey()); + objectMap.put(pagedModel.getPageKey(), pagedModel.getPage(), pagedModel); + if (booleanMap == null) { + return; + } + for (Map.Entry stringBooleanEntry : booleanMap.entrySet()) { + if (!stringBooleanEntry.getValue()) { + iterator.remove(); + return; + } + } + List> entryList = new ArrayList>(); + entryList.addAll(objectMap.get(pagedModel.getPageKey()).entrySet()); + if (entryList.size() != 0) { + Collections.sort(entryList, new Comparator>() { + @Override + public int compare(Map.Entry o1, Map.Entry o2) { + try { + int i1 = Integer.parseInt(o1.getKey()); + int i2 = Integer.parseInt(o2.getKey()); + return i1 - i2; + } catch (NumberFormatException e) { + return o1.getKey().compareTo(o2.getKey()); + } + } + }); + PagedModel value = entryList.get(0).getValue(); + for (int i=1;i + * @date: 13-8-4
+ * Time: 下午8:17
+ */ +@TargetUrl("http://news.163.com/\\d+/\\d+/\\d+/\\w+*.html") +public class News163 implements PagedModel, AfterExtractor { + + @ExtractByUrl("http://news\\.163\\.com/\\d+/\\d+/\\d+/(\\w+)*\\.html") + private String pageKey; + + @ExtractByUrl(value = "http://news\\.163\\.com/\\d+/\\d+/\\d+/\\w+_(\\d+)\\.html", notNull = false) + private String page; + + private List otherPage; + + @ExtractBy("//h1[@id=\"h1title\"]/text()") + private String title; + + @ExtractBy("//div[@id=\"epContentLeft\"]") + private String content; + + @Override + public String getPageKey() { + return pageKey; + } + + @Override + public Collection getOtherPages() { + return otherPage; + } + + @Override + public String getPage() { + if (page == null) { + return "0"; + } + return page; + } + + @Override + public PagedModel combine(PagedModel pagedModel) { + News163 news163 = new News163(); + News163 pagedModel1 = (News163) pagedModel; + news163.content = this.content + pagedModel1.content; + return news163; + } + + @Override + public String toString() { + return "News163{" + + "content='" + content + '\'' + + ", title='" + title + '\'' + + ", otherPage=" + otherPage + + '}'; + } + + public static void main(String[] args) { + OOSpider.create(Site.me().addStartUrl("http://news.163.com/13/0802/05/958I1E330001124J_2.html"), News163.class) + .clearPipeline().pipeline(new PagedPipeline()).pipeline(new ConsolePipeline()).run(); + } + + @Override + public void afterProcess(Page page) { + Selectable xpath = page.getHtml().xpath("//div[@class=\"ep-pages\"]//a/@href"); + otherPage = xpath.regex("http://news\\.163\\.com/\\d+/\\d+/\\d+/\\w+_(\\d+)\\.html").all(); + } +} From 971e7b6ce2c60efce42eb45a58e684a1bdce37ea Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Mon, 5 Aug 2013 13:53:13 +0800 Subject: [PATCH 32/84] add core --- .../src/main/java/us/codecraft/webmagic/model/ExtractBy.java | 1 + .../src/main/java/us/codecraft/webmagic/model/ExtractByUrl.java | 1 + .../src/main/java/us/codecraft/webmagic/model/HelpUrl.java | 1 + .../src/main/java/us/codecraft/webmagic/model/TargetUrl.java | 1 + .../src/main/java/us/codecraft/webmagic/utils/DoubleKeyMap.java | 2 +- .../main/java/us/codecraft/webmagic/utils/MultiKeyMapBase.java | 2 +- .../main/java/us/codecraft/webmagic/model/samples/News163.java | 2 +- 7 files changed, 7 insertions(+), 3 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/model/ExtractBy.java b/webmagic-core/src/main/java/us/codecraft/webmagic/model/ExtractBy.java index 4c37c9b..661fd67 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/model/ExtractBy.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/model/ExtractBy.java @@ -5,6 +5,7 @@ import java.lang.annotation.Retention; import java.lang.annotation.Target; /** + * 定义类或者字段的抽取规则。
* @author code4crafter@gmail.com
* @date: 13-8-1
* Time: 下午8:40
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/model/ExtractByUrl.java b/webmagic-core/src/main/java/us/codecraft/webmagic/model/ExtractByUrl.java index 9f77676..f443c0e 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/model/ExtractByUrl.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/model/ExtractByUrl.java @@ -5,6 +5,7 @@ import java.lang.annotation.Retention; import java.lang.annotation.Target; /** + * 定义类或者字段的抽取规则(从url中抽取,只支持正则表达式)。
* @author code4crafter@gmail.com
* @date: 13-8-1
* Time: 下午8:40
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/model/HelpUrl.java b/webmagic-core/src/main/java/us/codecraft/webmagic/model/HelpUrl.java index 9dee05b..808d58a 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/model/HelpUrl.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/model/HelpUrl.java @@ -5,6 +5,7 @@ import java.lang.annotation.Retention; import java.lang.annotation.Target; /** + * 定义辅助爬取的url。
* @author code4crafter@gmail.com
* @date: 13-8-1
* Time: 下午8:40
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/model/TargetUrl.java b/webmagic-core/src/main/java/us/codecraft/webmagic/model/TargetUrl.java index 96ca864..3622f55 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/model/TargetUrl.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/model/TargetUrl.java @@ -5,6 +5,7 @@ import java.lang.annotation.Retention; import java.lang.annotation.Target; /** + * 定义某个类抽取的范围和来源,sourceRegion可以用xpath语法限定抽取区域。
* @author code4crafter@gmail.com
* @date: 13-8-1
* Time: 下午8:40
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/DoubleKeyMap.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/DoubleKeyMap.java index 500573a..a834528 100755 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/DoubleKeyMap.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/DoubleKeyMap.java @@ -3,7 +3,7 @@ package us.codecraft.webmagic.utils; import java.util.Map; /** - * @author yihua.huang@dianping.com + * @author code4crafter@gmail.com * @date Dec 14, 2012 */ public class DoubleKeyMap extends MultiKeyMapBase { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/MultiKeyMapBase.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/MultiKeyMapBase.java index e0b5c64..256097a 100755 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/MultiKeyMapBase.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/MultiKeyMapBase.java @@ -1,7 +1,7 @@ package us.codecraft.webmagic.utils; /** - * @author yihua.huang@dianping.com + * @author code4crafter@gmail.com * @date Dec 14, 2012 */ diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java index bf26930..07b1e8e 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java @@ -46,7 +46,7 @@ public class News163 implements PagedModel, AfterExtractor { @Override public String getPage() { if (page == null) { - return "0"; + return "1"; } return page; } From d56c681be1945379e117b8c61be8baec3cb4fcf6 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Mon, 5 Aug 2013 18:08:28 +0800 Subject: [PATCH 33/84] add priority to request --- .../java/us/codecraft/webmagic/Request.java | 27 ++++++++ webmagic-plugin/pom.xml | 1 + webmagic-plugin/webmagic-lucene/pom.xml | 28 ++++++++ .../webmagic/pipeline/LucenePipeline.java | 64 +++++++++++++++++++ 4 files changed, 120 insertions(+) create mode 100644 webmagic-plugin/webmagic-lucene/pom.xml create mode 100644 webmagic-plugin/webmagic-lucene/src/main/java/us/codecraft/webmagic/pipeline/LucenePipeline.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java index 42dd079..1f6657c 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java @@ -28,6 +28,8 @@ public class Request { private Object[] extra; + private double priority; + /** * 构建一个request对象 * @param url 必须参数,待抓取的url @@ -38,6 +40,15 @@ public class Request { this.extra = extra; } + public double getPriority() { + return priority; + } + + public Request setPriority(double priority) { + this.priority = priority; + return this; + } + /** * 获取预存的对象 * @return object[] 预存的对象数组 @@ -54,4 +65,20 @@ public class Request { return url; } + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + Request request = (Request) o; + + if (!url.equals(request.url)) return false; + + return true; + } + + @Override + public int hashCode() { + return url.hashCode(); + } } diff --git a/webmagic-plugin/pom.xml b/webmagic-plugin/pom.xml index 2225722..54c69ec 100644 --- a/webmagic-plugin/pom.xml +++ b/webmagic-plugin/pom.xml @@ -12,6 +12,7 @@ webmagic-misc webmagic-selenium + webmagic-lucene webmagic-plugin diff --git a/webmagic-plugin/webmagic-lucene/pom.xml b/webmagic-plugin/webmagic-lucene/pom.xml new file mode 100644 index 0000000..b072472 --- /dev/null +++ b/webmagic-plugin/webmagic-lucene/pom.xml @@ -0,0 +1,28 @@ + + + + webmagic-plugin + us.codecraft + 0.1.0 + + 4.0.0 + + webmagic-lucene + + + + org.apache.lucene + lucene-analyzers-common + 4.4.0 + + + org.apache.lucene + lucene-queryparser + 4.4.0 + + + + + \ No newline at end of file diff --git a/webmagic-plugin/webmagic-lucene/src/main/java/us/codecraft/webmagic/pipeline/LucenePipeline.java b/webmagic-plugin/webmagic-lucene/src/main/java/us/codecraft/webmagic/pipeline/LucenePipeline.java new file mode 100644 index 0000000..2e7191c --- /dev/null +++ b/webmagic-plugin/webmagic-lucene/src/main/java/us/codecraft/webmagic/pipeline/LucenePipeline.java @@ -0,0 +1,64 @@ +package us.codecraft.webmagic.pipeline; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.queryparser.classic.QueryParser; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.ScoreDoc; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.FSDirectory; +import org.apache.lucene.util.Version; +import us.codecraft.webmagic.ResultItems; +import us.codecraft.webmagic.Task; + +import java.io.File; + +/** + * @author yihua.huang@dianping.com
+ * @date: 13-8-5
+ * Time: 下午2:11
+ */ +public class LucenePipeline implements Pipeline { + @Override + public void process(ResultItems resultItems, Task task) { + try { + + } catch (Exception e) { + + } + } + + public static void main(String[] args) throws Exception { + Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_44); +// Directory directory = new RAMDirectory(); + // To store an index on disk, use this instead: + Directory directory = FSDirectory.open(new File("/data/webmagic/www.guoxue123.cn/")); + IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_44, analyzer); + IndexWriter iwriter = new IndexWriter(directory, config); + Document doc = new Document(); +// String text = "This is the text to be indexed."; +// doc.add(new Field("fieldname", text, TextField.TYPE_STORED)); +// iwriter.addDocument(doc); + iwriter.close(); + + // Now search the index: + DirectoryReader ireader = DirectoryReader.open(directory); + IndexSearcher isearcher = new IndexSearcher(ireader); + // Parse a simple query that searches for "text": + QueryParser parser = new QueryParser(Version.LUCENE_44, "fieldname", analyzer); + Query query = parser.parse("经典"); + ScoreDoc[] hits = isearcher.search(query, null, 1000).scoreDocs; + // Iterate through the results: + for (int i = 0; i < hits.length; i++) { + Document hitDoc = isearcher.doc(hits[i].doc); + System.out.println(hitDoc); + } + ireader.close(); + directory.close(); + } +} From dc9f574e27b6cfb9c5cfa70809852b92579dab6d Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Mon, 5 Aug 2013 18:17:52 +0800 Subject: [PATCH 34/84] update request --- .../java/us/codecraft/webmagic/Request.java | 34 ++++++++++++------- .../webmagic/scheduler/RedisScheduler.java | 2 +- 2 files changed, 22 insertions(+), 14 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java index 1f6657c..9b9740d 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java @@ -1,5 +1,8 @@ package us.codecraft.webmagic; +import java.util.HashMap; +import java.util.Map; + /** * Request对象封装了待抓取的url信息。
* 在PageProcessor中,Request对象可以通过{@link us.codecraft.webmagic.Page#getRequest()} 获取。
@@ -18,26 +21,29 @@ package us.codecraft.webmagic; * String linktext = (String)page.getRequest().getExtra()[0]; * } * + * * @author code4crafter@gmail.com
- * Date: 13-4-21 - * Time: 上午11:37 + * Date: 13-4-21 + * Time: 上午11:37 */ public class Request { private String url; - private Object[] extra; + /** + * 额外参数,可以保存一些需要的上下文信息 + */ + private Map extras = new HashMap(); private double priority; /** * 构建一个request对象 - * @param url 必须参数,待抓取的url - * @param extra 额外参数,可以保存一些需要的上下文信息 + * + * @param url 必须参数,待抓取的url */ - public Request(String url, Object... extra) { + public Request(String url) { this.url = url; - this.extra = extra; } public double getPriority() { @@ -49,16 +55,18 @@ public class Request { return this; } - /** - * 获取预存的对象 - * @return object[] 预存的对象数组 - */ - public Object[] getExtra() { - return extra; + public Object getExtra(String key) { + return extras.get(key); + } + + public Request putExtra(String key,Object value) { + extras.put(key,value); + return this; } /** * 获取待抓取的url + * * @return url 待抓取的url */ public String getUrl() { diff --git a/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java index 094295c..8109ad1 100644 --- a/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java +++ b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java @@ -33,7 +33,7 @@ public class RedisScheduler implements Scheduler { if (jedis.zrank(SET_PREFIX + task.getUUID(), request.getUrl()) == null) { //使用List保存队列 jedis.rpush(QUEUE_PREFIX + task.getUUID(), request.getUrl()); - jedis.zadd(SET_PREFIX + task.getUUID(), System.currentTimeMillis(), request.getUrl()); + jedis.zadd(SET_PREFIX + task.getUUID(), request.getPriority(), request.getUrl()); } pool.returnResource(jedis); } From 27ce3fc17660dc5c2bf3f8c32cf6e4fc55525ab2 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Mon, 5 Aug 2013 19:36:49 +0800 Subject: [PATCH 35/84] lazy init --- .../src/main/java/us/codecraft/webmagic/Request.java | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java index 9b9740d..77db2c1 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java @@ -33,7 +33,7 @@ public class Request { /** * 额外参数,可以保存一些需要的上下文信息 */ - private Map extras = new HashMap(); + private Map extras; private double priority; @@ -56,10 +56,16 @@ public class Request { } public Object getExtra(String key) { + if (extras==null){ + return null; + } return extras.get(key); } public Request putExtra(String key,Object value) { + if (extras==null){ + extras = new HashMap(); + } extras.put(key,value); return this; } From 629f8ac2d11925016142bbd25af6eef573f30c82 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Mon, 5 Aug 2013 20:45:34 +0800 Subject: [PATCH 36/84] add extractors chain --- .../codecraft/webmagic/model/ExtractBy2.java | 23 ++++ .../codecraft/webmagic/model/ExtractBy3.java | 23 ++++ .../codecraft/webmagic/model/Extractor.java | 10 +- .../webmagic/model/PageModelExtractor.java | 126 +++++++++++++----- 4 files changed, 148 insertions(+), 34 deletions(-) create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/model/ExtractBy2.java create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/model/ExtractBy3.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/model/ExtractBy2.java b/webmagic-core/src/main/java/us/codecraft/webmagic/model/ExtractBy2.java new file mode 100644 index 0000000..55d5dfa --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/model/ExtractBy2.java @@ -0,0 +1,23 @@ +package us.codecraft.webmagic.model; + +import java.lang.annotation.ElementType; +import java.lang.annotation.Retention; +import java.lang.annotation.Target; + +/** + * 定义类或者字段的抽取规则。
+ * @author code4crafter@gmail.com
+ * @date: 13-8-1
+ * Time: 下午8:40
+ */ +@Retention(java.lang.annotation.RetentionPolicy.RUNTIME) +@Target({ElementType.FIELD}) +public @interface ExtractBy2 { + + String value(); + + public enum Type {XPath2, XPath, Regex, Css} + + Type type() default Type.XPath2; + +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/model/ExtractBy3.java b/webmagic-core/src/main/java/us/codecraft/webmagic/model/ExtractBy3.java new file mode 100644 index 0000000..10f6a9f --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/model/ExtractBy3.java @@ -0,0 +1,23 @@ +package us.codecraft.webmagic.model; + +import java.lang.annotation.ElementType; +import java.lang.annotation.Retention; +import java.lang.annotation.Target; + +/** + * 定义类或者字段的抽取规则。
+ * @author code4crafter@gmail.com
+ * @date: 13-8-1
+ * Time: 下午8:40
+ */ +@Retention(java.lang.annotation.RetentionPolicy.RUNTIME) +@Target({ElementType.FIELD}) +public @interface ExtractBy3 { + + String value(); + + public enum Type {XPath2, XPath, Regex, Css} + + Type type() default Type.XPath2; + +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/model/Extractor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/model/Extractor.java index 498aba9..82c7dbb 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/model/Extractor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/model/Extractor.java @@ -9,7 +9,7 @@ import us.codecraft.webmagic.selector.Selector; */ class Extractor { - protected final Selector selector; + protected Selector selector; protected final Source source; @@ -37,4 +37,12 @@ class Extractor { boolean isNotNull() { return notNull; } + + boolean isMulti() { + return multi; + } + + void setSelector(Selector selector) { + this.selector = selector; + } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java index 9694c4e..b2c2bb0 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java @@ -14,6 +14,7 @@ import java.util.regex.Pattern; /** * Model主要逻辑类。将一个带注解的POJO转换为一个PageModelExtractor。
+ * * @author code4crafter@gmail.com
* @date: 13-8-1
* Time: 下午9:33
@@ -46,41 +47,54 @@ class PageModelExtractor { fieldExtractors = new ArrayList(); for (Field field : clazz.getDeclaredFields()) { field.setAccessible(true); - getAnnotationExtractBy(clazz, field); - getAnnotationExtractByRaw(clazz,field); - getAnnotationExtractByUrl(clazz, field); + FieldExtractor fieldExtractor = getAnnotationExtractBy(clazz, field); + FieldExtractor fieldExtractorTmp = getAnnotationExtractByRaw(clazz, field); + if (fieldExtractor != null && fieldExtractorTmp != null) { + throw new IllegalStateException("Only one of 'ExtractBy ExtractByRaw ExtractByUrl' can be added to a field!"); + } else if (fieldExtractor == null && fieldExtractorTmp != null) { + fieldExtractor = fieldExtractorTmp; + } + // ExtractBy2 & ExtractBy3 + addAnnotationExtractBy2(clazz, fieldExtractor); + addAnnotationExtractBy3(clazz, fieldExtractor); + fieldExtractorTmp = getAnnotationExtractByUrl(clazz, field); + if (fieldExtractor != null && fieldExtractorTmp != null) { + throw new IllegalStateException("Only one of 'ExtractBy ExtractByRaw ExtractByUrl' can be added to a field!"); + } else if (fieldExtractor == null && fieldExtractorTmp != null) { + fieldExtractor = fieldExtractorTmp; + } + if (fieldExtractor != null) { + if (!fieldExtractor.isMulti() && !String.class.isAssignableFrom(field.getType())) { + throw new IllegalStateException("Field " + field.getName() + " must be string"); + } else if (fieldExtractor.isMulti() && !List.class.isAssignableFrom(field.getType())) { + throw new IllegalStateException("Field " + field.getName() + " must be list"); + } + } + } } - private void getAnnotationExtractByUrl(Class clazz, Field field) { + private FieldExtractor getAnnotationExtractByUrl(Class clazz, Field field) { + FieldExtractor fieldExtractor = null; ExtractByUrl extractByUrl = field.getAnnotation(ExtractByUrl.class); if (extractByUrl != null) { - if (!extractByUrl.multi() && !String.class.isAssignableFrom(field.getType())) { - throw new IllegalStateException("Field " + field.getName() + " must be string"); - } else if (extractByUrl.multi() && !List.class.isAssignableFrom(field.getType())) { - throw new IllegalStateException("Field " + field.getName() + " must be list"); - } String regexPattern = extractByUrl.value(); if (regexPattern.trim().equals("")) { regexPattern = ".*"; } - FieldExtractor fieldExtractor = new FieldExtractor(field, new RegexSelector(regexPattern), FieldExtractor.Source.Url, extractByUrl.notNull(), extractByUrl.multi()); + fieldExtractor = new FieldExtractor(field, new RegexSelector(regexPattern), FieldExtractor.Source.Url, extractByUrl.notNull(), extractByUrl.multi()); Method setterMethod = getSetterMethod(clazz, field); if (setterMethod != null) { fieldExtractor.setSetterMethod(setterMethod); } - fieldExtractors.add(fieldExtractor); } + return fieldExtractor; } - private void getAnnotationExtractBy(Class clazz, Field field) { + private FieldExtractor getAnnotationExtractBy(Class clazz, Field field) { + FieldExtractor fieldExtractor = null; ExtractBy extractBy = field.getAnnotation(ExtractBy.class); if (extractBy != null) { - if (!extractBy.multi() && !String.class.isAssignableFrom(field.getType())) { - throw new IllegalStateException("Field " + field.getName() + " must be string"); - } else if (extractBy.multi() && !List.class.isAssignableFrom(field.getType())) { - throw new IllegalStateException("Field " + field.getName() + " must be list"); - } String value = extractBy.value(); Selector selector; switch (extractBy.type()) { @@ -99,23 +113,69 @@ class PageModelExtractor { default: selector = new Xpath2Selector(value); } - FieldExtractor fieldExtractor = new FieldExtractor(field, selector, FieldExtractor.Source.Html, extractBy.notNull(), extractBy.multi()); + fieldExtractor = new FieldExtractor(field, selector, FieldExtractor.Source.Html, extractBy.notNull(), extractBy.multi()); Method setterMethod = getSetterMethod(clazz, field); if (setterMethod != null) { fieldExtractor.setSetterMethod(setterMethod); } - fieldExtractors.add(fieldExtractor); + } + return fieldExtractor; + } + + private void addAnnotationExtractBy2(Class clazz, FieldExtractor fieldExtractor) { + ExtractBy2 extractBy = fieldExtractor.getField().getAnnotation(ExtractBy2.class); + if (extractBy != null) { + String value = extractBy.value(); + Selector selector; + switch (extractBy.type()) { + case Css: + selector = new CssSelector(value); + break; + case Regex: + selector = new RegexSelector(value); + break; + case XPath: + selector = new XpathSelector(value); + break; + case XPath2: + selector = new Xpath2Selector(value); + break; + default: + selector = new Xpath2Selector(value); + } + fieldExtractor.setSelector(new AndSelector(fieldExtractor.getSelector(), selector)); } } - private void getAnnotationExtractByRaw(Class clazz, Field field) { + private void addAnnotationExtractBy3(Class clazz, FieldExtractor fieldExtractor) { + ExtractBy3 extractBy = fieldExtractor.getField().getAnnotation(ExtractBy3.class); + if (extractBy != null) { + String value = extractBy.value(); + Selector selector; + switch (extractBy.type()) { + case Css: + selector = new CssSelector(value); + break; + case Regex: + selector = new RegexSelector(value); + break; + case XPath: + selector = new XpathSelector(value); + break; + case XPath2: + selector = new Xpath2Selector(value); + break; + default: + selector = new Xpath2Selector(value); + } + fieldExtractor.setSelector(new AndSelector(fieldExtractor.getSelector(), selector)); + } + } + + private FieldExtractor getAnnotationExtractByRaw(Class clazz, Field field) { + FieldExtractor fieldExtractor = null; ExtractByRaw extractByRaw = field.getAnnotation(ExtractByRaw.class); if (extractByRaw != null) { - if (!extractByRaw.multi() && !String.class.isAssignableFrom(field.getType())) { - throw new IllegalStateException("Field " + field.getName() + " must be string"); - } else if (extractByRaw.multi() && !List.class.isAssignableFrom(field.getType())) { - throw new IllegalStateException("Field " + field.getName() + " must be list"); - } String value = extractByRaw.value(); Selector selector; switch (extractByRaw.type()) { @@ -134,13 +194,13 @@ class PageModelExtractor { default: selector = new Xpath2Selector(value); } - FieldExtractor fieldExtractor = new FieldExtractor(field, selector, FieldExtractor.Source.RawHtml, extractByRaw.notNull(), extractByRaw.multi()); + fieldExtractor = new FieldExtractor(field, selector, FieldExtractor.Source.RawHtml, extractByRaw.notNull(), extractByRaw.multi()); Method setterMethod = getSetterMethod(clazz, field); if (setterMethod != null) { fieldExtractor.setSetterMethod(setterMethod); } - fieldExtractors.add(fieldExtractor); } + return fieldExtractor; } public static Method getSetterMethod(Class clazz, Field field) { @@ -197,19 +257,19 @@ class PageModelExtractor { return null; } if (extractor == null) { - return processSingle(page,page.getHtml().toString()); + return processSingle(page, page.getHtml().toString()); } else { - if (extractor.multi){ + if (extractor.multi) { List os = new ArrayList(); List list = extractor.getSelector().selectList(page.getHtml().toString()); for (String s : list) { Object o = processSingle(page, s); - if (o!=null){ + if (o != null) { os.add(o); } } return os; - }else { + } else { String select = extractor.getSelector().select(page.getHtml().toString()); Object o = processSingle(page, select); return o; @@ -217,12 +277,12 @@ class PageModelExtractor { } } - private Object processSingle(Page page,String html) { + private Object processSingle(Page page, String html) { Object o = null; try { o = clazz.newInstance(); for (FieldExtractor fieldExtractor : fieldExtractors) { - if (fieldExtractor.multi) { + if (fieldExtractor.isMulti()) { List value; switch (fieldExtractor.getSource()) { case RawHtml: From f3a29d931520f893a43836364768f3fb3abd1926 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Mon, 5 Aug 2013 21:03:47 +0800 Subject: [PATCH 37/84] fix pagedmodel bug --- .../webmagic/model/PageModelExtractor.java | 12 +++++++----- .../codecraft/webmagic/pipeline/PagedPipeline.java | 11 +++++++---- .../codecraft/webmagic/model/samples/News163.java | 14 +++++--------- 3 files changed, 19 insertions(+), 18 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java index b2c2bb0..0207b7a 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java @@ -55,8 +55,10 @@ class PageModelExtractor { fieldExtractor = fieldExtractorTmp; } // ExtractBy2 & ExtractBy3 - addAnnotationExtractBy2(clazz, fieldExtractor); - addAnnotationExtractBy3(clazz, fieldExtractor); + if (fieldExtractor!=null){ + addAnnotationExtractBy2(fieldExtractor); + addAnnotationExtractBy3(fieldExtractor); + } fieldExtractorTmp = getAnnotationExtractByUrl(clazz, field); if (fieldExtractor != null && fieldExtractorTmp != null) { throw new IllegalStateException("Only one of 'ExtractBy ExtractByRaw ExtractByUrl' can be added to a field!"); @@ -69,8 +71,8 @@ class PageModelExtractor { } else if (fieldExtractor.isMulti() && !List.class.isAssignableFrom(field.getType())) { throw new IllegalStateException("Field " + field.getName() + " must be list"); } + fieldExtractors.add(fieldExtractor); } - } } @@ -122,7 +124,7 @@ class PageModelExtractor { return fieldExtractor; } - private void addAnnotationExtractBy2(Class clazz, FieldExtractor fieldExtractor) { + private void addAnnotationExtractBy2(FieldExtractor fieldExtractor) { ExtractBy2 extractBy = fieldExtractor.getField().getAnnotation(ExtractBy2.class); if (extractBy != null) { String value = extractBy.value(); @@ -147,7 +149,7 @@ class PageModelExtractor { } } - private void addAnnotationExtractBy3(Class clazz, FieldExtractor fieldExtractor) { + private void addAnnotationExtractBy3(FieldExtractor fieldExtractor) { ExtractBy3 extractBy = fieldExtractor.getField().getAnnotation(ExtractBy3.class); if (extractBy != null) { String value = extractBy.value(); diff --git a/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/pipeline/PagedPipeline.java b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/pipeline/PagedPipeline.java index cc71e5c..282545f 100644 --- a/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/pipeline/PagedPipeline.java +++ b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/pipeline/PagedPipeline.java @@ -33,10 +33,13 @@ public class PagedPipeline implements Pipeline { Object o = objectEntry.getValue(); if (o instanceof PagedModel) { PagedModel pagedModel = (PagedModel) o; - for (String otherPage : pagedModel.getOtherPages()) { - Boolean aBoolean = pageMap.get(pagedModel.getPageKey(), otherPage); - if (aBoolean == null) { - pageMap.put(pagedModel.getPageKey(), otherPage, Boolean.FALSE); + pageMap.put(pagedModel.getPageKey(), pagedModel.getPage(), Boolean.TRUE); + if (pagedModel.getOtherPages()!=null){ + for (String otherPage : pagedModel.getOtherPages()) { + Boolean aBoolean = pageMap.get(pagedModel.getPageKey(), otherPage); + if (aBoolean == null) { + pageMap.put(pagedModel.getPageKey(), otherPage, Boolean.FALSE); + } } } //check if all pages are processed diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java index 07b1e8e..52abe88 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java @@ -1,12 +1,10 @@ package us.codecraft.webmagic.model.samples; -import us.codecraft.webmagic.Page; import us.codecraft.webmagic.PagedModel; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.model.*; import us.codecraft.webmagic.pipeline.ConsolePipeline; import us.codecraft.webmagic.pipeline.PagedPipeline; -import us.codecraft.webmagic.selector.Selectable; import java.util.Collection; import java.util.List; @@ -17,14 +15,16 @@ import java.util.List; * Time: 下午8:17
*/ @TargetUrl("http://news.163.com/\\d+/\\d+/\\d+/\\w+*.html") -public class News163 implements PagedModel, AfterExtractor { +public class News163 implements PagedModel { - @ExtractByUrl("http://news\\.163\\.com/\\d+/\\d+/\\d+/(\\w+)*\\.html") + @ExtractByUrl("http://news\\.163\\.com/\\d+/\\d+/\\d+/([^_]*).*\\.html") private String pageKey; @ExtractByUrl(value = "http://news\\.163\\.com/\\d+/\\d+/\\d+/\\w+_(\\d+)\\.html", notNull = false) private String page; + @ExtractBy(value = "//div[@class=\"ep-pages\"]//a/@href", multi = true) + @ExtractBy2(value = "http://news\\.163\\.com/\\d+/\\d+/\\d+/\\w+_(\\d+)\\.html", type = ExtractBy2.Type.Regex) private List otherPage; @ExtractBy("//h1[@id=\"h1title\"]/text()") @@ -54,6 +54,7 @@ public class News163 implements PagedModel, AfterExtractor { @Override public PagedModel combine(PagedModel pagedModel) { News163 news163 = new News163(); + news163.title = this.title; News163 pagedModel1 = (News163) pagedModel; news163.content = this.content + pagedModel1.content; return news163; @@ -73,9 +74,4 @@ public class News163 implements PagedModel, AfterExtractor { .clearPipeline().pipeline(new PagedPipeline()).pipeline(new ConsolePipeline()).run(); } - @Override - public void afterProcess(Page page) { - Selectable xpath = page.getHtml().xpath("//div[@class=\"ep-pages\"]//a/@href"); - otherPage = xpath.regex("http://news\\.163\\.com/\\d+/\\d+/\\d+/\\w+_(\\d+)\\.html").all(); - } } From b0af45f4bbdc075a6dbb124ccfe37d06575510e9 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Mon, 5 Aug 2013 21:44:29 +0800 Subject: [PATCH 38/84] complete redis support --- .../java/us/codecraft/webmagic/Request.java | 19 +++++++---- webmagic-plugin/webmagic-misc/pom.xml | 5 +++ .../webmagic/scheduler/HessianSerializer.java | 33 +++++++++++++++++++ .../webmagic/scheduler/RedisScheduler.java | 24 +++++++++++++- 4 files changed, 74 insertions(+), 7 deletions(-) create mode 100644 webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/scheduler/HessianSerializer.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java index 77db2c1..6cf344f 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java @@ -1,5 +1,6 @@ package us.codecraft.webmagic; +import java.io.Serializable; import java.util.HashMap; import java.util.Map; @@ -26,7 +27,9 @@ import java.util.Map; * Date: 13-4-21 * Time: 上午11:37 */ -public class Request { +public class Request implements Serializable { + + private static final long serialVersionUID = 2062192774891352043L; private String url; @@ -40,7 +43,7 @@ public class Request { /** * 构建一个request对象 * - * @param url 必须参数,待抓取的url + * @param url 必须参数,待抓取的url */ public Request(String url) { this.url = url; @@ -56,17 +59,17 @@ public class Request { } public Object getExtra(String key) { - if (extras==null){ + if (extras == null) { return null; } return extras.get(key); } - public Request putExtra(String key,Object value) { - if (extras==null){ + public Request putExtra(String key, Object value) { + if (extras == null) { extras = new HashMap(); } - extras.put(key,value); + extras.put(key, value); return this; } @@ -91,6 +94,10 @@ public class Request { return true; } + public Map getExtras() { + return extras; + } + @Override public int hashCode() { return url.hashCode(); diff --git a/webmagic-plugin/webmagic-misc/pom.xml b/webmagic-plugin/webmagic-misc/pom.xml index c545615..4d8776c 100644 --- a/webmagic-plugin/webmagic-misc/pom.xml +++ b/webmagic-plugin/webmagic-misc/pom.xml @@ -17,6 +17,11 @@ freemarker 2.3.15 + + org.resthub + hessian + 4.0.8 + redis.clients jedis diff --git a/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/scheduler/HessianSerializer.java b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/scheduler/HessianSerializer.java new file mode 100644 index 0000000..68cb5bb --- /dev/null +++ b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/scheduler/HessianSerializer.java @@ -0,0 +1,33 @@ +package us.codecraft.webmagic.scheduler; + +import com.caucho.hessian.io.Hessian2Input; +import com.caucho.hessian.io.Hessian2Output; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; + +/** + * @author yihua.huang@dianping.com
+ * @date: 13-7-14
+ * Time: 下午9:20
+ */ +public enum HessianSerializer { + INSTANCE; + public byte[] serialize(T v) throws IOException { + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + Hessian2Output hessian2Output = new Hessian2Output(baos); + hessian2Output.writeObject(v); + hessian2Output.close(); + return baos.toByteArray(); + } + + @SuppressWarnings("unchecked") + public T deSerialize(byte[] bytes) throws IOException { + ByteArrayInputStream bais = new ByteArrayInputStream(bytes); + Hessian2Input hessian2Input = new Hessian2Input(bais); + T t = (T) hessian2Input.readObject(); + hessian2Input.close(); + return t; + } +} diff --git a/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java index 8109ad1..c00c12f 100644 --- a/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java +++ b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java @@ -1,5 +1,6 @@ package us.codecraft.webmagic.scheduler; +import org.apache.commons.codec.digest.DigestUtils; import redis.clients.jedis.Jedis; import redis.clients.jedis.JedisPool; import redis.clients.jedis.JedisPoolConfig; @@ -7,6 +8,8 @@ import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.schedular.Scheduler; +import java.io.IOException; + /** * 使用redis管理url,构建一个分布式的爬虫。
* @@ -22,6 +25,8 @@ public class RedisScheduler implements Scheduler { private static final String SET_PREFIX = "set_"; + private static final String ITEM_PREFIX = "item_"; + public RedisScheduler(String host) { pool = new JedisPool(new JedisPoolConfig(), host); } @@ -34,6 +39,15 @@ public class RedisScheduler implements Scheduler { //使用List保存队列 jedis.rpush(QUEUE_PREFIX + task.getUUID(), request.getUrl()); jedis.zadd(SET_PREFIX + task.getUUID(), request.getPriority(), request.getUrl()); + if (request.getExtras() != null) { + String key = ITEM_PREFIX + DigestUtils.shaHex(request.getUrl()); + try { + byte[] serialize = HessianSerializer.INSTANCE.serialize(request); + jedis.set(key.getBytes(), serialize); + } catch (IOException e) { + e.printStackTrace(); + } + } } pool.returnResource(jedis); } @@ -42,8 +56,16 @@ public class RedisScheduler implements Scheduler { public synchronized Request poll(Task task) { Jedis jedis = pool.getResource(); String url = jedis.lpop(QUEUE_PREFIX + task.getUUID()); + String key = ITEM_PREFIX + DigestUtils.shaHex(url); + byte[] bytes = jedis.get(key.getBytes()); + try { + Object o = HessianSerializer.INSTANCE.deSerialize(bytes); + return (Request)o; + } catch (Exception e) { + e.printStackTrace(); + } pool.returnResource(jedis); - if (url==null){ + if (url == null) { return null; } return new Request(url); From 4eb3d6008352658e19587f04bf3aae06dbd1e85f Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Mon, 5 Aug 2013 22:06:39 +0800 Subject: [PATCH 39/84] fix nullpointer exception --- .../webmagic/model/ModelPageProcessor.java | 4 ++-- .../webmagic/scheduler/RedisScheduler.java | 20 ++++++++++--------- .../webmagic/model/samples/News163.java | 3 ++- 3 files changed, 15 insertions(+), 12 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/model/ModelPageProcessor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/model/ModelPageProcessor.java index 12e85d0..84563ce 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/model/ModelPageProcessor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/model/ModelPageProcessor.java @@ -51,14 +51,14 @@ class ModelPageProcessor implements PageProcessor { @Override public void process(Page page) { for (PageModelExtractor pageModelExtractor : pageModelExtractorList) { + extractLinks(page, pageModelExtractor.getHelpUrlRegionSelector(), pageModelExtractor.getHelpUrlPatterns()); + extractLinks(page, pageModelExtractor.getTargetUrlRegionSelector(), pageModelExtractor.getTargetUrlPatterns()); Object process = pageModelExtractor.process(page); if (process == null || (process instanceof List && ((List) process).size() == 0)) { page.getResultItems().setSkip(true); } postProcessPageModel(pageModelExtractor.getClazz(), process); page.putField(pageModelExtractor.getClazz().getCanonicalName(), process); - extractLinks(page, pageModelExtractor.getHelpUrlRegionSelector(), pageModelExtractor.getHelpUrlPatterns()); - extractLinks(page, pageModelExtractor.getTargetUrlRegionSelector(), pageModelExtractor.getTargetUrlPatterns()); } } diff --git a/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java index c00c12f..fb82a69 100644 --- a/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java +++ b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java @@ -56,18 +56,20 @@ public class RedisScheduler implements Scheduler { public synchronized Request poll(Task task) { Jedis jedis = pool.getResource(); String url = jedis.lpop(QUEUE_PREFIX + task.getUUID()); - String key = ITEM_PREFIX + DigestUtils.shaHex(url); - byte[] bytes = jedis.get(key.getBytes()); - try { - Object o = HessianSerializer.INSTANCE.deSerialize(bytes); - return (Request)o; - } catch (Exception e) { - e.printStackTrace(); - } - pool.returnResource(jedis); if (url == null) { return null; } + String key = ITEM_PREFIX + DigestUtils.shaHex(url); + byte[] bytes = jedis.get(key.getBytes()); + if (bytes!=null){ + try { + Object o = HessianSerializer.INSTANCE.deSerialize(bytes); + return (Request)o; + } catch (Exception e) { + e.printStackTrace(); + } + } + pool.returnResource(jedis); return new Request(url); } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java index 52abe88..2aa9073 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java @@ -5,6 +5,7 @@ import us.codecraft.webmagic.Site; import us.codecraft.webmagic.model.*; import us.codecraft.webmagic.pipeline.ConsolePipeline; import us.codecraft.webmagic.pipeline.PagedPipeline; +import us.codecraft.webmagic.scheduler.RedisScheduler; import java.util.Collection; import java.util.List; @@ -71,7 +72,7 @@ public class News163 implements PagedModel { public static void main(String[] args) { OOSpider.create(Site.me().addStartUrl("http://news.163.com/13/0802/05/958I1E330001124J_2.html"), News163.class) - .clearPipeline().pipeline(new PagedPipeline()).pipeline(new ConsolePipeline()).run(); + .scheduler(new RedisScheduler("localhost")).clearPipeline().pipeline(new PagedPipeline()).pipeline(new ConsolePipeline()).run(); } } From b40cca112235289023bb8150de60f1ab3bfd787b Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Tue, 6 Aug 2013 20:41:35 +0800 Subject: [PATCH 40/84] move model package to plugin --- .../src/main/java/us/codecraft/webmagic/model/AfterExtractor.java | 0 .../us/codecraft/webmagic/model/ConsolePageModelPipeline.java | 0 .../src/main/java/us/codecraft/webmagic/model/ExtractBy.java | 0 .../src/main/java/us/codecraft/webmagic/model/ExtractBy2.java | 0 .../src/main/java/us/codecraft/webmagic/model/ExtractBy3.java | 0 .../src/main/java/us/codecraft/webmagic/model/ExtractByRaw.java | 0 .../src/main/java/us/codecraft/webmagic/model/ExtractByUrl.java | 0 .../src/main/java/us/codecraft/webmagic/model/Extractor.java | 0 .../src/main/java/us/codecraft/webmagic/model/FieldExtractor.java | 0 .../src/main/java/us/codecraft/webmagic/model/HelpUrl.java | 0 .../main/java/us/codecraft/webmagic/model/ModelPageProcessor.java | 0 .../src/main/java/us/codecraft/webmagic/model/ModelPipeline.java | 0 .../src/main/java/us/codecraft/webmagic/model/OOSpider.java | 0 .../main/java/us/codecraft/webmagic/model/PageModelExtractor.java | 0 .../main/java/us/codecraft/webmagic/model/PageModelPipeline.java | 0 .../src/main/java/us/codecraft/webmagic/model/TargetUrl.java | 0 .../src/main/java/us/codecraft/webmagic/model/package.html | 0 .../src/main/java/us/codecraft/webmagic/model/samples/Blog.java | 0 .../main/java/us/codecraft/webmagic/model/samples/IteyeBlog.java | 0 .../main/java/us/codecraft/webmagic/model/samples/News163.java | 0 .../java/us/codecraft/webmagic/model/samples/OschinaAnswer.java | 0 .../java/us/codecraft/webmagic/model/samples/OschinaBlog.java | 0 .../src/test/java/us/codecraft/webmagic/model/OschinaBlog.java | 0 .../src/test/java/us/codecraft/webmagic/model/TestFetcher.java | 0 24 files changed, 0 insertions(+), 0 deletions(-) rename {webmagic-core => webmagic-plugin/webmagic-misc}/src/main/java/us/codecraft/webmagic/model/AfterExtractor.java (100%) rename {webmagic-core => webmagic-plugin/webmagic-misc}/src/main/java/us/codecraft/webmagic/model/ConsolePageModelPipeline.java (100%) rename {webmagic-core => webmagic-plugin/webmagic-misc}/src/main/java/us/codecraft/webmagic/model/ExtractBy.java (100%) rename {webmagic-core => webmagic-plugin/webmagic-misc}/src/main/java/us/codecraft/webmagic/model/ExtractBy2.java (100%) rename {webmagic-core => webmagic-plugin/webmagic-misc}/src/main/java/us/codecraft/webmagic/model/ExtractBy3.java (100%) rename {webmagic-core => webmagic-plugin/webmagic-misc}/src/main/java/us/codecraft/webmagic/model/ExtractByRaw.java (100%) rename {webmagic-core => webmagic-plugin/webmagic-misc}/src/main/java/us/codecraft/webmagic/model/ExtractByUrl.java (100%) rename {webmagic-core => webmagic-plugin/webmagic-misc}/src/main/java/us/codecraft/webmagic/model/Extractor.java (100%) rename {webmagic-core => webmagic-plugin/webmagic-misc}/src/main/java/us/codecraft/webmagic/model/FieldExtractor.java (100%) rename {webmagic-core => webmagic-plugin/webmagic-misc}/src/main/java/us/codecraft/webmagic/model/HelpUrl.java (100%) rename {webmagic-core => webmagic-plugin/webmagic-misc}/src/main/java/us/codecraft/webmagic/model/ModelPageProcessor.java (100%) rename {webmagic-core => webmagic-plugin/webmagic-misc}/src/main/java/us/codecraft/webmagic/model/ModelPipeline.java (100%) rename {webmagic-core => webmagic-plugin/webmagic-misc}/src/main/java/us/codecraft/webmagic/model/OOSpider.java (100%) rename {webmagic-core => webmagic-plugin/webmagic-misc}/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java (100%) rename {webmagic-core => webmagic-plugin/webmagic-misc}/src/main/java/us/codecraft/webmagic/model/PageModelPipeline.java (100%) rename {webmagic-core => webmagic-plugin/webmagic-misc}/src/main/java/us/codecraft/webmagic/model/TargetUrl.java (100%) rename {webmagic-core => webmagic-plugin/webmagic-misc}/src/main/java/us/codecraft/webmagic/model/package.html (100%) rename {webmagic-samples => webmagic-plugin/webmagic-misc}/src/main/java/us/codecraft/webmagic/model/samples/Blog.java (100%) rename {webmagic-samples => webmagic-plugin/webmagic-misc}/src/main/java/us/codecraft/webmagic/model/samples/IteyeBlog.java (100%) rename {webmagic-samples => webmagic-plugin/webmagic-misc}/src/main/java/us/codecraft/webmagic/model/samples/News163.java (100%) rename {webmagic-samples => webmagic-plugin/webmagic-misc}/src/main/java/us/codecraft/webmagic/model/samples/OschinaAnswer.java (100%) rename {webmagic-samples => webmagic-plugin/webmagic-misc}/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java (100%) rename {webmagic-core => webmagic-plugin/webmagic-misc}/src/test/java/us/codecraft/webmagic/model/OschinaBlog.java (100%) rename {webmagic-core => webmagic-plugin/webmagic-misc}/src/test/java/us/codecraft/webmagic/model/TestFetcher.java (100%) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/model/AfterExtractor.java b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/model/AfterExtractor.java similarity index 100% rename from webmagic-core/src/main/java/us/codecraft/webmagic/model/AfterExtractor.java rename to webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/model/AfterExtractor.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/model/ConsolePageModelPipeline.java b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/model/ConsolePageModelPipeline.java similarity index 100% rename from webmagic-core/src/main/java/us/codecraft/webmagic/model/ConsolePageModelPipeline.java rename to webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/model/ConsolePageModelPipeline.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/model/ExtractBy.java b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/model/ExtractBy.java similarity index 100% rename from webmagic-core/src/main/java/us/codecraft/webmagic/model/ExtractBy.java rename to webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/model/ExtractBy.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/model/ExtractBy2.java b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/model/ExtractBy2.java similarity index 100% rename from webmagic-core/src/main/java/us/codecraft/webmagic/model/ExtractBy2.java rename to webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/model/ExtractBy2.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/model/ExtractBy3.java b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/model/ExtractBy3.java similarity index 100% rename from webmagic-core/src/main/java/us/codecraft/webmagic/model/ExtractBy3.java rename to webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/model/ExtractBy3.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/model/ExtractByRaw.java b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/model/ExtractByRaw.java similarity index 100% rename from webmagic-core/src/main/java/us/codecraft/webmagic/model/ExtractByRaw.java rename to webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/model/ExtractByRaw.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/model/ExtractByUrl.java b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/model/ExtractByUrl.java similarity index 100% rename from webmagic-core/src/main/java/us/codecraft/webmagic/model/ExtractByUrl.java rename to webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/model/ExtractByUrl.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/model/Extractor.java b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/model/Extractor.java similarity index 100% rename from webmagic-core/src/main/java/us/codecraft/webmagic/model/Extractor.java rename to webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/model/Extractor.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/model/FieldExtractor.java b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/model/FieldExtractor.java similarity index 100% rename from webmagic-core/src/main/java/us/codecraft/webmagic/model/FieldExtractor.java rename to webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/model/FieldExtractor.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/model/HelpUrl.java b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/model/HelpUrl.java similarity index 100% rename from webmagic-core/src/main/java/us/codecraft/webmagic/model/HelpUrl.java rename to webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/model/HelpUrl.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/model/ModelPageProcessor.java b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/model/ModelPageProcessor.java similarity index 100% rename from webmagic-core/src/main/java/us/codecraft/webmagic/model/ModelPageProcessor.java rename to webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/model/ModelPageProcessor.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/model/ModelPipeline.java b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/model/ModelPipeline.java similarity index 100% rename from webmagic-core/src/main/java/us/codecraft/webmagic/model/ModelPipeline.java rename to webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/model/ModelPipeline.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/model/OOSpider.java b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/model/OOSpider.java similarity index 100% rename from webmagic-core/src/main/java/us/codecraft/webmagic/model/OOSpider.java rename to webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/model/OOSpider.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java similarity index 100% rename from webmagic-core/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java rename to webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/model/PageModelPipeline.java b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/model/PageModelPipeline.java similarity index 100% rename from webmagic-core/src/main/java/us/codecraft/webmagic/model/PageModelPipeline.java rename to webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/model/PageModelPipeline.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/model/TargetUrl.java b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/model/TargetUrl.java similarity index 100% rename from webmagic-core/src/main/java/us/codecraft/webmagic/model/TargetUrl.java rename to webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/model/TargetUrl.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/model/package.html b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/model/package.html similarity index 100% rename from webmagic-core/src/main/java/us/codecraft/webmagic/model/package.html rename to webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/model/package.html diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/Blog.java b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/model/samples/Blog.java similarity index 100% rename from webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/Blog.java rename to webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/model/samples/Blog.java diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/IteyeBlog.java b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/model/samples/IteyeBlog.java similarity index 100% rename from webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/IteyeBlog.java rename to webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/model/samples/IteyeBlog.java diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/model/samples/News163.java similarity index 100% rename from webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java rename to webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/model/samples/News163.java diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaAnswer.java b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/model/samples/OschinaAnswer.java similarity index 100% rename from webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaAnswer.java rename to webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/model/samples/OschinaAnswer.java diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java similarity index 100% rename from webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java rename to webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/model/OschinaBlog.java b/webmagic-plugin/webmagic-misc/src/test/java/us/codecraft/webmagic/model/OschinaBlog.java similarity index 100% rename from webmagic-core/src/test/java/us/codecraft/webmagic/model/OschinaBlog.java rename to webmagic-plugin/webmagic-misc/src/test/java/us/codecraft/webmagic/model/OschinaBlog.java diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/model/TestFetcher.java b/webmagic-plugin/webmagic-misc/src/test/java/us/codecraft/webmagic/model/TestFetcher.java similarity index 100% rename from webmagic-core/src/test/java/us/codecraft/webmagic/model/TestFetcher.java rename to webmagic-plugin/webmagic-misc/src/test/java/us/codecraft/webmagic/model/TestFetcher.java From 789beccfb3ef0100e1ea0585e605dc097f57a997 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Tue, 6 Aug 2013 21:23:29 +0800 Subject: [PATCH 41/84] update package --- .../src/main/java/us/codecraft/webmagic/model/samples/Blog.java | 0 .../main/java/us/codecraft/webmagic/model/samples/IteyeBlog.java | 0 .../main/java/us/codecraft/webmagic/model/samples/News163.java | 0 .../java/us/codecraft/webmagic/model/samples/OschinaAnswer.java | 0 .../java/us/codecraft/webmagic/model/samples/OschinaBlog.java | 0 5 files changed, 0 insertions(+), 0 deletions(-) rename {webmagic-plugin/webmagic-misc => webmagic-samples}/src/main/java/us/codecraft/webmagic/model/samples/Blog.java (100%) rename {webmagic-plugin/webmagic-misc => webmagic-samples}/src/main/java/us/codecraft/webmagic/model/samples/IteyeBlog.java (100%) rename {webmagic-plugin/webmagic-misc => webmagic-samples}/src/main/java/us/codecraft/webmagic/model/samples/News163.java (100%) rename {webmagic-plugin/webmagic-misc => webmagic-samples}/src/main/java/us/codecraft/webmagic/model/samples/OschinaAnswer.java (100%) rename {webmagic-plugin/webmagic-misc => webmagic-samples}/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java (100%) diff --git a/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/model/samples/Blog.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/Blog.java similarity index 100% rename from webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/model/samples/Blog.java rename to webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/Blog.java diff --git a/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/model/samples/IteyeBlog.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/IteyeBlog.java similarity index 100% rename from webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/model/samples/IteyeBlog.java rename to webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/IteyeBlog.java diff --git a/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/model/samples/News163.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java similarity index 100% rename from webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/model/samples/News163.java rename to webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java diff --git a/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/model/samples/OschinaAnswer.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaAnswer.java similarity index 100% rename from webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/model/samples/OschinaAnswer.java rename to webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaAnswer.java diff --git a/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java similarity index 100% rename from webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java rename to webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java From 29f8cd2ec6b70c4a168d71429918e77a7c303465 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Tue, 6 Aug 2013 21:26:03 +0800 Subject: [PATCH 42/84] change freemarker dep to samples --- webmagic-plugin/webmagic-misc/pom.xml | 5 ---- .../src/main/resources/ftl/wordpress.ftl | 23 ------------------- .../webmagic/FreemarkerPipelineTest.java | 19 --------------- webmagic-samples/pom.xml | 5 ++++ .../webmagic/pipeline/FreemarkerPipeline.java | 0 5 files changed, 5 insertions(+), 47 deletions(-) delete mode 100644 webmagic-plugin/webmagic-misc/src/main/resources/ftl/wordpress.ftl delete mode 100644 webmagic-plugin/webmagic-misc/src/test/java/us/codecraft/webmagic/FreemarkerPipelineTest.java rename {webmagic-plugin/webmagic-misc => webmagic-samples}/src/main/java/us/codecraft/webmagic/pipeline/FreemarkerPipeline.java (100%) diff --git a/webmagic-plugin/webmagic-misc/pom.xml b/webmagic-plugin/webmagic-misc/pom.xml index 4d8776c..98b7c77 100644 --- a/webmagic-plugin/webmagic-misc/pom.xml +++ b/webmagic-plugin/webmagic-misc/pom.xml @@ -12,11 +12,6 @@ webmagic-misc - - org.freemarker - freemarker - 2.3.15 - org.resthub hessian diff --git a/webmagic-plugin/webmagic-misc/src/main/resources/ftl/wordpress.ftl b/webmagic-plugin/webmagic-misc/src/main/resources/ftl/wordpress.ftl deleted file mode 100644 index 61820b7..0000000 --- a/webmagic-plugin/webmagic-misc/src/main/resources/ftl/wordpress.ftl +++ /dev/null @@ -1,23 +0,0 @@ - - $it.Title - http://127.0.0.1/wordpress/?p=$it.Id - ${date} - admin - http://127.0.0.1/wordpress/?p=$it.Id - - - - <#--$it.Id--> - ${date} - ${date} - open - open - ${title} - publish - 0 - 0 - post - - 0 - $tags - \ No newline at end of file diff --git a/webmagic-plugin/webmagic-misc/src/test/java/us/codecraft/webmagic/FreemarkerPipelineTest.java b/webmagic-plugin/webmagic-misc/src/test/java/us/codecraft/webmagic/FreemarkerPipelineTest.java deleted file mode 100644 index 8ceb99f..0000000 --- a/webmagic-plugin/webmagic-misc/src/test/java/us/codecraft/webmagic/FreemarkerPipelineTest.java +++ /dev/null @@ -1,19 +0,0 @@ -package us.codecraft.webmagic; - -import org.junit.Test; -import us.codecraft.webmagic.pipeline.FreemarkerPipeline; - -import java.io.IOException; - -/** - * @author code4crafter@gmail.com
- * Date: 13-6-9 - * Time: 上午7:14 - */ -public class FreemarkerPipelineTest { - - @Test - public void testTemplateLoad() throws IOException { - new FreemarkerPipeline("wordpress.ftl"); - } -} diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index 8a7e00c..a38e872 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -27,6 +27,11 @@ webmagic-selenium ${project.version}
+ + org.freemarker + freemarker + 2.3.15 + junit junit diff --git a/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/pipeline/FreemarkerPipeline.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/pipeline/FreemarkerPipeline.java similarity index 100% rename from webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/pipeline/FreemarkerPipeline.java rename to webmagic-samples/src/main/java/us/codecraft/webmagic/pipeline/FreemarkerPipeline.java From 7d277e84d46e3f4ac43841aa3e080cf14b6db4fb Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Tue, 6 Aug 2013 21:47:44 +0800 Subject: [PATCH 43/84] update lucene pipeline --- .../java/us/codecraft/webmagic/Spider.java | 6 +- .../webmagic/pipeline/ConsolePipeline.java | 3 - .../webmagic/pipeline/FilePipeline.java | 3 - .../webmagic/pipeline/LucenePipeline.java | 77 +++++++++++++------ .../webmagic/model/ModelPipeline.java | 3 - .../webmagic/pipeline/FreemarkerPipeline.java | 3 - 6 files changed, 57 insertions(+), 38 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 414315c..878c63e 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -228,8 +228,10 @@ public class Spider implements Runnable, Task { } pageProcessor.process(page); addRequest(page); - for (Pipeline pipeline : pipelines) { - pipeline.process(page.getResultItems(), this); + if (!page.getResultItems().isSkip()){ + for (Pipeline pipeline : pipelines) { + pipeline.process(page.getResultItems(), this); + } } sleep(site.getSleepTime()); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java index 8f29474..2ff99c8 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java @@ -15,9 +15,6 @@ public class ConsolePipeline implements Pipeline{ @Override public void process(ResultItems resultItems,Task task) { - if (resultItems.isSkip()){ - return; - } System.out.println("get page: "+resultItems.getRequest().getUrl()); for (Map.Entry entry : resultItems.getAll().entrySet()) { if (entry.getValue() instanceof Iterable) { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java index 01f8d8b..39248d2 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java @@ -47,9 +47,6 @@ public class FilePipeline implements Pipeline { if (!file.exists()) { file.mkdirs(); } - if (resultItems.isSkip()) { - return; - } try { PrintWriter printWriter = new PrintWriter(new FileWriter(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".html")); printWriter.println("url:\t" + resultItems.getRequest().getUrl()); diff --git a/webmagic-plugin/webmagic-lucene/src/main/java/us/codecraft/webmagic/pipeline/LucenePipeline.java b/webmagic-plugin/webmagic-lucene/src/main/java/us/codecraft/webmagic/pipeline/LucenePipeline.java index 2e7191c..aca6501 100644 --- a/webmagic-plugin/webmagic-lucene/src/main/java/us/codecraft/webmagic/pipeline/LucenePipeline.java +++ b/webmagic-plugin/webmagic-lucene/src/main/java/us/codecraft/webmagic/pipeline/LucenePipeline.java @@ -3,20 +3,26 @@ package us.codecraft.webmagic.pipeline; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.TextField; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.queryparser.classic.ParseException; import org.apache.lucene.queryparser.classic.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.store.Directory; -import org.apache.lucene.store.FSDirectory; +import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.util.Version; import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.Task; -import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; /** * @author yihua.huang@dianping.com
@@ -24,41 +30,64 @@ import java.io.File; * Time: 下午2:11
*/ public class LucenePipeline implements Pipeline { - @Override - public void process(ResultItems resultItems, Task task) { + + private Directory directory; + + private IndexWriter indexWriter; + + private Analyzer analyzer; + + private void init() throws IOException { + analyzer = new StandardAnalyzer(Version.LUCENE_44); + directory = new RAMDirectory(); + IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_44, analyzer); + indexWriter = new IndexWriter(directory, config); + indexWriter.close(); + } + + public LucenePipeline() { try { - - } catch (Exception e) { - + init(); + } catch (IOException e) { + e.printStackTrace(); } } - public static void main(String[] args) throws Exception { - Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_44); -// Directory directory = new RAMDirectory(); - // To store an index on disk, use this instead: - Directory directory = FSDirectory.open(new File("/data/webmagic/www.guoxue123.cn/")); - IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_44, analyzer); - IndexWriter iwriter = new IndexWriter(directory, config); - Document doc = new Document(); -// String text = "This is the text to be indexed."; -// doc.add(new Field("fieldname", text, TextField.TYPE_STORED)); -// iwriter.addDocument(doc); - iwriter.close(); - - // Now search the index: + public List search(String fieldName, String value) throws IOException, ParseException { + List documents = new ArrayList(); DirectoryReader ireader = DirectoryReader.open(directory); IndexSearcher isearcher = new IndexSearcher(ireader); // Parse a simple query that searches for "text": - QueryParser parser = new QueryParser(Version.LUCENE_44, "fieldname", analyzer); - Query query = parser.parse("经典"); + QueryParser parser = new QueryParser(Version.LUCENE_44, fieldName, analyzer); + Query query = parser.parse(value); ScoreDoc[] hits = isearcher.search(query, null, 1000).scoreDocs; // Iterate through the results: for (int i = 0; i < hits.length; i++) { Document hitDoc = isearcher.doc(hits[i].doc); - System.out.println(hitDoc); + documents.add(hitDoc); } ireader.close(); directory.close(); + return documents; + } + + @Override + public void process(ResultItems resultItems, Task task) { + if (resultItems.isSkip()){ + return; + } + Document doc = new Document(); + Map all = resultItems.getAll(); + if (all==null){ + return; + } + for (Map.Entry objectEntry : all.entrySet()) { + doc.add(new Field(objectEntry.getKey(), objectEntry.getValue().toString(), TextField.TYPE_STORED)); + } + try { + indexWriter.addDocument(doc); + } catch (IOException e) { + e.printStackTrace(); + } } } diff --git a/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/model/ModelPipeline.java b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/model/ModelPipeline.java index f9b0015..c9f67dc 100644 --- a/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/model/ModelPipeline.java +++ b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/model/ModelPipeline.java @@ -29,9 +29,6 @@ class ModelPipeline implements Pipeline { @Override public void process(ResultItems resultItems, Task task) { - if (resultItems.isSkip()) { - return; - } for (Map.Entry classPageModelPipelineEntry : pageModelPipelines.entrySet()) { Object o = resultItems.get(classPageModelPipelineEntry.getKey().getCanonicalName()); if (o != null) { diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/pipeline/FreemarkerPipeline.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/pipeline/FreemarkerPipeline.java index 9a045ef..3742062 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/pipeline/FreemarkerPipeline.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/pipeline/FreemarkerPipeline.java @@ -40,9 +40,6 @@ public class FreemarkerPipeline implements Pipeline { @Override public void process(ResultItems resultItems, Task task) { - if (resultItems.isSkip()) { - return; - } String path = this.path + "" + task.getUUID() + "/"; File file = new File(path); if (!file.exists()) { From 5436ecbb7b42b4528358cf23594a9788c602aa91 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Tue, 6 Aug 2013 22:12:52 +0800 Subject: [PATCH 44/84] update email --- .../java/us/codecraft/webmagic/pipeline/LucenePipeline.java | 2 +- .../src/main/java/us/codecraft/webmagic/PagedModel.java | 2 +- .../main/java/us/codecraft/webmagic/pipeline/PagedPipeline.java | 2 +- .../java/us/codecraft/webmagic/scheduler/HessianSerializer.java | 2 +- .../main/java/us/codecraft/webmagic/model/samples/News163.java | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/webmagic-plugin/webmagic-lucene/src/main/java/us/codecraft/webmagic/pipeline/LucenePipeline.java b/webmagic-plugin/webmagic-lucene/src/main/java/us/codecraft/webmagic/pipeline/LucenePipeline.java index aca6501..4c048ef 100644 --- a/webmagic-plugin/webmagic-lucene/src/main/java/us/codecraft/webmagic/pipeline/LucenePipeline.java +++ b/webmagic-plugin/webmagic-lucene/src/main/java/us/codecraft/webmagic/pipeline/LucenePipeline.java @@ -25,7 +25,7 @@ import java.util.List; import java.util.Map; /** - * @author yihua.huang@dianping.com
+ * @author code4crafter@gmail.com
* @date: 13-8-5
* Time: 下午2:11
*/ diff --git a/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/PagedModel.java b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/PagedModel.java index f18426a..95e1a83 100644 --- a/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/PagedModel.java +++ b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/PagedModel.java @@ -3,7 +3,7 @@ package us.codecraft.webmagic; import java.util.Collection; /** - * @author yihua.huang@dianping.com
+ * @author code4crafter@gmail.com
* @date: 13-8-4
* Time: 下午5:18
*/ diff --git a/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/pipeline/PagedPipeline.java b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/pipeline/PagedPipeline.java index 282545f..b692da2 100644 --- a/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/pipeline/PagedPipeline.java +++ b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/pipeline/PagedPipeline.java @@ -9,7 +9,7 @@ import java.util.*; import java.util.concurrent.ConcurrentHashMap; /** - * @author yihua.huang@dianping.com
+ * @author code4crafter@gmail.com
* @date: 13-8-4
* Time: 下午5:15
*/ diff --git a/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/scheduler/HessianSerializer.java b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/scheduler/HessianSerializer.java index 68cb5bb..c137bfb 100644 --- a/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/scheduler/HessianSerializer.java +++ b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/scheduler/HessianSerializer.java @@ -8,7 +8,7 @@ import java.io.ByteArrayOutputStream; import java.io.IOException; /** - * @author yihua.huang@dianping.com
+ * @author code4crafter@gmail.com
* @date: 13-7-14
* Time: 下午9:20
*/ diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java index 2aa9073..83f8388 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java @@ -11,7 +11,7 @@ import java.util.Collection; import java.util.List; /** - * @author yihua.huang@dianping.com
+ * @author code4crafter@gmail.com
* @date: 13-8-4
* Time: 下午8:17
*/ From a2d830f7b068d6f4ea2c0def34ddab2c71b7e049 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Tue, 6 Aug 2013 22:25:27 +0800 Subject: [PATCH 45/84] change serilizer to fastjson --- webmagic-plugin/webmagic-misc/pom.xml | 6 +- .../webmagic/pipeline/JsonFilePipeline.java | 58 +++++++++++++++++++ .../webmagic/scheduler/HessianSerializer.java | 33 ----------- .../webmagic/scheduler/RedisScheduler.java | 21 ++----- 4 files changed, 67 insertions(+), 51 deletions(-) create mode 100644 webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePipeline.java delete mode 100644 webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/scheduler/HessianSerializer.java diff --git a/webmagic-plugin/webmagic-misc/pom.xml b/webmagic-plugin/webmagic-misc/pom.xml index 98b7c77..5e24173 100644 --- a/webmagic-plugin/webmagic-misc/pom.xml +++ b/webmagic-plugin/webmagic-misc/pom.xml @@ -13,9 +13,9 @@ - org.resthub - hessian - 4.0.8 + com.alibaba + fastjson + 1.1.35 redis.clients diff --git a/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePipeline.java b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePipeline.java new file mode 100644 index 0000000..1500409 --- /dev/null +++ b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePipeline.java @@ -0,0 +1,58 @@ +package us.codecraft.webmagic.pipeline; + +import com.alibaba.fastjson.JSON; +import org.apache.commons.codec.digest.DigestUtils; +import org.apache.log4j.Logger; +import us.codecraft.webmagic.ResultItems; +import us.codecraft.webmagic.Task; + +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.io.PrintWriter; + +/** + * 持久化到文件的接口。 + * + * @author code4crafter@gmail.com
+ * Date: 13-4-21 + * Time: 下午6:28 + */ +public class JsonFilePipeline implements Pipeline { + + private String path = "/data/temp/webmagic/"; + + private Logger logger = Logger.getLogger(getClass()); + + /** + * 新建一个FilePipeline,使用默认保存路径"/data/temp/webmagic/" + */ + public JsonFilePipeline() { + + } + + /** + * 新建一个FilePipeline + * + * @param path 文件保存路径 + */ + public JsonFilePipeline(String path) { + this.path = path; + } + + @Override + public void process(ResultItems resultItems, Task task) { + String path = this.path + "/" + task.getUUID() + "/"; + File file = new File(path); + if (!file.exists()) { + file.mkdirs(); + } + try { + PrintWriter printWriter = new PrintWriter(new FileWriter(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".json")); + printWriter.write(JSON.toJSONString(resultItems.getAll())); + printWriter.close(); + } catch (IOException e) { + logger.warn("write file error", e); + } + } +} diff --git a/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/scheduler/HessianSerializer.java b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/scheduler/HessianSerializer.java deleted file mode 100644 index c137bfb..0000000 --- a/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/scheduler/HessianSerializer.java +++ /dev/null @@ -1,33 +0,0 @@ -package us.codecraft.webmagic.scheduler; - -import com.caucho.hessian.io.Hessian2Input; -import com.caucho.hessian.io.Hessian2Output; - -import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; -import java.io.IOException; - -/** - * @author code4crafter@gmail.com
- * @date: 13-7-14
- * Time: 下午9:20
- */ -public enum HessianSerializer { - INSTANCE; - public byte[] serialize(T v) throws IOException { - ByteArrayOutputStream baos = new ByteArrayOutputStream(); - Hessian2Output hessian2Output = new Hessian2Output(baos); - hessian2Output.writeObject(v); - hessian2Output.close(); - return baos.toByteArray(); - } - - @SuppressWarnings("unchecked") - public T deSerialize(byte[] bytes) throws IOException { - ByteArrayInputStream bais = new ByteArrayInputStream(bytes); - Hessian2Input hessian2Input = new Hessian2Input(bais); - T t = (T) hessian2Input.readObject(); - hessian2Input.close(); - return t; - } -} diff --git a/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java index fb82a69..8233698 100644 --- a/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java +++ b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java @@ -1,5 +1,6 @@ package us.codecraft.webmagic.scheduler; +import com.alibaba.fastjson.JSON; import org.apache.commons.codec.digest.DigestUtils; import redis.clients.jedis.Jedis; import redis.clients.jedis.JedisPool; @@ -8,8 +9,6 @@ import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.schedular.Scheduler; -import java.io.IOException; - /** * 使用redis管理url,构建一个分布式的爬虫。
* @@ -41,12 +40,8 @@ public class RedisScheduler implements Scheduler { jedis.zadd(SET_PREFIX + task.getUUID(), request.getPriority(), request.getUrl()); if (request.getExtras() != null) { String key = ITEM_PREFIX + DigestUtils.shaHex(request.getUrl()); - try { - byte[] serialize = HessianSerializer.INSTANCE.serialize(request); - jedis.set(key.getBytes(), serialize); - } catch (IOException e) { - e.printStackTrace(); - } + byte[] serialize = JSON.toJSONBytes(request); + jedis.set(key.getBytes(), serialize); } } pool.returnResource(jedis); @@ -61,13 +56,9 @@ public class RedisScheduler implements Scheduler { } String key = ITEM_PREFIX + DigestUtils.shaHex(url); byte[] bytes = jedis.get(key.getBytes()); - if (bytes!=null){ - try { - Object o = HessianSerializer.INSTANCE.deSerialize(bytes); - return (Request)o; - } catch (Exception e) { - e.printStackTrace(); - } + if (bytes != null) { + Object o = JSON.parse(bytes); + return (Request) o; } pool.returnResource(jedis); return new Request(url); From e5f4b3916f5ee3b50f4b62bf2727c18ffa174084 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Tue, 6 Aug 2013 22:26:39 +0800 Subject: [PATCH 46/84] change file dir --- .../java/us/codecraft/webmagic/pipeline/FilePipeline.java | 4 ++-- .../java/us/codecraft/webmagic/pipeline/JsonFilePipeline.java | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java index 39248d2..72ae1aa 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java @@ -20,12 +20,12 @@ import java.util.Map; */ public class FilePipeline implements Pipeline { - private String path = "/data/temp/webmagic/"; + private String path = "/data/webmagic/"; private Logger logger = Logger.getLogger(getClass()); /** - * 新建一个FilePipeline,使用默认保存路径"/data/temp/webmagic/" + * 新建一个FilePipeline,使用默认保存路径"/data/webmagic/" */ public FilePipeline() { diff --git a/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePipeline.java b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePipeline.java index 1500409..08f3e87 100644 --- a/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePipeline.java +++ b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePipeline.java @@ -20,12 +20,12 @@ import java.io.PrintWriter; */ public class JsonFilePipeline implements Pipeline { - private String path = "/data/temp/webmagic/"; + private String path = "/data/webmagic/"; private Logger logger = Logger.getLogger(getClass()); /** - * 新建一个FilePipeline,使用默认保存路径"/data/temp/webmagic/" + * 新建一个FilePipeline,使用默认保存路径"/data/webmagic/" */ public JsonFilePipeline() { From c7005a0227e26047030a89929e83a2c734b9dec9 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Tue, 6 Aug 2013 22:36:37 +0800 Subject: [PATCH 47/84] json fix --- .../src/main/java/us/codecraft/webmagic/Request.java | 11 +++++++++++ .../codecraft/webmagic/scheduler/RedisScheduler.java | 8 ++++---- .../webmagic/scheduler/RedisSchedulerTest.java | 6 ++++-- 3 files changed, 19 insertions(+), 6 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java index 6cf344f..905dbe5 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java @@ -40,6 +40,9 @@ public class Request implements Serializable { private double priority; + public Request() { + } + /** * 构建一个request对象 * @@ -102,4 +105,12 @@ public class Request implements Serializable { public int hashCode() { return url.hashCode(); } + + public void setExtras(Map extras) { + this.extras = extras; + } + + public void setUrl(String url) { + this.url = url; + } } diff --git a/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java index 8233698..c9992db 100644 --- a/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java +++ b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java @@ -40,8 +40,8 @@ public class RedisScheduler implements Scheduler { jedis.zadd(SET_PREFIX + task.getUUID(), request.getPriority(), request.getUrl()); if (request.getExtras() != null) { String key = ITEM_PREFIX + DigestUtils.shaHex(request.getUrl()); - byte[] serialize = JSON.toJSONBytes(request); - jedis.set(key.getBytes(), serialize); + byte[] bytes = JSON.toJSONString(request).getBytes(); + jedis.set(key.getBytes(), bytes); } } pool.returnResource(jedis); @@ -57,8 +57,8 @@ public class RedisScheduler implements Scheduler { String key = ITEM_PREFIX + DigestUtils.shaHex(url); byte[] bytes = jedis.get(key.getBytes()); if (bytes != null) { - Object o = JSON.parse(bytes); - return (Request) o; + Request o = JSON.parseObject(new String(bytes),Request.class); + return o; } pool.returnResource(jedis); return new Request(url); diff --git a/webmagic-plugin/webmagic-misc/src/test/java/us/codecraft/webmagic/scheduler/RedisSchedulerTest.java b/webmagic-plugin/webmagic-misc/src/test/java/us/codecraft/webmagic/scheduler/RedisSchedulerTest.java index 3d59671..144dba5 100644 --- a/webmagic-plugin/webmagic-misc/src/test/java/us/codecraft/webmagic/scheduler/RedisSchedulerTest.java +++ b/webmagic-plugin/webmagic-misc/src/test/java/us/codecraft/webmagic/scheduler/RedisSchedulerTest.java @@ -1,7 +1,6 @@ package us.codecraft.webmagic.scheduler; import org.junit.Before; -import org.junit.Ignore; import org.junit.Test; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Site; @@ -35,8 +34,11 @@ public class RedisSchedulerTest { return null; } }; - redisScheduler.push(new Request("http://www.ibm.com/developerworks/cn/java/j-javadev2-22/"), task); + Request request = new Request("http://www.ibm.com/developerworks/cn/java/j-javadev2-22/"); + request.putExtra("1","2"); + redisScheduler.push(request, task); Request poll = redisScheduler.poll(task); + System.out.println(poll); } } From f4134504036eab888b0b39e134ae33744eb42f5d Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Tue, 6 Aug 2013 22:38:12 +0800 Subject: [PATCH 48/84] remove duplicated class --- .../codecraft/webmagic/model/OschinaBlog.java | 40 ------------------- .../codecraft/webmagic/model/TestFetcher.java | 24 ----------- .../scheduler/RedisSchedulerTest.java | 1 + 3 files changed, 1 insertion(+), 64 deletions(-) delete mode 100644 webmagic-plugin/webmagic-misc/src/test/java/us/codecraft/webmagic/model/OschinaBlog.java delete mode 100644 webmagic-plugin/webmagic-misc/src/test/java/us/codecraft/webmagic/model/TestFetcher.java diff --git a/webmagic-plugin/webmagic-misc/src/test/java/us/codecraft/webmagic/model/OschinaBlog.java b/webmagic-plugin/webmagic-misc/src/test/java/us/codecraft/webmagic/model/OschinaBlog.java deleted file mode 100644 index 2552104..0000000 --- a/webmagic-plugin/webmagic-misc/src/test/java/us/codecraft/webmagic/model/OschinaBlog.java +++ /dev/null @@ -1,40 +0,0 @@ -package us.codecraft.webmagic.model; - -import us.codecraft.webmagic.Page; -import us.codecraft.webmagic.Site; - -import java.util.List; - -/** - * @author code4crafter@gmail.com
- * @date: 13-8-1
- * Time: 下午10:18
- */ -@TargetUrl("http://my.oschina.net/flashsword/blog/*") -public class OschinaBlog implements AfterExtractor { - - @ExtractBy("//title") - private String title; - - @ExtractBy(value = "div.BlogContent", type = ExtractBy.Type.Css) - private String content; - - @ExtractBy(value = "//div[@class='BlogTags']/a/text()", multi = true) - private List tags; - - @ExtractBy(value = "//div[@class='BlogTags']/a/text()", multi = true) - private List comments; - - @Override - public void afterProcess(Page page) { - System.out.println("title:\t"+title); - System.out.println("content:\t"+content); - System.out.println("tags:\t" + tags); - page.setSkip(true); - } - - public static void main(String[] args) { - OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog/145796"), OschinaBlog.class) - .run(); - } -} \ No newline at end of file diff --git a/webmagic-plugin/webmagic-misc/src/test/java/us/codecraft/webmagic/model/TestFetcher.java b/webmagic-plugin/webmagic-misc/src/test/java/us/codecraft/webmagic/model/TestFetcher.java deleted file mode 100644 index 009d53a..0000000 --- a/webmagic-plugin/webmagic-misc/src/test/java/us/codecraft/webmagic/model/TestFetcher.java +++ /dev/null @@ -1,24 +0,0 @@ -package us.codecraft.webmagic.model; - -import org.junit.Ignore; -import org.junit.Test; -import us.codecraft.webmagic.Site; - -/** - * @author code4crafter@gmail.com
- * @date: 13-8-1
- * Time: 下午8:42
- */ -public class TestFetcher { - - @Ignore("takes long") - @Test - public void test() { - OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog/145796"), OschinaBlog.class) - .run(); - - } - - - -} diff --git a/webmagic-plugin/webmagic-misc/src/test/java/us/codecraft/webmagic/scheduler/RedisSchedulerTest.java b/webmagic-plugin/webmagic-misc/src/test/java/us/codecraft/webmagic/scheduler/RedisSchedulerTest.java index 144dba5..f0cbb3d 100644 --- a/webmagic-plugin/webmagic-misc/src/test/java/us/codecraft/webmagic/scheduler/RedisSchedulerTest.java +++ b/webmagic-plugin/webmagic-misc/src/test/java/us/codecraft/webmagic/scheduler/RedisSchedulerTest.java @@ -1,6 +1,7 @@ package us.codecraft.webmagic.scheduler; import org.junit.Before; +import org.junit.Ignore; import org.junit.Test; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Site; From 5c96407a3d7793e2fa1347c2b6d0d62a0476fd4e Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Tue, 6 Aug 2013 22:43:31 +0800 Subject: [PATCH 49/84] fix a null domain error --- .../src/main/java/us/codecraft/webmagic/Site.java | 14 +++++++++++++- .../webmagic/model/samples/OschinaBlog.java | 4 +++- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java index 2c6118c..9ab97fe 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java @@ -1,5 +1,7 @@ package us.codecraft.webmagic; +import us.codecraft.webmagic.utils.UrlUtils; + import java.util.*; /** @@ -90,6 +92,11 @@ public class Site { * @return 已设置的domain */ public String getDomain() { + if (domain == null) { + if (startUrls.size() > 0) { + domain = UrlUtils.getDomain(startUrls.get(0)); + } + } return domain; } @@ -150,6 +157,7 @@ public class Site { /** * 获取初始页面的地址列表 + * * @return 初始页面的地址列表 */ public List getStartUrls() { @@ -158,6 +166,7 @@ public class Site { /** * 增加初始页面的地址,可反复调用此方法增加多个初始地址。 + * * @param startUrl 初始页面的地址 * @return this */ @@ -179,6 +188,7 @@ public class Site { /** * 获取两次抓取之间的间隔 + * * @return 两次抓取之间的间隔,单位毫秒 */ public int getSleepTime() { @@ -187,6 +197,7 @@ public class Site { /** * 获取重新下载的次数,默认为0 + * * @return 重新下载的次数 */ public int getRetryTimes() { @@ -195,6 +206,7 @@ public class Site { /** * 设置获取重新下载的次数,默认为0 + * * @return this */ public Site setRetryTimes(int retryTimes) { @@ -219,7 +231,7 @@ public class Site { return true; } - public Task toTask(){ + public Task toTask() { return new Task() { @Override public String getUUID() { diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java index 9f11d0e..38cb41f 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java @@ -4,6 +4,8 @@ import us.codecraft.webmagic.Site; import us.codecraft.webmagic.model.ExtractBy; import us.codecraft.webmagic.model.OOSpider; import us.codecraft.webmagic.model.TargetUrl; +import us.codecraft.webmagic.pipeline.ConsolePipeline; +import us.codecraft.webmagic.pipeline.JsonFilePipeline; /** * @author code4crafter@gmail.com
@@ -28,7 +30,7 @@ public class OschinaBlog implements Blog{ } public static void main(String[] args) { - OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog"), OschinaBlog.class).run(); + OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog"), OschinaBlog.class).pipeline(new ConsolePipeline()).pipeline(new JsonFilePipeline()).run(); } public String getTitle() { From 36494bcfa52d58157117a25c504e73de6b15e1da Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Tue, 6 Aug 2013 23:01:43 +0800 Subject: [PATCH 50/84] add xpath2.0 api --- .../webmagic/pipeline/ConsolePipeline.java | 21 +++++++------------ .../us/codecraft/webmagic/selector/Html.java | 6 ++++++ .../webmagic/selector/PlainText.java | 5 +++++ .../webmagic/selector/Selectable.java | 8 +++++++ .../webmagic/selector/SelectorFactory.java | 4 ++++ .../webmagic/downloader/FileDownloader.java | 0 .../scheduler}/FileCacheQueueScheduler.java | 3 ++- .../webmagic/utils/DoubleKeyMap.java | 0 .../webmagic/utils/MultiKeyMapBase.java | 0 .../webmagic/model/samples/OschinaBlog.java | 3 +-- .../webmagic/samples/GuoxueProcessor.java | 2 +- .../us/codecraft/webmagic/SpiderTest.java | 2 +- .../processor/DiandianProcessorTest.java | 2 +- .../processor/DiaoyuwengProcessorTest.java | 2 +- .../processor/SinablogProcessorTest.java | 2 +- 15 files changed, 38 insertions(+), 22 deletions(-) rename {webmagic-core => webmagic-plugin/webmagic-misc}/src/main/java/us/codecraft/webmagic/downloader/FileDownloader.java (100%) rename {webmagic-core/src/main/java/us/codecraft/webmagic/schedular => webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/scheduler}/FileCacheQueueScheduler.java (97%) rename {webmagic-core => webmagic-plugin/webmagic-misc}/src/main/java/us/codecraft/webmagic/utils/DoubleKeyMap.java (100%) rename {webmagic-core => webmagic-plugin/webmagic-misc}/src/main/java/us/codecraft/webmagic/utils/MultiKeyMapBase.java (100%) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java index 2ff99c8..e1648fe 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java @@ -7,25 +7,18 @@ import java.util.Map; /** * 命令行输出抽取结果。可用于测试。
+ * * @author code4crafter@gmail.com
- * Date: 13-4-21 - * Time: 下午1:45 + * Date: 13-4-21 + * Time: 下午1:45 */ -public class ConsolePipeline implements Pipeline{ +public class ConsolePipeline implements Pipeline { @Override - public void process(ResultItems resultItems,Task task) { - System.out.println("get page: "+resultItems.getRequest().getUrl()); + public void process(ResultItems resultItems, Task task) { + System.out.println("get page: " + resultItems.getRequest().getUrl()); for (Map.Entry entry : resultItems.getAll().entrySet()) { - if (entry.getValue() instanceof Iterable) { - Iterable value = (Iterable) entry.getValue(); - System.out.println(entry.getKey() + ":"); - for (Object o : value) { - System.out.println(o); - } - } else { - System.out.println(entry.getKey() + ":\t" + entry.getValue()); - } + System.out.println(entry.getKey() + ":\t" + entry.getValue()); } } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java index 114eef9..79d62a0 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java @@ -63,6 +63,12 @@ public class Html extends PlainText { return selectList(xpathSelector, strings); } + @Override + public Selectable xpath2(String xpath) { + Xpath2Selector xpathSelector = SelectorFactory.getInstatnce().newXpath2Selector(xpath); + return selectList(xpathSelector, strings); + } + @Override public Selectable $(String selector) { CssSelector cssSelector = new CssSelector(selector); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java index d06a531..4fff6da 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java @@ -34,6 +34,11 @@ public class PlainText implements Selectable { throw new UnsupportedOperationException(); } + @Override + public Selectable xpath2(String xpath) { + throw new UnsupportedOperationException(); + } + @Override public Selectable $(String selector) { throw new UnsupportedOperationException(); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java index 42f3d10..cea501d 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java @@ -18,6 +18,14 @@ public interface Selectable { */ public Selectable xpath(String xpath); + /** + * select list with xpath 2.0 syntax + * + * @param xpath + * @return new Selectable after extract + */ + public Selectable xpath2(String xpath); + /** * select list with css selector * diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SelectorFactory.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SelectorFactory.java index 1dd56e0..9abb1ce 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SelectorFactory.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SelectorFactory.java @@ -34,6 +34,10 @@ public class SelectorFactory { return newSelector(XpathSelector.class, xpath); } + public Xpath2Selector newXpath2Selector(String xpath) { + return newSelector(Xpath2Selector.class, xpath); + } + public SmartContentSelector newSmartContentSelector(){ return newSelector(SmartContentSelector.class); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/FileDownloader.java b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/downloader/FileDownloader.java similarity index 100% rename from webmagic-core/src/main/java/us/codecraft/webmagic/downloader/FileDownloader.java rename to webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/downloader/FileDownloader.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/FileCacheQueueScheduler.java b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java similarity index 97% rename from webmagic-core/src/main/java/us/codecraft/webmagic/schedular/FileCacheQueueScheduler.java rename to webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java index f5393a3..d4a3987 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/FileCacheQueueScheduler.java +++ b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java @@ -1,9 +1,10 @@ -package us.codecraft.webmagic.schedular; +package us.codecraft.webmagic.scheduler; import org.apache.commons.lang3.math.NumberUtils; import org.apache.log4j.Logger; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Task; +import us.codecraft.webmagic.schedular.Scheduler; import java.io.*; import java.util.LinkedHashSet; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/DoubleKeyMap.java b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/utils/DoubleKeyMap.java similarity index 100% rename from webmagic-core/src/main/java/us/codecraft/webmagic/utils/DoubleKeyMap.java rename to webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/utils/DoubleKeyMap.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/MultiKeyMapBase.java b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/utils/MultiKeyMapBase.java similarity index 100% rename from webmagic-core/src/main/java/us/codecraft/webmagic/utils/MultiKeyMapBase.java rename to webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/utils/MultiKeyMapBase.java diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java index 38cb41f..817ba44 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java @@ -4,7 +4,6 @@ import us.codecraft.webmagic.Site; import us.codecraft.webmagic.model.ExtractBy; import us.codecraft.webmagic.model.OOSpider; import us.codecraft.webmagic.model.TargetUrl; -import us.codecraft.webmagic.pipeline.ConsolePipeline; import us.codecraft.webmagic.pipeline.JsonFilePipeline; /** @@ -30,7 +29,7 @@ public class OschinaBlog implements Blog{ } public static void main(String[] args) { - OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog"), OschinaBlog.class).pipeline(new ConsolePipeline()).pipeline(new JsonFilePipeline()).run(); + OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog"), OschinaBlog.class).pipeline(new JsonFilePipeline()).run(); } public String getTitle() { diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GuoxueProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GuoxueProcessor.java index db00c79..5d7d355 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GuoxueProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GuoxueProcessor.java @@ -3,7 +3,7 @@ package us.codecraft.webmagic.samples; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.pipeline.FilePipeline; import us.codecraft.webmagic.processor.SimplePageProcessor; -import us.codecraft.webmagic.schedular.FileCacheQueueScheduler; +import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler; /** * @author code4crafter@gmail.com
diff --git a/webmagic-samples/src/test/java/us/codecraft/webmagic/SpiderTest.java b/webmagic-samples/src/test/java/us/codecraft/webmagic/SpiderTest.java index 76a423f..dbfa815 100644 --- a/webmagic-samples/src/test/java/us/codecraft/webmagic/SpiderTest.java +++ b/webmagic-samples/src/test/java/us/codecraft/webmagic/SpiderTest.java @@ -5,7 +5,7 @@ import org.junit.Test; import us.codecraft.webmagic.pipeline.FilePipeline; import us.codecraft.webmagic.processor.SimplePageProcessor; import us.codecraft.webmagic.samples.HuxiuProcessor; -import us.codecraft.webmagic.schedular.FileCacheQueueScheduler; +import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler; /** * @author code4crafter@gmail.com
diff --git a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiandianProcessorTest.java b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiandianProcessorTest.java index 13910b5..cf587f1 100644 --- a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiandianProcessorTest.java +++ b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiandianProcessorTest.java @@ -6,7 +6,7 @@ import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.pipeline.ConsolePipeline; import us.codecraft.webmagic.pipeline.FreemarkerPipeline; import us.codecraft.webmagic.samples.DiandianBlogProcessor; -import us.codecraft.webmagic.schedular.FileCacheQueueScheduler; +import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler; import java.io.IOException; diff --git a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiaoyuwengProcessorTest.java b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiaoyuwengProcessorTest.java index 33bcf9c..69a535c 100644 --- a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiaoyuwengProcessorTest.java +++ b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiaoyuwengProcessorTest.java @@ -6,7 +6,7 @@ import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.pipeline.FilePipeline; import us.codecraft.webmagic.pipeline.FreemarkerPipeline; import us.codecraft.webmagic.samples.DiaoyuwengProcessor; -import us.codecraft.webmagic.schedular.FileCacheQueueScheduler; +import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler; import java.io.IOException; diff --git a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/SinablogProcessorTest.java b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/SinablogProcessorTest.java index a0160e1..a44fe35 100644 --- a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/SinablogProcessorTest.java +++ b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/SinablogProcessorTest.java @@ -6,7 +6,7 @@ import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.pipeline.FilePipeline; import us.codecraft.webmagic.pipeline.FreemarkerPipeline; import us.codecraft.webmagic.samples.SinaBlogProcesser; -import us.codecraft.webmagic.schedular.FileCacheQueueScheduler; +import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler; import java.io.IOException; From ac917039a3056b28aeaf586502205ce0408444ad Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Tue, 6 Aug 2013 23:05:00 +0800 Subject: [PATCH 51/84] orgnize dep --- webmagic-plugin/webmagic-misc/pom.xml | 5 ++ .../selenium}/SeleniumDownloader.java | 2 +- .../downloader/selenium}/WebDriverPool.java | 2 +- webmagic-plugin/webmagic-selenium/README.md | 3 - webmagic-plugin/webmagic-selenium/pom.xml | 23 ------- .../webmagic/selenium/SeleniumTest.java | 41 ------------- .../downloader/SeleniumDownloaderTest.java | 61 ------------------- .../downloader/WebDriverPoolTest.java | 31 ---------- webmagic-samples/pom.xml | 5 -- 9 files changed, 7 insertions(+), 166 deletions(-) rename webmagic-plugin/{webmagic-selenium/src/main/java/us/codecraft/webmagic/selenium/downloader => webmagic-misc/src/main/java/us/codecraft/webmagic/downloader/selenium}/SeleniumDownloader.java (98%) rename webmagic-plugin/{webmagic-selenium/src/main/java/us/codecraft/webmagic/selenium/downloader => webmagic-misc/src/main/java/us/codecraft/webmagic/downloader/selenium}/WebDriverPool.java (97%) delete mode 100644 webmagic-plugin/webmagic-selenium/README.md delete mode 100644 webmagic-plugin/webmagic-selenium/pom.xml delete mode 100644 webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/SeleniumTest.java delete mode 100644 webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/downloader/SeleniumDownloaderTest.java delete mode 100644 webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/downloader/WebDriverPoolTest.java diff --git a/webmagic-plugin/webmagic-misc/pom.xml b/webmagic-plugin/webmagic-misc/pom.xml index 5e24173..130e400 100644 --- a/webmagic-plugin/webmagic-misc/pom.xml +++ b/webmagic-plugin/webmagic-misc/pom.xml @@ -22,6 +22,11 @@ jedis 2.0.0
+ + org.seleniumhq.selenium + selenium-java + 2.33.0 +
\ No newline at end of file diff --git a/webmagic-plugin/webmagic-selenium/src/main/java/us/codecraft/webmagic/selenium/downloader/SeleniumDownloader.java b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java similarity index 98% rename from webmagic-plugin/webmagic-selenium/src/main/java/us/codecraft/webmagic/selenium/downloader/SeleniumDownloader.java rename to webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java index 002dcc9..e95f27c 100644 --- a/webmagic-plugin/webmagic-selenium/src/main/java/us/codecraft/webmagic/selenium/downloader/SeleniumDownloader.java +++ b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java @@ -1,4 +1,4 @@ -package us.codecraft.webmagic.selenium.downloader; +package us.codecraft.webmagic.downloader.selenium; import org.apache.log4j.Logger; import org.openqa.selenium.By; diff --git a/webmagic-plugin/webmagic-selenium/src/main/java/us/codecraft/webmagic/selenium/downloader/WebDriverPool.java b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java similarity index 97% rename from webmagic-plugin/webmagic-selenium/src/main/java/us/codecraft/webmagic/selenium/downloader/WebDriverPool.java rename to webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java index fdd978d..c763a99 100644 --- a/webmagic-plugin/webmagic-selenium/src/main/java/us/codecraft/webmagic/selenium/downloader/WebDriverPool.java +++ b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java @@ -1,4 +1,4 @@ -package us.codecraft.webmagic.selenium.downloader; +package us.codecraft.webmagic.downloader.selenium; import org.openqa.selenium.WebDriver; import org.openqa.selenium.chrome.ChromeDriver; diff --git a/webmagic-plugin/webmagic-selenium/README.md b/webmagic-plugin/webmagic-selenium/README.md deleted file mode 100644 index 5e5ce82..0000000 --- a/webmagic-plugin/webmagic-selenium/README.md +++ /dev/null @@ -1,3 +0,0 @@ -webmagic-selenium -------- -尝试使用selenium来进行页面动态渲染,开发中。 \ No newline at end of file diff --git a/webmagic-plugin/webmagic-selenium/pom.xml b/webmagic-plugin/webmagic-selenium/pom.xml deleted file mode 100644 index 0da4504..0000000 --- a/webmagic-plugin/webmagic-selenium/pom.xml +++ /dev/null @@ -1,23 +0,0 @@ - - - - - us.codecraft - webmagic-plugin - 0.1.0 - - 4.0.0 - webmagic-selenium - - - - org.seleniumhq.selenium - selenium-java - 2.33.0 - - - - - \ No newline at end of file diff --git a/webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/SeleniumTest.java b/webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/SeleniumTest.java deleted file mode 100644 index a6de847..0000000 --- a/webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/SeleniumTest.java +++ /dev/null @@ -1,41 +0,0 @@ -package us.codecraft.webmagic.selenium; - -import org.junit.Ignore; -import org.junit.Test; -import org.openqa.selenium.By; -import org.openqa.selenium.WebDriver; -import org.openqa.selenium.WebElement; -import org.openqa.selenium.chrome.ChromeDriver; -import org.openqa.selenium.remote.DesiredCapabilities; - -import java.util.Arrays; -import java.util.HashMap; -import java.util.Map; - -/** - * @author code4crafter@gmail.com
- * @date: 13-7-26
- * Time: 下午12:27
- */ -public class SeleniumTest { - - @Ignore("need chrome driver") - @Test - public void testSelenium() { - System.getProperties().setProperty("webdriver.chrome.driver", "/Users/yihua/Downloads/chromedriver"); - Map contentSettings = new HashMap(); - contentSettings.put("images", 2); - - Map preferences = new HashMap(); - preferences.put("profile.default_content_settings", contentSettings); - - DesiredCapabilities caps = DesiredCapabilities.chrome(); - caps.setCapability("chrome.prefs", preferences); - caps.setCapability("chrome.switches", Arrays.asList("--user-data-dir=/Users/yihua/temp/chrome")); - WebDriver webDriver = new ChromeDriver(caps); - webDriver.get("http://huaban.com/"); - WebElement webElement = webDriver.findElement(By.xpath("/html")); - System.out.println(webElement.getAttribute("outerHTML")); - webDriver.close(); - } -} diff --git a/webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/downloader/SeleniumDownloaderTest.java b/webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/downloader/SeleniumDownloaderTest.java deleted file mode 100644 index 23711fa..0000000 --- a/webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/downloader/SeleniumDownloaderTest.java +++ /dev/null @@ -1,61 +0,0 @@ -package us.codecraft.webmagic.selenium.downloader; - -import org.junit.Ignore; -import org.junit.Test; -import us.codecraft.webmagic.Page; -import us.codecraft.webmagic.Request; -import us.codecraft.webmagic.Site; -import us.codecraft.webmagic.Task; - -/** - * @author code4crafter@gmail.com
- * @date: 13-7-26
- * Time: 下午2:46
- */ -public class SeleniumDownloaderTest { - - private String chromeDriverPath = "/Users/yihua/Downloads/chromedriver"; - - @Ignore("need chrome driver") - @Test - public void test() { - SeleniumDownloader seleniumDownloader = new SeleniumDownloader(chromeDriverPath); - long time1 = System.currentTimeMillis(); - for (int i = 0; i < 100; i++) { - Page page = seleniumDownloader.download(new Request("http://huaban.com/"), new Task() { - @Override - public String getUUID() { - return "huaban.com"; - } - - @Override - public Site getSite() { - return Site.me(); - } - }); - System.out.println(page.getHtml().$("#waterfall").links().regex(".*pins.*").all()); - } - System.out.println(System.currentTimeMillis() - time1); - } - - @Ignore - @Test - public void testBaiduWenku() { - SeleniumDownloader seleniumDownloader = new SeleniumDownloader(chromeDriverPath); - seleniumDownloader.setSleepTime(10000); - long time1 = System.currentTimeMillis(); - Page page = seleniumDownloader.download(new Request("http://wenku.baidu.com/view/462933ff04a1b0717fd5ddc2.html"), new Task() { - @Override - public String getUUID() { - return "huaban.com"; - } - - @Override - public Site getSite() { - return Site.me(); - } - }); - System.out.println(page.getHtml().$("div.inner").replace("<[^<>]+>","").replace("&nsbp;","").all()); - } - -} diff --git a/webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/downloader/WebDriverPoolTest.java b/webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/downloader/WebDriverPoolTest.java deleted file mode 100644 index cbf3860..0000000 --- a/webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/downloader/WebDriverPoolTest.java +++ /dev/null @@ -1,31 +0,0 @@ -package us.codecraft.webmagic.selenium.downloader; - -import org.junit.Ignore; -import org.junit.Test; -import org.openqa.selenium.WebDriver; - -/** - * @author code4crafter@gmail.com
- * @date: 13-7-26
- * Time: 下午2:12
- */ -public class WebDriverPoolTest { - - private String chromeDriverPath = "/Users/yihua/Downloads/chromedriver"; - - @Ignore("need chrome driver") - @Test - public void test() { - System.getProperties().setProperty("webdriver.chrome.driver", chromeDriverPath); - WebDriverPool webDriverPool = new WebDriverPool(5); - for (int i = 0; i < 5; i++) { - try { - WebDriver webDriver = webDriverPool.get(); - System.out.println(i); - } catch (InterruptedException e) { - e.printStackTrace(); - } - } - webDriverPool.closeAll(); - } -} diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index a38e872..ff37e21 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -22,11 +22,6 @@ webmagic-misc ${project.version}
- - us.codecraft - webmagic-selenium - ${project.version} - org.freemarker freemarker From 8b7e6a350b79d8e5a9c5fc1962b83540f85833a1 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Tue, 6 Aug 2013 23:11:00 +0800 Subject: [PATCH 52/84] add test case back --- .../webmagic/downloader/SeleniumTest.java | 41 +++++++++++++ .../selenium/SeleniumDownloaderTest.java | 61 +++++++++++++++++++ .../selenium/WebDriverPoolTest.java | 31 ++++++++++ .../webmagic/samples/HuabanProcessor.java | 2 +- 4 files changed, 134 insertions(+), 1 deletion(-) create mode 100644 webmagic-plugin/webmagic-misc/src/test/java/us/codecraft/webmagic/downloader/SeleniumTest.java create mode 100644 webmagic-plugin/webmagic-misc/src/test/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloaderTest.java create mode 100644 webmagic-plugin/webmagic-misc/src/test/java/us/codecraft/webmagic/downloader/selenium/WebDriverPoolTest.java diff --git a/webmagic-plugin/webmagic-misc/src/test/java/us/codecraft/webmagic/downloader/SeleniumTest.java b/webmagic-plugin/webmagic-misc/src/test/java/us/codecraft/webmagic/downloader/SeleniumTest.java new file mode 100644 index 0000000..2c19033 --- /dev/null +++ b/webmagic-plugin/webmagic-misc/src/test/java/us/codecraft/webmagic/downloader/SeleniumTest.java @@ -0,0 +1,41 @@ +package us.codecraft.webmagic.downloader; + +import org.junit.Ignore; +import org.junit.Test; +import org.openqa.selenium.By; +import org.openqa.selenium.WebDriver; +import org.openqa.selenium.WebElement; +import org.openqa.selenium.chrome.ChromeDriver; +import org.openqa.selenium.remote.DesiredCapabilities; + +import java.util.Arrays; +import java.util.HashMap; +import java.util.Map; + +/** + * @author code4crafter@gmail.com
+ * @date: 13-7-26
+ * Time: 下午12:27
+ */ +public class SeleniumTest { + + @Ignore("need chrome driver") + @Test + public void testSelenium() { + System.getProperties().setProperty("webdriver.chrome.driver", "/Users/yihua/Downloads/chromedriver"); + Map contentSettings = new HashMap(); + contentSettings.put("images", 2); + + Map preferences = new HashMap(); + preferences.put("profile.default_content_settings", contentSettings); + + DesiredCapabilities caps = DesiredCapabilities.chrome(); + caps.setCapability("chrome.prefs", preferences); + caps.setCapability("chrome.switches", Arrays.asList("--user-data-dir=/Users/yihua/temp/chrome")); + WebDriver webDriver = new ChromeDriver(caps); + webDriver.get("http://huaban.com/"); + WebElement webElement = webDriver.findElement(By.xpath("/html")); + System.out.println(webElement.getAttribute("outerHTML")); + webDriver.close(); + } +} diff --git a/webmagic-plugin/webmagic-misc/src/test/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloaderTest.java b/webmagic-plugin/webmagic-misc/src/test/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloaderTest.java new file mode 100644 index 0000000..fe98e8f --- /dev/null +++ b/webmagic-plugin/webmagic-misc/src/test/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloaderTest.java @@ -0,0 +1,61 @@ +package us.codecraft.webmagic.downloader.selenium; + +import org.junit.Ignore; +import org.junit.Test; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Task; + +/** + * @author code4crafter@gmail.com
+ * @date: 13-7-26
+ * Time: 下午2:46
+ */ +public class SeleniumDownloaderTest { + + private String chromeDriverPath = "/Users/yihua/Downloads/chromedriver"; + + @Ignore("need chrome driver") + @Test + public void test() { + SeleniumDownloader seleniumDownloader = new SeleniumDownloader(chromeDriverPath); + long time1 = System.currentTimeMillis(); + for (int i = 0; i < 100; i++) { + Page page = seleniumDownloader.download(new Request("http://huaban.com/"), new Task() { + @Override + public String getUUID() { + return "huaban.com"; + } + + @Override + public Site getSite() { + return Site.me(); + } + }); + System.out.println(page.getHtml().$("#waterfall").links().regex(".*pins.*").all()); + } + System.out.println(System.currentTimeMillis() - time1); + } + + @Ignore + @Test + public void testBaiduWenku() { + SeleniumDownloader seleniumDownloader = new SeleniumDownloader(chromeDriverPath); + seleniumDownloader.setSleepTime(10000); + long time1 = System.currentTimeMillis(); + Page page = seleniumDownloader.download(new Request("http://wenku.baidu.com/view/462933ff04a1b0717fd5ddc2.html"), new Task() { + @Override + public String getUUID() { + return "huaban.com"; + } + + @Override + public Site getSite() { + return Site.me(); + } + }); + System.out.println(page.getHtml().$("div.inner").replace("<[^<>]+>","").replace("&nsbp;","").all()); + } + +} diff --git a/webmagic-plugin/webmagic-misc/src/test/java/us/codecraft/webmagic/downloader/selenium/WebDriverPoolTest.java b/webmagic-plugin/webmagic-misc/src/test/java/us/codecraft/webmagic/downloader/selenium/WebDriverPoolTest.java new file mode 100644 index 0000000..4d5d275 --- /dev/null +++ b/webmagic-plugin/webmagic-misc/src/test/java/us/codecraft/webmagic/downloader/selenium/WebDriverPoolTest.java @@ -0,0 +1,31 @@ +package us.codecraft.webmagic.downloader.selenium; + +import org.junit.Ignore; +import org.junit.Test; +import org.openqa.selenium.WebDriver; + +/** + * @author code4crafter@gmail.com
+ * @date: 13-7-26
+ * Time: 下午2:12
+ */ +public class WebDriverPoolTest { + + private String chromeDriverPath = "/Users/yihua/Downloads/chromedriver"; + + @Ignore("need chrome driver") + @Test + public void test() { + System.getProperties().setProperty("webdriver.chrome.driver", chromeDriverPath); + WebDriverPool webDriverPool = new WebDriverPool(5); + for (int i = 0; i < 5; i++) { + try { + WebDriver webDriver = webDriverPool.get(); + System.out.println(i); + } catch (InterruptedException e) { + e.printStackTrace(); + } + } + webDriverPool.closeAll(); + } +} diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuabanProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuabanProcessor.java index eef2b2f..4763c07 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuabanProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuabanProcessor.java @@ -6,7 +6,7 @@ import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.pipeline.FilePipeline; import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.scheduler.RedisScheduler; -import us.codecraft.webmagic.selenium.downloader.SeleniumDownloader; +import us.codecraft.webmagic.downloader.downloader.SeleniumDownloader; /** * 花瓣网抽取器。
From e4c53c4400b840b081bd2a3e3a67ffde9ad9b81b Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Tue, 6 Aug 2013 23:11:43 +0800 Subject: [PATCH 53/84] fix compile error --- .../java/us/codecraft/webmagic/samples/HuabanProcessor.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuabanProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuabanProcessor.java index 4763c07..44173fb 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuabanProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuabanProcessor.java @@ -3,10 +3,10 @@ package us.codecraft.webmagic.samples; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.downloader.selenium.SeleniumDownloader; import us.codecraft.webmagic.pipeline.FilePipeline; import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.scheduler.RedisScheduler; -import us.codecraft.webmagic.downloader.downloader.SeleniumDownloader; /** * 花瓣网抽取器。
From 0c8599e3b296df30e29880ee418bdc94e6438e4d Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Tue, 6 Aug 2013 23:17:07 +0800 Subject: [PATCH 54/84] update packages structure --- pom.xml | 2 +- .../pom.xml | 13 ++++++-- .../us/codecraft/webmagic/PagedModel.java | 0 .../webmagic/downloader/FileDownloader.java | 0 .../selenium/SeleniumDownloader.java | 0 .../downloader/selenium/WebDriverPool.java | 0 .../webmagic/model/AfterExtractor.java | 0 .../model/ConsolePageModelPipeline.java | 0 .../codecraft/webmagic/model/ExtractBy.java | 0 .../codecraft/webmagic/model/ExtractBy2.java | 0 .../codecraft/webmagic/model/ExtractBy3.java | 0 .../webmagic/model/ExtractByRaw.java | 0 .../webmagic/model/ExtractByUrl.java | 0 .../codecraft/webmagic/model/Extractor.java | 0 .../webmagic/model/FieldExtractor.java | 0 .../us/codecraft/webmagic/model/HelpUrl.java | 0 .../webmagic/model/ModelPageProcessor.java | 0 .../webmagic/model/ModelPipeline.java | 0 .../us/codecraft/webmagic/model/OOSpider.java | 0 .../webmagic/model/PageModelExtractor.java | 0 .../webmagic/model/PageModelPipeline.java | 0 .../codecraft/webmagic/model/TargetUrl.java | 0 .../us/codecraft/webmagic/model/package.html | 0 .../webmagic/pipeline/JsonFilePipeline.java | 0 .../webmagic/pipeline/PagedPipeline.java | 0 .../scheduler/FileCacheQueueScheduler.java | 0 .../webmagic/scheduler/RedisScheduler.java | 0 .../webmagic/utils/DoubleKeyMap.java | 0 .../webmagic/utils/MultiKeyMapBase.java | 0 .../webmagic/downloader/SeleniumTest.java | 0 .../selenium/SeleniumDownloaderTest.java | 0 .../selenium/WebDriverPoolTest.java | 0 .../scheduler/RedisSchedulerTest.java | 0 .../pom.xml | 11 ++++++- .../webmagic/pipeline/LucenePipeline.java | 0 webmagic-plugin/README.md | 6 ---- webmagic-plugin/pom.xml | 32 ------------------- webmagic-samples/pom.xml | 2 +- 38 files changed, 23 insertions(+), 43 deletions(-) rename {webmagic-plugin/webmagic-misc => webmagic-extension}/pom.xml (70%) rename {webmagic-plugin/webmagic-misc => webmagic-extension}/src/main/java/us/codecraft/webmagic/PagedModel.java (100%) rename {webmagic-plugin/webmagic-misc => webmagic-extension}/src/main/java/us/codecraft/webmagic/downloader/FileDownloader.java (100%) rename {webmagic-plugin/webmagic-misc => webmagic-extension}/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java (100%) rename {webmagic-plugin/webmagic-misc => webmagic-extension}/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java (100%) rename {webmagic-plugin/webmagic-misc => webmagic-extension}/src/main/java/us/codecraft/webmagic/model/AfterExtractor.java (100%) rename {webmagic-plugin/webmagic-misc => webmagic-extension}/src/main/java/us/codecraft/webmagic/model/ConsolePageModelPipeline.java (100%) rename {webmagic-plugin/webmagic-misc => webmagic-extension}/src/main/java/us/codecraft/webmagic/model/ExtractBy.java (100%) rename {webmagic-plugin/webmagic-misc => webmagic-extension}/src/main/java/us/codecraft/webmagic/model/ExtractBy2.java (100%) rename {webmagic-plugin/webmagic-misc => webmagic-extension}/src/main/java/us/codecraft/webmagic/model/ExtractBy3.java (100%) rename {webmagic-plugin/webmagic-misc => webmagic-extension}/src/main/java/us/codecraft/webmagic/model/ExtractByRaw.java (100%) rename {webmagic-plugin/webmagic-misc => webmagic-extension}/src/main/java/us/codecraft/webmagic/model/ExtractByUrl.java (100%) rename {webmagic-plugin/webmagic-misc => webmagic-extension}/src/main/java/us/codecraft/webmagic/model/Extractor.java (100%) rename {webmagic-plugin/webmagic-misc => webmagic-extension}/src/main/java/us/codecraft/webmagic/model/FieldExtractor.java (100%) rename {webmagic-plugin/webmagic-misc => webmagic-extension}/src/main/java/us/codecraft/webmagic/model/HelpUrl.java (100%) rename {webmagic-plugin/webmagic-misc => webmagic-extension}/src/main/java/us/codecraft/webmagic/model/ModelPageProcessor.java (100%) rename {webmagic-plugin/webmagic-misc => webmagic-extension}/src/main/java/us/codecraft/webmagic/model/ModelPipeline.java (100%) rename {webmagic-plugin/webmagic-misc => webmagic-extension}/src/main/java/us/codecraft/webmagic/model/OOSpider.java (100%) rename {webmagic-plugin/webmagic-misc => webmagic-extension}/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java (100%) rename {webmagic-plugin/webmagic-misc => webmagic-extension}/src/main/java/us/codecraft/webmagic/model/PageModelPipeline.java (100%) rename {webmagic-plugin/webmagic-misc => webmagic-extension}/src/main/java/us/codecraft/webmagic/model/TargetUrl.java (100%) rename {webmagic-plugin/webmagic-misc => webmagic-extension}/src/main/java/us/codecraft/webmagic/model/package.html (100%) rename {webmagic-plugin/webmagic-misc => webmagic-extension}/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePipeline.java (100%) rename {webmagic-plugin/webmagic-misc => webmagic-extension}/src/main/java/us/codecraft/webmagic/pipeline/PagedPipeline.java (100%) rename {webmagic-plugin/webmagic-misc => webmagic-extension}/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java (100%) rename {webmagic-plugin/webmagic-misc => webmagic-extension}/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java (100%) rename {webmagic-plugin/webmagic-misc => webmagic-extension}/src/main/java/us/codecraft/webmagic/utils/DoubleKeyMap.java (100%) rename {webmagic-plugin/webmagic-misc => webmagic-extension}/src/main/java/us/codecraft/webmagic/utils/MultiKeyMapBase.java (100%) rename {webmagic-plugin/webmagic-misc => webmagic-extension}/src/test/java/us/codecraft/webmagic/downloader/SeleniumTest.java (100%) rename {webmagic-plugin/webmagic-misc => webmagic-extension}/src/test/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloaderTest.java (100%) rename {webmagic-plugin/webmagic-misc => webmagic-extension}/src/test/java/us/codecraft/webmagic/downloader/selenium/WebDriverPoolTest.java (100%) rename {webmagic-plugin/webmagic-misc => webmagic-extension}/src/test/java/us/codecraft/webmagic/scheduler/RedisSchedulerTest.java (100%) rename {webmagic-plugin/webmagic-lucene => webmagic-lucene}/pom.xml (70%) rename {webmagic-plugin/webmagic-lucene => webmagic-lucene}/src/main/java/us/codecraft/webmagic/pipeline/LucenePipeline.java (100%) delete mode 100644 webmagic-plugin/README.md delete mode 100644 webmagic-plugin/pom.xml diff --git a/pom.xml b/pom.xml index fa369f4..b269ade 100644 --- a/pom.xml +++ b/pom.xml @@ -10,7 +10,7 @@ webmagic-core - webmagic-plugin/ + webmagic-extension/ webmagic-samples/ diff --git a/webmagic-plugin/webmagic-misc/pom.xml b/webmagic-extension/pom.xml similarity index 70% rename from webmagic-plugin/webmagic-misc/pom.xml rename to webmagic-extension/pom.xml index 130e400..cbff0b8 100644 --- a/webmagic-plugin/webmagic-misc/pom.xml +++ b/webmagic-extension/pom.xml @@ -4,12 +4,12 @@ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> us.codecraft - webmagic-plugin + webmagic 0.1.0 4.0.0 - webmagic-misc + webmagic-extension @@ -27,6 +27,15 @@ selenium-java 2.33.0 + + us.codecraft + webmagic-core + ${project.version} + + + junit + junit + \ No newline at end of file diff --git a/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/PagedModel.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/PagedModel.java similarity index 100% rename from webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/PagedModel.java rename to webmagic-extension/src/main/java/us/codecraft/webmagic/PagedModel.java diff --git a/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/downloader/FileDownloader.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/FileDownloader.java similarity index 100% rename from webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/downloader/FileDownloader.java rename to webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/FileDownloader.java diff --git a/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java similarity index 100% rename from webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java rename to webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java diff --git a/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java similarity index 100% rename from webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java rename to webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java diff --git a/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/model/AfterExtractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/AfterExtractor.java similarity index 100% rename from webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/model/AfterExtractor.java rename to webmagic-extension/src/main/java/us/codecraft/webmagic/model/AfterExtractor.java diff --git a/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/model/ConsolePageModelPipeline.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ConsolePageModelPipeline.java similarity index 100% rename from webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/model/ConsolePageModelPipeline.java rename to webmagic-extension/src/main/java/us/codecraft/webmagic/model/ConsolePageModelPipeline.java diff --git a/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/model/ExtractBy.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ExtractBy.java similarity index 100% rename from webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/model/ExtractBy.java rename to webmagic-extension/src/main/java/us/codecraft/webmagic/model/ExtractBy.java diff --git a/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/model/ExtractBy2.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ExtractBy2.java similarity index 100% rename from webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/model/ExtractBy2.java rename to webmagic-extension/src/main/java/us/codecraft/webmagic/model/ExtractBy2.java diff --git a/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/model/ExtractBy3.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ExtractBy3.java similarity index 100% rename from webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/model/ExtractBy3.java rename to webmagic-extension/src/main/java/us/codecraft/webmagic/model/ExtractBy3.java diff --git a/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/model/ExtractByRaw.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ExtractByRaw.java similarity index 100% rename from webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/model/ExtractByRaw.java rename to webmagic-extension/src/main/java/us/codecraft/webmagic/model/ExtractByRaw.java diff --git a/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/model/ExtractByUrl.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ExtractByUrl.java similarity index 100% rename from webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/model/ExtractByUrl.java rename to webmagic-extension/src/main/java/us/codecraft/webmagic/model/ExtractByUrl.java diff --git a/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/model/Extractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/Extractor.java similarity index 100% rename from webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/model/Extractor.java rename to webmagic-extension/src/main/java/us/codecraft/webmagic/model/Extractor.java diff --git a/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/model/FieldExtractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/FieldExtractor.java similarity index 100% rename from webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/model/FieldExtractor.java rename to webmagic-extension/src/main/java/us/codecraft/webmagic/model/FieldExtractor.java diff --git a/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/model/HelpUrl.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/HelpUrl.java similarity index 100% rename from webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/model/HelpUrl.java rename to webmagic-extension/src/main/java/us/codecraft/webmagic/model/HelpUrl.java diff --git a/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/model/ModelPageProcessor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ModelPageProcessor.java similarity index 100% rename from webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/model/ModelPageProcessor.java rename to webmagic-extension/src/main/java/us/codecraft/webmagic/model/ModelPageProcessor.java diff --git a/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/model/ModelPipeline.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ModelPipeline.java similarity index 100% rename from webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/model/ModelPipeline.java rename to webmagic-extension/src/main/java/us/codecraft/webmagic/model/ModelPipeline.java diff --git a/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/model/OOSpider.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/OOSpider.java similarity index 100% rename from webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/model/OOSpider.java rename to webmagic-extension/src/main/java/us/codecraft/webmagic/model/OOSpider.java diff --git a/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java similarity index 100% rename from webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java rename to webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java diff --git a/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/model/PageModelPipeline.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelPipeline.java similarity index 100% rename from webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/model/PageModelPipeline.java rename to webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelPipeline.java diff --git a/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/model/TargetUrl.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/TargetUrl.java similarity index 100% rename from webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/model/TargetUrl.java rename to webmagic-extension/src/main/java/us/codecraft/webmagic/model/TargetUrl.java diff --git a/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/model/package.html b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/package.html similarity index 100% rename from webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/model/package.html rename to webmagic-extension/src/main/java/us/codecraft/webmagic/model/package.html diff --git a/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePipeline.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePipeline.java similarity index 100% rename from webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePipeline.java rename to webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePipeline.java diff --git a/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/pipeline/PagedPipeline.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/PagedPipeline.java similarity index 100% rename from webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/pipeline/PagedPipeline.java rename to webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/PagedPipeline.java diff --git a/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java similarity index 100% rename from webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java rename to webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java diff --git a/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java similarity index 100% rename from webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java rename to webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java diff --git a/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/utils/DoubleKeyMap.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/DoubleKeyMap.java similarity index 100% rename from webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/utils/DoubleKeyMap.java rename to webmagic-extension/src/main/java/us/codecraft/webmagic/utils/DoubleKeyMap.java diff --git a/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/utils/MultiKeyMapBase.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/MultiKeyMapBase.java similarity index 100% rename from webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/utils/MultiKeyMapBase.java rename to webmagic-extension/src/main/java/us/codecraft/webmagic/utils/MultiKeyMapBase.java diff --git a/webmagic-plugin/webmagic-misc/src/test/java/us/codecraft/webmagic/downloader/SeleniumTest.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/downloader/SeleniumTest.java similarity index 100% rename from webmagic-plugin/webmagic-misc/src/test/java/us/codecraft/webmagic/downloader/SeleniumTest.java rename to webmagic-extension/src/test/java/us/codecraft/webmagic/downloader/SeleniumTest.java diff --git a/webmagic-plugin/webmagic-misc/src/test/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloaderTest.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloaderTest.java similarity index 100% rename from webmagic-plugin/webmagic-misc/src/test/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloaderTest.java rename to webmagic-extension/src/test/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloaderTest.java diff --git a/webmagic-plugin/webmagic-misc/src/test/java/us/codecraft/webmagic/downloader/selenium/WebDriverPoolTest.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/downloader/selenium/WebDriverPoolTest.java similarity index 100% rename from webmagic-plugin/webmagic-misc/src/test/java/us/codecraft/webmagic/downloader/selenium/WebDriverPoolTest.java rename to webmagic-extension/src/test/java/us/codecraft/webmagic/downloader/selenium/WebDriverPoolTest.java diff --git a/webmagic-plugin/webmagic-misc/src/test/java/us/codecraft/webmagic/scheduler/RedisSchedulerTest.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/scheduler/RedisSchedulerTest.java similarity index 100% rename from webmagic-plugin/webmagic-misc/src/test/java/us/codecraft/webmagic/scheduler/RedisSchedulerTest.java rename to webmagic-extension/src/test/java/us/codecraft/webmagic/scheduler/RedisSchedulerTest.java diff --git a/webmagic-plugin/webmagic-lucene/pom.xml b/webmagic-lucene/pom.xml similarity index 70% rename from webmagic-plugin/webmagic-lucene/pom.xml rename to webmagic-lucene/pom.xml index b072472..512d189 100644 --- a/webmagic-plugin/webmagic-lucene/pom.xml +++ b/webmagic-lucene/pom.xml @@ -3,7 +3,7 @@ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> - webmagic-plugin + webmagic us.codecraft 0.1.0 @@ -22,6 +22,15 @@ lucene-queryparser 4.4.0
+ + us.codecraft + webmagic-core + ${project.version} + + + junit + junit +
diff --git a/webmagic-plugin/webmagic-lucene/src/main/java/us/codecraft/webmagic/pipeline/LucenePipeline.java b/webmagic-lucene/src/main/java/us/codecraft/webmagic/pipeline/LucenePipeline.java similarity index 100% rename from webmagic-plugin/webmagic-lucene/src/main/java/us/codecraft/webmagic/pipeline/LucenePipeline.java rename to webmagic-lucene/src/main/java/us/codecraft/webmagic/pipeline/LucenePipeline.java diff --git a/webmagic-plugin/README.md b/webmagic-plugin/README.md deleted file mode 100644 index 536d596..0000000 --- a/webmagic-plugin/README.md +++ /dev/null @@ -1,6 +0,0 @@ -webmagic-plugin -------- -webmagic的插件模块。 -目前仅实现了freemarker模板渲染,和redis实现分布式爬虫。 - -另外有一个使用Selenium来动态渲染页面的模块在开发中。 \ No newline at end of file diff --git a/webmagic-plugin/pom.xml b/webmagic-plugin/pom.xml deleted file mode 100644 index 54c69ec..0000000 --- a/webmagic-plugin/pom.xml +++ /dev/null @@ -1,32 +0,0 @@ - - - - us.codecraft - webmagic - 0.1.0 - - pom - 4.0.0 - - webmagic-misc - webmagic-selenium - webmagic-lucene - - - webmagic-plugin - - - - us.codecraft - webmagic-core - ${project.version} - - - junit - junit - - - - \ No newline at end of file diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index ff37e21..de3b4aa 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -19,7 +19,7 @@
us.codecraft - webmagic-misc + webmagic-extension ${project.version} From 9ef6de01e46182a26c03bdbf1f4f69f48b68d338 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Tue, 6 Aug 2013 23:23:23 +0800 Subject: [PATCH 55/84] update readme --- webmagic-lucene/README.md | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 webmagic-lucene/README.md diff --git a/webmagic-lucene/README.md b/webmagic-lucene/README.md new file mode 100644 index 0000000..77050ab --- /dev/null +++ b/webmagic-lucene/README.md @@ -0,0 +1,3 @@ +webmagic-lucene +-------- +尝试将webmagic与lucene结合,打造一个搜索引擎。开发中,不作为webmagic主要模块。 \ No newline at end of file From e5cf2882b0b2e41068fb9ba473d1a6cf226f74b7 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Tue, 6 Aug 2013 23:32:02 +0800 Subject: [PATCH 56/84] fix a lucene bug --- webmagic-lucene/pom.xml | 2 +- .../webmagic/pipeline/LucenePipeline.java | 11 ++-- .../webmagic/lucene/OschinaBlog.java | 61 +++++++++++++++++++ 3 files changed, 67 insertions(+), 7 deletions(-) create mode 100644 webmagic-lucene/src/main/test/java/us/codecraft/webmagic/lucene/OschinaBlog.java diff --git a/webmagic-lucene/pom.xml b/webmagic-lucene/pom.xml index 512d189..2d0ceeb 100644 --- a/webmagic-lucene/pom.xml +++ b/webmagic-lucene/pom.xml @@ -24,7 +24,7 @@ us.codecraft - webmagic-core + webmagic-extension ${project.version} diff --git a/webmagic-lucene/src/main/java/us/codecraft/webmagic/pipeline/LucenePipeline.java b/webmagic-lucene/src/main/java/us/codecraft/webmagic/pipeline/LucenePipeline.java index 4c048ef..724ac7e 100644 --- a/webmagic-lucene/src/main/java/us/codecraft/webmagic/pipeline/LucenePipeline.java +++ b/webmagic-lucene/src/main/java/us/codecraft/webmagic/pipeline/LucenePipeline.java @@ -33,16 +33,14 @@ public class LucenePipeline implements Pipeline { private Directory directory; - private IndexWriter indexWriter; - private Analyzer analyzer; + private IndexWriterConfig config; + private void init() throws IOException { analyzer = new StandardAnalyzer(Version.LUCENE_44); directory = new RAMDirectory(); - IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_44, analyzer); - indexWriter = new IndexWriter(directory, config); - indexWriter.close(); + config = new IndexWriterConfig(Version.LUCENE_44, analyzer); } public LucenePipeline() { @@ -67,7 +65,6 @@ public class LucenePipeline implements Pipeline { documents.add(hitDoc); } ireader.close(); - directory.close(); return documents; } @@ -85,7 +82,9 @@ public class LucenePipeline implements Pipeline { doc.add(new Field(objectEntry.getKey(), objectEntry.getValue().toString(), TextField.TYPE_STORED)); } try { + IndexWriter indexWriter = new IndexWriter(directory, config); indexWriter.addDocument(doc); + indexWriter.close(); } catch (IOException e) { e.printStackTrace(); } diff --git a/webmagic-lucene/src/main/test/java/us/codecraft/webmagic/lucene/OschinaBlog.java b/webmagic-lucene/src/main/test/java/us/codecraft/webmagic/lucene/OschinaBlog.java new file mode 100644 index 0000000..d444275 --- /dev/null +++ b/webmagic-lucene/src/main/test/java/us/codecraft/webmagic/lucene/OschinaBlog.java @@ -0,0 +1,61 @@ +package us.codecraft.webmagic.lucene; + +import org.apache.lucene.document.Document; +import org.apache.lucene.queryparser.classic.ParseException; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.model.ExtractBy; +import us.codecraft.webmagic.model.OOSpider; +import us.codecraft.webmagic.model.TargetUrl; +import us.codecraft.webmagic.pipeline.LucenePipeline; + +import java.io.IOException; +import java.util.List; + +/** + * @author code4crafter@gmail.com
+ * @date: 13-8-2
+ * Time: 上午7:52
+ */ +@TargetUrl("http://my.oschina.net/flashsword/blog/\\d+") +public class OschinaBlog { + + @ExtractBy("//title") + private String title; + + @ExtractBy(value = "div.BlogContent", type = ExtractBy.Type.Css) + private String content; + + @Override + public String toString() { + return "OschinaBlog{" + + "title='" + title + '\'' + + ", content='" + content + '\'' + + '}'; + } + + public static void main(String[] args) { + LucenePipeline pipeline = new LucenePipeline(); + OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog"), OschinaBlog.class).pipeline(pipeline).runAsync(); + while (true) { + try { + List search = pipeline.search("title", "webmagic"); + System.out.println(search); + Thread.sleep(3000); + } catch (IOException e) { + e.printStackTrace(); + } catch (ParseException e) { + e.printStackTrace(); + } catch (InterruptedException e) { + e.printStackTrace(); + } + } + } + + public String getTitle() { + return title; + } + + public String getContent() { + return content; + } +} From 0954aa9a3ca71b2105073fcbdaf4708e963279ec Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Wed, 7 Aug 2013 08:05:14 +0800 Subject: [PATCH 57/84] update maven dep --- pom.xml | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index b269ade..706d9fd 100644 --- a/pom.xml +++ b/pom.xml @@ -80,6 +80,7 @@ org.apache.maven.plugins maven-dependency-plugin + 2.8 copy-dependencies @@ -99,6 +100,7 @@ org.apache.maven.plugins maven-resources-plugin + 2.6 UTF-8 @@ -106,6 +108,7 @@ org.apache.maven.plugins maven-source-plugin + 2.2.1 attach-sources @@ -118,6 +121,10 @@ org.apache.maven.plugins maven-javadoc-plugin + 2.9.1 + + UTF-8 + attach-javadocs @@ -130,7 +137,7 @@ org.apache.maven.plugins maven-release-plugin - 2.0-beta-7 + 2.4.1 From 570533cce51d41f1213077fe1c87f4db4466c39d Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Wed, 7 Aug 2013 09:45:00 +0800 Subject: [PATCH 58/84] update readme --- webmagic-core/README.md | 2 +- webmagic-extension/README.md | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) create mode 100644 webmagic-extension/README.md diff --git a/webmagic-core/README.md b/webmagic-core/README.md index 4964e16..90a6f0a 100644 --- a/webmagic-core/README.md +++ b/webmagic-core/README.md @@ -1,3 +1,3 @@ webmagic-core ------- -webmagic核心部分。 \ No newline at end of file +webmagic核心部分。只包含爬虫基本模块和基本抽取器。webmagic-core的目标是成为网页爬虫的一个教科书般的实现。 \ No newline at end of file diff --git a/webmagic-extension/README.md b/webmagic-extension/README.md new file mode 100644 index 0000000..71d3c48 --- /dev/null +++ b/webmagic-extension/README.md @@ -0,0 +1,3 @@ +webmagic-extension +------- +webmagic的扩展模块。包括注解格式定义爬虫、JSON、分布式等支持。 \ No newline at end of file From 3f7257957e751b452fa65f78f42e2124790c3658 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Wed, 7 Aug 2013 09:47:28 +0800 Subject: [PATCH 59/84] add readme --- webmagic-samples/README.md | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 webmagic-samples/README.md diff --git a/webmagic-samples/README.md b/webmagic-samples/README.md new file mode 100644 index 0000000..7cdad18 --- /dev/null +++ b/webmagic-samples/README.md @@ -0,0 +1,3 @@ +webmagic-samples +------- +webmagic的一些示例。包括抓取常见博客、信息类网站等。 \ No newline at end of file From 5ef231a768008686c0c0c2c73921d468c5465b42 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Wed, 7 Aug 2013 12:48:32 +0800 Subject: [PATCH 60/84] update version --- pom.xml | 2 +- webmagic-core/pom.xml | 2 +- webmagic-extension/pom.xml | 2 +- webmagic-lucene/pom.xml | 2 +- webmagic-samples/pom.xml | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pom.xml b/pom.xml index 706d9fd..5033977 100644 --- a/pom.xml +++ b/pom.xml @@ -3,7 +3,7 @@ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> us.codecraft - 0.1.0 + 0.2.0 4.0.0 pom webmagic diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index a5fbd75..d2c48b2 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -5,7 +5,7 @@ us.codecraft webmagic - 0.1.0 + 0.2.0 4.0.0 diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index cbff0b8..2a3590d 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -5,7 +5,7 @@ us.codecraft webmagic - 0.1.0 + 0.2.0 4.0.0 diff --git a/webmagic-lucene/pom.xml b/webmagic-lucene/pom.xml index 2d0ceeb..d7b4665 100644 --- a/webmagic-lucene/pom.xml +++ b/webmagic-lucene/pom.xml @@ -5,7 +5,7 @@ webmagic us.codecraft - 0.1.0 + 0.2.0 4.0.0 diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index de3b4aa..b85e708 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -5,7 +5,7 @@ us.codecraft webmagic - 0.1.0 + 0.2.0 4.0.0 From 36384246b558f9492ae99229084c6b5458f5cad1 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Wed, 7 Aug 2013 12:51:21 +0800 Subject: [PATCH 61/84] update package structure --- .../main/java/us/codecraft/webmagic/model/ModelPipeline.java | 1 + .../java/us/codecraft/webmagic/model/PageModelExtractor.java | 1 + .../codecraft/webmagic/model/{ => annotation}/ExtractBy.java | 2 +- .../codecraft/webmagic/model/{ => annotation}/ExtractBy2.java | 2 +- .../codecraft/webmagic/model/{ => annotation}/ExtractBy3.java | 2 +- .../webmagic/model/{ => annotation}/ExtractByRaw.java | 2 +- .../webmagic/model/{ => annotation}/ExtractByUrl.java | 2 +- .../us/codecraft/webmagic/model/{ => annotation}/HelpUrl.java | 2 +- .../codecraft/webmagic/model/{ => annotation}/TargetUrl.java | 2 +- .../test/java/us/codecraft/webmagic/lucene/OschinaBlog.java | 4 ++-- .../java/us/codecraft/webmagic/model/samples/IteyeBlog.java | 4 ++-- .../java/us/codecraft/webmagic/model/samples/News163.java | 4 ++++ .../us/codecraft/webmagic/model/samples/OschinaAnswer.java | 3 +++ .../java/us/codecraft/webmagic/model/samples/OschinaBlog.java | 4 ++-- 14 files changed, 22 insertions(+), 13 deletions(-) rename webmagic-extension/src/main/java/us/codecraft/webmagic/model/{ => annotation}/ExtractBy.java (92%) rename webmagic-extension/src/main/java/us/codecraft/webmagic/model/{ => annotation}/ExtractBy2.java (91%) rename webmagic-extension/src/main/java/us/codecraft/webmagic/model/{ => annotation}/ExtractBy3.java (91%) rename webmagic-extension/src/main/java/us/codecraft/webmagic/model/{ => annotation}/ExtractByRaw.java (93%) rename webmagic-extension/src/main/java/us/codecraft/webmagic/model/{ => annotation}/ExtractByUrl.java (91%) rename webmagic-extension/src/main/java/us/codecraft/webmagic/model/{ => annotation}/HelpUrl.java (89%) rename webmagic-extension/src/main/java/us/codecraft/webmagic/model/{ => annotation}/TargetUrl.java (91%) diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ModelPipeline.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ModelPipeline.java index c9f67dc..84db455 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ModelPipeline.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ModelPipeline.java @@ -2,6 +2,7 @@ package us.codecraft.webmagic.model; import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.Task; +import us.codecraft.webmagic.model.annotation.ExtractBy; import us.codecraft.webmagic.pipeline.Pipeline; import java.lang.annotation.Annotation; diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java index 0207b7a..445bdd9 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java @@ -2,6 +2,7 @@ package us.codecraft.webmagic.model; import org.apache.commons.lang3.StringUtils; import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.model.annotation.*; import us.codecraft.webmagic.selector.*; import java.lang.annotation.Annotation; diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ExtractBy.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy.java similarity index 92% rename from webmagic-extension/src/main/java/us/codecraft/webmagic/model/ExtractBy.java rename to webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy.java index 661fd67..af8946a 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ExtractBy.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy.java @@ -1,4 +1,4 @@ -package us.codecraft.webmagic.model; +package us.codecraft.webmagic.model.annotation; import java.lang.annotation.ElementType; import java.lang.annotation.Retention; diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ExtractBy2.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy2.java similarity index 91% rename from webmagic-extension/src/main/java/us/codecraft/webmagic/model/ExtractBy2.java rename to webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy2.java index 55d5dfa..f68b7d6 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ExtractBy2.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy2.java @@ -1,4 +1,4 @@ -package us.codecraft.webmagic.model; +package us.codecraft.webmagic.model.annotation; import java.lang.annotation.ElementType; import java.lang.annotation.Retention; diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ExtractBy3.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy3.java similarity index 91% rename from webmagic-extension/src/main/java/us/codecraft/webmagic/model/ExtractBy3.java rename to webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy3.java index 10f6a9f..f3212a6 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ExtractBy3.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy3.java @@ -1,4 +1,4 @@ -package us.codecraft.webmagic.model; +package us.codecraft.webmagic.model.annotation; import java.lang.annotation.ElementType; import java.lang.annotation.Retention; diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ExtractByRaw.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractByRaw.java similarity index 93% rename from webmagic-extension/src/main/java/us/codecraft/webmagic/model/ExtractByRaw.java rename to webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractByRaw.java index 5dca8e1..9692732 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ExtractByRaw.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractByRaw.java @@ -1,4 +1,4 @@ -package us.codecraft.webmagic.model; +package us.codecraft.webmagic.model.annotation; import java.lang.annotation.ElementType; import java.lang.annotation.Retention; diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ExtractByUrl.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractByUrl.java similarity index 91% rename from webmagic-extension/src/main/java/us/codecraft/webmagic/model/ExtractByUrl.java rename to webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractByUrl.java index f443c0e..52f4a57 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ExtractByUrl.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractByUrl.java @@ -1,4 +1,4 @@ -package us.codecraft.webmagic.model; +package us.codecraft.webmagic.model.annotation; import java.lang.annotation.ElementType; import java.lang.annotation.Retention; diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/HelpUrl.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/HelpUrl.java similarity index 89% rename from webmagic-extension/src/main/java/us/codecraft/webmagic/model/HelpUrl.java rename to webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/HelpUrl.java index 808d58a..3437e44 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/HelpUrl.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/HelpUrl.java @@ -1,4 +1,4 @@ -package us.codecraft.webmagic.model; +package us.codecraft.webmagic.model.annotation; import java.lang.annotation.ElementType; import java.lang.annotation.Retention; diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/TargetUrl.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/TargetUrl.java similarity index 91% rename from webmagic-extension/src/main/java/us/codecraft/webmagic/model/TargetUrl.java rename to webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/TargetUrl.java index 3622f55..c747da6 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/TargetUrl.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/TargetUrl.java @@ -1,4 +1,4 @@ -package us.codecraft.webmagic.model; +package us.codecraft.webmagic.model.annotation; import java.lang.annotation.ElementType; import java.lang.annotation.Retention; diff --git a/webmagic-lucene/src/main/test/java/us/codecraft/webmagic/lucene/OschinaBlog.java b/webmagic-lucene/src/main/test/java/us/codecraft/webmagic/lucene/OschinaBlog.java index d444275..4480f0b 100644 --- a/webmagic-lucene/src/main/test/java/us/codecraft/webmagic/lucene/OschinaBlog.java +++ b/webmagic-lucene/src/main/test/java/us/codecraft/webmagic/lucene/OschinaBlog.java @@ -3,9 +3,9 @@ package us.codecraft.webmagic.lucene; import org.apache.lucene.document.Document; import org.apache.lucene.queryparser.classic.ParseException; import us.codecraft.webmagic.Site; -import us.codecraft.webmagic.model.ExtractBy; +import us.codecraft.webmagic.model.annotation.ExtractBy; import us.codecraft.webmagic.model.OOSpider; -import us.codecraft.webmagic.model.TargetUrl; +import us.codecraft.webmagic.model.annotation.TargetUrl; import us.codecraft.webmagic.pipeline.LucenePipeline; import java.io.IOException; diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/IteyeBlog.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/IteyeBlog.java index 4d01902..4be2ede 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/IteyeBlog.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/IteyeBlog.java @@ -1,9 +1,9 @@ package us.codecraft.webmagic.model.samples; import us.codecraft.webmagic.Site; -import us.codecraft.webmagic.model.ExtractBy; +import us.codecraft.webmagic.model.annotation.ExtractBy; import us.codecraft.webmagic.model.OOSpider; -import us.codecraft.webmagic.model.TargetUrl; +import us.codecraft.webmagic.model.annotation.TargetUrl; /** * @author code4crafter@gmail.com
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java index 83f8388..848800d 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java @@ -3,6 +3,10 @@ package us.codecraft.webmagic.model.samples; import us.codecraft.webmagic.PagedModel; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.model.*; +import us.codecraft.webmagic.model.annotation.ExtractBy; +import us.codecraft.webmagic.model.annotation.ExtractBy2; +import us.codecraft.webmagic.model.annotation.ExtractByUrl; +import us.codecraft.webmagic.model.annotation.TargetUrl; import us.codecraft.webmagic.pipeline.ConsolePipeline; import us.codecraft.webmagic.pipeline.PagedPipeline; import us.codecraft.webmagic.scheduler.RedisScheduler; diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaAnswer.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaAnswer.java index df23873..02b8a9c 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaAnswer.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaAnswer.java @@ -3,6 +3,9 @@ package us.codecraft.webmagic.model.samples; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.model.*; +import us.codecraft.webmagic.model.annotation.ExtractBy; +import us.codecraft.webmagic.model.annotation.HelpUrl; +import us.codecraft.webmagic.model.annotation.TargetUrl; /** * @author code4crafter@gmail.com
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java index 817ba44..6409492 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java @@ -1,9 +1,9 @@ package us.codecraft.webmagic.model.samples; import us.codecraft.webmagic.Site; -import us.codecraft.webmagic.model.ExtractBy; +import us.codecraft.webmagic.model.annotation.ExtractBy; import us.codecraft.webmagic.model.OOSpider; -import us.codecraft.webmagic.model.TargetUrl; +import us.codecraft.webmagic.model.annotation.TargetUrl; import us.codecraft.webmagic.pipeline.JsonFilePipeline; /** From cff943f6989251bc223e55eb6fb533ee7bae382b Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Wed, 7 Aug 2013 13:05:12 +0800 Subject: [PATCH 62/84] fix path format error --- .../main/java/us/codecraft/webmagic/pipeline/FilePipeline.java | 3 +++ .../java/us/codecraft/webmagic/downloader/FileDownloader.java | 3 +++ .../java/us/codecraft/webmagic/pipeline/JsonFilePipeline.java | 3 +++ .../codecraft/webmagic/scheduler/FileCacheQueueScheduler.java | 3 +++ 4 files changed, 12 insertions(+) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java index 72ae1aa..252ccd5 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java @@ -37,6 +37,9 @@ public class FilePipeline implements Pipeline { * @param path 文件保存路径 */ public FilePipeline(String path) { + if (!path.endsWith("/")&&!path.endsWith("\\")){ + path+="/"; + } this.path = path; } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/FileDownloader.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/FileDownloader.java index 722a2eb..cca5b20 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/FileDownloader.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/FileDownloader.java @@ -34,6 +34,9 @@ public class FileDownloader implements Downloader { } public FileDownloader(String path, Downloader downloaderWhenFileMiss) { + if (!path.endsWith("/")&&!path.endsWith("\\")){ + path+="/"; + } this.path = path; this.downloaderWhenFileMiss = downloaderWhenFileMiss; } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePipeline.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePipeline.java index 08f3e87..dbe1a00 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePipeline.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePipeline.java @@ -37,6 +37,9 @@ public class JsonFilePipeline implements Pipeline { * @param path 文件保存路径 */ public JsonFilePipeline(String path) { + if (!path.endsWith("/")&&!path.endsWith("\\")){ + path+="/"; + } this.path = path; } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java index d4a3987..c294f09 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java @@ -47,6 +47,9 @@ public class FileCacheQueueScheduler implements Scheduler { private Set urls; public FileCacheQueueScheduler(String filePath) { + if (!filePath.endsWith("/")&&!filePath.endsWith("\\")){ + filePath+="/"; + } this.filePath = filePath; } From 46c65f19c773aac1f11e7a689b4b28486166dcbe Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Wed, 7 Aug 2013 14:06:49 +0800 Subject: [PATCH 63/84] add maven jar plugin --- webmagic-samples/assembly.xml | 21 +++++++++++++++++++++ webmagic-samples/pom.xml | 18 ++++++++++++++++++ 2 files changed, 39 insertions(+) create mode 100644 webmagic-samples/assembly.xml diff --git a/webmagic-samples/assembly.xml b/webmagic-samples/assembly.xml new file mode 100644 index 0000000..83370a2 --- /dev/null +++ b/webmagic-samples/assembly.xml @@ -0,0 +1,21 @@ + + jar-with-dependencies + + jar + + false + + + ${project.basedir}/target/classes + / + + + + + false + + + \ No newline at end of file diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index b85e708..1a4b702 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -33,4 +33,22 @@
+ + + + org.apache.maven.plugins + maven-jar-plugin + 2.4 + + + + true + ./lib/ + + + + + + + \ No newline at end of file From bd1384a5134c08ec690650e1ee10cefe1d961170 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Wed, 7 Aug 2013 14:18:58 +0800 Subject: [PATCH 64/84] remove samples --- .../webmagic/samples/{ => selenium}/HuabanProcessor.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename webmagic-samples/src/main/java/us/codecraft/webmagic/samples/{ => selenium}/HuabanProcessor.java (96%) diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuabanProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/selenium/HuabanProcessor.java similarity index 96% rename from webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuabanProcessor.java rename to webmagic-samples/src/main/java/us/codecraft/webmagic/samples/selenium/HuabanProcessor.java index 44173fb..dfff0c6 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuabanProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/selenium/HuabanProcessor.java @@ -1,4 +1,4 @@ -package us.codecraft.webmagic.samples; +package us.codecraft.webmagic.samples.selenium; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; From f1573b40a2601a1ebf05b952e4398ff4e2e9f4a7 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Wed, 7 Aug 2013 14:44:52 +0800 Subject: [PATCH 65/84] set selenium dep to seperate package --- pom.xml | 11 ++++--- webmagic-extension/pom.xml | 5 --- webmagic-samples/assembly.xml | 21 ------------- webmagic-selenium/README.md | 3 ++ webmagic-selenium/pom.xml | 31 +++++++++++++++++++ .../selenium/SeleniumDownloader.java | 0 .../downloader/selenium/WebDriverPool.java | 0 .../webmagic/downloader/SeleniumTest.java | 0 .../selenium/SeleniumDownloaderTest.java | 0 .../selenium/WebDriverPoolTest.java | 0 .../webmagic/samples}/HuabanProcessor.java | 4 +-- 11 files changed, 41 insertions(+), 34 deletions(-) delete mode 100644 webmagic-samples/assembly.xml create mode 100644 webmagic-selenium/README.md create mode 100644 webmagic-selenium/pom.xml rename {webmagic-extension => webmagic-selenium}/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java (100%) rename {webmagic-extension => webmagic-selenium}/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java (100%) rename {webmagic-extension => webmagic-selenium}/src/test/java/us/codecraft/webmagic/downloader/SeleniumTest.java (100%) rename {webmagic-extension => webmagic-selenium}/src/test/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloaderTest.java (100%) rename {webmagic-extension => webmagic-selenium}/src/test/java/us/codecraft/webmagic/downloader/selenium/WebDriverPoolTest.java (100%) rename {webmagic-samples/src/main/java/us/codecraft/webmagic/samples/selenium => webmagic-selenium/src/test/java/us/codecraft/webmagic/samples}/HuabanProcessor.java (89%) diff --git a/pom.xml b/pom.xml index 5033977..1497bb0 100644 --- a/pom.xml +++ b/pom.xml @@ -8,10 +8,12 @@ pom webmagic - - webmagic-core - webmagic-extension/ - webmagic-samples/ + + webmagic-core + webmagic-extension/ + webmagic-samples/ + webmagic-selenium/ + webmagic-lucene/ @@ -143,5 +145,4 @@ - diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index 2a3590d..63034f2 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -22,11 +22,6 @@ jedis 2.0.0 - - org.seleniumhq.selenium - selenium-java - 2.33.0 - us.codecraft webmagic-core diff --git a/webmagic-samples/assembly.xml b/webmagic-samples/assembly.xml deleted file mode 100644 index 83370a2..0000000 --- a/webmagic-samples/assembly.xml +++ /dev/null @@ -1,21 +0,0 @@ - - jar-with-dependencies - - jar - - false - - - ${project.basedir}/target/classes - / - - - - - false - - - \ No newline at end of file diff --git a/webmagic-selenium/README.md b/webmagic-selenium/README.md new file mode 100644 index 0000000..71d3c48 --- /dev/null +++ b/webmagic-selenium/README.md @@ -0,0 +1,3 @@ +webmagic-extension +------- +webmagic的扩展模块。包括注解格式定义爬虫、JSON、分布式等支持。 \ No newline at end of file diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml new file mode 100644 index 0000000..814b7b3 --- /dev/null +++ b/webmagic-selenium/pom.xml @@ -0,0 +1,31 @@ + + + + us.codecraft + webmagic + 0.2.0 + + 4.0.0 + + webmagic-selenium + + + + org.seleniumhq.selenium + selenium-java + 2.33.0 + + + us.codecraft + webmagic-core + ${project.version} + + + junit + junit + + + + \ No newline at end of file diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java similarity index 100% rename from webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java rename to webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java similarity index 100% rename from webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java rename to webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/downloader/SeleniumTest.java b/webmagic-selenium/src/test/java/us/codecraft/webmagic/downloader/SeleniumTest.java similarity index 100% rename from webmagic-extension/src/test/java/us/codecraft/webmagic/downloader/SeleniumTest.java rename to webmagic-selenium/src/test/java/us/codecraft/webmagic/downloader/SeleniumTest.java diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloaderTest.java b/webmagic-selenium/src/test/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloaderTest.java similarity index 100% rename from webmagic-extension/src/test/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloaderTest.java rename to webmagic-selenium/src/test/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloaderTest.java diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/downloader/selenium/WebDriverPoolTest.java b/webmagic-selenium/src/test/java/us/codecraft/webmagic/downloader/selenium/WebDriverPoolTest.java similarity index 100% rename from webmagic-extension/src/test/java/us/codecraft/webmagic/downloader/selenium/WebDriverPoolTest.java rename to webmagic-selenium/src/test/java/us/codecraft/webmagic/downloader/selenium/WebDriverPoolTest.java diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/selenium/HuabanProcessor.java b/webmagic-selenium/src/test/java/us/codecraft/webmagic/samples/HuabanProcessor.java similarity index 89% rename from webmagic-samples/src/main/java/us/codecraft/webmagic/samples/selenium/HuabanProcessor.java rename to webmagic-selenium/src/test/java/us/codecraft/webmagic/samples/HuabanProcessor.java index dfff0c6..6b3d8a0 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/selenium/HuabanProcessor.java +++ b/webmagic-selenium/src/test/java/us/codecraft/webmagic/samples/HuabanProcessor.java @@ -1,4 +1,4 @@ -package us.codecraft.webmagic.samples.selenium; +package us.codecraft.webmagic.samples; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; @@ -6,7 +6,6 @@ import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.downloader.selenium.SeleniumDownloader; import us.codecraft.webmagic.pipeline.FilePipeline; import us.codecraft.webmagic.processor.PageProcessor; -import us.codecraft.webmagic.scheduler.RedisScheduler; /** * 花瓣网抽取器。
@@ -39,7 +38,6 @@ public class HuabanProcessor implements PageProcessor { public static void main(String[] args) { Spider.create(new HuabanProcessor()).thread(5) - .scheduler(new RedisScheduler("localhost")) .pipeline(new FilePipeline("/data/webmagic/test/")) .downloader(new SeleniumDownloader("/Users/yihua/Downloads/chromedriver")) .runAsync(); From 7555ea0afc51560ade7ab9baeb592eb6efcfd3db Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Wed, 7 Aug 2013 14:45:44 +0800 Subject: [PATCH 66/84] update readme --- webmagic-selenium/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webmagic-selenium/README.md b/webmagic-selenium/README.md index 71d3c48..c8583c3 100644 --- a/webmagic-selenium/README.md +++ b/webmagic-selenium/README.md @@ -1,3 +1,3 @@ webmagic-extension ------- -webmagic的扩展模块。包括注解格式定义爬虫、JSON、分布式等支持。 \ No newline at end of file +webmagic与selenium的集成,用于爬取ajax页面。selenium太重,所以单独抽出成一个包了。 \ No newline at end of file From afec9d31b858dfea054892beaf06c5e7ee9da847 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Wed, 7 Aug 2013 21:51:26 +0800 Subject: [PATCH 67/84] add starter --- .../codecraft/webmagic/main/QuickStarter.java | 57 +++++++++++++++++++ .../webmagic/model/samples/News163.java | 2 +- .../webmagic/samples/GlobalProcessor.java | 49 ---------------- .../webmagic/samples/GuoxueProcessor.java | 20 ------- 4 files changed, 58 insertions(+), 70 deletions(-) create mode 100644 webmagic-samples/src/main/java/us/codecraft/webmagic/main/QuickStarter.java delete mode 100644 webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GlobalProcessor.java delete mode 100644 webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GuoxueProcessor.java diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/main/QuickStarter.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/main/QuickStarter.java new file mode 100644 index 0000000..2046fcb --- /dev/null +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/main/QuickStarter.java @@ -0,0 +1,57 @@ +package us.codecraft.webmagic.main; + +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.model.OOSpider; +import us.codecraft.webmagic.model.samples.IteyeBlog; +import us.codecraft.webmagic.model.samples.News163; +import us.codecraft.webmagic.model.samples.OschinaBlog; +import us.codecraft.webmagic.pipeline.ConsolePipeline; + +import java.util.LinkedHashMap; +import java.util.Map; +import java.util.Scanner; + +/** + * @author code4crafter@gmail.com
+ * @date: 13-8-7
+ * Time: 下午9:24
+ */ +public class QuickStarter { + + public static void main(String[] args) { + Map clazzMap = new LinkedHashMap(); + clazzMap.put("1", OschinaBlog.class); + clazzMap.put("2", IteyeBlog.class); + clazzMap.put("3", News163.class); + Map urlMap = new LinkedHashMap(); + urlMap.put("1", "http://my.oschina.net/flashsword/blog"); + urlMap.put("2", "http://flashsword20.iteye.com/"); + urlMap.put("3", "http://news.163.com/"); + Scanner stdin = new Scanner(System.in); + String key = null; + System.out.println("Choose a Spider demo:"); + for (Map.Entry classEntry : clazzMap.entrySet()) { + System.out.println(classEntry.getKey()+"\t" + classEntry.getValue() + "\t" + urlMap.get(classEntry.getKey())); + } + while (key == null) { + key = new String(stdin.nextLine()); + if (clazzMap.get(key) == null) { + System.out.println("Invalid choice!"); + key = null; + } + } + System.out.println("The demo started and will last 60 seconds..."); + + //Start spider + OOSpider.create(Site.me().addStartUrl(urlMap.get(key)), clazzMap.get(key)).pipeline(new ConsolePipeline()).runAsync(); + + + try { + Thread.sleep(60000); + } catch (InterruptedException e) { + e.printStackTrace(); + } + System.out.println("The demo stopped!"); + System.exit(0); + } +} diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java index 848800d..6baa8ae 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java @@ -28,7 +28,7 @@ public class News163 implements PagedModel { @ExtractByUrl(value = "http://news\\.163\\.com/\\d+/\\d+/\\d+/\\w+_(\\d+)\\.html", notNull = false) private String page; - @ExtractBy(value = "//div[@class=\"ep-pages\"]//a/@href", multi = true) + @ExtractBy(value = "//div[@class=\"ep-pages\"]//a/@href", multi = true,notNull = false) @ExtractBy2(value = "http://news\\.163\\.com/\\d+/\\d+/\\d+/\\w+_(\\d+)\\.html", type = ExtractBy2.Type.Regex) private List otherPage; diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GlobalProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GlobalProcessor.java deleted file mode 100644 index 0448683..0000000 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GlobalProcessor.java +++ /dev/null @@ -1,49 +0,0 @@ -package us.codecraft.webmagic.samples; - -import us.codecraft.webmagic.Page; -import us.codecraft.webmagic.Site; -import us.codecraft.webmagic.Spider; -import us.codecraft.webmagic.pipeline.FilePipeline; -import us.codecraft.webmagic.processor.PageProcessor; -import us.codecraft.webmagic.scheduler.RedisScheduler; - -import java.util.List; - -/** - * Author code4crafter@gmail.com - * Date: 13-6-24 - * Time: 下午2:12 - */ -public class GlobalProcessor implements PageProcessor { - - private Site site; - - @Override - public void process(Page page) { - final List requests = page.getHtml().links().all(); - page.addTargetRequests(requests); - - } - - @Override - public Site getSite() { - if (site == null) { - site = Site.me().setDomain("www.2345.com").setSleepTime(0) - .addStartUrl("http://www.2345.com/").addStartUrl("http://hao.360.cn/") - .addStartUrl("http://www.baidu.com/s?wd=%E7%BD%91%E7%AB%99%E5%AF%BC%E8%88%AA&rsv_spt=1&issp=1&rsv_bp=0&ie=utf-8&tn=80039098_oem_dg&rsv_n=2&rsv_sug3=6&rsv_sug4=698&rsv_sug=0&rsv_sug1=3") - .setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); - } - return site; - } - - public static void main(String[] args) { - Spider.create(new GlobalProcessor()).thread(10) - .scheduler(new RedisScheduler("localhost")) - .pipeline(new FilePipeline("/data/webmagic/test/")) - .runAsync(); - Spider.create(new GlobalProcessor()).thread(10) - .scheduler(new RedisScheduler("localhost")) - .pipeline(new FilePipeline("/data/webmagic/test/")) - .run(); - } -} diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GuoxueProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GuoxueProcessor.java deleted file mode 100644 index 5d7d355..0000000 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GuoxueProcessor.java +++ /dev/null @@ -1,20 +0,0 @@ -package us.codecraft.webmagic.samples; - -import us.codecraft.webmagic.Spider; -import us.codecraft.webmagic.pipeline.FilePipeline; -import us.codecraft.webmagic.processor.SimplePageProcessor; -import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler; - -/** - * @author code4crafter@gmail.com
- * @date: 13-7-14
- * Time: 上午8:33
- */ -public class GuoxueProcessor { - - public static void main(String[] args) { - SimplePageProcessor simplePageProcessor = new SimplePageProcessor("http://www.guoxue123.cn/", "http://www.guoxue123.cn/*"); - simplePageProcessor.getSite().setCharset("GBK").setSleepTime(500); - Spider.create(simplePageProcessor).pipeline(new FilePipeline("/data/webmagic/")).scheduler(new FileCacheQueueScheduler("/data/webmagic/")).run(); - } -} From 76dcbe605a08904ccf7f4b93bc34745b3668e49d Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Wed, 7 Aug 2013 21:57:05 +0800 Subject: [PATCH 68/84] update main class --- webmagic-samples/pom.xml | 1 + .../java/us/codecraft/webmagic/main/QuickStarter.java | 8 +++++--- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index 1a4b702..d8963ff 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -44,6 +44,7 @@ true ./lib/ + us.codecraft.webmagic.main.QuickStarter diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/main/QuickStarter.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/main/QuickStarter.java index 2046fcb..e40b371 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/main/QuickStarter.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/main/QuickStarter.java @@ -6,6 +6,7 @@ import us.codecraft.webmagic.model.samples.IteyeBlog; import us.codecraft.webmagic.model.samples.News163; import us.codecraft.webmagic.model.samples.OschinaBlog; import us.codecraft.webmagic.pipeline.ConsolePipeline; +import us.codecraft.webmagic.pipeline.PagedPipeline; import java.util.LinkedHashMap; import java.util.Map; @@ -40,18 +41,19 @@ public class QuickStarter { key = null; } } - System.out.println("The demo started and will last 60 seconds..."); + System.out.println("The demo started and will last 20 seconds..."); //Start spider - OOSpider.create(Site.me().addStartUrl(urlMap.get(key)), clazzMap.get(key)).pipeline(new ConsolePipeline()).runAsync(); + OOSpider.create(Site.me().addStartUrl(urlMap.get(key)), clazzMap.get(key)).pipeline(new PagedPipeline()).pipeline(new ConsolePipeline()).runAsync(); try { - Thread.sleep(60000); + Thread.sleep(20000); } catch (InterruptedException e) { e.printStackTrace(); } System.out.println("The demo stopped!"); + System.out.println("To more usage, try to customize your own Spider!"); System.exit(0); } } From 194152ab1306a300af7abcf1c8e6a67ec2cab5a6 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Wed, 7 Aug 2013 22:03:10 +0800 Subject: [PATCH 69/84] modify class --- .../java/us/codecraft/webmagic/model/samples/IteyeBlog.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/IteyeBlog.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/IteyeBlog.java index 4be2ede..38b6980 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/IteyeBlog.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/IteyeBlog.java @@ -10,7 +10,7 @@ import us.codecraft.webmagic.model.annotation.TargetUrl; * @date: 13-8-2
* Time: 上午7:52
*/ -@TargetUrl("http://dengminhui.iteye.com/blog/*") +@TargetUrl("http://*.iteye.com/blog/*") public class IteyeBlog implements Blog{ @ExtractBy("//title") @@ -28,7 +28,7 @@ public class IteyeBlog implements Blog{ } public static void main(String[] args) { - OOSpider.create(Site.me().addStartUrl("http://dengminhui.iteye.com/blog"), IteyeBlog.class).run(); + OOSpider.create(Site.me().addStartUrl("http://*.iteye.com/blog"), IteyeBlog.class).run(); } public String getTitle() { From c6132e0746123d501c1304e817389f591a5a6ed3 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Wed, 7 Aug 2013 22:32:58 +0800 Subject: [PATCH 70/84] some refector --- .../codecraft/webmagic/main/QuickStarter.java | 47 ++++++++++++------- 1 file changed, 29 insertions(+), 18 deletions(-) diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/main/QuickStarter.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/main/QuickStarter.java index e40b371..65940e0 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/main/QuickStarter.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/main/QuickStarter.java @@ -19,17 +19,41 @@ import java.util.Scanner; */ public class QuickStarter { - public static void main(String[] args) { - Map clazzMap = new LinkedHashMap(); + private static Map clazzMap; + + private static Map urlMap; + + private static void init(){ + clazzMap = new LinkedHashMap(); clazzMap.put("1", OschinaBlog.class); clazzMap.put("2", IteyeBlog.class); clazzMap.put("3", News163.class); - Map urlMap = new LinkedHashMap(); + urlMap = new LinkedHashMap(); urlMap.put("1", "http://my.oschina.net/flashsword/blog"); urlMap.put("2", "http://flashsword20.iteye.com/"); urlMap.put("3", "http://news.163.com/"); - Scanner stdin = new Scanner(System.in); + } + + public static void main(String[] args) { + init(); String key = null; + key = readKey(key); + System.out.println("The demo started and will last 20 seconds..."); + //Start spider + OOSpider.create(Site.me().addStartUrl(urlMap.get(key)), clazzMap.get(key)).pipeline(new PagedPipeline()).pipeline(new ConsolePipeline()).runAsync(); + + try { + Thread.sleep(20000); + } catch (InterruptedException e) { + e.printStackTrace(); + } + System.out.println("The demo stopped!"); + System.out.println("To more usage, try to customize your own Spider!"); + System.exit(0); + } + + private static String readKey(String key) { + Scanner stdin = new Scanner(System.in); System.out.println("Choose a Spider demo:"); for (Map.Entry classEntry : clazzMap.entrySet()) { System.out.println(classEntry.getKey()+"\t" + classEntry.getValue() + "\t" + urlMap.get(classEntry.getKey())); @@ -41,19 +65,6 @@ public class QuickStarter { key = null; } } - System.out.println("The demo started and will last 20 seconds..."); - - //Start spider - OOSpider.create(Site.me().addStartUrl(urlMap.get(key)), clazzMap.get(key)).pipeline(new PagedPipeline()).pipeline(new ConsolePipeline()).runAsync(); - - - try { - Thread.sleep(20000); - } catch (InterruptedException e) { - e.printStackTrace(); - } - System.out.println("The demo stopped!"); - System.out.println("To more usage, try to customize your own Spider!"); - System.exit(0); + return key; } } From 268bd8d0c4b1e2385e2d5f97749869c06792122c Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Wed, 7 Aug 2013 23:04:10 +0800 Subject: [PATCH 71/84] remove saxon to extension --- webmagic-core/pom.xml | 5 --- .../us/codecraft/webmagic/selector/Html.java | 6 --- .../webmagic/selector/PlainText.java | 5 --- .../webmagic/selector/Selectable.java | 8 ---- .../webmagic/selector/SelectorFactory.java | 4 -- .../webmagic/selector/SaxonTest.java | 45 ------------------- webmagic-extension/pom.xml | 4 ++ .../webmagic/selector/Xpath2Selector.java | 0 .../webmagic/selector/XpathSelectorTest.java | 36 --------------- 9 files changed, 4 insertions(+), 109 deletions(-) delete mode 100644 webmagic-core/src/test/java/us/codecraft/webmagic/selector/SaxonTest.java rename {webmagic-core => webmagic-extension}/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java (100%) rename {webmagic-core => webmagic-extension}/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java (98%) diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index d2c48b2..cf42d2a 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -27,11 +27,6 @@ commons-lang3
- - net.sf.saxon - Saxon-HE - - log4j log4j diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java index 79d62a0..114eef9 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java @@ -63,12 +63,6 @@ public class Html extends PlainText { return selectList(xpathSelector, strings); } - @Override - public Selectable xpath2(String xpath) { - Xpath2Selector xpathSelector = SelectorFactory.getInstatnce().newXpath2Selector(xpath); - return selectList(xpathSelector, strings); - } - @Override public Selectable $(String selector) { CssSelector cssSelector = new CssSelector(selector); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java index 4fff6da..d06a531 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java @@ -34,11 +34,6 @@ public class PlainText implements Selectable { throw new UnsupportedOperationException(); } - @Override - public Selectable xpath2(String xpath) { - throw new UnsupportedOperationException(); - } - @Override public Selectable $(String selector) { throw new UnsupportedOperationException(); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java index cea501d..42f3d10 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java @@ -18,14 +18,6 @@ public interface Selectable { */ public Selectable xpath(String xpath); - /** - * select list with xpath 2.0 syntax - * - * @param xpath - * @return new Selectable after extract - */ - public Selectable xpath2(String xpath); - /** * select list with css selector * diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SelectorFactory.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SelectorFactory.java index 9abb1ce..1dd56e0 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SelectorFactory.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SelectorFactory.java @@ -34,10 +34,6 @@ public class SelectorFactory { return newSelector(XpathSelector.class, xpath); } - public Xpath2Selector newXpath2Selector(String xpath) { - return newSelector(Xpath2Selector.class, xpath); - } - public SmartContentSelector newSmartContentSelector(){ return newSelector(SmartContentSelector.class); } diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/SaxonTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/SaxonTest.java deleted file mode 100644 index 05a8906..0000000 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/SaxonTest.java +++ /dev/null @@ -1,45 +0,0 @@ -package us.codecraft.webmagic.selector; - -import net.sf.saxon.xpath.XPathFactoryImpl; -import org.htmlcleaner.CleanerProperties; -import org.htmlcleaner.DomSerializer; -import org.htmlcleaner.HtmlCleaner; -import org.htmlcleaner.TagNode; -import org.junit.Test; -import org.w3c.dom.Document; -import org.w3c.dom.NodeList; - -import javax.xml.xpath.XPath; -import javax.xml.xpath.XPathConstants; -import javax.xml.xpath.XPathExpression; -import javax.xml.xpath.XPathFactoryConfigurationException; - -/** - * @author code4crafter@gmail.com
- * @date: 13-8-2
- * Time: 下午5:48
- */ -public class SaxonTest { - - @Test - public void test() throws XPathFactoryConfigurationException { -// System.setProperty("javax.xml.xpath.XPathFactory:" + NamespaceConstant.OBJECT_MODEL_SAXON, "net.sf.saxon.xpath.XPathFactoryImpl"); -// XPathFactory xpf = XPathFactory.newInstance(NamespaceConstant.OBJECT_MODEL_SAXON); - String xml = "#BBB##CCC##DDD#"; - try { - HtmlCleaner htmlCleaner = new HtmlCleaner(); - TagNode tagNode = htmlCleaner.clean(""); - Document document = new DomSerializer(new CleanerProperties()).createDOM(tagNode); - - javax.xml.xpath.XPathFactory factory = XPathFactoryImpl.newInstance(); - XPath xpath = factory.newXPath(); - XPathExpression expr = xpath.compile("//a[matches(.,'#...#')]"); - - Object result = expr.evaluate(document, XPathConstants.NODESET); - NodeList nodes = (NodeList) result; - System.out.println(nodes); - } catch (Exception e) { - e.printStackTrace(); - } - } -} diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index 63034f2..843c2c3 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -27,6 +27,10 @@ webmagic-core ${project.version}
+ + net.sf.saxon + Saxon-HE + junit junit diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java similarity index 100% rename from webmagic-core/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java rename to webmagic-extension/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java similarity index 98% rename from webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java rename to webmagic-extension/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java index 2f663c9..9f32a8f 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java +++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java @@ -1380,42 +1380,6 @@ public class XpathSelectorTest { System.out.println(xpathSelector.select(text)); } - //http://sourceforge.net/mailarchive/forum.php?thread_name=4EA92A8A.6080202%40saxonica.com&forum_name=saxon-help - @Test - public void testSaxon() { - String text = "

眉山:扎实推进农业农村工作 促农持续增收
\n" + - "2013-07-31 23:29:45   来源:眉山网      责任编辑:张斯炜

"; - try { - HtmlCleaner htmlCleaner = new HtmlCleaner(); - TagNode tagNode = htmlCleaner.clean(text); - Document document = new DomSerializer(new CleanerProperties()).createDOM(tagNode); - XPathEvaluator xPathEvaluator = new XPathEvaluator(); - xPathEvaluator.setNamespaceContext(new NamespaceContextImpl(new NamespaceResolver() { - - - @Override - public String getURIForPrefix(String s, boolean b) { - return NamespaceConstant.FN; - } - - @Override - public Iterator iteratePrefixes() { - return Collections.singletonList("fn").iterator(); - } - })); - XPathExpression expr = xPathEvaluator.compile("fn:substring-before(//h1,'\n')"); - Object result = expr.evaluate(document, XPathConstants.STRING); - Assert.assertNotNull(result); - } catch (Exception e) { - e.printStackTrace(); - } - Xpath2Selector xpath2Selector = new Xpath2Selector("fn:substring-before(//h1,'\n')"); - String select = xpath2Selector.select(text); - Assert.assertNotNull(select); - Assert.assertNotNull(xpath2Selector.selectList(text)); - - } - @Test public void testXpath2Selector() { Xpath2Selector xpath2Selector = new Xpath2Selector("//a/@href"); From 521fbad9871f805796f0e19c29a0752b6e66b72e Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Wed, 7 Aug 2013 23:21:28 +0800 Subject: [PATCH 72/84] move xpath2.0 support to seperate package --- pom.xml | 1 + webmagic-extension/pom.xml | 4 --- .../webmagic/model/PageModelExtractor.java | 26 +++++----------- .../webmagic/model/annotation/ExtractBy.java | 4 +-- .../webmagic/model/annotation/ExtractBy2.java | 5 ++-- .../webmagic/model/annotation/ExtractBy3.java | 4 +-- .../model/annotation/ExtractByRaw.java | 4 +-- webmagic-saxon/README.md | 3 ++ webmagic-saxon/pom.xml | 30 +++++++++++++++++++ .../webmagic/selector/Xpath2Selector.java | 0 .../webmagic/selector/XpathSelectorTest.java | 17 ----------- 11 files changed, 50 insertions(+), 48 deletions(-) create mode 100644 webmagic-saxon/README.md create mode 100644 webmagic-saxon/pom.xml rename {webmagic-extension => webmagic-saxon}/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java (100%) rename {webmagic-extension => webmagic-saxon}/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java (99%) diff --git a/pom.xml b/pom.xml index 1497bb0..cb354e2 100644 --- a/pom.xml +++ b/pom.xml @@ -14,6 +14,7 @@ webmagic-samples/ webmagic-selenium/ webmagic-lucene/ + webmagic-saxon/
diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index 843c2c3..63034f2 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -27,10 +27,6 @@ webmagic-core ${project.version} - - net.sf.saxon - Saxon-HE - junit junit diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java index 445bdd9..158e74d 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java @@ -110,11 +110,8 @@ class PageModelExtractor { case XPath: selector = new XpathSelector(value); break; - case XPath2: - selector = new Xpath2Selector(value); - break; default: - selector = new Xpath2Selector(value); + selector = new XpathSelector(value); } fieldExtractor = new FieldExtractor(field, selector, FieldExtractor.Source.Html, extractBy.notNull(), extractBy.multi()); Method setterMethod = getSetterMethod(clazz, field); @@ -140,11 +137,8 @@ class PageModelExtractor { case XPath: selector = new XpathSelector(value); break; - case XPath2: - selector = new Xpath2Selector(value); - break; default: - selector = new Xpath2Selector(value); + selector = new XpathSelector(value); } fieldExtractor.setSelector(new AndSelector(fieldExtractor.getSelector(), selector)); } @@ -165,11 +159,8 @@ class PageModelExtractor { case XPath: selector = new XpathSelector(value); break; - case XPath2: - selector = new Xpath2Selector(value); - break; default: - selector = new Xpath2Selector(value); + selector = new XpathSelector(value); } fieldExtractor.setSelector(new AndSelector(fieldExtractor.getSelector(), selector)); } @@ -191,11 +182,8 @@ class PageModelExtractor { case XPath: selector = new XpathSelector(value); break; - case XPath2: - selector = new Xpath2Selector(value); - break; default: - selector = new Xpath2Selector(value); + selector = new XpathSelector(value); } fieldExtractor = new FieldExtractor(field, selector, FieldExtractor.Source.RawHtml, extractByRaw.notNull(), extractByRaw.multi()); Method setterMethod = getSetterMethod(clazz, field); @@ -228,7 +216,7 @@ class PageModelExtractor { targetUrlPatterns.add(Pattern.compile("(" + s.replace(".", "\\.").replace("*", "[^\"'#]*") + ")")); } if (!targetUrl.sourceRegion().equals("")) { - targetUrlRegionSelector = new Xpath2Selector(targetUrl.sourceRegion()); + targetUrlRegionSelector = new XpathSelector(targetUrl.sourceRegion()); } } annotation = clazz.getAnnotation(HelpUrl.class); @@ -239,13 +227,13 @@ class PageModelExtractor { helpUrlPatterns.add(Pattern.compile("(" + s.replace(".", "\\.").replace("*", "[^\"'#]*") + ")")); } if (!helpUrl.sourceRegion().equals("")) { - helpUrlRegionSelector = new Xpath2Selector(helpUrl.sourceRegion()); + helpUrlRegionSelector = new XpathSelector(helpUrl.sourceRegion()); } } annotation = clazz.getAnnotation(ExtractBy.class); if (annotation != null) { ExtractBy extractBy = (ExtractBy) annotation; - extractor = new Extractor(new Xpath2Selector(extractBy.value()), Extractor.Source.Html, extractBy.notNull(), extractBy.multi()); + extractor = new Extractor(new XpathSelector(extractBy.value()), Extractor.Source.Html, extractBy.notNull(), extractBy.multi()); } } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy.java index af8946a..2fcdb82 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy.java @@ -16,9 +16,9 @@ public @interface ExtractBy { String value(); - public enum Type {XPath2, XPath, Regex, Css} + public enum Type {XPath, Regex, Css} - Type type() default Type.XPath2; + Type type() default Type.XPath; boolean notNull() default true; diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy2.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy2.java index f68b7d6..ad720b3 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy2.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy2.java @@ -6,6 +6,7 @@ import java.lang.annotation.Target; /** * 定义类或者字段的抽取规则。
+ * * @author code4crafter@gmail.com
* @date: 13-8-1
* Time: 下午8:40
@@ -16,8 +17,8 @@ public @interface ExtractBy2 { String value(); - public enum Type {XPath2, XPath, Regex, Css} + public enum Type {XPath, Regex, Css} - Type type() default Type.XPath2; + Type type() default Type.XPath; } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy3.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy3.java index f3212a6..023360e 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy3.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy3.java @@ -16,8 +16,8 @@ public @interface ExtractBy3 { String value(); - public enum Type {XPath2, XPath, Regex, Css} + public enum Type { XPath, Regex, Css} - Type type() default Type.XPath2; + Type type() default Type.XPath; } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractByRaw.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractByRaw.java index 9692732..1bd3da1 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractByRaw.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractByRaw.java @@ -16,9 +16,9 @@ public @interface ExtractByRaw { String value(); - public enum Type {XPath2, XPath, Regex, Css} + public enum Type {XPath, Regex, Css} - Type type() default Type.XPath2; + Type type() default Type.XPath; boolean notNull() default true; diff --git a/webmagic-saxon/README.md b/webmagic-saxon/README.md new file mode 100644 index 0000000..0471c68 --- /dev/null +++ b/webmagic-saxon/README.md @@ -0,0 +1,3 @@ +webmagic-extension +------- +webmagic的扩展模块,依赖Saxon进行xpath2.0解析支持。Saxon依赖包太大,不作为默认模块引入。 \ No newline at end of file diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml new file mode 100644 index 0000000..a2db768 --- /dev/null +++ b/webmagic-saxon/pom.xml @@ -0,0 +1,30 @@ + + + + us.codecraft + webmagic + 0.2.0 + + 4.0.0 + + webmagic-saxon + + + + us.codecraft + webmagic-core + ${project.version} + + + net.sf.saxon + Saxon-HE + + + junit + junit + + + + \ No newline at end of file diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java b/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java similarity index 100% rename from webmagic-extension/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java rename to webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java b/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java similarity index 99% rename from webmagic-extension/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java rename to webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java index 9f32a8f..b623040 100644 --- a/webmagic-extension/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java +++ b/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java @@ -1,25 +1,8 @@ package us.codecraft.webmagic.selector; -import net.sf.saxon.Configuration; -import net.sf.saxon.lib.NamespaceConstant; -import net.sf.saxon.om.NamespaceResolver; -import net.sf.saxon.pull.NamespaceContextImpl; -import net.sf.saxon.xpath.JAXPXPathStaticContext; -import net.sf.saxon.xpath.XPathEvaluator; -import net.sf.saxon.xpath.XPathFactoryImpl; -import org.htmlcleaner.CleanerProperties; -import org.htmlcleaner.DomSerializer; -import org.htmlcleaner.HtmlCleaner; -import org.htmlcleaner.TagNode; import org.junit.Assert; import org.junit.Ignore; import org.junit.Test; -import org.w3c.dom.Document; -import org.w3c.dom.NodeList; - -import javax.xml.xpath.*; -import java.util.Collections; -import java.util.Iterator; /** * @author code4crafter@gmail.com
Date: 13-4-21 Time: 上午10:06 From 48fd0033942433d89c3a838b491c88fbc3971bcb Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Wed, 7 Aug 2013 23:32:22 +0800 Subject: [PATCH 73/84] remove freemarker dep --- webmagic-samples/pom.xml | 5 -- .../webmagic/pipeline/FreemarkerPipeline.java | 57 ------------------- 2 files changed, 62 deletions(-) delete mode 100644 webmagic-samples/src/main/java/us/codecraft/webmagic/pipeline/FreemarkerPipeline.java diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index d8963ff..9d00d2f 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -22,11 +22,6 @@ webmagic-extension ${project.version}
- - org.freemarker - freemarker - 2.3.15 - junit junit diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/pipeline/FreemarkerPipeline.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/pipeline/FreemarkerPipeline.java deleted file mode 100644 index 3742062..0000000 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/pipeline/FreemarkerPipeline.java +++ /dev/null @@ -1,57 +0,0 @@ -package us.codecraft.webmagic.pipeline; - -import freemarker.template.Configuration; -import freemarker.template.Template; -import freemarker.template.TemplateException; -import org.apache.commons.codec.digest.DigestUtils; -import us.codecraft.webmagic.ResultItems; -import us.codecraft.webmagic.Task; - -import java.io.File; -import java.io.FileWriter; -import java.io.IOException; -import java.io.PrintWriter; - -/** - * @author code4crafter@gmail.com
- * Date: 13-6-8 - * Time: 下午9:00 - */ -public class FreemarkerPipeline implements Pipeline { - - private Configuration configuration; - - private Template template; - - private String path = "/data/temp/webmagic/ftl/"; - - public FreemarkerPipeline(String template, String path) throws IOException { - configuration = new Configuration(); - configuration.setDirectoryForTemplateLoading(new File(this.getClass().getClassLoader().getResource("ftl/").getFile())); - this.template = configuration.getTemplate(template); - this.path = path; - new File(path); - } - - public FreemarkerPipeline(String template) throws IOException { - this(template, "/data/temp/webmagic/ftl/"); - } - - - @Override - public void process(ResultItems resultItems, Task task) { - String path = this.path + "" + task.getUUID() + "/"; - File file = new File(path); - if (!file.exists()) { - file.mkdirs(); - } - try { - PrintWriter printWriter = new PrintWriter(new FileWriter(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".html")); - template.process(resultItems.getAll(), printWriter); - printWriter.close(); - } catch (TemplateException e) { - } catch (IOException e) { - e.printStackTrace(); - } - } -} From b713497a6b288d67960b77196fdef66722ec9d45 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Wed, 7 Aug 2013 23:33:10 +0800 Subject: [PATCH 74/84] remove freemarker --- .../processor/DiandianProcessorTest.java | 37 ------------------- 1 file changed, 37 deletions(-) delete mode 100644 webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiandianProcessorTest.java diff --git a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiandianProcessorTest.java b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiandianProcessorTest.java deleted file mode 100644 index cf587f1..0000000 --- a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiandianProcessorTest.java +++ /dev/null @@ -1,37 +0,0 @@ -package us.codecraft.webmagic.processor; - -import org.junit.Ignore; -import org.junit.Test; -import us.codecraft.webmagic.Spider; -import us.codecraft.webmagic.pipeline.ConsolePipeline; -import us.codecraft.webmagic.pipeline.FreemarkerPipeline; -import us.codecraft.webmagic.samples.DiandianBlogProcessor; -import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler; - -import java.io.IOException; - -/** - * @author code4crafter@gmail.com
- * Date: 13-6-9 - * Time: 上午8:02 - */ -public class DiandianProcessorTest { - - @Ignore - @Test - public void test() throws IOException { - DiandianBlogProcessor diaoyuwengProcessor = new DiandianBlogProcessor(); - //pipeline是抓取结束后的处理 - //ftl文件放到classpath:ftl/文件夹下 - //默认放到/data/temp/webmagic/ftl/[domain]目录下 - FreemarkerPipeline pipeline = new FreemarkerPipeline("wordpress.ftl"); - //Spider.me()是简化写法,其实就是new一个啦 - //Spider.pipeline()设定一个pipeline,支持链式调用 - //ConsolePipeline输出结果到控制台 - //FileCacheQueueSchedular保存url,支持断点续传,临时文件输出到/data/temp/webmagic/cache目录 - //Spider.run()执行 - - Spider.create(diaoyuwengProcessor).pipeline(new ConsolePipeline()).pipeline(pipeline).scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")). - run(); - } -} From f76def231df9884f25a3a40adfb49423943da74c Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Wed, 7 Aug 2013 23:36:48 +0800 Subject: [PATCH 75/84] remove freemarker dp --- .../webmagic/processor/DiaoyuwengProcessorTest.java | 4 ++-- .../webmagic/processor/SinablogProcessorTest.java | 7 +++---- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiaoyuwengProcessorTest.java b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiaoyuwengProcessorTest.java index 69a535c..0371eb2 100644 --- a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiaoyuwengProcessorTest.java +++ b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiaoyuwengProcessorTest.java @@ -4,7 +4,7 @@ import org.junit.Ignore; import org.junit.Test; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.pipeline.FilePipeline; -import us.codecraft.webmagic.pipeline.FreemarkerPipeline; +import us.codecraft.webmagic.pipeline.JsonFilePipeline; import us.codecraft.webmagic.samples.DiaoyuwengProcessor; import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler; @@ -21,7 +21,7 @@ public class DiaoyuwengProcessorTest { @Test public void test() throws IOException { DiaoyuwengProcessor diaoyuwengProcessor = new DiaoyuwengProcessor(); - FreemarkerPipeline pipeline = new FreemarkerPipeline("wordpress.ftl"); + JsonFilePipeline pipeline = new JsonFilePipeline("/data/webmagic/"); Spider.create(diaoyuwengProcessor).pipeline(new FilePipeline()).pipeline(pipeline).scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")). run(); } diff --git a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/SinablogProcessorTest.java b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/SinablogProcessorTest.java index a44fe35..026f8d5 100644 --- a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/SinablogProcessorTest.java +++ b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/SinablogProcessorTest.java @@ -4,7 +4,7 @@ import org.junit.Ignore; import org.junit.Test; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.pipeline.FilePipeline; -import us.codecraft.webmagic.pipeline.FreemarkerPipeline; +import us.codecraft.webmagic.pipeline.JsonFilePipeline; import us.codecraft.webmagic.samples.SinaBlogProcesser; import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler; @@ -22,9 +22,8 @@ public class SinablogProcessorTest { public void test() throws IOException { SinaBlogProcesser sinaBlogProcesser = new SinaBlogProcesser(); //pipeline是抓取结束后的处理 - //ftl文件放到classpath:ftl/文件夹下 - //默认放到/data/temp/webmagic/ftl/[domain]目录下 - FreemarkerPipeline pipeline = new FreemarkerPipeline("wordpress.ftl"); + //默认放到/data/webmagic/ftl/[domain]目录下 + JsonFilePipeline pipeline = new JsonFilePipeline("/data/webmagic/"); //Spider.me()是简化写法,其实就是new一个啦 //Spider.pipeline()设定一个pipeline,支持链式调用 //ConsolePipeline输出结果到控制台 From c78de7bcbb2e85cd5df1bcff63af3e3a5236fa3f Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Thu, 8 Aug 2013 13:10:05 +0800 Subject: [PATCH 76/84] update notnull default to false --- .../java/us/codecraft/webmagic/model/annotation/ExtractBy.java | 2 +- .../us/codecraft/webmagic/model/annotation/ExtractByRaw.java | 2 +- .../us/codecraft/webmagic/model/annotation/ExtractByUrl.java | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy.java index 2fcdb82..bcd5706 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy.java @@ -20,7 +20,7 @@ public @interface ExtractBy { Type type() default Type.XPath; - boolean notNull() default true; + boolean notNull() default false; boolean multi() default false; diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractByRaw.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractByRaw.java index 1bd3da1..f774d3e 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractByRaw.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractByRaw.java @@ -20,7 +20,7 @@ public @interface ExtractByRaw { Type type() default Type.XPath; - boolean notNull() default true; + boolean notNull() default false; boolean multi() default false; diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractByUrl.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractByUrl.java index 52f4a57..d57ec6e 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractByUrl.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractByUrl.java @@ -16,7 +16,7 @@ public @interface ExtractByUrl{ String value() default ""; - boolean notNull() default true; + boolean notNull() default false; boolean multi() default false; From f41c8ef7554a8eecf1238815235dbb240d3ccb0f Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Thu, 8 Aug 2013 13:53:04 +0800 Subject: [PATCH 77/84] remove uncore package from pom --- pom.xml | 3 --- 1 file changed, 3 deletions(-) diff --git a/pom.xml b/pom.xml index cb354e2..cacce99 100644 --- a/pom.xml +++ b/pom.xml @@ -12,9 +12,6 @@ webmagic-core webmagic-extension/ webmagic-samples/ - webmagic-selenium/ - webmagic-lucene/ - webmagic-saxon/ From e04d6253fa3a61fc8846b7d4b154704cbf4ea245 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Fri, 9 Aug 2013 00:03:14 +0800 Subject: [PATCH 78/84] add manual --- webmagic manual.md | 246 ++++++++++++++++++ .../webmagic/model/samples/OschinaBlog.java | 27 +- .../samples/OschinaBlogPageProcesser.java | 22 +- 3 files changed, 270 insertions(+), 25 deletions(-) create mode 100644 webmagic manual.md diff --git a/webmagic manual.md b/webmagic manual.md new file mode 100644 index 0000000..e275f73 --- /dev/null +++ b/webmagic manual.md @@ -0,0 +1,246 @@ +webmagic使用手册 +------ +>webmagic是一个开源的Java垂直爬虫框架,目标是简化爬虫的开发流程,让开发者专注于逻辑功能的开发。webmagic的核心非常简单,但是覆盖爬虫的整个流程,也是很好的学习爬虫开发的材料。 + +>web爬虫是一种技术,webmagic致力于将这种技术的实现成本降低,但是出于对资源提供者的尊重,webmagic不会做反封锁的事情,包括:验证码破解、代理切换、自动登录、抓取静态资源等。 + +>作者黄亿华([code4crafter@gmail.com](code4crafter@gmail.com))目前就职于大众点评,曾经在前公司进行过一年的垂直爬虫的开发。webmagic就是为了解决爬虫开发的一些重复劳动而产生的框架。有使用不便或者问题,欢迎在github[提交issue](https://github.com/code4craft/webmagic/issues),或者在[oschina讨论模块](http://www.oschina.net/question)提问。 + +>webmagic的架构和设计参考了以下两个项目,感谢以下两个项目的作者: + +>python爬虫 **scrapy** [https://github.com/scrapy/scrapy](https://github.com/scrapy/scrapy) + +>Java爬虫 **Spiderman** [https://gitcafe.com/laiweiwei/Spiderman](https://gitcafe.com/laiweiwei/Spiderman) + +--------- + +## 快速开始 + +### 使用maven + +webmagic使用maven管理依赖,你可以直接下载webmagic源码进行编译: + + git clone https://github.com/code4craft/webmagic.git + mvn clean install + +安装后,在项目中添加对应的依赖即可使用webmagic: + + + us.codecraft + webmagic-core + 0.2.0 + + + us.codecraft + webmagic-extension + 0.2.0 + + +#### 项目结构 + +webmagic主要包括两个包: + +* **webmagic-core** + + webmagic核心部分,只包含爬虫基本模块和基本抽取器。webmagic-core的目标是成为网页爬虫的一个教科书般的实现。 + +* **webmagic-extension** + + webmagic的扩展模块,提供一些更方便的编写爬虫的工具。包括注解格式定义爬虫、JSON、分布式等支持。 + +webmagic还包含两个可用的扩展包,因为这两个包都依赖了比较重量级的工具,所以从主要包中抽离出来: + +* **webmagic-saxon** + + webmagic与Saxon结合的模块。Saxon是一个XPath、XSLT的解析工具,webmagic依赖Saxon来进行XPath2.0语法解析支持。 + +* **webmagic-selenium** + + webmagic与Selenium结合的模块。Selenium是一个模拟浏览器进行页面渲染的工具,webmagic依赖Selenium进行动态页面的抓取。 + +在项目中,你可以根据需要依赖不同的包。 + +### 不使用maven + +不使用maven的用户,可以下载这个二进制打包版本(感谢[oschina](http://www.oschina.net/)): + + git clone http://git.oschina.net/flashsword20/webmagic-bin.git + +在`bin/lib`目录下,有项目依赖的所有jar包,直接在IDE里import即可。 + +### 第一个爬虫 + +#### 定制PageProcessor + +PageProcessor是webmagic-core的一部分,定制一个PageProcessor即可实现自己的爬虫逻辑。以下是抓取osc博客的一段代码: + + public class OschinaBlogPageProcesser implements PageProcessor { + + private Site site = Site.me().setDomain("my.oschina.net") + .addStartUrl("http://my.oschina.net/flashsword/blog"); + + @Override + public void process(Page page) { + List links = page.getHtml().links().regex("http://my\\.oschina\\.net/flashsword/blog/\\d+").all(); + page.addTargetRequests(links); + page.putField("title", page.getHtml().xpath("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1").toString()); + page.putField("content", page.getHtml().$("div.content").toString()); + page.putField("tags",page.getHtml().xpath("//div[@class='BlogTags']/a/text()").all()); + } + + @Override + public Site getSite() { + return site; + + } + + public static void main(String[] args) { + Spider.create(new OschinaBlogPageProcesser()) + .pipeline(new ConsolePipeline()).run(); + } + } + +这里通过page.addTargetRequests()方法来增加要抓取的URL,并通过page.putField()来保存抽取结果。page.getHtml().xpath()则是按照某个规则对结果进行抽取,这里抽取支持链式调用。调用结束后,toString()表示转化为单个String,all()则转化为一个String列表。 + +Spider是爬虫的入口类。Pipeline是结果输出和持久化的接口,这里ConsolePipeline表示结果输出到控制台。 + +执行这个main方法,即可在控制台看到抓取结果。webmagic默认有3秒抓取间隔,请耐心等待。 + +#### 使用注解 + +PageProcessor的方式灵活、强大,但是没有解决两个问题: + +* 对于一个站点,如果想抓取多种格式的URL,那么必须在PageProcesser中写判断逻辑,代码难以管理。 +* 抓取结果没有对应Model,并不符合Java程序开发习惯,与一些框架也无法很好整合。 + +webmagic-extension包括了注解方式编写爬虫的方法,只需基于一个POJO增加注解即可完成一个爬虫。以下仍然是抓取oschina博客的一段代码,功能与OschinaBlogPageProcesser完全相同: + + @TargetUrl("http://my.oschina.net/flashsword/blog/\\d+") + public class OschinaBlog { + + @ExtractBy("//title") + private String title; + + @ExtractBy(value = "div.BlogContent",type = ExtractBy.Type.Css) + private String content; + + @ExtractBy(value = "//div[@class='BlogTags']/a/text()", multi = true) + private List tags; + + public static void main(String[] args) { + OOSpider.create( + Site.me().addStartUrl("http://my.oschina.net/flashsword/blog"), + new ConsolePageModelPipeline(), OschinaBlog.class).run(); + } + } + +这个例子定义了一个Model类,Model类的字段'title'、'content'、'tags'均为要抽取的属性。这个类在Pipeline里是可以复用的。 + +"TargetUrl"关键词表示要抓取的URL,这里使用了正则表达式,匹配 "http://my.oschina.net/flashsword/blog/150039" 格式的URL。webmagic对正则表达式进行了修改,"."仅表示字符"."而不代表任意字符,而"\*"则代表了".\*",例如"http://\*.oschina.net/\*"代表了oschina所有的二级域名下的URL。 + +"ExtractBy"关键词定义了字段抽取的规则。抽取的规则默认使用[**XPath**](http://www.w3school.com.cn/xpath/),也可以选择使用CSS Selector、正则表达式。使用"ExtractBy"注解提取的字段(设置type),目前只能为String或者List类型(multi=true时有效)。"ExtractBy"支持链式抽取,通过增加注解"ExtractBy2"、"ExtractBy3"实现。 + +OOSpider是注解式爬虫的入口,这里调用create()方法将OschinaBlog这个类加入到爬虫的抽取中,这里是可以传入多个类的,OOSpider会根据TargetUrl调用不同的Model进行解析。 + +可以通过定义PageModelPipeline来选择结果输出方式。这里new ConsolePageModelPipeline()是PageModelPipeline的一个实现,会将结果输出到控制台。 + +注解方式其实也是通过一个PageProcessor的实现--ModelPageProcessor完成,因此对webmagic-core代码没有任何影响。 + +## 核心架构解析 + +webmagic-core是爬虫的核心框架。此部分摘自作者的博文 +[webmagic的设计机制及原理-如何开发一个Java爬虫](http://my.oschina.net/flashsword/blog/145796)。 + +### webmagic-core的模块划分 + +webmagic-core参考了scrapy的模块划分,分为Spider(整个爬虫的调度框架)、Downloader(页面下载)、PageProcessor(链接提取和页面分析)、Scheduler(URL管理)、Pipeline(离线分析和持久化)几部分。只不过scrapy通过middleware实现扩展,而webmagic则通过定义这几个接口,并将其不同的实现注入主框架类Spider来实现扩展。 + +![image](http://code4craft.github.io/images/posts/webmagic.png) + +#### Spider类(核心调度) + +Spider是爬虫的入口类,Spider的接口调用采用了链式的API设计,其他功能全部通过接口注入Spider实现,下面是启动一个比较复杂的Spider的例子。 + + + Spider.create(sinaBlogProcessor) + .scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")) + .pipeline(new FilePipeline()) + .thread(10).run(); + + +Spider的核心处理流程非常简单,代码如下: + + + private void processRequest(Request request) { + Page page = downloader.download(request, this); + if (page == null) { + sleep(site.getSleepTime()); + return; + } + pageProcessor.process(page); + addRequest(page); + for (Pipeline pipeline : pipelines) { + pipeline.process(page, this); + } + sleep(site.getSleepTime()); + } + +#### Downloader(页面下载) + +大部分爬虫都是通过模拟http请求,接收并分析响应来完成。这方面,JDK自带的**HttpURLConnection**可以满足最简单的需要,而**Apache HttpClient**(4.0后整合到HttpCompenent项目中)则是开发复杂爬虫的不二之选。它支持自定义HTTP头(对于爬虫比较有用的就是User-agent、cookie等)、自动redirect、连接复用、cookie保留、设置代理等诸多强大的功能。 + +webmagic使用了HttpClient 4.2,并封装到了**HttpClientDownloader**。学习HttpClient的使用对于构建高性能爬虫是非常有帮助的,官方的[Tutorial](http://hc.apache.org/httpcomponents-client-ga/tutorial/html/)就是很好的学习资料。目前webmagic对HttpClient的使用仍在初步阶段,不过对于一般抓取任务,已经够用了。 + +对于一些Javascript动态加载的网页,仅仅使用http模拟下载工具,并不能取到页面的内容。这方面的思路有两种:一种是抽丝剥茧,分析js的逻辑,再用爬虫去重现它(比如在网页中提取关键数据,再用这些数据去构造Ajax请求,最后直接从响应体获取想要的数据); +另一种就是:内置一个浏览器,直接获取最后加载完的页面。这方面,js可以使用**PhantomJS**,它内部集成了webkit。而Java可以使用**Selenium**,这是一个非常强大的浏览器模拟工具。webmagic-selenium包中整合了Selenium到`SeleniumDownloader`,可以直接进行动态加载页面的抓取。 + +#### PageProcessor(页面分析及链接抽取) + +页面分析可以说是垂直爬虫最复杂的一部分,在webmagic里,PageProcessor是定制爬虫的核心。通过编写一个实现PageProcessor接口的类,就可以定制一个自己的爬虫。 + +**Selector**是webmagic为了简化页面抽取开发的独立模块,是webmagic的主要着力点。这里整合了CSS Selector、XPath和正则表达式,并可以进行链式的抽取,很容易就实现强大的功能。即使你使用自己开发的爬虫工具,webmagic的Selector仍然值得一试。 + +例如,我已经下载了一个页面,现在要抽取某个区域的所有包含"blog"的链接,我可以这样写: + + + //content是用别的爬虫工具抽取到的正文 + String content = "blabla"; + List links = Html.create(content) + .$("div.title") //css 选择,Java里虽然很少有$符号出现,不过貌似$作为方法名是合法的 + .xpath("//@href") //提取链接 + .regex(".*blog.*") //正则匹配过滤 + .all(); //转换为string列表 + + +另外,webmagic的抓取链接需要显示的调用`Page.addTargetRequests()`去添加,这也是为了灵活性考虑的(很多时候,下一步的URL不是单纯的页面href链接,可能会根据页面模块进行抽取,甚至可能是自己拼凑出来的)。 + +webmagic包括一个对于页面正文的自动抽取的功能**SmartContentSelector**。相信用过Evernote Clearly都会对其自动抽取正文的技术印象深刻。这个技术又叫**Readability**。当然webmagic对Readability的实现还比较粗略,但是仍有一些学习价值。 + +基于Saxon,webmagic提供了XPath2.0语法的支持。XPath2.0语法支持内部函数、逻辑控制等,是一门完整的语言,如果你熟悉语法,倒是不妨一试(需要引入webmagic-saxon包)。 + +#### Scheduler(URL管理) + + +URL管理的问题可大可小。对于小规模的抓取,URL管理是很简单的。我们只需要将待抓取URL和未抓取URL分开保存,并进行去重即可。使用JDK内置的集合类型Set、List或者Queue都可以满足需要。如果我们要进行多线程抓取,则可以选择线程安全的容器,例如LinkedBlockingQueue以及ConcurrentHashMap。 + +因为小规模的URL管理非常简单,很多框架都并不将其抽象为一个模块,而是直接融入到代码中。但是实际上,抽象出Scheduler模块,会使得框架的解耦程度上升一个档次,并非常容易进行横向扩展,这也是我从scrapy中学到的。 + +在webmagic的设计中,除了Scheduler模块,其他的处理-从下载、解析到持久化,每个任务都是互相独立的,因此可以通过多个Spider共用一个Scheduler来进行扩展。排除去重的因素,URL管理天生就是一个队列,我们可以很方便的用分布式的队列工具去扩展它,也可以基于mysql、redis或者mongodb这样的存储工具来构造一个队列,这样构建一个多线程乃至分布式的爬虫就轻而易举了。 + +URL去重也是一个比较复杂的问题。如果数据量较少,则使用hash的方式就能很好解决。数据量较大的情况下,可以使用Bloom Filter或者更复杂的方式。 + +webmagic目前有两个Scheduler的实现,**QueueScheduler**是一个简单的内存队列,速度较快,并且是线程安全的,**FileCacheQueueScheduler**则是一个文件队列,它可以用于耗时较长的下载任务,在任务中途停止后,下次执行仍然从中止的URL开始继续爬取。 + +webmagic有一个基于redis的Scheduler实现**RedisScheduler**。通过使用同一台redis服务器存储URL,webmagic可以很容易的在多机部署,从而达到分布式爬虫的效果。 + + +#### Pipeline-离线处理和持久化 + + +Pipeline其实也是容易被忽略的一部分。大家都知道持久化的重要性,但是很多框架都选择直接在页面抽取的时候将持久化一起完成,例如crawer4j。但是Pipeline真正的好处是,将页面的在线分析和离线处理拆分开来,可以在一些线程里进行下载,另一些线程里进行处理和持久化。 + +你可以扩展Pipeline来实现抽取结果的持久化,将其保存到你想要保存的地方-本地文件、数据库、mongodb等等。Pipeline的处理目前还是在线的,但是修改为离线的也并不困难。 + +webmagic目前只支持控制台输出和文件持久化,但是持久化到数据库也是很容易的。这里不妨看一下[webmagic结合JFinal持久化到数据库的一段代码](http://www.oschina.net/code/snippet_190591_23456)。因为JFinal目前还不支持maven,所以并没有放到webmagic-samples里来。 + +## 示例 \ No newline at end of file diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java index 6409492..95a7891 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java @@ -1,10 +1,12 @@ package us.codecraft.webmagic.model.samples; import us.codecraft.webmagic.Site; -import us.codecraft.webmagic.model.annotation.ExtractBy; +import us.codecraft.webmagic.model.ConsolePageModelPipeline; import us.codecraft.webmagic.model.OOSpider; +import us.codecraft.webmagic.model.annotation.ExtractBy; import us.codecraft.webmagic.model.annotation.TargetUrl; -import us.codecraft.webmagic.pipeline.JsonFilePipeline; + +import java.util.List; /** * @author code4crafter@gmail.com
@@ -12,7 +14,7 @@ import us.codecraft.webmagic.pipeline.JsonFilePipeline; * Time: 上午7:52
*/ @TargetUrl("http://my.oschina.net/flashsword/blog/\\d+") -public class OschinaBlog implements Blog{ +public class OschinaBlog { @ExtractBy("//title") private String title; @@ -20,23 +22,12 @@ public class OschinaBlog implements Blog{ @ExtractBy(value = "div.BlogContent",type = ExtractBy.Type.Css) private String content; - @Override - public String toString() { - return "OschinaBlog{" + - "title='" + title + '\'' + - ", content='" + content + '\'' + - '}'; - } + @ExtractBy(value = "//div[@class='BlogTags']/a/text()", multi = true) + private List tags; public static void main(String[] args) { - OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog"), OschinaBlog.class).pipeline(new JsonFilePipeline()).run(); + OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog") + ,new ConsolePageModelPipeline(), OschinaBlog.class).run(); } - public String getTitle() { - return title; - } - - public String getContent() { - return content; - } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java index f2dbe8e..8ba7063 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java @@ -2,6 +2,8 @@ package us.codecraft.webmagic.samples; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.pipeline.ConsolePipeline; import us.codecraft.webmagic.processor.PageProcessor; import java.util.List; @@ -13,18 +15,24 @@ import java.util.List; */ public class OschinaBlogPageProcesser implements PageProcessor { + private Site site = Site.me().setDomain("my.oschina.net").addStartUrl("http://my.oschina.net/flashsword/blog"); + @Override public void process(Page page) { - List strings = page.getHtml().links().regex("(http://my\\.oschina\\.net)").all(); - page.addTargetRequests(strings); - page.putField("title", page.getHtml().xpath("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1")); - page.putField("content", page.getHtml().smartContent()); - page.putField("author", page.getUrl().regex("my\\.oschina\\.net/(\\w+)/blog/\\d+")); + List links = page.getHtml().links().regex("http://my\\.oschina\\.net/flashsword/blog/\\d+").all(); + page.addTargetRequests(links); + page.putField("title", page.getHtml().xpath("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1").toString()); + page.putField("content", page.getHtml().$("div.content").toString()); + page.putField("tags",page.getHtml().xpath("//div[@class='BlogTags']/a/text()").all()); } @Override public Site getSite() { - return Site.me().setDomain("my.oschina.net").addStartUrl("http://www.oschina.net/"). - setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); + return site; + + } + + public static void main(String[] args) { + Spider.create(new OschinaBlogPageProcesser()).pipeline(new ConsolePipeline()).run(); } } From 31a84a1998d8f30897e14e5ec599bce057ff06a3 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Fri, 9 Aug 2013 00:48:35 +0800 Subject: [PATCH 79/84] update manual --- webmagic manual.md | 86 ++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 71 insertions(+), 15 deletions(-) diff --git a/webmagic manual.md b/webmagic manual.md index e275f73..9d243c0 100644 --- a/webmagic manual.md +++ b/webmagic manual.md @@ -108,11 +108,6 @@ Spider是爬虫的入口类。Pipeline是结果输出和持久化的接口,这 #### 使用注解 -PageProcessor的方式灵活、强大,但是没有解决两个问题: - -* 对于一个站点,如果想抓取多种格式的URL,那么必须在PageProcesser中写判断逻辑,代码难以管理。 -* 抓取结果没有对应Model,并不符合Java程序开发习惯,与一些框架也无法很好整合。 - webmagic-extension包括了注解方式编写爬虫的方法,只需基于一个POJO增加注解即可完成一个爬虫。以下仍然是抓取oschina博客的一段代码,功能与OschinaBlogPageProcesser完全相同: @TargetUrl("http://my.oschina.net/flashsword/blog/\\d+") @@ -136,19 +131,15 @@ webmagic-extension包括了注解方式编写爬虫的方法,只需基于一 这个例子定义了一个Model类,Model类的字段'title'、'content'、'tags'均为要抽取的属性。这个类在Pipeline里是可以复用的。 -"TargetUrl"关键词表示要抓取的URL,这里使用了正则表达式,匹配 "http://my.oschina.net/flashsword/blog/150039" 格式的URL。webmagic对正则表达式进行了修改,"."仅表示字符"."而不代表任意字符,而"\*"则代表了".\*",例如"http://\*.oschina.net/\*"代表了oschina所有的二级域名下的URL。 +关于注解的使用方式,在后面会专门讲到。 -"ExtractBy"关键词定义了字段抽取的规则。抽取的规则默认使用[**XPath**](http://www.w3school.com.cn/xpath/),也可以选择使用CSS Selector、正则表达式。使用"ExtractBy"注解提取的字段(设置type),目前只能为String或者List类型(multi=true时有效)。"ExtractBy"支持链式抽取,通过增加注解"ExtractBy2"、"ExtractBy3"实现。 - -OOSpider是注解式爬虫的入口,这里调用create()方法将OschinaBlog这个类加入到爬虫的抽取中,这里是可以传入多个类的,OOSpider会根据TargetUrl调用不同的Model进行解析。 - -可以通过定义PageModelPipeline来选择结果输出方式。这里new ConsolePageModelPipeline()是PageModelPipeline的一个实现,会将结果输出到控制台。 - -注解方式其实也是通过一个PageProcessor的实现--ModelPageProcessor完成,因此对webmagic-core代码没有任何影响。 +------ ## 核心架构解析 -webmagic-core是爬虫的核心框架。此部分摘自作者的博文 +webmagic-core是爬虫的核心框架,只包括一个爬虫各功能模块的核心功能。webmagic-core的目标是成为网页爬虫的一个教科书般的实现。 + +此节部分内容摘自作者的博文 [webmagic的设计机制及原理-如何开发一个Java爬虫](http://my.oschina.net/flashsword/blog/145796)。 ### webmagic-core的模块划分 @@ -243,4 +234,69 @@ Pipeline其实也是容易被忽略的一部分。大家都知道持久化的重 webmagic目前只支持控制台输出和文件持久化,但是持久化到数据库也是很容易的。这里不妨看一下[webmagic结合JFinal持久化到数据库的一段代码](http://www.oschina.net/code/snippet_190591_23456)。因为JFinal目前还不支持maven,所以并没有放到webmagic-samples里来。 -## 示例 \ No newline at end of file +------ + +## 注解模块 + +webmagic-extension包括注解模块。为什么会有注解方式? + +因为PageProcessor的方式灵活、强大,但是没有解决两个问题: + +* 对于一个站点,如果想抓取多种格式的URL,那么必须在PageProcesser中写判断逻辑,代码难以管理。 +* 抓取结果没有对应Model,并不符合Java程序开发习惯,与一些框架也无法很好整合。 + +注解的核心是Model类,本身是一个POJO,这个Model类用于传递、保存页面最终抓取结果数据。注解方式直接将抽取与数据绑定,以便于编写和维护。 + +注解方式其实也是通过一个PageProcessor的实现--ModelPageProcessor完成,因此对webmagic-core代码没有任何影响。 + +注解部分包括以下内容: + +* ### TargetUrl + + "TargetUrl"表示这个Model对应要抓取的URL,它包含两层意思:符合这个条件的URL会被加入抓取队列;符合这个条件的URL会被这个Model抓取。TargetUrl可以`sourceRegion`指定提取URL的区域(仅支持XPath)。 + + TargetUrl使用了正则表达式,匹配 "http://my.oschina.net/flashsword/blog/150039" 格式的URL。webmagic对正则表达式进行了修改,"."仅表示字符"."而不代表任意字符,而"\*"则代表了".\*",例如"http://\*.oschina.net/\*"代表了oschina所有的二级域名下的URL。 + + 与TargetUrl相似的还有`HelpUrl`,HelpUrl表示:仅仅抓取该URL用作链接提取,并不对它进行内容抽取。例如博客正文页对应TargetUrl,而列表页则对应HelpUrl。 + +* ### ExtractBy + + * #### 用于字段 + + "ExtractBy"可用于类以及字段。用于字段时,定义了字段抽取的规则。抽取的规则默认使用[**XPath**](http://www.w3school.com.cn/xpath/),也可以选择使用CSS Selector、正则表达式(通过设置type)。 + + ExtractBy还有几个扩展属性。`multi`表示是否抽取列表,当然,设置为multi时,你需要一个List字段去容纳它。`notnull`则表示,此字段不允许为null,若为null则放弃整个对象。 + + * #### 用于类 + "ExtractBy"用于类时,则限定了字段抽取的区域。用于类时仍支持multi,multi则表示一个页面可以抽取到多个对象。 + + * #### ExtractByRaw & ExtractByUrl + + 在类使用"ExtractBy"修饰后,字段的"ExtractBy"使用的是其抽取的结果,如果仍然想要抽取原HTML,可以使用"ExtractByRaw"。与此类似的还有"ExtractByUrl",表示从URL重抽取信息。ExtractByUrl只支持正则表达式。 + + * #### ExtractBy2 ExtractBy3 + + "ExtractBy"、"ExtractByRaw"支持链式抽取,通过增加注解"ExtractBy2"、"ExtractBy3"实现。 + +* ### AfterExtractor + + AfterExtractor接口是对注解方式抽取能力不足的补充。实现AfterExtractor接口后,会在**使用注解方式填充完字段后**调用`afterProcess()`方法,在这个方法中可以直接访问已抽取的字段、补充需要抽取的字段,甚至做一些简单的输出和持久化操作(并不是很建议这么做)。这部分可以参考[webmagic结合JFinal持久化到数据库的一段代码](http://www.oschina.net/code/snippet_190591_23456)。 + +* ### OOSpider + OOSpider是注解式爬虫的入口,这里调用`create()`方法将OschinaBlog这个类加入到爬虫的抽取中,这里是可以传入多个类的,OOSpider会根据TargetUrl调用不同的Model进行解析。 + +* ### PageModelPipeline + 可以通过定义PageModelPipeline来选择结果输出方式。这里new ConsolePageModelPipeline()是PageModelPipeline的一个实现,会将结果输出到控制台。 + +* ### 分页 + 处理单项数据分页(例如单条新闻多个页面)是爬虫一个比较头疼的问题。webmagic有一个对于分页的实现,通过实现`PagedModel`接口即可。webmagic-samples里有一个抓取网易新闻的类:`us.codecraft.webmagic.model.samples.News163`。关于分页,这里有一篇对于webmagic分页实现的详细说明的文章[关于爬虫实现分页的一些思考](http://my.oschina.net/flashsword/blog/150039)。 + 目前分页功能还没有分布式实现。 + +-------- + +## 分布式 + webmagic-extension中,通过redis来管理URL,达到分布式的效果。具体实现方式只有一个类:`us.codecraft.webmagic.scheduler.RedisScheduler`。 + + + + From 7c0eba5e3f94912874b1953411e3b3ed63c337df Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Fri, 9 Aug 2013 00:49:32 +0800 Subject: [PATCH 80/84] update manual --- webmagic manual.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/webmagic manual.md b/webmagic manual.md index 9d243c0..d009a97 100644 --- a/webmagic manual.md +++ b/webmagic manual.md @@ -295,7 +295,8 @@ webmagic-extension包括注解模块。为什么会有注解方式? -------- ## 分布式 - webmagic-extension中,通过redis来管理URL,达到分布式的效果。具体实现方式只有一个类:`us.codecraft.webmagic.scheduler.RedisScheduler`。 + +webmagic-extension中,通过redis来管理URL,达到分布式的效果。具体实现方式只有一个类:`us.codecraft.webmagic.scheduler.RedisScheduler`。 From fcfa2c30c7aee8ebe24d01aa25727efbe33ed1f1 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Fri, 9 Aug 2013 20:36:27 +0800 Subject: [PATCH 81/84] complete docs --- .../webmagic/model/ModelPipeline.java | 3 ++- .../webmagic/model/annotation/ExtractBy.java | 25 ++++++++++++++++++- .../webmagic/model/annotation/ExtractBy2.java | 2 +- .../webmagic/model/annotation/ExtractBy3.java | 2 +- .../model/annotation/ExtractByRaw.java | 24 +++++++++++++++++- .../model/annotation/ExtractByUrl.java | 17 +++++++++++++ .../webmagic/model/annotation/HelpUrl.java | 10 ++++++++ .../webmagic/model/annotation/TargetUrl.java | 11 ++++++++ .../webmagic/pipeline/JsonFilePipeline.java | 2 +- .../webmagic/pipeline/PagedPipeline.java | 9 ++++--- 10 files changed, 96 insertions(+), 9 deletions(-) diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ModelPipeline.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ModelPipeline.java index 84db455..efb724e 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ModelPipeline.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ModelPipeline.java @@ -11,7 +11,8 @@ import java.util.Map; import java.util.concurrent.ConcurrentHashMap; /** - * 基于Pipeline的扩展点。
+ * 基于Pipeline的扩展点,用于实现注解格式的Pipeline。
+ * 与PageModelPipeline是一对多的关系(原谅作者没有更好的名字了)。
* @author code4crafter@gmail.com
* @date: 13-8-2
* Time: 上午10:47
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy.java index bcd5706..168387b 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy.java @@ -6,22 +6,45 @@ import java.lang.annotation.Target; /** * 定义类或者字段的抽取规则。
+ * * @author code4crafter@gmail.com
* @date: 13-8-1
* Time: 下午8:40
*/ @Retention(java.lang.annotation.RetentionPolicy.RUNTIME) -@Target({ElementType.FIELD,ElementType.TYPE}) +@Target({ElementType.FIELD, ElementType.TYPE}) public @interface ExtractBy { + /** + * 抽取规则 + * + * @return 抽取规则 + */ String value(); public enum Type {XPath, Regex, Css} + /** + * 抽取规则类型,支持XPath、Css selector、正则表达式,默认是XPath + * + * @return 抽取规则类型 + */ Type type() default Type.XPath; + /** + * 是否是不能为空的关键字段,若notNull为true,则对应字段抽取不到时,丢弃整个类,默认为false + * + * @return 是否是不能为空的关键字段 + */ boolean notNull() default false; + /** + * 是否抽取多个结果
+ * 用于字段时,需要List来盛放结果
+ * 用于类时,表示单页抽取多个对象
+ * + * @return 是否抽取多个结果 + */ boolean multi() default false; } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy2.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy2.java index ad720b3..99ebd76 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy2.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy2.java @@ -5,7 +5,7 @@ import java.lang.annotation.Retention; import java.lang.annotation.Target; /** - * 定义类或者字段的抽取规则。
+ * 定义类或者字段的抽取规则,只能在Extract、ExtractByRaw之后使用。
* * @author code4crafter@gmail.com
* @date: 13-8-1
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy3.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy3.java index 023360e..77910f8 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy3.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy3.java @@ -5,7 +5,7 @@ import java.lang.annotation.Retention; import java.lang.annotation.Target; /** - * 定义类或者字段的抽取规则。
+ * 定义类或者字段的抽取规则,只能在Extract、ExtractByRaw之后使用。
* @author code4crafter@gmail.com
* @date: 13-8-1
* Time: 下午8:40
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractByRaw.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractByRaw.java index f774d3e..caa87de 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractByRaw.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractByRaw.java @@ -6,22 +6,44 @@ import java.lang.annotation.Target; /** * 对于在Class级别就使用过ExtractBy的类,在字段中想抽取全部内容可使用此方法。
+ * * @author code4crafter@gmail.com
* @date: 13-8-1
* Time: 下午8:40
*/ @Retention(java.lang.annotation.RetentionPolicy.RUNTIME) -@Target({ElementType.FIELD,ElementType.TYPE}) +@Target({ElementType.FIELD, ElementType.TYPE}) public @interface ExtractByRaw { + /** + * 抽取规则 + * + * @return 抽取规则 + */ String value(); public enum Type {XPath, Regex, Css} + /** + * 抽取规则类型,支持XPath、Css selector、正则表达式,默认是XPath + * + * @return 抽取规则类型 + */ Type type() default Type.XPath; + /** + * 是否是不能为空的关键字段,若notNull为true,则对应字段抽取不到时,丢弃整个类,默认为false + * + * @return 是否是不能为空的关键字段 + */ boolean notNull() default false; + /** + * 是否抽取多个结果
+ * 需要List来盛放结果
+ * + * @return 是否抽取多个结果 + */ boolean multi() default false; } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractByUrl.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractByUrl.java index d57ec6e..401e469 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractByUrl.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractByUrl.java @@ -14,10 +14,27 @@ import java.lang.annotation.Target; @Target({ElementType.FIELD}) public @interface ExtractByUrl{ + /** + * 抽取规则,支持正则表达式 + * + * @return 抽取规则 + */ String value() default ""; + /** + * 是否是不能为空的关键字段,若notNull为true,则对应字段抽取不到时,丢弃整个类,默认为false + * + * @return 是否是不能为空的关键字段 + */ boolean notNull() default false; + /** + * 是否抽取多个结果
+ * 用于字段时,需要List来盛放结果
+ * 用于类时,表示单页抽取多个对象
+ * + * @return 是否抽取多个结果 + */ boolean multi() default false; } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/HelpUrl.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/HelpUrl.java index 3437e44..0b2a2ec 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/HelpUrl.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/HelpUrl.java @@ -14,7 +14,17 @@ import java.lang.annotation.Target; @Target({ElementType.TYPE}) public @interface HelpUrl { + /** + * 某个类对应的URL规则列表
+ * webmagic对正则表达式进行了修改,"."仅表示字符"."而不代表任意字符,而"\*"则代表了".\*",例如"http://\*.oschina.net/\*"代表了oschina所有的二级域名下的URL。
+ * + * @return 抽取规则 + */ String[] value(); + /** + * 指定提取URL的区域(仅支持XPath) + * @return 指定提取URL的区域 + */ String sourceRegion() default ""; } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/TargetUrl.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/TargetUrl.java index c747da6..0caf190 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/TargetUrl.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/TargetUrl.java @@ -6,6 +6,7 @@ import java.lang.annotation.Target; /** * 定义某个类抽取的范围和来源,sourceRegion可以用xpath语法限定抽取区域。
+ * * @author code4crafter@gmail.com
* @date: 13-8-1
* Time: 下午8:40
@@ -14,8 +15,18 @@ import java.lang.annotation.Target; @Target({ElementType.TYPE}) public @interface TargetUrl { + /** + * 某个类对应的URL规则列表
+ * webmagic对正则表达式进行了修改,"."仅表示字符"."而不代表任意字符,而"\*"则代表了".\*",例如"http://\*.oschina.net/\*"代表了oschina所有的二级域名下的URL。
+ * + * @return 抽取规则 + */ String[] value(); + /** + * 指定提取URL的区域(仅支持XPath) + * @return 指定提取URL的区域 + */ String sourceRegion() default ""; } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePipeline.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePipeline.java index dbe1a00..53dba9e 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePipeline.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePipeline.java @@ -12,7 +12,7 @@ import java.io.IOException; import java.io.PrintWriter; /** - * 持久化到文件的接口。 + * JSON格式持久化到文件的接口。 * * @author code4crafter@gmail.com
* Date: 13-4-21 diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/PagedPipeline.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/PagedPipeline.java index b692da2..1753842 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/PagedPipeline.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/PagedPipeline.java @@ -9,6 +9,9 @@ import java.util.*; import java.util.concurrent.ConcurrentHashMap; /** + * 用于实现分页的Pipeline。
+ * 在使用redis做分布式爬虫时,请不要使用此功能。
+ * * @author code4crafter@gmail.com
* @date: 13-8-4
* Time: 下午5:15
@@ -34,7 +37,7 @@ public class PagedPipeline implements Pipeline { if (o instanceof PagedModel) { PagedModel pagedModel = (PagedModel) o; pageMap.put(pagedModel.getPageKey(), pagedModel.getPage(), Boolean.TRUE); - if (pagedModel.getOtherPages()!=null){ + if (pagedModel.getOtherPages() != null) { for (String otherPage : pagedModel.getOtherPages()) { Boolean aBoolean = pageMap.get(pagedModel.getPageKey(), otherPage); if (aBoolean == null) { @@ -70,8 +73,8 @@ public class PagedPipeline implements Pipeline { } }); PagedModel value = entryList.get(0).getValue(); - for (int i=1;i Date: Fri, 9 Aug 2013 20:38:28 +0800 Subject: [PATCH 82/84] add docs --- webmagic manual.md | 181 +++++++++++++++++++++++++++++---------------- 1 file changed, 119 insertions(+), 62 deletions(-) diff --git a/webmagic manual.md b/webmagic manual.md index d009a97..0c681c5 100644 --- a/webmagic manual.md +++ b/webmagic manual.md @@ -4,7 +4,7 @@ webmagic使用手册 >web爬虫是一种技术,webmagic致力于将这种技术的实现成本降低,但是出于对资源提供者的尊重,webmagic不会做反封锁的事情,包括:验证码破解、代理切换、自动登录、抓取静态资源等。 ->作者黄亿华([code4crafter@gmail.com](code4crafter@gmail.com))目前就职于大众点评,曾经在前公司进行过一年的垂直爬虫的开发。webmagic就是为了解决爬虫开发的一些重复劳动而产生的框架。有使用不便或者问题,欢迎在github[提交issue](https://github.com/code4craft/webmagic/issues),或者在[oschina讨论模块](http://www.oschina.net/question)提问。 +>作者黄亿华([code4crafter@gmail.com](code4crafter@gmail.com))目前就职于大众点评,曾经在前公司进行过一年的垂直爬虫的开发,webmagic就是为了解决爬虫开发的一些重复劳动而产生的框架。 >webmagic的架构和设计参考了以下两个项目,感谢以下两个项目的作者: @@ -12,7 +12,10 @@ webmagic使用手册 >Java爬虫 **Spiderman** [https://gitcafe.com/laiweiwei/Spiderman](https://gitcafe.com/laiweiwei/Spiderman) ---------- +>webmagic遵循[Apache 2.0协议](http://www.apache.org/licenses/LICENSE-2.0.html),你可以自由进行使用和修改。有使用不便或者问题,欢迎在github[提交issue](https://github.com/code4craft/webmagic/issues),或者在[oschina讨论模块](http://www.oschina.net/question)提问。 + +
+ ## 快速开始 @@ -66,7 +69,7 @@ webmagic还包含两个可用的扩展包,因为这两个包都依赖了比较 git clone http://git.oschina.net/flashsword20/webmagic-bin.git -在`bin/lib`目录下,有项目依赖的所有jar包,直接在IDE里import即可。 +在**bin/lib**目录下,有项目依赖的所有jar包,直接在IDE里import即可。 ### 第一个爬虫 @@ -131,11 +134,12 @@ webmagic-extension包括了注解方式编写爬虫的方法,只需基于一 这个例子定义了一个Model类,Model类的字段'title'、'content'、'tags'均为要抽取的属性。这个类在Pipeline里是可以复用的。 -关于注解的使用方式,在后面会专门讲到。 +注解的详细使用方式见后文中得webmagic-extension注解模块。 ------- +
-## 核心架构解析 + +## webmagic-core webmagic-core是爬虫的核心框架,只包括一个爬虫各功能模块的核心功能。webmagic-core的目标是成为网页爬虫的一个教科书般的实现。 @@ -147,12 +151,12 @@ webmagic-core是爬虫的核心框架,只包括一个爬虫各功能模块的 webmagic-core参考了scrapy的模块划分,分为Spider(整个爬虫的调度框架)、Downloader(页面下载)、PageProcessor(链接提取和页面分析)、Scheduler(URL管理)、Pipeline(离线分析和持久化)几部分。只不过scrapy通过middleware实现扩展,而webmagic则通过定义这几个接口,并将其不同的实现注入主框架类Spider来实现扩展。 ![image](http://code4craft.github.io/images/posts/webmagic.png) +
#### Spider类(核心调度) -Spider是爬虫的入口类,Spider的接口调用采用了链式的API设计,其他功能全部通过接口注入Spider实现,下面是启动一个比较复杂的Spider的例子。 +**Spider**是爬虫的入口类,Spider的接口调用采用了链式的API设计,其他功能全部通过接口注入Spider实现,下面是启动一个比较复杂的Spider的例子。 - Spider.create(sinaBlogProcessor) .scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")) .pipeline(new FilePipeline()) @@ -175,68 +179,115 @@ Spider的核心处理流程非常简单,代码如下: } sleep(site.getSleepTime()); } - -#### Downloader(页面下载) - -大部分爬虫都是通过模拟http请求,接收并分析响应来完成。这方面,JDK自带的**HttpURLConnection**可以满足最简单的需要,而**Apache HttpClient**(4.0后整合到HttpCompenent项目中)则是开发复杂爬虫的不二之选。它支持自定义HTTP头(对于爬虫比较有用的就是User-agent、cookie等)、自动redirect、连接复用、cookie保留、设置代理等诸多强大的功能。 - -webmagic使用了HttpClient 4.2,并封装到了**HttpClientDownloader**。学习HttpClient的使用对于构建高性能爬虫是非常有帮助的,官方的[Tutorial](http://hc.apache.org/httpcomponents-client-ga/tutorial/html/)就是很好的学习资料。目前webmagic对HttpClient的使用仍在初步阶段,不过对于一般抓取任务,已经够用了。 - -对于一些Javascript动态加载的网页,仅仅使用http模拟下载工具,并不能取到页面的内容。这方面的思路有两种:一种是抽丝剥茧,分析js的逻辑,再用爬虫去重现它(比如在网页中提取关键数据,再用这些数据去构造Ajax请求,最后直接从响应体获取想要的数据); -另一种就是:内置一个浏览器,直接获取最后加载完的页面。这方面,js可以使用**PhantomJS**,它内部集成了webkit。而Java可以使用**Selenium**,这是一个非常强大的浏览器模拟工具。webmagic-selenium包中整合了Selenium到`SeleniumDownloader`,可以直接进行动态加载页面的抓取。 - + #### PageProcessor(页面分析及链接抽取) -页面分析可以说是垂直爬虫最复杂的一部分,在webmagic里,PageProcessor是定制爬虫的核心。通过编写一个实现PageProcessor接口的类,就可以定制一个自己的爬虫。 +页面分析是垂直爬虫中需要定制的部分。在webmagic-core里,通过实现**PageProcessor**接口来实现定制爬虫。PageProcessor有两个核心方法:public void process(Page page)和public Site getSite() 。 -**Selector**是webmagic为了简化页面抽取开发的独立模块,是webmagic的主要着力点。这里整合了CSS Selector、XPath和正则表达式,并可以进行链式的抽取,很容易就实现强大的功能。即使你使用自己开发的爬虫工具,webmagic的Selector仍然值得一试。 +* public void process(Page page) -例如,我已经下载了一个页面,现在要抽取某个区域的所有包含"blog"的链接,我可以这样写: + 通过对**Page**对象的操作,实现爬虫逻辑。Page对象包括两个最重要的方法:addTargetRequests()可以添加URL到待抓取队列,put()可以将结果保存供后续处理。 + Page的数据可以通过Page.getHtml()和Page.getUrl()获取。 + +* public Site getSite() + + **Site**对象定义了爬虫的域名、起始地址、抓取间隔、编码等信息。 + +**Selector**是webmagic为了简化页面抽取开发的独立模块,是webmagic-core的主要着力点。这里整合了CSS Selector、XPath和正则表达式,并可以进行链式的抽取。 //content是用别的爬虫工具抽取到的正文 - String content = "blabla"; - List links = Html.create(content) + List links = page.getHtml() .$("div.title") //css 选择,Java里虽然很少有$符号出现,不过貌似$作为方法名是合法的 .xpath("//@href") //提取链接 .regex(".*blog.*") //正则匹配过滤 .all(); //转换为string列表 +webmagic包括一个对于页面正文的自动抽取的类**SmartContentSelector**。相信用过Evernote Clearly都会对其自动抽取正文的技术印象深刻。这个技术又叫**Readability**。当然webmagic对Readability的实现还比较粗略,但是仍有一些学习价值。 -另外,webmagic的抓取链接需要显示的调用`Page.addTargetRequests()`去添加,这也是为了灵活性考虑的(很多时候,下一步的URL不是单纯的页面href链接,可能会根据页面模块进行抽取,甚至可能是自己拼凑出来的)。 +基于Saxon,webmagic提供了XPath2.0语法的支持。XPath2.0语法支持内部函数、逻辑控制等,是一门完整的语言,如果你熟悉XPath2.0语法,倒是不妨一试(需要引入**webmagic-saxon**包)。 -webmagic包括一个对于页面正文的自动抽取的功能**SmartContentSelector**。相信用过Evernote Clearly都会对其自动抽取正文的技术印象深刻。这个技术又叫**Readability**。当然webmagic对Readability的实现还比较粗略,但是仍有一些学习价值。 +**webmagic-samples**包里有一些为某个站点定制的PageProcessor,供学习之用。 -基于Saxon,webmagic提供了XPath2.0语法的支持。XPath2.0语法支持内部函数、逻辑控制等,是一门完整的语言,如果你熟悉语法,倒是不妨一试(需要引入webmagic-saxon包)。 +#### Downloader(页面下载) + +**Downloader**是webmagic中下载页面的接口,主要方法: + +* public Page download(Request request, Task task) + + **Request**对象封装了待抓取的URL及其他信息,而Page则包含了页面下载后的Html及其他信息。Task是一个包装了任务对应的Site信息的抽象接口。 + +* public void setThread(int thread) + + 因为Downloader一般会涉及连接池等功能,而这些功能与多线程密切相关,所以定义了此方法。 + +目前有几个Downloader的实现: + +* HttpClientDownloader + + 集成了**Apache HttpClient**的Downloader。Apache HttpClient(4.0后整合到HttpCompenent项目中)是强大的Java http下载器,它支持自定义HTTP头(对于爬虫比较有用的就是User-agent、cookie等)、自动redirect、连接复用、cookie保留、设置代理等诸多强大的功能。 + +* SeleniumDownloader + + 对于一些Javascript动态加载的网页,仅仅使用http模拟下载工具,并不能取到页面的内容。这方面的思路有两种:一种是抽丝剥茧,分析js的逻辑,再用爬虫去重现它;另一种就是:内置一个浏览器,直接获取最后加载完的页面。**webmagic-selenium**包中整合了Selenium到SeleniumDownloader,可以直接进行动态加载页面的抓取。 #### Scheduler(URL管理) +**Scheduler**是webmagic的管理模块,通过实现Scheduler可以定制自己的URL管理器。Scheduler包括两个主要方法: -URL管理的问题可大可小。对于小规模的抓取,URL管理是很简单的。我们只需要将待抓取URL和未抓取URL分开保存,并进行去重即可。使用JDK内置的集合类型Set、List或者Queue都可以满足需要。如果我们要进行多线程抓取,则可以选择线程安全的容器,例如LinkedBlockingQueue以及ConcurrentHashMap。 +* public void push(Request request,Task task) + + 将待抓取URL加入Scheduler。Request对象是对URL的一个封装,还包括优先级、以及一个供存储数据的Map。Task仍然用于区分不同任务,在多个任务公用一个Scheduler时可以此进行区分。 -因为小规模的URL管理非常简单,很多框架都并不将其抽象为一个模块,而是直接融入到代码中。但是实际上,抽象出Scheduler模块,会使得框架的解耦程度上升一个档次,并非常容易进行横向扩展,这也是我从scrapy中学到的。 +* public Request poll(Task task) + + 从Scheduler里取出一条请求,并进行后续执行。 -在webmagic的设计中,除了Scheduler模块,其他的处理-从下载、解析到持久化,每个任务都是互相独立的,因此可以通过多个Spider共用一个Scheduler来进行扩展。排除去重的因素,URL管理天生就是一个队列,我们可以很方便的用分布式的队列工具去扩展它,也可以基于mysql、redis或者mongodb这样的存储工具来构造一个队列,这样构建一个多线程乃至分布式的爬虫就轻而易举了。 +webmagic目前有三个Scheduler的实现: -URL去重也是一个比较复杂的问题。如果数据量较少,则使用hash的方式就能很好解决。数据量较大的情况下,可以使用Bloom Filter或者更复杂的方式。 +* QueueScheduler + + 一个简单的内存队列,速度较快,并且是线程安全的。 + +* FileCacheQueueScheduler + + 使用文件保存队列,它可以用于耗时较长的下载任务,在任务中途停止后(手动停止或者程序崩溃),下次执行仍然从中止的URL开始继续爬取。 + +* RedisScheduler + + 使用redis存储URL队列。通过使用同一台redis服务器存储URL,webmagic可以很容易的在多机部署,从而达到分布式爬虫的效果。 -webmagic目前有两个Scheduler的实现,**QueueScheduler**是一个简单的内存队列,速度较快,并且是线程安全的,**FileCacheQueueScheduler**则是一个文件队列,它可以用于耗时较长的下载任务,在任务中途停止后,下次执行仍然从中止的URL开始继续爬取。 +#### Pipeline(后续处理和持久化) -webmagic有一个基于redis的Scheduler实现**RedisScheduler**。通过使用同一台redis服务器存储URL,webmagic可以很容易的在多机部署,从而达到分布式爬虫的效果。 +**Pipeline**是最终抽取结果进行输出和持久化的接口。它只包括一个方法: +* public void process(ResultItems resultItems,Task task) + + **ResultItems**是集成了抽取结果的对象。通过ResultItems.get(key)可以获取抽取结果。Task同样是用于区分不同任务的对象。 + +webmagic包括以下几个Pipeline的实现: -#### Pipeline-离线处理和持久化 +* ConsolePipeline + + 直接输出结果到控制台,测试时使用。 + +* FilePipeline + + 输出结果到文件,每个URL单独保存到一个页面,以URL的MD5结果作为文件名。通过构造函数`public FilePipeline(String path)`定义存储路径,**以下使用文件持久化的类,多数都使用此方法指定路径**。 + +* JsonFilePipeline + + 以JSON输出结果到文件(.json后缀),其他与FilePipeline相同。 +webmagic目前不支持持久化到数据库,但是结合其他工具,持久化到数据库也是很容易的。这里不妨看一下[webmagic结合JFinal持久化到数据库的一段代码](http://www.oschina.net/code/snippet_190591_23456)。因为JFinal目前还不支持maven,所以这段代码并没有放到webmagic-samples里来。 -Pipeline其实也是容易被忽略的一部分。大家都知道持久化的重要性,但是很多框架都选择直接在页面抽取的时候将持久化一起完成,例如crawer4j。但是Pipeline真正的好处是,将页面的在线分析和离线处理拆分开来,可以在一些线程里进行下载,另一些线程里进行处理和持久化。 +
-你可以扩展Pipeline来实现抽取结果的持久化,将其保存到你想要保存的地方-本地文件、数据库、mongodb等等。Pipeline的处理目前还是在线的,但是修改为离线的也并不困难。 +## webmagic-extension -webmagic目前只支持控制台输出和文件持久化,但是持久化到数据库也是很容易的。这里不妨看一下[webmagic结合JFinal持久化到数据库的一段代码](http://www.oschina.net/code/snippet_190591_23456)。因为JFinal目前还不支持maven,所以并没有放到webmagic-samples里来。 +webmagic-extension是为了开发爬虫更方便而实现的一些功能模块。这些功能完全基于webmagic-core的框架,包括注解形式编写爬虫、分页、分布式等功能。 ------- - -## 注解模块 +### 注解模块 webmagic-extension包括注解模块。为什么会有注解方式? @@ -251,53 +302,59 @@ webmagic-extension包括注解模块。为什么会有注解方式? 注解部分包括以下内容: -* ### TargetUrl +* #### TargetUrl - "TargetUrl"表示这个Model对应要抓取的URL,它包含两层意思:符合这个条件的URL会被加入抓取队列;符合这个条件的URL会被这个Model抓取。TargetUrl可以`sourceRegion`指定提取URL的区域(仅支持XPath)。 + "TargetUrl"表示这个Model对应要抓取的URL,它包含两层意思:符合这个条件的URL会被加入抓取队列;符合这个条件的URL会被这个Model抓取。TargetUrl可以**sourceRegion**指定提取URL的区域(仅支持XPath)。 TargetUrl使用了正则表达式,匹配 "http://my.oschina.net/flashsword/blog/150039" 格式的URL。webmagic对正则表达式进行了修改,"."仅表示字符"."而不代表任意字符,而"\*"则代表了".\*",例如"http://\*.oschina.net/\*"代表了oschina所有的二级域名下的URL。 - 与TargetUrl相似的还有`HelpUrl`,HelpUrl表示:仅仅抓取该URL用作链接提取,并不对它进行内容抽取。例如博客正文页对应TargetUrl,而列表页则对应HelpUrl。 + 与TargetUrl相似的还有**HelpUrl**,HelpUrl表示:仅仅抓取该URL用作链接提取,并不对它进行内容抽取。例如博客正文页对应TargetUrl,而列表页则对应HelpUrl。 -* ### ExtractBy +* #### ExtractBy - * #### 用于字段 + * ##### 用于字段 "ExtractBy"可用于类以及字段。用于字段时,定义了字段抽取的规则。抽取的规则默认使用[**XPath**](http://www.w3school.com.cn/xpath/),也可以选择使用CSS Selector、正则表达式(通过设置type)。 - ExtractBy还有几个扩展属性。`multi`表示是否抽取列表,当然,设置为multi时,你需要一个List字段去容纳它。`notnull`则表示,此字段不允许为null,若为null则放弃整个对象。 + ExtractBy还有几个扩展属性。**multi**表示是否抽取列表,当然,设置为multi时,你需要一个List字段去容纳它。**notnull**则表示,此字段不允许为null,若为null则放弃整个对象。 - * #### 用于类 + * ##### 用于类 "ExtractBy"用于类时,则限定了字段抽取的区域。用于类时仍支持multi,multi则表示一个页面可以抽取到多个对象。 - * #### ExtractByRaw & ExtractByUrl + * ##### ExtractByRaw & ExtractByUrl - 在类使用"ExtractBy"修饰后,字段的"ExtractBy"使用的是其抽取的结果,如果仍然想要抽取原HTML,可以使用"ExtractByRaw"。与此类似的还有"ExtractByUrl",表示从URL重抽取信息。ExtractByUrl只支持正则表达式。 + 在类使用"ExtractBy"修饰后,字段的"ExtractBy"使用的是其抽取的结果,如果仍然想要抽取原HTML,可以使用"ExtractByRaw"。与此类似的还有"ExtractByUrl",表示从URL中抽取信息。ExtractByUrl只支持正则表达式。 - * #### ExtractBy2 ExtractBy3 + * ##### ExtractBy2 ExtractBy3 "ExtractBy"、"ExtractByRaw"支持链式抽取,通过增加注解"ExtractBy2"、"ExtractBy3"实现。 -* ### AfterExtractor +* #### AfterExtractor - AfterExtractor接口是对注解方式抽取能力不足的补充。实现AfterExtractor接口后,会在**使用注解方式填充完字段后**调用`afterProcess()`方法,在这个方法中可以直接访问已抽取的字段、补充需要抽取的字段,甚至做一些简单的输出和持久化操作(并不是很建议这么做)。这部分可以参考[webmagic结合JFinal持久化到数据库的一段代码](http://www.oschina.net/code/snippet_190591_23456)。 + AfterExtractor接口是对注解方式抽取能力不足的补充。实现AfterExtractor接口后,会在**使用注解方式填充完字段后**调用**afterProcess()**方法,在这个方法中可以直接访问已抽取的字段、补充需要抽取的字段,甚至做一些简单的输出和持久化操作(并不是很建议这么做)。这部分可以参考[webmagic结合JFinal持久化到数据库的一段代码](http://www.oschina.net/code/snippet_190591_23456)。 -* ### OOSpider - OOSpider是注解式爬虫的入口,这里调用`create()`方法将OschinaBlog这个类加入到爬虫的抽取中,这里是可以传入多个类的,OOSpider会根据TargetUrl调用不同的Model进行解析。 +* #### OOSpider + OOSpider是注解式爬虫的入口,这里调用**create()**方法将OschinaBlog这个类加入到爬虫的抽取中,这里是可以传入多个类的,例如: + + OOSpider.create( + Site.me().addStartUrl("http://www.oschina.net"), + new ConsolePageModelPipeline(), + OschinaBlog.clas,OschinaAnswer.class).run(); + + OOSpider会根据TargetUrl调用不同的Model进行解析。 -* ### PageModelPipeline +* #### PageModelPipeline 可以通过定义PageModelPipeline来选择结果输出方式。这里new ConsolePageModelPipeline()是PageModelPipeline的一个实现,会将结果输出到控制台。 -* ### 分页 - 处理单项数据分页(例如单条新闻多个页面)是爬虫一个比较头疼的问题。webmagic有一个对于分页的实现,通过实现`PagedModel`接口即可。webmagic-samples里有一个抓取网易新闻的类:`us.codecraft.webmagic.model.samples.News163`。关于分页,这里有一篇对于webmagic分页实现的详细说明的文章[关于爬虫实现分页的一些思考](http://my.oschina.net/flashsword/blog/150039)。 - 目前分页功能还没有分布式实现。 +* #### 分页 --------- - -## 分布式 - -webmagic-extension中,通过redis来管理URL,达到分布式的效果。具体实现方式只有一个类:`us.codecraft.webmagic.scheduler.RedisScheduler`。 + 处理单项数据分页(例如单条新闻多个页面)是爬虫一个比较头疼的问题。webmagic目前对于分页的解决方案是:在注解模式下,Model通过实现**PagedModel**接口,并引入PagedPipeline作为第一个Pipeline来实现。具体可以参考webmagic-samples中抓取网易新闻的代码:**us.codecraft.webmagic.model.samples.News163**。 + 关于分页,这里有一篇对于webmagic分页实现的详细说明的文章[关于爬虫实现分页的一些思考](http://my.oschina.net/flashsword/blog/150039)。 + 目前分页功能还没有分布式实现,如果实现RedisScheduler进行分布式爬取,请不要使用分页功能。 + +### 分布式 +webmagic-extension中,通过redis来管理URL,达到分布式的效果。但是对于分布式爬虫,仅仅程序能够分布式运行,还满足不了大规模抓取的需要,webmagic可能后期会加入一些任务管理和监控的功能,也欢迎各位用户为webmagic提交代码,做出贡献。 From 17d2d98cecbe85c28785bc4e191bd31777d0a659 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Fri, 9 Aug 2013 20:43:06 +0800 Subject: [PATCH 83/84] remove invalid @date --- .../src/main/java/us/codecraft/webmagic/ResultItems.java | 2 +- .../main/java/us/codecraft/webmagic/downloader/Destroyable.java | 2 +- .../main/java/us/codecraft/webmagic/selector/AndSelector.java | 2 +- .../main/java/us/codecraft/webmagic/selector/OrSelector.java | 2 +- .../src/main/java/us/codecraft/webmagic/PagedModel.java | 2 +- .../main/java/us/codecraft/webmagic/model/AfterExtractor.java | 2 +- .../us/codecraft/webmagic/model/ConsolePageModelPipeline.java | 2 +- .../src/main/java/us/codecraft/webmagic/model/Extractor.java | 2 +- .../main/java/us/codecraft/webmagic/model/FieldExtractor.java | 2 +- .../java/us/codecraft/webmagic/model/ModelPageProcessor.java | 2 +- .../main/java/us/codecraft/webmagic/model/ModelPipeline.java | 2 +- .../src/main/java/us/codecraft/webmagic/model/OOSpider.java | 2 +- .../java/us/codecraft/webmagic/model/PageModelExtractor.java | 2 +- .../java/us/codecraft/webmagic/model/PageModelPipeline.java | 2 +- .../java/us/codecraft/webmagic/model/annotation/ExtractBy.java | 2 +- .../java/us/codecraft/webmagic/model/annotation/ExtractBy2.java | 2 +- .../java/us/codecraft/webmagic/model/annotation/ExtractBy3.java | 2 +- .../us/codecraft/webmagic/model/annotation/ExtractByRaw.java | 2 +- .../us/codecraft/webmagic/model/annotation/ExtractByUrl.java | 2 +- .../java/us/codecraft/webmagic/model/annotation/HelpUrl.java | 2 +- .../java/us/codecraft/webmagic/model/annotation/TargetUrl.java | 2 +- .../main/java/us/codecraft/webmagic/pipeline/PagedPipeline.java | 2 +- .../java/us/codecraft/webmagic/scheduler/RedisScheduler.java | 2 +- .../src/main/java/us/codecraft/webmagic/utils/DoubleKeyMap.java | 2 +- .../main/java/us/codecraft/webmagic/utils/MultiKeyMapBase.java | 2 +- .../us/codecraft/webmagic/scheduler/RedisSchedulerTest.java | 2 +- .../java/us/codecraft/webmagic/pipeline/LucenePipeline.java | 2 +- .../test/java/us/codecraft/webmagic/lucene/OschinaBlog.java | 2 +- .../src/main/java/us/codecraft/webmagic/main/QuickStarter.java | 2 +- .../src/main/java/us/codecraft/webmagic/model/samples/Blog.java | 2 +- .../java/us/codecraft/webmagic/model/samples/IteyeBlog.java | 2 +- .../main/java/us/codecraft/webmagic/model/samples/News163.java | 2 +- .../java/us/codecraft/webmagic/model/samples/OschinaAnswer.java | 2 +- .../java/us/codecraft/webmagic/model/samples/OschinaBlog.java | 2 +- .../java/us/codecraft/webmagic/samples/IteyeBlogProcessor.java | 2 +- .../webmagic/downloader/selenium/SeleniumDownloader.java | 2 +- .../codecraft/webmagic/downloader/selenium/WebDriverPool.java | 2 +- .../java/us/codecraft/webmagic/downloader/SeleniumTest.java | 2 +- .../webmagic/downloader/selenium/SeleniumDownloaderTest.java | 2 +- .../webmagic/downloader/selenium/WebDriverPoolTest.java | 2 +- .../java/us/codecraft/webmagic/samples/HuabanProcessor.java | 2 +- 41 files changed, 41 insertions(+), 41 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/ResultItems.java b/webmagic-core/src/main/java/us/codecraft/webmagic/ResultItems.java index c91a270..7a8e5c3 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/ResultItems.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/ResultItems.java @@ -6,7 +6,7 @@ import java.util.Map; /** * 保存抽取结果的类,由PageProcessor处理得到,传递给{@link us.codecraft.webmagic.pipeline.Pipeline}进行持久化。
* @author code4crafter@gmail.com
- * @date: 13-7-25
+ * Date: 13-7-25
* Time: 下午12:20
*/ public class ResultItems { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Destroyable.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Destroyable.java index 2b040fa..6dcbde1 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Destroyable.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Destroyable.java @@ -3,7 +3,7 @@ package us.codecraft.webmagic.downloader; /** * 比较占用资源的服务可以实现该接口,Spider会在结束时调用destroy()释放资源。
* @author code4crafter@gmail.com
- * @date: 13-7-26
+ * Date: 13-7-26
* Time: 下午3:10
*/ public interface Destroyable { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/AndSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/AndSelector.java index e6bbbb8..997b6cf 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/AndSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/AndSelector.java @@ -5,7 +5,7 @@ import java.util.List; /** * @author code4crafter@gmail.com
- * @date: 13-8-3
+ * Date: 13-8-3
* Time: 下午5:29
*/ public class AndSelector implements Selector { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/OrSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/OrSelector.java index dca1b34..48f9fb9 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/OrSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/OrSelector.java @@ -5,7 +5,7 @@ import java.util.List; /** * @author code4crafter@gmail.com
- * @date: 13-8-3
+ * Date: 13-8-3
* Time: 下午5:29
*/ public class OrSelector implements Selector { diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/PagedModel.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/PagedModel.java index 95e1a83..7d46cc2 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/PagedModel.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/PagedModel.java @@ -4,7 +4,7 @@ import java.util.Collection; /** * @author code4crafter@gmail.com
- * @date: 13-8-4
+ * Date: 13-8-4
* Time: 下午5:18
*/ public interface PagedModel { diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/AfterExtractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/AfterExtractor.java index 0117081..3927d11 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/AfterExtractor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/AfterExtractor.java @@ -6,7 +6,7 @@ import us.codecraft.webmagic.Page; * 实现这个接口即可在抽取后进行后处理。
* * @author code4crafter@gmail.com
- * @date: 13-8-3
+ * Date: 13-8-3
* Time: 上午9:42
*/ public interface AfterExtractor { diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ConsolePageModelPipeline.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ConsolePageModelPipeline.java index e5485a1..c841f10 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ConsolePageModelPipeline.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ConsolePageModelPipeline.java @@ -5,7 +5,7 @@ import us.codecraft.webmagic.Task; /** * @author code4crafter@gmail.com
- * @date: 13-8-3
+ * Date: 13-8-3
* Time: 下午3:41
*/ public class ConsolePageModelPipeline implements PageModelPipeline { diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/Extractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/Extractor.java index 82c7dbb..0494076 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/Extractor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/Extractor.java @@ -4,7 +4,7 @@ import us.codecraft.webmagic.selector.Selector; /** * @author code4crafter@gmail.com
- * @date: 13-8-1
+ * Date: 13-8-1
* Time: 下午9:48
*/ class Extractor { diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/FieldExtractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/FieldExtractor.java index 17a55c8..4ec1bbc 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/FieldExtractor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/FieldExtractor.java @@ -7,7 +7,7 @@ import java.lang.reflect.Method; /** * @author code4crafter@gmail.com
- * @date: 13-8-1
+ * Date: 13-8-1
* Time: 下午9:48
*/ class FieldExtractor extends Extractor{ diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ModelPageProcessor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ModelPageProcessor.java index 84563ce..af762ec 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ModelPageProcessor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ModelPageProcessor.java @@ -16,7 +16,7 @@ import java.util.regex.Pattern; /** * 基于PageProcessor的扩展点。
* @author code4crafter@gmail.com
- * @date: 13-8-1
+ * Date: 13-8-1
* Time: 下午8:46
*/ class ModelPageProcessor implements PageProcessor { diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ModelPipeline.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ModelPipeline.java index efb724e..07d6c5a 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ModelPipeline.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ModelPipeline.java @@ -14,7 +14,7 @@ import java.util.concurrent.ConcurrentHashMap; * 基于Pipeline的扩展点,用于实现注解格式的Pipeline。
* 与PageModelPipeline是一对多的关系(原谅作者没有更好的名字了)。
* @author code4crafter@gmail.com
- * @date: 13-8-2
+ * Date: 13-8-2
* Time: 上午10:47
*/ class ModelPipeline implements Pipeline { diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/OOSpider.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/OOSpider.java index 8a3739d..e5a41e1 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/OOSpider.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/OOSpider.java @@ -6,7 +6,7 @@ import us.codecraft.webmagic.Spider; /** * 基于Model的Spider,封装后的入口类。
* @author code4crafter@gmail.com
- * @date: 13-8-3
+ * Date: 13-8-3
* Time: 上午9:51
*/ public class OOSpider extends Spider { diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java index 158e74d..2f9004b 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java @@ -17,7 +17,7 @@ import java.util.regex.Pattern; * Model主要逻辑类。将一个带注解的POJO转换为一个PageModelExtractor。
* * @author code4crafter@gmail.com
- * @date: 13-8-1
+ * Date: 13-8-1
* Time: 下午9:33
*/ class PageModelExtractor { diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelPipeline.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelPipeline.java index bd3aa95..a70137f 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelPipeline.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelPipeline.java @@ -4,7 +4,7 @@ import us.codecraft.webmagic.Task; /** * @author code4crafter@gmail.com
- * @date: 13-8-3
+ * Date: 13-8-3
* Time: 上午9:34
*/ public interface PageModelPipeline { diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy.java index 168387b..8c12ce1 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy.java @@ -8,7 +8,7 @@ import java.lang.annotation.Target; * 定义类或者字段的抽取规则。
* * @author code4crafter@gmail.com
- * @date: 13-8-1
+ * Date: 13-8-1
* Time: 下午8:40
*/ @Retention(java.lang.annotation.RetentionPolicy.RUNTIME) diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy2.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy2.java index 99ebd76..2a4f080 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy2.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy2.java @@ -8,7 +8,7 @@ import java.lang.annotation.Target; * 定义类或者字段的抽取规则,只能在Extract、ExtractByRaw之后使用。
* * @author code4crafter@gmail.com
- * @date: 13-8-1
+ * Date: 13-8-1
* Time: 下午8:40
*/ @Retention(java.lang.annotation.RetentionPolicy.RUNTIME) diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy3.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy3.java index 77910f8..741682d 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy3.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy3.java @@ -7,7 +7,7 @@ import java.lang.annotation.Target; /** * 定义类或者字段的抽取规则,只能在Extract、ExtractByRaw之后使用。
* @author code4crafter@gmail.com
- * @date: 13-8-1
+ * Date: 13-8-1
* Time: 下午8:40
*/ @Retention(java.lang.annotation.RetentionPolicy.RUNTIME) diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractByRaw.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractByRaw.java index caa87de..a3ae3e5 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractByRaw.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractByRaw.java @@ -8,7 +8,7 @@ import java.lang.annotation.Target; * 对于在Class级别就使用过ExtractBy的类,在字段中想抽取全部内容可使用此方法。
* * @author code4crafter@gmail.com
- * @date: 13-8-1
+ * Date: 13-8-1
* Time: 下午8:40
*/ @Retention(java.lang.annotation.RetentionPolicy.RUNTIME) diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractByUrl.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractByUrl.java index 401e469..51b5f0d 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractByUrl.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractByUrl.java @@ -7,7 +7,7 @@ import java.lang.annotation.Target; /** * 定义类或者字段的抽取规则(从url中抽取,只支持正则表达式)。
* @author code4crafter@gmail.com
- * @date: 13-8-1
+ * Date: 13-8-1
* Time: 下午8:40
*/ @Retention(java.lang.annotation.RetentionPolicy.RUNTIME) diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/HelpUrl.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/HelpUrl.java index 0b2a2ec..9a0cce4 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/HelpUrl.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/HelpUrl.java @@ -7,7 +7,7 @@ import java.lang.annotation.Target; /** * 定义辅助爬取的url。
* @author code4crafter@gmail.com
- * @date: 13-8-1
+ * Date: 13-8-1
* Time: 下午8:40
*/ @Retention(java.lang.annotation.RetentionPolicy.RUNTIME) diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/TargetUrl.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/TargetUrl.java index 0caf190..e12fca3 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/TargetUrl.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/TargetUrl.java @@ -8,7 +8,7 @@ import java.lang.annotation.Target; * 定义某个类抽取的范围和来源,sourceRegion可以用xpath语法限定抽取区域。
* * @author code4crafter@gmail.com
- * @date: 13-8-1
+ * Date: 13-8-1
* Time: 下午8:40
*/ @Retention(java.lang.annotation.RetentionPolicy.RUNTIME) diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/PagedPipeline.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/PagedPipeline.java index 1753842..beda667 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/PagedPipeline.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/PagedPipeline.java @@ -13,7 +13,7 @@ import java.util.concurrent.ConcurrentHashMap; * 在使用redis做分布式爬虫时,请不要使用此功能。
* * @author code4crafter@gmail.com
- * @date: 13-8-4
+ * Date: 13-8-4
* Time: 下午5:15
*/ public class PagedPipeline implements Pipeline { diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java index c9992db..e26ed9d 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java @@ -13,7 +13,7 @@ import us.codecraft.webmagic.schedular.Scheduler; * 使用redis管理url,构建一个分布式的爬虫。
* * @author code4crafter@gmail.com
- * @date: 13-7-25
+ * Date: 13-7-25
* Time: 上午7:07
*/ public class RedisScheduler implements Scheduler { diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/DoubleKeyMap.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/DoubleKeyMap.java index a834528..b4a477f 100755 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/DoubleKeyMap.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/DoubleKeyMap.java @@ -4,7 +4,7 @@ import java.util.Map; /** * @author code4crafter@gmail.com - * @date Dec 14, 2012 + * Date Dec 14, 2012 */ public class DoubleKeyMap extends MultiKeyMapBase { private Map> map; diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/MultiKeyMapBase.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/MultiKeyMapBase.java index 256097a..89fdc9a 100755 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/MultiKeyMapBase.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/MultiKeyMapBase.java @@ -2,7 +2,7 @@ package us.codecraft.webmagic.utils; /** * @author code4crafter@gmail.com - * @date Dec 14, 2012 + * Date Dec 14, 2012 */ import java.util.HashMap; diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/scheduler/RedisSchedulerTest.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/scheduler/RedisSchedulerTest.java index f0cbb3d..0819e43 100644 --- a/webmagic-extension/src/test/java/us/codecraft/webmagic/scheduler/RedisSchedulerTest.java +++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/scheduler/RedisSchedulerTest.java @@ -9,7 +9,7 @@ import us.codecraft.webmagic.Task; /** * @author code4crafter@gmail.com
- * @date: 13-7-25
+ * Date: 13-7-25
* Time: 上午7:51
*/ public class RedisSchedulerTest { diff --git a/webmagic-lucene/src/main/java/us/codecraft/webmagic/pipeline/LucenePipeline.java b/webmagic-lucene/src/main/java/us/codecraft/webmagic/pipeline/LucenePipeline.java index 724ac7e..6fe2702 100644 --- a/webmagic-lucene/src/main/java/us/codecraft/webmagic/pipeline/LucenePipeline.java +++ b/webmagic-lucene/src/main/java/us/codecraft/webmagic/pipeline/LucenePipeline.java @@ -26,7 +26,7 @@ import java.util.Map; /** * @author code4crafter@gmail.com
- * @date: 13-8-5
+ * Date: 13-8-5
* Time: 下午2:11
*/ public class LucenePipeline implements Pipeline { diff --git a/webmagic-lucene/src/main/test/java/us/codecraft/webmagic/lucene/OschinaBlog.java b/webmagic-lucene/src/main/test/java/us/codecraft/webmagic/lucene/OschinaBlog.java index 4480f0b..b350370 100644 --- a/webmagic-lucene/src/main/test/java/us/codecraft/webmagic/lucene/OschinaBlog.java +++ b/webmagic-lucene/src/main/test/java/us/codecraft/webmagic/lucene/OschinaBlog.java @@ -13,7 +13,7 @@ import java.util.List; /** * @author code4crafter@gmail.com
- * @date: 13-8-2
+ * Date: 13-8-2
* Time: 上午7:52
*/ @TargetUrl("http://my.oschina.net/flashsword/blog/\\d+") diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/main/QuickStarter.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/main/QuickStarter.java index 65940e0..52be272 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/main/QuickStarter.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/main/QuickStarter.java @@ -14,7 +14,7 @@ import java.util.Scanner; /** * @author code4crafter@gmail.com
- * @date: 13-8-7
+ * Date: 13-8-7
* Time: 下午9:24
*/ public class QuickStarter { diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/Blog.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/Blog.java index 484861b..509aaf9 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/Blog.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/Blog.java @@ -2,7 +2,7 @@ package us.codecraft.webmagic.model.samples; /** * @author code4crafter@gmail.com
- * @date: 13-8-2
+ * Date: 13-8-2
* Time: 上午8:10
*/ public interface Blog { diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/IteyeBlog.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/IteyeBlog.java index 38b6980..ae94525 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/IteyeBlog.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/IteyeBlog.java @@ -7,7 +7,7 @@ import us.codecraft.webmagic.model.annotation.TargetUrl; /** * @author code4crafter@gmail.com
- * @date: 13-8-2
+ * Date: 13-8-2
* Time: 上午7:52
*/ @TargetUrl("http://*.iteye.com/blog/*") diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java index 6baa8ae..8c0e32d 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java @@ -16,7 +16,7 @@ import java.util.List; /** * @author code4crafter@gmail.com
- * @date: 13-8-4
+ * Date: 13-8-4
* Time: 下午8:17
*/ @TargetUrl("http://news.163.com/\\d+/\\d+/\\d+/\\w+*.html") diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaAnswer.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaAnswer.java index 02b8a9c..e878633 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaAnswer.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaAnswer.java @@ -9,7 +9,7 @@ import us.codecraft.webmagic.model.annotation.TargetUrl; /** * @author code4crafter@gmail.com
- * @date: 13-8-3
+ * Date: 13-8-3
* Time: 下午8:25
*/ @TargetUrl("http://www.oschina.net/question/\\d+_\\d+*") diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java index 95a7891..c1e3ea3 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java @@ -10,7 +10,7 @@ import java.util.List; /** * @author code4crafter@gmail.com
- * @date: 13-8-2
+ * Date: 13-8-2
* Time: 上午7:52
*/ @TargetUrl("http://my.oschina.net/flashsword/blog/\\d+") diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/IteyeBlogProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/IteyeBlogProcessor.java index da846e8..c0b3f73 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/IteyeBlogProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/IteyeBlogProcessor.java @@ -8,7 +8,7 @@ import us.codecraft.webmagic.processor.PageProcessor; /** * @author code4crafter@gmail.com
- * @date: 13-7-26
+ * Date: 13-7-26
* Time: 上午7:31
*/ public class IteyeBlogProcessor implements PageProcessor { diff --git a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java index e95f27c..0fa0eea 100644 --- a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java +++ b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java @@ -22,7 +22,7 @@ import java.util.Map; * 需要下载Selenium driver支持。
* * @author code4crafter@gmail.com
- * @date: 13-7-26
+ * Date: 13-7-26
* Time: 下午1:37
*/ public class SeleniumDownloader implements Downloader, Destroyable { diff --git a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java index c763a99..71ba290 100644 --- a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java +++ b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java @@ -12,7 +12,7 @@ import java.util.concurrent.atomic.AtomicInteger; /** * @author code4crafter@gmail.com
- * @date: 13-7-26
+ * Date: 13-7-26
* Time: 下午1:41
*/ class WebDriverPool { diff --git a/webmagic-selenium/src/test/java/us/codecraft/webmagic/downloader/SeleniumTest.java b/webmagic-selenium/src/test/java/us/codecraft/webmagic/downloader/SeleniumTest.java index 2c19033..b7bcd80 100644 --- a/webmagic-selenium/src/test/java/us/codecraft/webmagic/downloader/SeleniumTest.java +++ b/webmagic-selenium/src/test/java/us/codecraft/webmagic/downloader/SeleniumTest.java @@ -14,7 +14,7 @@ import java.util.Map; /** * @author code4crafter@gmail.com
- * @date: 13-7-26
+ * Date: 13-7-26
* Time: 下午12:27
*/ public class SeleniumTest { diff --git a/webmagic-selenium/src/test/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloaderTest.java b/webmagic-selenium/src/test/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloaderTest.java index fe98e8f..2b8c247 100644 --- a/webmagic-selenium/src/test/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloaderTest.java +++ b/webmagic-selenium/src/test/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloaderTest.java @@ -9,7 +9,7 @@ import us.codecraft.webmagic.Task; /** * @author code4crafter@gmail.com
- * @date: 13-7-26
+ * Date: 13-7-26
* Time: 下午2:46
*/ public class SeleniumDownloaderTest { diff --git a/webmagic-selenium/src/test/java/us/codecraft/webmagic/downloader/selenium/WebDriverPoolTest.java b/webmagic-selenium/src/test/java/us/codecraft/webmagic/downloader/selenium/WebDriverPoolTest.java index 4d5d275..a711a19 100644 --- a/webmagic-selenium/src/test/java/us/codecraft/webmagic/downloader/selenium/WebDriverPoolTest.java +++ b/webmagic-selenium/src/test/java/us/codecraft/webmagic/downloader/selenium/WebDriverPoolTest.java @@ -6,7 +6,7 @@ import org.openqa.selenium.WebDriver; /** * @author code4crafter@gmail.com
- * @date: 13-7-26
+ * Date: 13-7-26
* Time: 下午2:12
*/ public class WebDriverPoolTest { diff --git a/webmagic-selenium/src/test/java/us/codecraft/webmagic/samples/HuabanProcessor.java b/webmagic-selenium/src/test/java/us/codecraft/webmagic/samples/HuabanProcessor.java index 6b3d8a0..1696a3f 100644 --- a/webmagic-selenium/src/test/java/us/codecraft/webmagic/samples/HuabanProcessor.java +++ b/webmagic-selenium/src/test/java/us/codecraft/webmagic/samples/HuabanProcessor.java @@ -11,7 +11,7 @@ import us.codecraft.webmagic.processor.PageProcessor; * 花瓣网抽取器。
* 使用Selenium做页面动态渲染。
* @author code4crafter@gmail.com
- * @date: 13-7-26
+ * Date: 13-7-26
* Time: 下午4:08
*/ public class HuabanProcessor implements PageProcessor { From 21eca688e991668af6f0747272bd1c42c263dba1 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Fri, 9 Aug 2013 20:56:33 +0800 Subject: [PATCH 84/84] complete docs --- .../src/main/java/us/codecraft/webmagic/Spider.java | 4 ++-- .../webmagic/{schedular => scheduler}/QueueScheduler.java | 2 +- .../webmagic/{schedular => scheduler}/Scheduler.java | 2 +- .../codecraft/webmagic/{schedular => scheduler}/package.html | 0 .../java/us/codecraft/webmagic/model/annotation/package.html | 5 +++++ .../webmagic/scheduler/FileCacheQueueScheduler.java | 1 - .../java/us/codecraft/webmagic/scheduler/RedisScheduler.java | 1 - .../main/java/us/codecraft/webmagic/utils/DoubleKeyMap.java | 4 ++-- 8 files changed, 11 insertions(+), 8 deletions(-) rename webmagic-core/src/main/java/us/codecraft/webmagic/{schedular => scheduler}/QueueScheduler.java (96%) rename webmagic-core/src/main/java/us/codecraft/webmagic/{schedular => scheduler}/Scheduler.java (95%) rename webmagic-core/src/main/java/us/codecraft/webmagic/{schedular => scheduler}/package.html (100%) create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/package.html diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 878c63e..cf62796 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -8,8 +8,8 @@ import us.codecraft.webmagic.downloader.HttpClientDownloader; import us.codecraft.webmagic.pipeline.ConsolePipeline; import us.codecraft.webmagic.pipeline.Pipeline; import us.codecraft.webmagic.processor.PageProcessor; -import us.codecraft.webmagic.schedular.QueueScheduler; -import us.codecraft.webmagic.schedular.Scheduler; +import us.codecraft.webmagic.scheduler.QueueScheduler; +import us.codecraft.webmagic.scheduler.Scheduler; import us.codecraft.webmagic.utils.ThreadUtils; import java.util.ArrayList; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/QueueScheduler.java b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java similarity index 96% rename from webmagic-core/src/main/java/us/codecraft/webmagic/schedular/QueueScheduler.java rename to webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java index 613e406..723b5f9 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/QueueScheduler.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java @@ -1,4 +1,4 @@ -package us.codecraft.webmagic.schedular; +package us.codecraft.webmagic.scheduler; import org.apache.log4j.Logger; import us.codecraft.webmagic.Request; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/Scheduler.java b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/Scheduler.java similarity index 95% rename from webmagic-core/src/main/java/us/codecraft/webmagic/schedular/Scheduler.java rename to webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/Scheduler.java index 8d9649b..fc39b45 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/Scheduler.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/Scheduler.java @@ -1,4 +1,4 @@ -package us.codecraft.webmagic.schedular; +package us.codecraft.webmagic.scheduler; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Task; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/package.html b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/package.html similarity index 100% rename from webmagic-core/src/main/java/us/codecraft/webmagic/schedular/package.html rename to webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/package.html diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/package.html b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/package.html new file mode 100644 index 0000000..1e3004f --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/package.html @@ -0,0 +1,5 @@ + + +webmagic注解抓取方式所定义的注解。 + + diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java index c294f09..a8dc23a 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java @@ -4,7 +4,6 @@ import org.apache.commons.lang3.math.NumberUtils; import org.apache.log4j.Logger; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Task; -import us.codecraft.webmagic.schedular.Scheduler; import java.io.*; import java.util.LinkedHashSet; diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java index e26ed9d..e7c5bcd 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java @@ -7,7 +7,6 @@ import redis.clients.jedis.JedisPool; import redis.clients.jedis.JedisPoolConfig; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Task; -import us.codecraft.webmagic.schedular.Scheduler; /** * 使用redis管理url,构建一个分布式的爬虫。
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/DoubleKeyMap.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/DoubleKeyMap.java index b4a477f..b284a15 100755 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/DoubleKeyMap.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/DoubleKeyMap.java @@ -42,7 +42,7 @@ public class DoubleKeyMap extends MultiKeyMapBase { /** * @param key - * @return + * @return map */ public Map get(K1 key) { return map.get(key); @@ -51,7 +51,7 @@ public class DoubleKeyMap extends MultiKeyMapBase { /** * @param key1 * @param key2 - * @return + * @return value */ public V get(K1 key1, K2 key2) { if (get(key1) == null) {