From 9e6d55dbeec35b57de6c7e6bc1f0212e7ba8f353 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 28 Sep 2013 07:52:32 +0800 Subject: [PATCH 01/38] update manual for objectformatter --- user-manual.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/user-manual.md b/user-manual.md index 2127e57..18988f5 100644 --- a/user-manual.md +++ b/user-manual.md @@ -395,9 +395,21 @@ webmagic-extension包括注解模块。为什么会有注解方式? * #### 类型转换 + webmagic的注解模式直接对抽取结果进行类型转换,通过`ObjectFormatter`实现。webmagic内置了基本类型的支持,这样抽取结果的字段可以是任意基本类型(需要保证抽取结果能够被转换到对应类型)。 + +```java + @ExtractBy("//ul[@class='pagehead-actions']/li[1]//a[@class='social-count js-social-count']/text()") + private int star; +``` +抽取结果也可以是`java.util.Date`类型,不过需要指定日期格式化的方式: + +```java @Formatter("yyyy-MM-dd HH:mm") @ExtractBy("//div[@class='BlogStat']/regex('\\d+-\\d+-\\d+\\s+\\d+:\\d+')") private Date date; +``` + +你也可以编写一个实现`ObjectFormatter`接口的类,进行自己的类型解析。要使用自己的类,需要调用`ObjectFormatters.put()`对这个类进行注册。 * #### AfterExtractor From 122cebfa5fca43686ba8bf4f4c2dcf30c016a81a Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 28 Sep 2013 07:54:42 +0800 Subject: [PATCH 02/38] update manual for objectformatter --- user-manual.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/user-manual.md b/user-manual.md index 18988f5..d9d6488 100644 --- a/user-manual.md +++ b/user-manual.md @@ -395,7 +395,7 @@ webmagic-extension包括注解模块。为什么会有注解方式? * #### 类型转换 - webmagic的注解模式直接对抽取结果进行类型转换,通过`ObjectFormatter`实现。webmagic内置了基本类型的支持,这样抽取结果的字段可以是任意基本类型(需要保证抽取结果能够被转换到对应类型)。 + webmagic的注解模式支持对抽取结果进行类型转换,这样抽取结果并不需要是String类型,而可以是任意类型。webmagic内置了基本类型的支持(需要保证抽取结果能够被转换到对应类型)。 ```java @ExtractBy("//ul[@class='pagehead-actions']/li[1]//a[@class='social-count js-social-count']/text()") @@ -410,7 +410,6 @@ webmagic-extension包括注解模块。为什么会有注解方式? ``` 你也可以编写一个实现`ObjectFormatter`接口的类,进行自己的类型解析。要使用自己的类,需要调用`ObjectFormatters.put()`对这个类进行注册。 - * #### AfterExtractor From e7657cca304fc49c3ba1e2c2f83bbd30b8030724 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 28 Sep 2013 08:00:00 +0800 Subject: [PATCH 03/38] update some code for annotation --- user-manual.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/user-manual.md b/user-manual.md index d9d6488..9bb2b1b 100644 --- a/user-manual.md +++ b/user-manual.md @@ -354,6 +354,10 @@ webmagic-extension包括注解模块。为什么会有注解方式? @ExtractBy(value = "//div[@class='BlogTags']/a/text()", multi = true) private List tags; + + @Formatter("yyyy-MM-dd HH:mm") + @ExtractBy("//div[@class='BlogStat']/regex('\\d+-\\d+-\\d+\\s+\\d+:\\d+')") + private Date date; public static void main(String[] args) { OOSpider.create( From 719100d6e681f5916fa71c64fd7099c00009eec5 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 28 Sep 2013 08:02:29 +0800 Subject: [PATCH 04/38] update xsoup usage doc --- user-manual.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/user-manual.md b/user-manual.md index 9bb2b1b..b487efa 100644 --- a/user-manual.md +++ b/user-manual.md @@ -213,7 +213,7 @@ Spider还包括一个方法test(String url),该方法只抓取一个单独的 webmagic包括一个对于页面正文的自动抽取的类**SmartContentSelector**。相信用过Evernote Clearly都会对其自动抽取正文的技术印象深刻。这个技术又叫**Readability**。当然webmagic对Readability的实现还比较粗略,但是仍有一些学习价值。 -webmagic的XPath解析使用了作者另一个开源项目:基于Jsoup的XPath解析器[Xsoup](https://github.com/code4craft/xsoup),Xsoup对XPath的语法进行了一些扩展,支持一些自定义的函数。 +webmagic的XPath解析使用了作者另一个开源项目:基于Jsoup的XPath解析器[Xsoup](https://github.com/code4craft/xsoup),Xsoup对XPath的语法进行了一些扩展,支持一些自定义的函数。这些函数的使用方式都是在XPath末尾加上`/name-of-function()`,例如:`"//div[@class='BlogStat']/regex('\\d+-\\d+-\\d+\\s+\\d+:\\d+')"`。 From 3b00190f9981bf005dae504496989269cc906cb2 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Thu, 10 Oct 2013 00:40:44 +0800 Subject: [PATCH 05/38] api without implementation for #28: add specific url crawl --- .../webmagic/example/BaiduBaike.java | 28 ++++++++++++++ .../us/codecraft/webmagic/model/OOSpider.java | 27 ++++++++++++-- .../model/annotation/UrlTemplate.java | 37 +++++++++++++++++++ .../webmagic/model/direct/Param.java | 15 ++++++++ .../webmagic/model/samples/Kr36NewsModel.java | 20 ++++++++-- 5 files changed, 120 insertions(+), 7 deletions(-) create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/example/BaiduBaike.java create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/UrlTemplate.java create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/model/direct/Param.java diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/BaiduBaike.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/BaiduBaike.java new file mode 100644 index 0000000..becc311 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/BaiduBaike.java @@ -0,0 +1,28 @@ +package us.codecraft.webmagic.example; + +import us.codecraft.webmagic.model.OOSpider; +import us.codecraft.webmagic.model.annotation.ExtractBy; +import us.codecraft.webmagic.model.annotation.UrlTemplate; +import us.codecraft.webmagic.model.direct.Param; + +import java.util.ArrayList; +import java.util.List; + +/** + * @author code4crafter@gmail.com + */ +@UrlTemplate("http://baike.baidu.com/search/word?word=${word}&enc=utf8") +public class BaiduBaike { + + private String word; + + @ExtractBy("//div[@id='lemmaContent-0']//div[@class='para']/allText()") + private String description; + + public static void main(String[] args) { + List words = new ArrayList(); + words.add(new Param().put("word","红烧肉")); + OOSpider.direct(words, BaiduBaike.class).thread(10).run(); + } + +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/OOSpider.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/OOSpider.java index 3cee9ad..efa5faf 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/OOSpider.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/OOSpider.java @@ -2,8 +2,11 @@ package us.codecraft.webmagic.model; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.model.direct.Param; import us.codecraft.webmagic.processor.PageProcessor; +import java.util.Collection; + /** * The spider for page model extractor.
* In webmagic, we call a POJO containing extract result as "page model".
@@ -22,13 +25,14 @@ import us.codecraft.webmagic.processor.PageProcessor; * {@literal @}ExtractBy(value = "//div[@class='BlogTags']/a/text()", multi = true) * private List tags; * } - + * * And start the spider by: *
  *   OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog")
  *        ,new JsonFilePageModelPipeline(), OschinaBlog.class).run();
  * }
- 
+ * + * * @author code4crafter@gmail.com
* @since 0.2.0 */ @@ -49,6 +53,7 @@ public class OOSpider extends Spider { /** * create a spider + * * @param site * @param pageModelPipeline * @param pageModels @@ -57,7 +62,7 @@ public class OOSpider extends Spider { this(ModelPageProcessor.create(site, pageModels)); this.modelPipeline = new ModelPipeline(); super.addPipeline(modelPipeline); - if (pageModelPipeline!=null){ + if (pageModelPipeline != null) { for (Class pageModel : pageModels) { this.modelPipeline.put(pageModel, pageModelPipeline); } @@ -72,6 +77,22 @@ public class OOSpider extends Spider { return new OOSpider(site, pageModelPipeline, pageModels); } + public static OOSpider direct(Site site, PageModelPipeline pageModelPipeline, Class... pageModels) { + return new OOSpider(site, pageModelPipeline, pageModels); + } + + public static OOSpider direct(PageModelPipeline pageModelPipeline, Class... pageModels) { + return new OOSpider(null, pageModelPipeline, pageModels); + } + + public static OOSpider direct(Class... pageModels) { + return new OOSpider(null, null, pageModels); + } + + public static OOSpider direct(Collection params,Class... pageModels) { + return new OOSpider(null, null, pageModels); + } + public OOSpider addPageModel(PageModelPipeline pageModelPipeline, Class... pageModels) { for (Class pageModel : pageModels) { modelPageProcessor.addPageModel(pageModel); diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/UrlTemplate.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/UrlTemplate.java new file mode 100644 index 0000000..a940a64 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/UrlTemplate.java @@ -0,0 +1,37 @@ +package us.codecraft.webmagic.model.annotation; + +import java.lang.annotation.ElementType; +import java.lang.annotation.Retention; +import java.lang.annotation.Target; + +/** + * Define the url patterns for class.
+ * All urls matching the pattern will be crawled and extracted for new objects.
+ * + * @author code4crafter@gmail.com
+ * @since 0.3.3 + */ +@Retention(java.lang.annotation.RetentionPolicy.RUNTIME) +@Target({ElementType.TYPE}) +public @interface UrlTemplate { + + /** + * The url patterns for class.
+ * Use regex expression with some changes:
+ * "." stand for literal character "." instead of "any character".
+ * "*" stand for any legal character for url in 0-n length ([^"'#]*) instead of "any length".
+ * + * @return the url patterns for class + */ + String value(); + + /** + * Define the region for url extracting.
+ * Only support XPath.
+ * When sourceRegion is set, the urls will be extracted only from the region instead of entire content.
+ * + * @return the region for url extracting + */ + String encoding() default "utf8"; + +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/direct/Param.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/direct/Param.java new file mode 100644 index 0000000..c66e854 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/direct/Param.java @@ -0,0 +1,15 @@ +package us.codecraft.webmagic.model.direct; + +import java.util.LinkedHashMap; + +/** + * @author code4crafter@gmail.com + */ +public class Param extends LinkedHashMap{ + + @Override + public Param put(String key, Object value) { + super.put(key, value); + return this; + } +} diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/Kr36NewsModel.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/Kr36NewsModel.java index de3fdf5..b381c96 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/Kr36NewsModel.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/Kr36NewsModel.java @@ -1,12 +1,12 @@ package us.codecraft.webmagic.model.samples; import us.codecraft.webmagic.Site; -import us.codecraft.webmagic.model.ConsolePageModelPipeline; import us.codecraft.webmagic.model.OOSpider; import us.codecraft.webmagic.model.annotation.ExtractBy; import us.codecraft.webmagic.model.annotation.ExtractByUrl; import us.codecraft.webmagic.model.annotation.HelpUrl; import us.codecraft.webmagic.model.annotation.TargetUrl; +import us.codecraft.webmagic.pipeline.JsonFilePageModelPipeline; /** * @author code4crafter@gmail.com
@@ -18,14 +18,26 @@ public class Kr36NewsModel { @ExtractBy("//h1[@class='entry-title sep10']") private String title; - @ExtractBy("//div[@class='mainContent sep-10']") + @ExtractBy("//div[@class='mainContent sep-10']/tidyText()") private String content; @ExtractByUrl private String url; public static void main(String[] args) { - OOSpider.create(Site.me().addStartUrl("http://www.36kr.com/"), new ConsolePageModelPipeline(), - Kr36NewsModel.class).run(); + OOSpider.create(Site.me().addStartUrl("http://www.36kr.com/").setSleepTime(0),new JsonFilePageModelPipeline(), + Kr36NewsModel.class).thread(20).run(); + } + + public String getTitle() { + return title; + } + + public String getContent() { + return content; + } + + public String getUrl() { + return url; } } From 1a2c84ea78d1f9cc20f59efc3a5a6954810f6683 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Fri, 11 Oct 2013 07:36:16 +0800 Subject: [PATCH 06/38] #27 add timeout config to site --- .../src/main/java/us/codecraft/webmagic/Site.java | 14 ++++++++++++++ .../webmagic/downloader/HttpClientPool.java | 4 ++-- .../us/codecraft/webmagic/example/BaiduBaike.java | 10 ++++++++-- .../java/us/codecraft/webmagic/model/OOSpider.java | 2 +- 4 files changed, 25 insertions(+), 5 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java index 4c7b992..0817335 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java @@ -32,6 +32,8 @@ public class Site { private int cycleRetryTimes = 0; + private int timeOut = 2000; + private static final Set DEFAULT_STATUS_CODE_SET = new HashSet(); private Set acceptStatCode = DEFAULT_STATUS_CODE_SET; @@ -131,6 +133,18 @@ public class Site { return charset; } + public int getTimeOut() { + return timeOut; + } + + /** + * set timeout for downloader in ms + * @param timeOut + */ + public void setTimeOut(int timeOut) { + this.timeOut = timeOut; + } + /** * Set acceptStatCode.
* When status code of http response is in acceptStatCodes, it will be processed.
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java index f2fffad..52e2f99 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java @@ -52,8 +52,8 @@ public class HttpClientPool { if (site != null && site.getUserAgent() != null) { params.setParameter(CoreProtocolPNames.USER_AGENT, site.getUserAgent()); } - params.setIntParameter(CoreConnectionPNames.SO_TIMEOUT, 1000); - params.setIntParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, 2000); + params.setIntParameter(CoreConnectionPNames.SO_TIMEOUT, site.getTimeOut()); + params.setIntParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, site.getTimeOut()); HttpProtocolParamBean paramsBean = new HttpProtocolParamBean(params); paramsBean.setVersion(HttpVersion.HTTP_1_1); diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/BaiduBaike.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/BaiduBaike.java index becc311..b82b8aa 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/BaiduBaike.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/BaiduBaike.java @@ -1,5 +1,7 @@ package us.codecraft.webmagic.example; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.model.AfterExtractor; import us.codecraft.webmagic.model.OOSpider; import us.codecraft.webmagic.model.annotation.ExtractBy; import us.codecraft.webmagic.model.annotation.UrlTemplate; @@ -12,17 +14,21 @@ import java.util.List; * @author code4crafter@gmail.com */ @UrlTemplate("http://baike.baidu.com/search/word?word=${word}&enc=utf8") -public class BaiduBaike { +public class BaiduBaike implements AfterExtractor{ private String word; @ExtractBy("//div[@id='lemmaContent-0']//div[@class='para']/allText()") private String description; + @Override + public void afterProcess(Page page) { + + } + public static void main(String[] args) { List words = new ArrayList(); words.add(new Param().put("word","红烧肉")); OOSpider.direct(words, BaiduBaike.class).thread(10).run(); } - } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/OOSpider.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/OOSpider.java index efa5faf..a64ca29 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/OOSpider.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/OOSpider.java @@ -89,7 +89,7 @@ public class OOSpider extends Spider { return new OOSpider(null, null, pageModels); } - public static OOSpider direct(Collection params,Class... pageModels) { + public static OOSpider direct(Collection params, Class... pageModels) { return new OOSpider(null, null, pageModels); } From 16e12e3bc936382a6503823fe21169120d9978a0 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Fri, 11 Oct 2013 08:37:21 +0800 Subject: [PATCH 07/38] #27 customize http header for downloader --- .../main/java/us/codecraft/webmagic/Site.java | 32 +++++++++++++++++-- .../downloader/HttpClientDownloader.java | 8 +++++ .../webmagic/downloader/HttpClientPool.java | 5 ++- 3 files changed, 39 insertions(+), 6 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java index 0817335..a84ba48 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java @@ -8,8 +8,8 @@ import java.util.*; * Object contains setting for crawler.
* * @author code4crafter@gmail.com
- * @since 0.1.0 * @see us.codecraft.webmagic.processor.PageProcessor + * @since 0.1.0 */ public class Site { @@ -38,6 +38,14 @@ public class Site { private Set acceptStatCode = DEFAULT_STATUS_CODE_SET; + private Map headers = new HashMap(); + + public static interface HeaderConst { + + public static final String REFERER = "Referer"; + } + + static { DEFAULT_STATUS_CODE_SET.add(200); } @@ -139,10 +147,12 @@ public class Site { /** * set timeout for downloader in ms + * * @param timeOut */ - public void setTimeOut(int timeOut) { + public Site setTimeOut(int timeOut) { this.timeOut = timeOut; + return this; } /** @@ -216,7 +226,7 @@ public class Site { } /** - * Get retry times when download fail immediately, 0 by default.
+ * Get retry times immediately when download fail, 0 by default.
* * @return retry times when download fail */ @@ -224,6 +234,22 @@ public class Site { return retryTimes; } + public Map getHeaders() { + return headers; + } + + /** + * Put an Http header for downloader.
+ * Use {@link #addCookie(String, String)} for cookie and {@link #setUserAgent(String)} for user-agent.
+ * @param key key of http header, there are some keys constant in {@link HeaderConst} + * @param value value of header + * @return + */ + public Site addHeader(String key, String value){ + headers.put(key,value); + return this; + } + /** * Set retry times when download fail, 0 by default.
* diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 82a4a9a..b6f0034 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -19,6 +19,7 @@ import us.codecraft.webmagic.utils.UrlUtils; import java.io.IOException; import java.util.HashSet; +import java.util.Map; import java.util.Set; @@ -66,10 +67,12 @@ public class HttpClientDownloader implements Downloader { int retryTimes = 0; Set acceptStatCode; String charset = null; + Map headers = null; if (site != null) { retryTimes = site.getRetryTimes(); acceptStatCode = site.getAcceptStatCode(); charset = site.getCharset(); + headers = site.getHeaders(); } else { acceptStatCode = new HashSet(); acceptStatCode.add(200); @@ -78,6 +81,11 @@ public class HttpClientDownloader implements Downloader { HttpClient httpClient = HttpClientPool.getInstance(poolSize).getClient(site); try { HttpGet httpGet = new HttpGet(request.getUrl()); + if (headers!=null){ + for (Map.Entry headerEntry : headers.entrySet()) { + httpGet.addHeader(headerEntry.getKey(),headerEntry.getValue()); + } + } HttpResponse httpResponse = null; int tried = 0; boolean retry; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java index 52e2f99..c256ac4 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java @@ -54,7 +54,7 @@ public class HttpClientPool { } params.setIntParameter(CoreConnectionPNames.SO_TIMEOUT, site.getTimeOut()); params.setIntParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, site.getTimeOut()); - + params.setParameter(ClientPNames.COOKIE_POLICY, CookiePolicy.BEST_MATCH); HttpProtocolParamBean paramsBean = new HttpProtocolParamBean(params); paramsBean.setVersion(HttpVersion.HTTP_1_1); if (site != null && site.getCharset() != null) { @@ -73,8 +73,7 @@ public class HttpClientPool { if (site != null) { generateCookie(httpClient, site); } - httpClient.getParams().setIntParameter("http.socket.timeout", 60000); - httpClient.getParams().setParameter(ClientPNames.COOKIE_POLICY, CookiePolicy.BEST_MATCH); + return httpClient; } From 5a226387e0f344555603e762b69afc4a9f0ebd57 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Fri, 11 Oct 2013 11:32:44 +0800 Subject: [PATCH 08/38] #27 nullpointer fix --- .../codecraft/webmagic/downloader/HttpClientPool.java | 10 +++++++--- .../webmagic/downloader/HttpClientDownloaderTest.java | 1 + 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java index c256ac4..f3b72b6 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java @@ -51,9 +51,14 @@ public class HttpClientPool { HttpParams params = new BasicHttpParams(); if (site != null && site.getUserAgent() != null) { params.setParameter(CoreProtocolPNames.USER_AGENT, site.getUserAgent()); + params.setIntParameter(CoreConnectionPNames.SO_TIMEOUT, site.getTimeOut()); + params.setIntParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, site.getTimeOut()); + } else { + params.setIntParameter(CoreConnectionPNames.SO_TIMEOUT, 3000); + params.setIntParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, 3000); } - params.setIntParameter(CoreConnectionPNames.SO_TIMEOUT, site.getTimeOut()); - params.setIntParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, site.getTimeOut()); + + params.setParameter(ClientPNames.COOKIE_POLICY, CookiePolicy.BEST_MATCH); HttpProtocolParamBean paramsBean = new HttpProtocolParamBean(params); paramsBean.setVersion(HttpVersion.HTTP_1_1); @@ -73,7 +78,6 @@ public class HttpClientPool { if (site != null) { generateCookie(httpClient, site); } - return httpClient; } diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java index 936aece..b5ecada 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java @@ -22,4 +22,5 @@ public class HttpClientDownloaderTest { Page download = httpClientDownloader.download(new Request("http://www.diandian.com"), site.toTask()); Assert.assertTrue(download.getHtml().toString().contains("flashsword30")); } + } From 7fb44d2eec333f61504c210a3a95e1b24d933596 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Mon, 14 Oct 2013 23:22:04 +0800 Subject: [PATCH 09/38] #30 reuse PoolingClientConnectionManager for HttpClientDownloader --- .../downloader/HttpClientDownloader.java | 12 ++++++- .../webmagic/downloader/HttpClientPool.java | 32 ++++++------------- webmagic-samples/pom.xml | 2 +- 3 files changed, 22 insertions(+), 24 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index b6f0034..1bee564 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -34,6 +34,8 @@ public class HttpClientDownloader implements Downloader { private Logger logger = Logger.getLogger(getClass()); + private HttpClientPool httpClientPool; + private int poolSize = 1; /** @@ -58,6 +60,13 @@ public class HttpClientDownloader implements Downloader { return (Html) page.getHtml(); } + private HttpClientPool getHttpClientPool(){ + if (httpClientPool==null){ + httpClientPool = new HttpClientPool(poolSize); + } + return httpClientPool; + } + @Override public Page download(Request request, Task task) { Site site = null; @@ -78,7 +87,7 @@ public class HttpClientDownloader implements Downloader { acceptStatCode.add(200); } logger.info("downloading page " + request.getUrl()); - HttpClient httpClient = HttpClientPool.getInstance(poolSize).getClient(site); + HttpClient httpClient = getHttpClientPool().getClient(site); try { HttpGet httpGet = new HttpGet(request.getUrl()); if (headers!=null){ @@ -150,6 +159,7 @@ public class HttpClientDownloader implements Downloader { @Override public void setThread(int thread) { poolSize = thread; + httpClientPool = new HttpClientPool(thread); } private void handleGzip(HttpResponse httpResponse) { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java index f3b72b6..c882836 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java @@ -24,23 +24,19 @@ import java.util.Map; */ public class HttpClientPool { - public static volatile HttpClientPool INSTANCE; - - public static HttpClientPool getInstance(int poolSize) { - if (INSTANCE == null) { - synchronized (HttpClientPool.class) { - if (INSTANCE == null) { - INSTANCE = new HttpClientPool(poolSize); - } - } - } - return INSTANCE; - } - private int poolSize; - private HttpClientPool(int poolSize) { + private PoolingClientConnectionManager connectionManager; + + public HttpClientPool(int poolSize) { this.poolSize = poolSize; + SchemeRegistry schemeRegistry = new SchemeRegistry(); + schemeRegistry.register(new Scheme("http", 80, PlainSocketFactory.getSocketFactory())); + schemeRegistry.register(new Scheme("https", 443, SSLSocketFactory.getSocketFactory())); + + connectionManager = new PoolingClientConnectionManager(schemeRegistry); + connectionManager.setMaxTotal(poolSize); + connectionManager.setDefaultMaxPerRoute(100); } public HttpClient getClient(Site site) { @@ -58,7 +54,6 @@ public class HttpClientPool { params.setIntParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, 3000); } - params.setParameter(ClientPNames.COOKIE_POLICY, CookiePolicy.BEST_MATCH); HttpProtocolParamBean paramsBean = new HttpProtocolParamBean(params); paramsBean.setVersion(HttpVersion.HTTP_1_1); @@ -67,13 +62,6 @@ public class HttpClientPool { } paramsBean.setUseExpectContinue(false); - SchemeRegistry schemeRegistry = new SchemeRegistry(); - schemeRegistry.register(new Scheme("http", 80, PlainSocketFactory.getSocketFactory())); - schemeRegistry.register(new Scheme("https", 443, SSLSocketFactory.getSocketFactory())); - - PoolingClientConnectionManager connectionManager = new PoolingClientConnectionManager(schemeRegistry); - connectionManager.setMaxTotal(poolSize); - connectionManager.setDefaultMaxPerRoute(100); DefaultHttpClient httpClient = new DefaultHttpClient(connectionManager, params); if (site != null) { generateCookie(httpClient, site); diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index bf97b75..79238b0 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -5,7 +5,7 @@ webmagic-parent us.codecraft - 0.3.2 + 0.3.3-SNAPSHOT 4.0.0 From 2e496402dc4789145bde88bd58c3199932fc65f1 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Thu, 24 Oct 2013 13:16:48 +0800 Subject: [PATCH 10/38] add more warning for 0.3.3 --- .../us/codecraft/webmagic/example/BaiduBaike.java | 2 ++ .../java/us/codecraft/webmagic/model/OOSpider.java | 12 ++++++++++++ 2 files changed, 14 insertions(+) diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/BaiduBaike.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/BaiduBaike.java index b82b8aa..9e63055 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/BaiduBaike.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/BaiduBaike.java @@ -11,6 +11,8 @@ import java.util.ArrayList; import java.util.List; /** + * @since 0.3.3 + * NO implement yet!!!!!!!!!!!! * @author code4crafter@gmail.com */ @UrlTemplate("http://baike.baidu.com/search/word?word=${word}&enc=utf8") diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/OOSpider.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/OOSpider.java index a64ca29..c43b6e8 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/OOSpider.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/OOSpider.java @@ -77,14 +77,26 @@ public class OOSpider extends Spider { return new OOSpider(site, pageModelPipeline, pageModels); } + /** + * @since 0.3.3 + * NO implement yet! + */ public static OOSpider direct(Site site, PageModelPipeline pageModelPipeline, Class... pageModels) { return new OOSpider(site, pageModelPipeline, pageModels); } + /** + * @since 0.3.3 + * NO implement yet! + */ public static OOSpider direct(PageModelPipeline pageModelPipeline, Class... pageModels) { return new OOSpider(null, pageModelPipeline, pageModels); } + /** + * @since 0.3.3 + * NO implement yet! + */ public static OOSpider direct(Class... pageModels) { return new OOSpider(null, null, pageModels); } From 43b79f284a4b3c5719b7a53c97addfcfdb049988 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Fri, 25 Oct 2013 21:14:24 +0800 Subject: [PATCH 11/38] update user-manual --- user-manual.md | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/user-manual.md b/user-manual.md index b487efa..6f331f2 100644 --- a/user-manual.md +++ b/user-manual.md @@ -1,5 +1,5 @@ webmagic使用手册 ------- +======== >webmagic是一个开源的Java垂直爬虫框架,目标是简化爬虫的开发流程,让开发者专注于逻辑功能的开发。webmagic的核心非常简单,但是覆盖爬虫的整个流程,也是很好的学习爬虫开发的材料。 >web爬虫是一种技术,webmagic致力于将这种技术的实现成本降低,但是出于对资源提供者的尊重,webmagic不会做反封锁的事情,包括:验证码破解、代理切换、自动登录等。 @@ -16,8 +16,9 @@ webmagic使用手册
+-------- -## 快速开始 +## 下载及安装 ### 使用maven @@ -66,9 +67,11 @@ webmagic还包含两个可用的扩展包,因为这两个包都依赖了比较 在**bin/lib**目录下,有项目依赖的所有jar包,直接在IDE里import即可。 -### 第一个爬虫 +-------- -#### 定制PageProcessor +## 第一个爬虫 + +### 定制PageProcessor PageProcessor是webmagic-core的一部分,定制一个PageProcessor即可实现自己的爬虫逻辑。以下是抓取osc博客的一段代码: @@ -141,6 +144,9 @@ webmagic-extension包括了注解方式编写爬虫的方法,只需基于一
+-------- + +## 详细介绍 ## webmagic-core @@ -325,6 +331,8 @@ webmagic目前不支持持久化到数据库,但是结合其他工具,持久
+----- + ## webmagic-extension webmagic-extension是为了开发爬虫更方便而实现的一些功能模块。这些功能完全基于webmagic-core的框架,包括注解形式编写爬虫、分页、分布式等功能。 From dbfb6b5803fa45d4c936c166538ea213778d679d Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Fri, 25 Oct 2013 21:15:12 +0800 Subject: [PATCH 12/38] update user-manual --- user-manual.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/user-manual.md b/user-manual.md index 6f331f2..653c82a 100644 --- a/user-manual.md +++ b/user-manual.md @@ -146,7 +146,7 @@ webmagic-extension包括了注解方式编写爬虫的方法,只需基于一 -------- -## 详细介绍 +## 模块详细介绍 ## webmagic-core From a3f9ad198f18cc58a6dd46a2f92d8bfb9c0ab397 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Thu, 31 Oct 2013 21:52:43 +0800 Subject: [PATCH 13/38] refactor multi thread code in Spider --- pom.xml | 5 + webmagic-core/pom.xml | 6 + .../java/us/codecraft/webmagic/Spider.java | 671 +++++++++--------- .../codecraft/webmagic/utils/ThreadUtils.java | 20 +- .../us/codecraft/webmagic/SpiderTest.java | 2 +- 5 files changed, 360 insertions(+), 344 deletions(-) diff --git a/pom.xml b/pom.xml index 8d25a66..8f9837f 100644 --- a/pom.xml +++ b/pom.xml @@ -63,6 +63,11 @@ httpclient 4.2.4 + + com.google.guava + guava + 15.0 + us.codecraft xsoup diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index f68114a..3d89e5c 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -20,6 +20,12 @@ junit + + com.google.guava + guava + 15.0 + + org.apache.commons commons-lang3 diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 829546b..149f0a8 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -42,7 +42,7 @@ import java.util.concurrent.atomic.AtomicInteger; * Spider.create(new SimplePageProcessor("http://my.oschina.net/", * "http://my.oschina.net/*blog/*"))
* .scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")).run();
- * + * * @author code4crafter@gmail.com
* @see Downloader * @see Scheduler @@ -52,381 +52,380 @@ import java.util.concurrent.atomic.AtomicInteger; */ public class Spider implements Runnable, Task { - protected Downloader downloader; + protected Downloader downloader; - protected List pipelines = new ArrayList(); + protected List pipelines = new ArrayList(); - protected PageProcessor pageProcessor; + protected PageProcessor pageProcessor; - protected List startUrls; + protected List startUrls; - protected Site site; + protected Site site; - protected String uuid; + protected String uuid; - protected Scheduler scheduler = new QueueScheduler(); + protected Scheduler scheduler = new QueueScheduler(); - protected Logger logger = Logger.getLogger(getClass()); + protected Logger logger = Logger.getLogger(getClass()); - protected ExecutorService executorService; + protected ExecutorService executorService; - protected int threadNum = 1; + protected int threadNum = 1; - protected AtomicInteger stat = new AtomicInteger(STAT_INIT); + protected AtomicInteger stat = new AtomicInteger(STAT_INIT); - protected final static int STAT_INIT = 0; + protected final static int STAT_INIT = 0; - protected final static int STAT_RUNNING = 1; + protected final static int STAT_RUNNING = 1; - protected final static int STAT_STOPPED = 2; + protected final static int STAT_STOPPED = 2; - /** - * create a spider with pageProcessor. - * - * @param pageProcessor - * @return new spider - * @see PageProcessor - */ - public static Spider create(PageProcessor pageProcessor) { - return new Spider(pageProcessor); - } + /** + * create a spider with pageProcessor. + * + * @param pageProcessor + * @return new spider + * @see PageProcessor + */ + public static Spider create(PageProcessor pageProcessor) { + return new Spider(pageProcessor); + } - /** - * create a spider with pageProcessor. - * - * @param pageProcessor - */ - public Spider(PageProcessor pageProcessor) { - this.pageProcessor = pageProcessor; - this.site = pageProcessor.getSite(); - this.startUrls = pageProcessor.getSite().getStartUrls(); - } + /** + * create a spider with pageProcessor. + * + * @param pageProcessor + */ + public Spider(PageProcessor pageProcessor) { + this.pageProcessor = pageProcessor; + this.site = pageProcessor.getSite(); + this.startUrls = pageProcessor.getSite().getStartUrls(); + } - /** - * Set startUrls of Spider.
- * Prior to startUrls of Site. - * - * @param startUrls - * @return this - */ - public Spider startUrls(List startUrls) { - checkIfRunning(); - this.startUrls = startUrls; - return this; - } + /** + * Set startUrls of Spider.
+ * Prior to startUrls of Site. + * + * @param startUrls + * @return this + */ + public Spider startUrls(List startUrls) { + checkIfRunning(); + this.startUrls = startUrls; + return this; + } - /** - * Set an uuid for spider.
- * Default uuid is domain of site.
- * - * @param uuid - * @return this - */ - public Spider setUUID(String uuid) { - this.uuid = uuid; - return this; - } + /** + * Set an uuid for spider.
+ * Default uuid is domain of site.
+ * + * @param uuid + * @return this + */ + public Spider setUUID(String uuid) { + this.uuid = uuid; + return this; + } - /** - * set scheduler for Spider - * - * @param scheduler - * @return this - * @Deprecated - * @see #setScheduler(us.codecraft.webmagic.scheduler.Scheduler) - */ - public Spider scheduler(Scheduler scheduler) { - return setScheduler(scheduler); - } + /** + * set scheduler for Spider + * + * @param scheduler + * @return this + * @Deprecated + * @see #setScheduler(us.codecraft.webmagic.scheduler.Scheduler) + */ + public Spider scheduler(Scheduler scheduler) { + return setScheduler(scheduler); + } - /** - * set scheduler for Spider - * - * @param scheduler - * @return this - * @see Scheduler - * @since 0.2.1 - */ - public Spider setScheduler(Scheduler scheduler) { - checkIfRunning(); - this.scheduler = scheduler; - return this; - } + /** + * set scheduler for Spider + * + * @param scheduler + * @return this + * @see Scheduler + * @since 0.2.1 + */ + public Spider setScheduler(Scheduler scheduler) { + checkIfRunning(); + this.scheduler = scheduler; + return this; + } - /** - * add a pipeline for Spider - * - * @param pipeline - * @return this - * @see #setPipeline(us.codecraft.webmagic.pipeline.Pipeline) - * @deprecated - */ - public Spider pipeline(Pipeline pipeline) { - return addPipeline(pipeline); - } + /** + * add a pipeline for Spider + * + * @param pipeline + * @return this + * @see #setPipeline(us.codecraft.webmagic.pipeline.Pipeline) + * @deprecated + */ + public Spider pipeline(Pipeline pipeline) { + return addPipeline(pipeline); + } - /** - * add a pipeline for Spider - * - * @param pipeline - * @return this - * @see Pipeline - * @since 0.2.1 - */ - public Spider addPipeline(Pipeline pipeline) { - checkIfRunning(); - this.pipelines.add(pipeline); - return this; - } + /** + * add a pipeline for Spider + * + * @param pipeline + * @return this + * @see Pipeline + * @since 0.2.1 + */ + public Spider addPipeline(Pipeline pipeline) { + checkIfRunning(); + this.pipelines.add(pipeline); + return this; + } - /** - * clear the pipelines set - * - * @return this - */ - public Spider clearPipeline() { - pipelines = new ArrayList(); - return this; - } + /** + * clear the pipelines set + * + * @return this + */ + public Spider clearPipeline() { + pipelines = new ArrayList(); + return this; + } - /** - * set the downloader of spider - * - * @param downloader - * @return this - * @see #setDownloader(us.codecraft.webmagic.downloader.Downloader) - * @deprecated - */ - public Spider downloader(Downloader downloader) { - return setDownloader(downloader); - } + /** + * set the downloader of spider + * + * @param downloader + * @return this + * @see #setDownloader(us.codecraft.webmagic.downloader.Downloader) + * @deprecated + */ + public Spider downloader(Downloader downloader) { + return setDownloader(downloader); + } - /** - * set the downloader of spider - * - * @param downloader - * @return this - * @see Downloader - */ - public Spider setDownloader(Downloader downloader) { - checkIfRunning(); - this.downloader = downloader; - return this; - } + /** + * set the downloader of spider + * + * @param downloader + * @return this + * @see Downloader + */ + public Spider setDownloader(Downloader downloader) { + checkIfRunning(); + this.downloader = downloader; + return this; + } - protected void checkComponent() { - if (downloader == null) { - this.downloader = new HttpClientDownloader(); - } - if (pipelines.isEmpty()) { - pipelines.add(new ConsolePipeline()); - } - downloader.setThread(threadNum); - } + protected void initComponent() { + if (downloader == null) { + this.downloader = new HttpClientDownloader(); + } + if (pipelines.isEmpty()) { + pipelines.add(new ConsolePipeline()); + } + downloader.setThread(threadNum); + executorService = ThreadUtils.newFixedThreadPool(threadNum); + if (startUrls != null) { + for (String startUrl : startUrls) { + scheduler.push(new Request(startUrl), this); + } + startUrls.clear(); + } + } - @Override - public void run() { - if (!stat.compareAndSet(STAT_INIT, STAT_RUNNING) && !stat.compareAndSet(STAT_STOPPED, STAT_RUNNING)) { - throw new IllegalStateException("Spider is already running!"); - } - checkComponent(); - if (startUrls != null) { - for (String startUrl : startUrls) { - scheduler.push(new Request(startUrl), this); - } - startUrls.clear(); - } - Request request = scheduler.poll(this); + @Override + public void run() { + checkRunningStat(); + initComponent(); logger.info("Spider " + getUUID() + " started!"); - // single thread - if (threadNum <= 1) { - while (request != null && stat.compareAndSet(STAT_RUNNING, STAT_RUNNING)) { - processRequest(request); - request = scheduler.poll(this); - } - } else { - synchronized (this) { - this.executorService = ThreadUtils.newFixedThreadPool(threadNum); - } - // multi thread - final AtomicInteger threadAlive = new AtomicInteger(0); - while (true && stat.compareAndSet(STAT_RUNNING, STAT_RUNNING)) { - if (request == null) { - // when no request found but some thread is alive, sleep a - // while. - try { - Thread.sleep(100); - } catch (InterruptedException e) { - } - } else { - final Request requestFinal = request; - threadAlive.incrementAndGet(); - executorService.execute(new Runnable() { - @Override - public void run() { - processRequest(requestFinal); - threadAlive.decrementAndGet(); - } - }); - } - request = scheduler.poll(this); - if (threadAlive.get() == 0) { - request = scheduler.poll(this); - if (request == null) { - break; - } - } - } - executorService.shutdown(); - } - stat.compareAndSet(STAT_RUNNING, STAT_STOPPED); - // release some resources - destroy(); - } + final AtomicInteger threadAlive = new AtomicInteger(0); + while (!Thread.currentThread().isInterrupted() && stat.get() == STAT_RUNNING) { + Request request = scheduler.poll(this); + if (request == null) { + if (threadAlive.get() == 0) { + break; + } + // when no request found but some thread is alive, sleep a + // while. + try { + Thread.sleep(100); + } catch (InterruptedException e) { + } + } else { + final Request requestFinal = request; + threadAlive.incrementAndGet(); + executorService.execute(new Runnable() { + @Override + public void run() { + try { + processRequest(requestFinal); + } catch (Exception e) { + logger.error("download "+requestFinal+" error",e); + } finally { + threadAlive.decrementAndGet(); + } + } + }); + } + } + executorService.shutdown(); + stat.set(STAT_STOPPED); + // release some resources + destroy(); + } - protected void destroy() { - destroyEach(downloader); - destroyEach(pageProcessor); - for (Pipeline pipeline : pipelines) { - destroyEach(pipeline); - } - } + private void checkRunningStat() { + while (true) { + int statNow = stat.get(); + if (statNow == STAT_RUNNING) { + throw new IllegalStateException("Spider is already running!"); + } + if (stat.compareAndSet(statNow, STAT_RUNNING)) { + break; + } + } + } - private void destroyEach(Object object) { - if (object instanceof Closeable) { - try { - ((Closeable) object).close(); - } catch (IOException e) { - e.printStackTrace(); - } - } - } + protected void destroy() { + destroyEach(downloader); + destroyEach(pageProcessor); + for (Pipeline pipeline : pipelines) { + destroyEach(pipeline); + } + } - /** - * Process specific urls without url discovering. - * - * @param urls - * urls to process - */ - public void test(String... urls) { - checkComponent(); - if (urls.length > 0) { - for (String url : urls) { - processRequest(new Request(url)); - } - } - } + private void destroyEach(Object object) { + if (object instanceof Closeable) { + try { + ((Closeable) object).close(); + } catch (IOException e) { + e.printStackTrace(); + } + } + } - protected void processRequest(Request request) { - Page page = downloader.download(request, this); - if (page == null) { - sleep(site.getSleepTime()); - return; - } - // for cycle retry - if (page.getHtml() == null) { - addRequest(page); - sleep(site.getSleepTime()); - return; - } - pageProcessor.process(page); - addRequest(page); - if (!page.getResultItems().isSkip()) { - for (Pipeline pipeline : pipelines) { - pipeline.process(page.getResultItems(), this); - } - } - sleep(site.getSleepTime()); - } + /** + * Process specific urls without url discovering. + * + * @param urls urls to process + */ + public void test(String... urls) { + initComponent(); + if (urls.length > 0) { + for (String url : urls) { + processRequest(new Request(url)); + } + } + } - protected void sleep(int time) { - try { - Thread.sleep(time); - } catch (InterruptedException e) { - e.printStackTrace(); - } - } + protected void processRequest(Request request) { + Page page = downloader.download(request, this); + if (page == null) { + sleep(site.getSleepTime()); + return; + } + // for cycle retry + if (page.getHtml() == null) { + addRequest(page); + sleep(site.getSleepTime()); + return; + } + pageProcessor.process(page); + addRequest(page); + if (!page.getResultItems().isSkip()) { + for (Pipeline pipeline : pipelines) { + pipeline.process(page.getResultItems(), this); + } + } + sleep(site.getSleepTime()); + } - protected void addRequest(Page page) { - if (CollectionUtils.isNotEmpty(page.getTargetRequests())) { - for (Request request : page.getTargetRequests()) { - scheduler.push(request, this); - } - } - } + protected void sleep(int time) { + try { + Thread.sleep(time); + } catch (InterruptedException e) { + e.printStackTrace(); + } + } - protected void checkIfRunning() { - if (!stat.compareAndSet(STAT_INIT, STAT_INIT) && !stat.compareAndSet(STAT_STOPPED, STAT_STOPPED)) { - throw new IllegalStateException("Spider is already running!"); - } - } + protected void addRequest(Page page) { + if (CollectionUtils.isNotEmpty(page.getTargetRequests())) { + for (Request request : page.getTargetRequests()) { + scheduler.push(request, this); + } + } + } - public void runAsync() { - Thread thread = new Thread(this); - thread.setDaemon(false); - thread.start(); - } + protected void checkIfRunning() { + if (stat.get() == STAT_RUNNING) { + throw new IllegalStateException("Spider is already running!"); + } + } - public void start() { - runAsync(); - } + public void runAsync() { + Thread thread = new Thread(this); + thread.setDaemon(false); + thread.start(); + } - public void stop() { - if (stat.compareAndSet(STAT_RUNNING, STAT_STOPPED)) { - if (executorService != null) { - executorService.shutdown(); - } - logger.info("Spider " + getUUID() + " stop success!"); - } else { - logger.info("Spider " + getUUID() + " stop fail!"); - } - } + public void start() { + runAsync(); + } - public void stopAndDestroy() { - stop(); - destroy(); - } + public void stop() { + if (stat.compareAndSet(STAT_RUNNING, STAT_STOPPED)) { + if (executorService != null) { + executorService.shutdown(); + } + logger.info("Spider " + getUUID() + " stop success!"); + } else { + logger.info("Spider " + getUUID() + " stop fail!"); + } + } - /** - * start with more than one threads - * - * @param threadNum - * @return this - */ - public Spider thread(int threadNum) { - checkIfRunning(); - this.threadNum = threadNum; - if (threadNum <= 0) { - throw new IllegalArgumentException("threadNum should be more than one!"); - } - if (threadNum == 1) { - return this; - } - return this; - } + public void stopAndDestroy() { + stop(); + destroy(); + } - /** - * switch off xsoup - * - * @return - */ - public static void xsoupOff() { - EnvironmentUtil.setUseXsoup(false); - } + /** + * start with more than one threads + * + * @param threadNum + * @return this + */ + public Spider thread(int threadNum) { + checkIfRunning(); + this.threadNum = threadNum; + if (threadNum <= 0) { + throw new IllegalArgumentException("threadNum should be more than one!"); + } + if (threadNum == 1) { + return this; + } + return this; + } - @Override - public String getUUID() { - if (uuid != null) { - return uuid; - } - if (site != null) { - return site.getDomain(); - } - return null; - } + /** + * switch off xsoup + * + * @return + */ + public static void xsoupOff() { + EnvironmentUtil.setUseXsoup(false); + } - @Override - public Site getSite() { - return site; - } + @Override + public String getUUID() { + if (uuid != null) { + return uuid; + } + if (site != null) { + return site.getDomain(); + } + return null; + } + + @Override + public Site getSite() { + return site; + } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/ThreadUtils.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/ThreadUtils.java index ba9774d..cdfe6d0 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/ThreadUtils.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/ThreadUtils.java @@ -1,5 +1,7 @@ package us.codecraft.webmagic.utils; +import com.google.common.util.concurrent.MoreExecutors; + import java.util.concurrent.ExecutorService; import java.util.concurrent.SynchronousQueue; import java.util.concurrent.ThreadPoolExecutor; @@ -11,11 +13,15 @@ import java.util.concurrent.TimeUnit; */ public class ThreadUtils { - public static ExecutorService newFixedThreadPool(int threadSize) { - if (threadSize <= 1) { - throw new IllegalArgumentException("ThreadSize must be greater than 1!"); - } - return new ThreadPoolExecutor(threadSize - 1, threadSize - 1, 0L, TimeUnit.MILLISECONDS, - new SynchronousQueue(), new ThreadPoolExecutor.CallerRunsPolicy()); - } + public static ExecutorService newFixedThreadPool(int threadSize) { + if (threadSize <= 0) { + throw new IllegalArgumentException("ThreadSize must be greater than 0!"); + } + if (threadSize == 1) { + return MoreExecutors.sameThreadExecutor(); + + } + return new ThreadPoolExecutor(threadSize - 1, threadSize - 1, 0L, TimeUnit.MILLISECONDS, + new SynchronousQueue(), new ThreadPoolExecutor.CallerRunsPolicy()); + } } diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/SpiderTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/SpiderTest.java index 75c1ba1..3add86c 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/SpiderTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/SpiderTest.java @@ -18,7 +18,7 @@ public class SpiderTest { public void process(ResultItems resultItems, Task task) { System.out.println(1); } - }).thread(2); + }).thread(1); spider.start(); Thread.sleep(10000); spider.stop(); From 352887870c15a1601288508a1840c925576f1961 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Thu, 31 Oct 2013 22:22:14 +0800 Subject: [PATCH 14/38] remove shutdown call --- .../java/us/codecraft/webmagic/Spider.java | 31 +++++++++++++------ 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 149f0a8..65ee7af 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -74,6 +74,8 @@ public class Spider implements Runnable, Task { protected AtomicInteger stat = new AtomicInteger(STAT_INIT); + protected boolean exitWhenComplete = false; + protected final static int STAT_INIT = 0; protected final static int STAT_RUNNING = 1; @@ -240,7 +242,7 @@ public class Spider implements Runnable, Task { while (!Thread.currentThread().isInterrupted() && stat.get() == STAT_RUNNING) { Request request = scheduler.poll(this); if (request == null) { - if (threadAlive.get() == 0) { + if (threadAlive.get() == 0 && exitWhenComplete) { break; } // when no request found but some thread is alive, sleep a @@ -258,7 +260,7 @@ public class Spider implements Runnable, Task { try { processRequest(requestFinal); } catch (Exception e) { - logger.error("download "+requestFinal+" error",e); + logger.error("download " + requestFinal + " error", e); } finally { threadAlive.decrementAndGet(); } @@ -372,20 +374,12 @@ public class Spider implements Runnable, Task { public void stop() { if (stat.compareAndSet(STAT_RUNNING, STAT_STOPPED)) { - if (executorService != null) { - executorService.shutdown(); - } logger.info("Spider " + getUUID() + " stop success!"); } else { logger.info("Spider " + getUUID() + " stop fail!"); } } - public void stopAndDestroy() { - stop(); - destroy(); - } - /** * start with more than one threads * @@ -413,6 +407,23 @@ public class Spider implements Runnable, Task { EnvironmentUtil.setUseXsoup(false); } + public boolean isExitWhenComplete() { + return exitWhenComplete; + } + + /** + * Exit when complete.
+ * True: exit when all url of the site is downloaded.
+ * False: not exit until call stop manually.
+ * + * @param exitWhenComplete + * @return + */ + public Spider setExitWhenComplete(boolean exitWhenComplete) { + this.exitWhenComplete = exitWhenComplete; + return this; + } + @Override public String getUUID() { if (uuid != null) { From b4fcf4116830c332992665e218551db64cd215b4 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Thu, 31 Oct 2013 22:41:02 +0800 Subject: [PATCH 15/38] add exit when comlete option --- .../java/us/codecraft/webmagic/Spider.java | 51 ++++++++++++++++--- .../example/OschinaBlogPageProcesser.java | 2 +- 2 files changed, 46 insertions(+), 7 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 65ee7af..1c4160d 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -18,6 +18,8 @@ import java.util.ArrayList; import java.util.List; import java.util.concurrent.ExecutorService; import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.locks.Condition; +import java.util.concurrent.locks.ReentrantLock; /** * Entrance of a crawler.
@@ -74,7 +76,7 @@ public class Spider implements Runnable, Task { protected AtomicInteger stat = new AtomicInteger(STAT_INIT); - protected boolean exitWhenComplete = false; + protected boolean exitWhenComplete = true; protected final static int STAT_INIT = 0; @@ -82,6 +84,10 @@ public class Spider implements Runnable, Task { protected final static int STAT_STOPPED = 2; + private ReentrantLock newUrlLock = new ReentrantLock(); + + private Condition newUrlCondition = newUrlLock.newCondition(); + /** * create a spider with pageProcessor. * @@ -245,11 +251,15 @@ public class Spider implements Runnable, Task { if (threadAlive.get() == 0 && exitWhenComplete) { break; } - // when no request found but some thread is alive, sleep a - // while. + // wait until new url added try { - Thread.sleep(100); - } catch (InterruptedException e) { + newUrlLock.lock(); + try { + newUrlCondition.await(); + } catch (InterruptedException e) { + } + } finally { + newUrlLock.unlock(); } } else { final Request requestFinal = request; @@ -263,6 +273,7 @@ public class Spider implements Runnable, Task { logger.error("download " + requestFinal + " error", e); } finally { threadAlive.decrementAndGet(); + signalNewUrl(); } } }); @@ -351,11 +362,16 @@ public class Spider implements Runnable, Task { protected void addRequest(Page page) { if (CollectionUtils.isNotEmpty(page.getTargetRequests())) { for (Request request : page.getTargetRequests()) { - scheduler.push(request, this); + addRequest(request); } } } + private void addRequest(Request request) { + scheduler.push(request, this); + + } + protected void checkIfRunning() { if (stat.get() == STAT_RUNNING) { throw new IllegalStateException("Spider is already running!"); @@ -368,6 +384,29 @@ public class Spider implements Runnable, Task { thread.start(); } + /** + * Add urls to crawl.
+ * + * @param urls + * @return + */ + public Spider addUrl(String... urls) { + for (String url : urls) { + addRequest(new Request(url)); + } + signalNewUrl(); + return this; + } + + private void signalNewUrl() { + try { + newUrlLock.lock(); + newUrlCondition.signalAll(); + } finally { + newUrlLock.unlock(); + } + } + public void start() { runAsync(); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/OschinaBlogPageProcesser.java b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/OschinaBlogPageProcesser.java index fa8dab6..2c53b2d 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/OschinaBlogPageProcesser.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/OschinaBlogPageProcesser.java @@ -34,6 +34,6 @@ public class OschinaBlogPageProcesser implements PageProcessor { } public static void main(String[] args) { - Spider.create(new OschinaBlogPageProcesser()).thread(2).run(); + Spider.create(new OschinaBlogPageProcesser()).thread(10).run(); } } From 84976c81ecc355d7ddb37415495c0b03f818c2ea Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Thu, 31 Oct 2013 22:48:18 +0800 Subject: [PATCH 16/38] remove useless code --- webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java | 3 --- 1 file changed, 3 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 1c4160d..9f1c479 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -431,9 +431,6 @@ public class Spider implements Runnable, Task { if (threadNum <= 0) { throw new IllegalArgumentException("threadNum should be more than one!"); } - if (threadNum == 1) { - return this; - } return this; } From 1446ada7327e08c2c96659b4288b9883d4b96a9f Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Thu, 31 Oct 2013 22:50:22 +0800 Subject: [PATCH 17/38] some refactor --- .../java/us/codecraft/webmagic/Spider.java | 22 +++++++++++-------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 9f1c479..da0d98a 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -252,15 +252,7 @@ public class Spider implements Runnable, Task { break; } // wait until new url added - try { - newUrlLock.lock(); - try { - newUrlCondition.await(); - } catch (InterruptedException e) { - } - } finally { - newUrlLock.unlock(); - } + waitNewUrl(); } else { final Request requestFinal = request; threadAlive.incrementAndGet(); @@ -398,6 +390,18 @@ public class Spider implements Runnable, Task { return this; } + private void waitNewUrl() { + try { + newUrlLock.lock(); + try { + newUrlCondition.await(); + } catch (InterruptedException e) { + } + } finally { + newUrlLock.unlock(); + } + } + private void signalNewUrl() { try { newUrlLock.lock(); From 6fa82a418ba31109c3c032976a7ec16bef6bcdc1 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sun, 3 Nov 2013 20:20:50 +0800 Subject: [PATCH 18/38] #29 seed urls with more information --- .../main/java/us/codecraft/webmagic/Site.java | 106 +++++++++++++----- .../java/us/codecraft/webmagic/Spider.java | 42 +++++-- .../example/OschinaBlogPageProcesser.java | 2 +- .../us/codecraft/webmagic/utils/UrlUtils.java | 21 +++- 4 files changed, 132 insertions(+), 39 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java index a84ba48..b5f8865 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java @@ -24,7 +24,7 @@ public class Site { /** * startUrls is the urls the crawler to start with. */ - private List startUrls = new ArrayList(); + private List startRequests = new ArrayList(); private int sleepTime = 3000; @@ -38,7 +38,7 @@ public class Site { private Set acceptStatCode = DEFAULT_STATUS_CODE_SET; - private Map headers = new HashMap(); + private Map headers = new HashMap(); public static interface HeaderConst { @@ -182,9 +182,16 @@ public class Site { * get start urls * * @return start urls + * @see #getStartRequests + * @deprecated */ + @Deprecated public List getStartUrls() { - return startUrls; + return UrlUtils.convertToUrls(startRequests); + } + + public List getStartRequests() { + return startRequests; } /** @@ -194,11 +201,19 @@ public class Site { * @return this */ public Site addStartUrl(String startUrl) { - this.startUrls.add(startUrl); - if (domain == null) { - if (startUrls.size() > 0) { - domain = UrlUtils.getDomain(startUrls.get(0)); - } + return addStartRequest(new Request(startUrl)); + } + + /** + * Add a url to start url.
+ * + * @param startUrl + * @return this + */ + public Site addStartRequest(Request startRequest) { + this.startRequests.add(startRequest); + if (domain == null && startRequest.getUrl() != null) { + domain = UrlUtils.getDomain(startRequest.getUrl()); } return this; } @@ -241,12 +256,13 @@ public class Site { /** * Put an Http header for downloader.
* Use {@link #addCookie(String, String)} for cookie and {@link #setUserAgent(String)} for user-agent.
- * @param key key of http header, there are some keys constant in {@link HeaderConst} + * + * @param key key of http header, there are some keys constant in {@link HeaderConst} * @param value value of header * @return */ - public Site addHeader(String key, String value){ - headers.put(key,value); + public Site addHeader(String key, String value) { + headers.put(key, value); return this; } @@ -279,23 +295,6 @@ public class Site { return this; } - @Override - public boolean equals(Object o) { - if (this == o) return true; - if (o == null || getClass() != o.getClass()) return false; - - Site site = (Site) o; - - if (acceptStatCode != null ? !acceptStatCode.equals(site.acceptStatCode) : site.acceptStatCode != null) - return false; - if (!domain.equals(site.domain)) return false; - if (!startUrls.equals(site.startUrls)) return false; - if (charset != null ? !charset.equals(site.charset) : site.charset != null) return false; - if (userAgent != null ? !userAgent.equals(site.userAgent) : site.userAgent != null) return false; - - return true; - } - public Task toTask() { return new Task() { @Override @@ -310,13 +309,60 @@ public class Site { }; } + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + Site site = (Site) o; + + if (cycleRetryTimes != site.cycleRetryTimes) return false; + if (retryTimes != site.retryTimes) return false; + if (sleepTime != site.sleepTime) return false; + if (timeOut != site.timeOut) return false; + if (acceptStatCode != null ? !acceptStatCode.equals(site.acceptStatCode) : site.acceptStatCode != null) + return false; + if (charset != null ? !charset.equals(site.charset) : site.charset != null) return false; + if (cookies != null ? !cookies.equals(site.cookies) : site.cookies != null) return false; + if (domain != null ? !domain.equals(site.domain) : site.domain != null) return false; + if (headers != null ? !headers.equals(site.headers) : site.headers != null) return false; + if (startRequests != null ? !startRequests.equals(site.startRequests) : site.startRequests != null) + return false; + if (userAgent != null ? !userAgent.equals(site.userAgent) : site.userAgent != null) return false; + + return true; + } + @Override public int hashCode() { - int result = domain.hashCode(); - result = 31 * result + (startUrls != null ? startUrls.hashCode() : 0); + int result = domain != null ? domain.hashCode() : 0; result = 31 * result + (userAgent != null ? userAgent.hashCode() : 0); + result = 31 * result + (cookies != null ? cookies.hashCode() : 0); result = 31 * result + (charset != null ? charset.hashCode() : 0); + result = 31 * result + (startRequests != null ? startRequests.hashCode() : 0); + result = 31 * result + sleepTime; + result = 31 * result + retryTimes; + result = 31 * result + cycleRetryTimes; + result = 31 * result + timeOut; result = 31 * result + (acceptStatCode != null ? acceptStatCode.hashCode() : 0); + result = 31 * result + (headers != null ? headers.hashCode() : 0); return result; } + + @Override + public String toString() { + return "Site{" + + "domain='" + domain + '\'' + + ", userAgent='" + userAgent + '\'' + + ", cookies=" + cookies + + ", charset='" + charset + '\'' + + ", startRequests=" + startRequests + + ", sleepTime=" + sleepTime + + ", retryTimes=" + retryTimes + + ", cycleRetryTimes=" + cycleRetryTimes + + ", timeOut=" + timeOut + + ", acceptStatCode=" + acceptStatCode + + ", headers=" + headers + + '}'; + } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index da0d98a..54f51d9 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -11,6 +11,7 @@ import us.codecraft.webmagic.scheduler.QueueScheduler; import us.codecraft.webmagic.scheduler.Scheduler; import us.codecraft.webmagic.utils.EnvironmentUtil; import us.codecraft.webmagic.utils.ThreadUtils; +import us.codecraft.webmagic.utils.UrlUtils; import java.io.Closeable; import java.io.IOException; @@ -60,7 +61,7 @@ public class Spider implements Runnable, Task { protected PageProcessor pageProcessor; - protected List startUrls; + protected List startRequests; protected Site site; @@ -107,7 +108,7 @@ public class Spider implements Runnable, Task { public Spider(PageProcessor pageProcessor) { this.pageProcessor = pageProcessor; this.site = pageProcessor.getSite(); - this.startUrls = pageProcessor.getSite().getStartUrls(); + this.startRequests = pageProcessor.getSite().getStartRequests(); } /** @@ -119,7 +120,20 @@ public class Spider implements Runnable, Task { */ public Spider startUrls(List startUrls) { checkIfRunning(); - this.startUrls = startUrls; + this.startRequests = UrlUtils.convertToRequests(startUrls); + return this; + } + + /** + * Set startUrls of Spider.
+ * Prior to startUrls of Site. + * + * @param startUrls + * @return this + */ + public Spider startRequest(List startRequests) { + checkIfRunning(); + this.startRequests = startRequests; return this; } @@ -231,11 +245,11 @@ public class Spider implements Runnable, Task { } downloader.setThread(threadNum); executorService = ThreadUtils.newFixedThreadPool(threadNum); - if (startUrls != null) { - for (String startUrl : startUrls) { - scheduler.push(new Request(startUrl), this); + if (startRequests != null) { + for (Request request : startRequests) { + scheduler.push(request, this); } - startUrls.clear(); + startRequests.clear(); } } @@ -390,6 +404,20 @@ public class Spider implements Runnable, Task { return this; } + /** + * Add urls with information to crawl.
+ * + * @param urls + * @return + */ + public Spider addRequest(Request... requests) { + for (Request request : requests) { + addRequest(request); + } + signalNewUrl(); + return this; + } + private void waitNewUrl() { try { newUrlLock.lock(); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/OschinaBlogPageProcesser.java b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/OschinaBlogPageProcesser.java index 2c53b2d..fa8dab6 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/OschinaBlogPageProcesser.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/OschinaBlogPageProcesser.java @@ -34,6 +34,6 @@ public class OschinaBlogPageProcesser implements PageProcessor { } public static void main(String[] args) { - Spider.create(new OschinaBlogPageProcesser()).thread(10).run(); + Spider.create(new OschinaBlogPageProcesser()).thread(2).run(); } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java index 9ca776d..e45f948 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java @@ -1,10 +1,13 @@ package us.codecraft.webmagic.utils; import org.apache.commons.lang3.StringUtils; +import us.codecraft.webmagic.Request; import java.net.MalformedURLException; import java.net.URL; import java.nio.charset.Charset; +import java.util.ArrayList; +import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -18,7 +21,7 @@ public class UrlUtils { /** * canonicalizeUrl - * + *

* Borrowed from Jsoup. * * @param url @@ -85,6 +88,22 @@ public class UrlUtils { return stringBuilder.toString(); } + public static List convertToRequests(List urls) { + List requestList = new ArrayList(urls.size()); + for (String url : urls) { + requestList.add(new Request(url)); + } + return requestList; + } + + public static List convertToUrls(List requests) { + List urlList = new ArrayList(requests.size()); + for (Request request : requests) { + urlList.add(request.getUrl()); + } + return urlList; + } + private static final Pattern patternForCharset = Pattern.compile("charset\\s*=\\s*['\"]*([^\\s;'\"]*)"); public static String getCharset(String contentType) { From 583a0eba8c04d29ab30bac74138a586aa4edc475 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sun, 3 Nov 2013 20:24:26 +0800 Subject: [PATCH 19/38] #29 refactor some method name --- .../src/main/java/us/codecraft/webmagic/Spider.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 54f51d9..32bd3e1 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -343,12 +343,12 @@ public class Spider implements Runnable, Task { } // for cycle retry if (page.getHtml() == null) { - addRequest(page); + extractAndAddRequests(page); sleep(site.getSleepTime()); return; } pageProcessor.process(page); - addRequest(page); + extractAndAddRequests(page); if (!page.getResultItems().isSkip()) { for (Pipeline pipeline : pipelines) { pipeline.process(page.getResultItems(), this); @@ -365,7 +365,7 @@ public class Spider implements Runnable, Task { } } - protected void addRequest(Page page) { + protected void extractAndAddRequests(Page page) { if (CollectionUtils.isNotEmpty(page.getTargetRequests())) { for (Request request : page.getTargetRequests()) { addRequest(request); From 160a149b0590da553b96bddabfe023b5ddf8941d Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sun, 3 Nov 2013 23:10:09 +0800 Subject: [PATCH 20/38] todo bugfix --- .../src/main/java/us/codecraft/webmagic/Spider.java | 2 +- .../codecraft/webmagic/downloader/HttpClientDownloader.java | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 32bd3e1..04ac894 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -482,7 +482,7 @@ public class Spider implements Runnable, Task { /** * Exit when complete.
* True: exit when all url of the site is downloaded.
- * False: not exit until call stop manually.
+ * False: not exit until call stop() manually.
* * @param exitWhenComplete * @return diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 1bee564..d6ee8c1 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -90,11 +90,15 @@ public class HttpClientDownloader implements Downloader { HttpClient httpClient = getHttpClientPool().getClient(site); try { HttpGet httpGet = new HttpGet(request.getUrl()); + if (headers!=null){ for (Map.Entry headerEntry : headers.entrySet()) { httpGet.addHeader(headerEntry.getKey(),headerEntry.getValue()); } } + if (!httpGet.containsHeader("Accept-Encoding")) { + httpGet.addHeader("Accept-Encoding", "gzip"); + } HttpResponse httpResponse = null; int tried = 0; boolean retry; @@ -168,6 +172,7 @@ public class HttpClientDownloader implements Downloader { HeaderElement[] codecs = ceheader.getElements(); for (HeaderElement codec : codecs) { if (codec.getName().equalsIgnoreCase("gzip")) { + //todo bugfix httpResponse.setEntity( new GzipDecompressingEntity(httpResponse.getEntity())); } From edfc319c454a6812841d1a9c9c01f622ee1c1293 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Mon, 4 Nov 2013 00:06:30 +0800 Subject: [PATCH 21/38] update httpclient to 4.3.1 --- pom.xml | 2 +- .../downloader/HttpClientDownloader.java | 129 +++++++----------- .../webmagic/downloader/HttpClientPool.java | 97 +++++++------ 3 files changed, 108 insertions(+), 120 deletions(-) diff --git a/pom.xml b/pom.xml index 8f9837f..918ab6a 100644 --- a/pom.xml +++ b/pom.xml @@ -61,7 +61,7 @@ org.apache.httpcomponents httpclient - 4.2.4 + 4.3.1 com.google.guava diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index d6ee8c1..ce4f8cb 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -1,13 +1,11 @@ package us.codecraft.webmagic.downloader; -import org.apache.commons.io.IOUtils; -import org.apache.http.Header; -import org.apache.http.HeaderElement; import org.apache.http.HttpResponse; import org.apache.http.annotation.ThreadSafe; -import org.apache.http.client.HttpClient; -import org.apache.http.client.entity.GzipDecompressingEntity; +import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; +import org.apache.http.impl.client.CloseableHttpClient; +import org.apache.http.util.EntityUtils; import org.apache.log4j.Logger; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; @@ -34,7 +32,7 @@ public class HttpClientDownloader implements Downloader { private Logger logger = Logger.getLogger(getClass()); - private HttpClientPool httpClientPool; + private volatile CloseableHttpClient httpClient; private int poolSize = 1; @@ -60,11 +58,15 @@ public class HttpClientDownloader implements Downloader { return (Html) page.getHtml(); } - private HttpClientPool getHttpClientPool(){ - if (httpClientPool==null){ - httpClientPool = new HttpClientPool(poolSize); + private CloseableHttpClient getHttpClient(Site site) { + if (httpClient == null) { + synchronized (this) { + if (httpClient == null) { + httpClient = new HttpClientPool(poolSize).getClient(site); + } + } } - return httpClientPool; + return httpClient; } @Override @@ -73,12 +75,10 @@ public class HttpClientDownloader implements Downloader { if (task != null) { site = task.getSite(); } - int retryTimes = 0; Set acceptStatCode; String charset = null; - Map headers = null; + Map headers = null; if (site != null) { - retryTimes = site.getRetryTimes(); acceptStatCode = site.getAcceptStatCode(); charset = site.getCharset(); headers = site.getHeaders(); @@ -87,54 +87,17 @@ public class HttpClientDownloader implements Downloader { acceptStatCode.add(200); } logger.info("downloading page " + request.getUrl()); - HttpClient httpClient = getHttpClientPool().getClient(site); + HttpGet httpGet = new HttpGet(request.getUrl()); + if (headers != null) { + for (Map.Entry headerEntry : headers.entrySet()) { + httpGet.addHeader(headerEntry.getKey(), headerEntry.getValue()); + } + } + CloseableHttpResponse httpResponse = null; try { - HttpGet httpGet = new HttpGet(request.getUrl()); - - if (headers!=null){ - for (Map.Entry headerEntry : headers.entrySet()) { - httpGet.addHeader(headerEntry.getKey(),headerEntry.getValue()); - } - } - if (!httpGet.containsHeader("Accept-Encoding")) { - httpGet.addHeader("Accept-Encoding", "gzip"); - } - HttpResponse httpResponse = null; - int tried = 0; - boolean retry; - do { - try { - httpResponse = httpClient.execute(httpGet); - retry = false; - } catch (IOException e) { - tried++; - - if (tried > retryTimes) { - logger.warn("download page " + request.getUrl() + " error", e); - if (site.getCycleRetryTimes() > 0) { - Page page = new Page(); - Object cycleTriedTimesObject = request.getExtra(Request.CYCLE_TRIED_TIMES); - if (cycleTriedTimesObject == null) { - page.addTargetRequest(request.setPriority(0).putExtra(Request.CYCLE_TRIED_TIMES, 1)); - } else { - int cycleTriedTimes = (Integer) cycleTriedTimesObject; - cycleTriedTimes++; - if (cycleTriedTimes >= site.getCycleRetryTimes()) { - return null; - } - page.addTargetRequest(request.setPriority(0).putExtra(Request.CYCLE_TRIED_TIMES, 1)); - } - return page; - } - return null; - } - logger.info("download page " + request.getUrl() + " error, retry the " + tried + " time!"); - retry = true; - } - } while (retry); + httpResponse = getHttpClient(site).execute(httpGet); int statusCode = httpResponse.getStatusLine().getStatusCode(); if (acceptStatCode.contains(statusCode)) { - handleGzip(httpResponse); //charset if (charset == null) { String value = httpResponse.getEntity().getContentType().getValue(); @@ -143,16 +106,43 @@ public class HttpClientDownloader implements Downloader { return handleResponse(request, charset, httpResponse, task); } else { logger.warn("code error " + statusCode + "\t" + request.getUrl()); + return null; } - } catch (Exception e) { + } catch (IOException e) { logger.warn("download page " + request.getUrl() + " error", e); + if (site.getCycleRetryTimes() > 0) { + return addToCycleRetry(request, site); + } + return null; + } finally { + try { + if (httpResponse != null) { + httpResponse.close(); + } + } catch (IOException e) { + logger.warn("close response fail", e); + } } - return null; + } + + private Page addToCycleRetry(Request request, Site site) { + Page page = new Page(); + Object cycleTriedTimesObject = request.getExtra(Request.CYCLE_TRIED_TIMES); + if (cycleTriedTimesObject == null) { + page.addTargetRequest(request.setPriority(0).putExtra(Request.CYCLE_TRIED_TIMES, 1)); + } else { + int cycleTriedTimes = (Integer) cycleTriedTimesObject; + cycleTriedTimes++; + if (cycleTriedTimes >= site.getCycleRetryTimes()) { + return null; + } + page.addTargetRequest(request.setPriority(0).putExtra(Request.CYCLE_TRIED_TIMES, 1)); + } + return page; } protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException { - String content = IOUtils.toString(httpResponse.getEntity().getContent(), - charset); + String content = EntityUtils.toString(httpResponse.getEntity(), charset); Page page = new Page(); page.setHtml(new Html(UrlUtils.fixAllRelativeHrefs(content, request.getUrl()))); page.setUrl(new PlainText(request.getUrl())); @@ -163,20 +153,5 @@ public class HttpClientDownloader implements Downloader { @Override public void setThread(int thread) { poolSize = thread; - httpClientPool = new HttpClientPool(thread); - } - - private void handleGzip(HttpResponse httpResponse) { - Header ceheader = httpResponse.getEntity().getContentEncoding(); - if (ceheader != null) { - HeaderElement[] codecs = ceheader.getElements(); - for (HeaderElement codec : codecs) { - if (codec.getName().equalsIgnoreCase("gzip")) { - //todo bugfix - httpResponse.setEntity( - new GzipDecompressingEntity(httpResponse.getEntity())); - } - } - } } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java index c882836..43ee94d 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java @@ -1,72 +1,85 @@ package us.codecraft.webmagic.downloader; -import org.apache.http.HttpVersion; +import org.apache.http.*; import org.apache.http.client.CookieStore; -import org.apache.http.client.HttpClient; -import org.apache.http.client.params.ClientPNames; -import org.apache.http.client.params.CookiePolicy; -import org.apache.http.conn.scheme.PlainSocketFactory; -import org.apache.http.conn.scheme.Scheme; -import org.apache.http.conn.scheme.SchemeRegistry; -import org.apache.http.conn.ssl.SSLSocketFactory; -import org.apache.http.impl.client.BasicCookieStore; -import org.apache.http.impl.client.DefaultHttpClient; -import org.apache.http.impl.conn.PoolingClientConnectionManager; +import org.apache.http.client.entity.GzipDecompressingEntity; +import org.apache.http.config.Registry; +import org.apache.http.config.RegistryBuilder; +import org.apache.http.conn.socket.ConnectionSocketFactory; +import org.apache.http.conn.socket.PlainConnectionSocketFactory; +import org.apache.http.conn.ssl.SSLConnectionSocketFactory; +import org.apache.http.impl.client.*; +import org.apache.http.impl.conn.PoolingHttpClientConnectionManager; import org.apache.http.impl.cookie.BasicClientCookie; -import org.apache.http.params.*; +import org.apache.http.protocol.HttpContext; import us.codecraft.webmagic.Site; +import java.io.IOException; import java.util.Map; /** * @author code4crafter@gmail.com
- * @since 0.1.0 + * @since 0.3.3 */ public class HttpClientPool { - private int poolSize; - - private PoolingClientConnectionManager connectionManager; + private PoolingHttpClientConnectionManager connectionManager; public HttpClientPool(int poolSize) { - this.poolSize = poolSize; - SchemeRegistry schemeRegistry = new SchemeRegistry(); - schemeRegistry.register(new Scheme("http", 80, PlainSocketFactory.getSocketFactory())); - schemeRegistry.register(new Scheme("https", 443, SSLSocketFactory.getSocketFactory())); - - connectionManager = new PoolingClientConnectionManager(schemeRegistry); + Registry reg = RegistryBuilder.create() + .register("http", PlainConnectionSocketFactory.INSTANCE) + .register("https", SSLConnectionSocketFactory.getSocketFactory()) + .build(); + PoolingHttpClientConnectionManager connectionManager = new PoolingHttpClientConnectionManager(reg); connectionManager.setMaxTotal(poolSize); connectionManager.setDefaultMaxPerRoute(100); } - public HttpClient getClient(Site site) { + public CloseableHttpClient getClient(Site site) { return generateClient(site); } - private HttpClient generateClient(Site site) { - HttpParams params = new BasicHttpParams(); + private CloseableHttpClient generateClient(Site site) { + HttpClientBuilder httpClientBuilder = HttpClients.custom().setConnectionManager(connectionManager); if (site != null && site.getUserAgent() != null) { - params.setParameter(CoreProtocolPNames.USER_AGENT, site.getUserAgent()); - params.setIntParameter(CoreConnectionPNames.SO_TIMEOUT, site.getTimeOut()); - params.setIntParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, site.getTimeOut()); + httpClientBuilder.setUserAgent(site.getUserAgent()); } else { - params.setIntParameter(CoreConnectionPNames.SO_TIMEOUT, 3000); - params.setIntParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, 3000); + httpClientBuilder.setUserAgent(""); } + httpClientBuilder.addInterceptorFirst(new HttpRequestInterceptor() { - params.setParameter(ClientPNames.COOKIE_POLICY, CookiePolicy.BEST_MATCH); - HttpProtocolParamBean paramsBean = new HttpProtocolParamBean(params); - paramsBean.setVersion(HttpVersion.HTTP_1_1); - if (site != null && site.getCharset() != null) { - paramsBean.setContentCharset(site.getCharset()); - } - paramsBean.setUseExpectContinue(false); + public void process( + final HttpRequest request, + final HttpContext context) throws HttpException, IOException { + if (!request.containsHeader("Accept-Encoding")) { + request.addHeader("Accept-Encoding", "gzip"); + } - DefaultHttpClient httpClient = new DefaultHttpClient(connectionManager, params); - if (site != null) { - generateCookie(httpClient, site); - } - return httpClient; + } + }).addInterceptorFirst(new HttpResponseInterceptor() { + + public void process( + final HttpResponse response, + final HttpContext context) throws HttpException, IOException { + HttpEntity entity = response.getEntity(); + if (entity != null) { + Header ceheader = entity.getContentEncoding(); + if (ceheader != null) { + HeaderElement[] codecs = ceheader.getElements(); + for (int i = 0; i < codecs.length; i++) { + if (codecs[i].getName().equalsIgnoreCase("gzip")) { + response.setEntity( + new GzipDecompressingEntity(response.getEntity())); + return; + } + } + } + } + } + + }); + httpClientBuilder.setRetryHandler(new DefaultHttpRequestRetryHandler(site.getRetryTimes(),true)); + return httpClientBuilder.build(); } private void generateCookie(DefaultHttpClient httpClient, Site site) { From 09153ff71506eb502ec341c29f332b5eed24ff3c Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Mon, 4 Nov 2013 00:47:09 +0800 Subject: [PATCH 22/38] #22 http proxy support #32 update httpclient to 4.3.1 --- .../main/java/us/codecraft/webmagic/Site.java | 17 +++++++++++ .../downloader/HttpClientDownloader.java | 30 ++++++++++++++----- .../webmagic/downloader/HttpClientPool.java | 4 ++- .../webmagic/model/samples/OschinaBlog.java | 13 ++++++-- 4 files changed, 52 insertions(+), 12 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java index b5f8865..33e9b8f 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java @@ -1,5 +1,6 @@ package us.codecraft.webmagic; +import org.apache.http.HttpHost; import us.codecraft.webmagic.utils.UrlUtils; import java.util.*; @@ -40,6 +41,8 @@ public class Site { private Map headers = new HashMap(); + private HttpHost httpProxy; + public static interface HeaderConst { public static final String REFERER = "Referer"; @@ -295,6 +298,20 @@ public class Site { return this; } + public HttpHost getHttpProxy() { + return httpProxy; + } + + /** + * set up httpProxy for this site + * @param httpProxy + * @return + */ + public Site setHttpProxy(HttpHost httpProxy) { + this.httpProxy = httpProxy; + return this; + } + public Task toTask() { return new Task() { @Override diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index ce4f8cb..2da585f 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -1,9 +1,11 @@ package us.codecraft.webmagic.downloader; +import com.google.common.collect.Sets; import org.apache.http.HttpResponse; import org.apache.http.annotation.ThreadSafe; +import org.apache.http.client.config.RequestConfig; import org.apache.http.client.methods.CloseableHttpResponse; -import org.apache.http.client.methods.HttpGet; +import org.apache.http.client.methods.RequestBuilder; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.util.EntityUtils; import org.apache.log4j.Logger; @@ -16,7 +18,7 @@ import us.codecraft.webmagic.selector.PlainText; import us.codecraft.webmagic.utils.UrlUtils; import java.io.IOException; -import java.util.HashSet; +import java.util.HashMap; import java.util.Map; import java.util.Set; @@ -32,7 +34,7 @@ public class HttpClientDownloader implements Downloader { private Logger logger = Logger.getLogger(getClass()); - private volatile CloseableHttpClient httpClient; + private final Map httpClients = new HashMap(); private int poolSize = 1; @@ -59,10 +61,16 @@ public class HttpClientDownloader implements Downloader { } private CloseableHttpClient getHttpClient(Site site) { + if (site == null) { + return new HttpClientPool(poolSize).getClient(null); + } + String domain = site.getDomain(); + CloseableHttpClient httpClient = httpClients.get(domain); if (httpClient == null) { synchronized (this) { if (httpClient == null) { httpClient = new HttpClientPool(poolSize).getClient(site); + httpClients.put(domain, httpClient); } } } @@ -83,19 +91,25 @@ public class HttpClientDownloader implements Downloader { charset = site.getCharset(); headers = site.getHeaders(); } else { - acceptStatCode = new HashSet(); - acceptStatCode.add(200); + acceptStatCode = Sets.newHashSet(200); } logger.info("downloading page " + request.getUrl()); - HttpGet httpGet = new HttpGet(request.getUrl()); + RequestBuilder requestBuilder = RequestBuilder.get().setUri(request.getUrl()); if (headers != null) { for (Map.Entry headerEntry : headers.entrySet()) { - httpGet.addHeader(headerEntry.getKey(), headerEntry.getValue()); + requestBuilder.addHeader(headerEntry.getKey(), headerEntry.getValue()); } } + RequestConfig.Builder requestConfigBuilder = RequestConfig.custom() + .setConnectionRequestTimeout(site.getTimeOut()) + .setConnectTimeout(site.getTimeOut()); + if (site.getHttpProxy()!=null){ + requestConfigBuilder.setProxy(site.getHttpProxy()); + } + requestBuilder.setConfig(requestConfigBuilder.build()); CloseableHttpResponse httpResponse = null; try { - httpResponse = getHttpClient(site).execute(httpGet); + httpResponse = getHttpClient(site).execute(requestBuilder.build()); int statusCode = httpResponse.getStatusLine().getStatusCode(); if (acceptStatCode.contains(statusCode)) { //charset diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java index 43ee94d..62d8718 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java @@ -78,7 +78,9 @@ public class HttpClientPool { } }); - httpClientBuilder.setRetryHandler(new DefaultHttpRequestRetryHandler(site.getRetryTimes(),true)); + if (site!=null){ + httpClientBuilder.setRetryHandler(new DefaultHttpRequestRetryHandler(site.getRetryTimes(),true)); + } return httpClientBuilder.build(); } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java index a7f51ad..d6b9c9d 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java @@ -1,10 +1,12 @@ package us.codecraft.webmagic.model.samples; +import org.apache.http.HttpHost; import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Task; import us.codecraft.webmagic.model.OOSpider; +import us.codecraft.webmagic.model.PageModelPipeline; import us.codecraft.webmagic.model.annotation.ExtractBy; import us.codecraft.webmagic.model.annotation.TargetUrl; -import us.codecraft.webmagic.pipeline.JsonFilePageModelPipeline; import java.util.List; @@ -24,8 +26,13 @@ public class OschinaBlog{ private List tags; public static void main(String[] args) { - OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog") - ,new JsonFilePageModelPipeline(), OschinaBlog.class).run(); + OOSpider.create(Site.me().setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36").addStartUrl("http://my.oschina.net/flashsword/blog").setSleepTime(0).setHttpProxy(new HttpHost("127.0.0.1",8888)) + ,new PageModelPipeline() { + @Override + public void process(Object o, Task task) { + + } + }, OschinaBlog.class).thread(10).run(); } public String getTitle() { From 3c6fced48e3efe2883646d53176af4073bbcca70 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Mon, 4 Nov 2013 00:53:01 +0800 Subject: [PATCH 23/38] update connection client --- .../java/us/codecraft/webmagic/downloader/HttpClientPool.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java index 62d8718..5c80ba5 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java @@ -30,7 +30,7 @@ public class HttpClientPool { .register("http", PlainConnectionSocketFactory.INSTANCE) .register("https", SSLConnectionSocketFactory.getSocketFactory()) .build(); - PoolingHttpClientConnectionManager connectionManager = new PoolingHttpClientConnectionManager(reg); + connectionManager = new PoolingHttpClientConnectionManager(reg); connectionManager.setMaxTotal(poolSize); connectionManager.setDefaultMaxPerRoute(100); } From a37f40e6e60b6c038522cd643bd836166ee4e7d1 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Mon, 4 Nov 2013 00:59:48 +0800 Subject: [PATCH 24/38] add cookie supoort --- .../codecraft/webmagic/downloader/HttpClientDownloader.java | 6 ++++-- .../us/codecraft/webmagic/downloader/HttpClientPool.java | 5 +++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 2da585f..cdbc55a 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -3,6 +3,7 @@ package us.codecraft.webmagic.downloader; import com.google.common.collect.Sets; import org.apache.http.HttpResponse; import org.apache.http.annotation.ThreadSafe; +import org.apache.http.client.config.CookieSpecs; import org.apache.http.client.config.RequestConfig; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.RequestBuilder; @@ -102,8 +103,9 @@ public class HttpClientDownloader implements Downloader { } RequestConfig.Builder requestConfigBuilder = RequestConfig.custom() .setConnectionRequestTimeout(site.getTimeOut()) - .setConnectTimeout(site.getTimeOut()); - if (site.getHttpProxy()!=null){ + .setConnectTimeout(site.getTimeOut()) + .setCookieSpec(CookieSpecs.BEST_MATCH); + if (site.getHttpProxy() != null) { requestConfigBuilder.setProxy(site.getHttpProxy()); } requestBuilder.setConfig(requestConfigBuilder.build()); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java index 5c80ba5..a0ab74a 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java @@ -81,10 +81,11 @@ public class HttpClientPool { if (site!=null){ httpClientBuilder.setRetryHandler(new DefaultHttpRequestRetryHandler(site.getRetryTimes(),true)); } + generateCookie(httpClientBuilder,site); return httpClientBuilder.build(); } - private void generateCookie(DefaultHttpClient httpClient, Site site) { + private void generateCookie(HttpClientBuilder httpClientBuilder, Site site) { CookieStore cookieStore = new BasicCookieStore(); if (site.getCookies() != null) { for (Map.Entry cookieEntry : site.getCookies().entrySet()) { @@ -93,7 +94,7 @@ public class HttpClientPool { cookieStore.addCookie(cookie); } } - httpClient.setCookieStore(cookieStore); + httpClientBuilder.setDefaultCookieStore(cookieStore); } } From ed3f3583cc0600b90e8e6f815f27261517df2f09 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Mon, 4 Nov 2013 01:03:23 +0800 Subject: [PATCH 25/38] downloader refactor --- .../codecraft/webmagic/downloader/HttpClientDownloader.java | 4 ++-- .../{HttpClientPool.java => HttpClientGenerator.java} | 4 ++-- .../us/codecraft/webmagic/downloader/HttpClientPool-cmnt.xml | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) rename webmagic-core/src/main/java/us/codecraft/webmagic/downloader/{HttpClientPool.java => HttpClientGenerator.java} (97%) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index cdbc55a..c4a0c01 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -63,14 +63,14 @@ public class HttpClientDownloader implements Downloader { private CloseableHttpClient getHttpClient(Site site) { if (site == null) { - return new HttpClientPool(poolSize).getClient(null); + return new HttpClientGenerator(poolSize).getClient(null); } String domain = site.getDomain(); CloseableHttpClient httpClient = httpClients.get(domain); if (httpClient == null) { synchronized (this) { if (httpClient == null) { - httpClient = new HttpClientPool(poolSize).getClient(site); + httpClient = new HttpClientGenerator(poolSize).getClient(site); httpClients.put(domain, httpClient); } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java similarity index 97% rename from webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java rename to webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java index a0ab74a..dbc3828 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java @@ -21,11 +21,11 @@ import java.util.Map; * @author code4crafter@gmail.com
* @since 0.3.3 */ -public class HttpClientPool { +public class HttpClientGenerator { private PoolingHttpClientConnectionManager connectionManager; - public HttpClientPool(int poolSize) { + public HttpClientGenerator(int poolSize) { Registry reg = RegistryBuilder.create() .register("http", PlainConnectionSocketFactory.INSTANCE) .register("https", SSLConnectionSocketFactory.getSocketFactory()) diff --git a/zh_docs/us/codecraft/webmagic/downloader/HttpClientPool-cmnt.xml b/zh_docs/us/codecraft/webmagic/downloader/HttpClientPool-cmnt.xml index 9c7ef38..c1416d3 100644 --- a/zh_docs/us/codecraft/webmagic/downloader/HttpClientPool-cmnt.xml +++ b/zh_docs/us/codecraft/webmagic/downloader/HttpClientPool-cmnt.xml @@ -4,7 +4,7 @@ Sat Aug 17 14:14:45 CST 2013 - + Date: 13-4-21 Time: 下午12:29 From c18b603399019b40e4120258149d74d46aa3a7dd Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Mon, 4 Nov 2013 07:09:44 +0800 Subject: [PATCH 26/38] optimize long compare --- .../webmagic/scheduler/PriorityScheduler.java | 5 +++-- .../codecraft/webmagic/utils/NumberUtils.java | 17 +++++++++++++++++ .../webmagic/model/samples/OschinaBlog.java | 6 ++++-- 3 files changed, 24 insertions(+), 4 deletions(-) create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/utils/NumberUtils.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/PriorityScheduler.java b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/PriorityScheduler.java index 7ce44f0..fa951e1 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/PriorityScheduler.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/PriorityScheduler.java @@ -4,6 +4,7 @@ import org.apache.http.annotation.ThreadSafe; import org.apache.log4j.Logger; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Task; +import us.codecraft.webmagic.utils.NumberUtils; import java.util.Comparator; import java.util.HashSet; @@ -30,14 +31,14 @@ public class PriorityScheduler implements Scheduler { private PriorityBlockingQueue priorityQueuePlus = new PriorityBlockingQueue(INITIAL_CAPACITY, new Comparator() { @Override public int compare(Request o1, Request o2) { - return -(new Long(o1.getPriority()).compareTo(o2.getPriority())); + return -NumberUtils.compareLong(o1.getPriority(), o2.getPriority()); } }); private PriorityBlockingQueue priorityQueueMinus = new PriorityBlockingQueue(INITIAL_CAPACITY, new Comparator() { @Override public int compare(Request o1, Request o2) { - return -(new Long(o1.getPriority()).compareTo(o2.getPriority())); + return -NumberUtils.compareLong(o1.getPriority(), o2.getPriority()); } }); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/NumberUtils.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/NumberUtils.java new file mode 100644 index 0000000..55e1851 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/NumberUtils.java @@ -0,0 +1,17 @@ +package us.codecraft.webmagic.utils; + +/** + * @author yihua.huang@dianping.com + */ +public abstract class NumberUtils { + + public static int compareLong(long o1, long o2) { + if (o1 < o2) { + return -1; + } else if (o1 == o2) { + return 0; + } else { + return 1; + } + } +} diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java index d6b9c9d..9e7fa88 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java @@ -1,6 +1,5 @@ package us.codecraft.webmagic.model.samples; -import org.apache.http.HttpHost; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.model.OOSpider; @@ -26,7 +25,10 @@ public class OschinaBlog{ private List tags; public static void main(String[] args) { - OOSpider.create(Site.me().setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36").addStartUrl("http://my.oschina.net/flashsword/blog").setSleepTime(0).setHttpProxy(new HttpHost("127.0.0.1",8888)) + OOSpider.create(Site.me() + .setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36").addStartUrl("http://my.oschina.net/flashsword/blog") + .setSleepTime(0) + .setRetryTimes(3) ,new PageModelPipeline() { @Override public void process(Object o, Task task) { From 86cfefb58ceb2781ae2d374886b4794f1b7ba2a1 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Mon, 4 Nov 2013 07:19:32 +0800 Subject: [PATCH 27/38] update test --- .../webmagic/model/samples/Kr36NewsModel.java | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/Kr36NewsModel.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/Kr36NewsModel.java index b381c96..472b47e 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/Kr36NewsModel.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/Kr36NewsModel.java @@ -1,12 +1,13 @@ package us.codecraft.webmagic.model.samples; import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Task; import us.codecraft.webmagic.model.OOSpider; +import us.codecraft.webmagic.model.PageModelPipeline; import us.codecraft.webmagic.model.annotation.ExtractBy; import us.codecraft.webmagic.model.annotation.ExtractByUrl; import us.codecraft.webmagic.model.annotation.HelpUrl; import us.codecraft.webmagic.model.annotation.TargetUrl; -import us.codecraft.webmagic.pipeline.JsonFilePageModelPipeline; /** * @author code4crafter@gmail.com
@@ -25,8 +26,13 @@ public class Kr36NewsModel { private String url; public static void main(String[] args) { - OOSpider.create(Site.me().addStartUrl("http://www.36kr.com/").setSleepTime(0),new JsonFilePageModelPipeline(), - Kr36NewsModel.class).thread(20).run(); + //Just for benchmark + OOSpider.create(Site.me().addStartUrl("http://www.36kr.com/").setSleepTime(0), new PageModelPipeline() { + @Override + public void process(Object o, Task task) { + + } + },Kr36NewsModel.class).thread(20).run(); } public String getTitle() { From 8f774afc84898673d58ef79bb2b0ca28fb9ccd80 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Wed, 6 Nov 2013 06:41:04 +0800 Subject: [PATCH 28/38] add direct download --- .../us/codecraft/webmagic/ResultItems.java | 9 +++ .../main/java/us/codecraft/webmagic/Site.java | 24 ++++++ .../java/us/codecraft/webmagic/Spider.java | 77 +++++++++++++++++-- .../downloader/HttpClientGenerator.java | 77 ++++++++++--------- .../webmagic/pipeline/CollectorPipeline.java | 25 ++++++ .../example/BaiduBaikePageProcesser.java | 48 ++++++++++++ .../example/GithubRepoPageProcesser.java | 4 +- .../example/OschinaBlogPageProcesser.java | 4 +- .../us/codecraft/webmagic/utils/UrlUtils.java | 5 +- .../webmagic/example/BaiduBaike.java | 2 +- 10 files changed, 225 insertions(+), 50 deletions(-) create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/CollectorPipeline.java create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/BaiduBaikePageProcesser.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/ResultItems.java b/webmagic-core/src/main/java/us/codecraft/webmagic/ResultItems.java index e055270..4791e77 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/ResultItems.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/ResultItems.java @@ -68,4 +68,13 @@ public class ResultItems { this.skip = skip; return this; } + + @Override + public String toString() { + return "ResultItems{" + + "fields=" + fields + + ", request=" + request + + ", skip=" + skip + + '}'; + } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java index 33e9b8f..22015c3 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java @@ -43,6 +43,8 @@ public class Site { private HttpHost httpProxy; + private boolean useGzip = true; + public static interface HeaderConst { public static final String REFERER = "Referer"; @@ -199,7 +201,10 @@ public class Site { /** * Add a url to start url.
+ * Because urls are more a Spider's property than Site, move it to {@link Spider#addUrl(String...)}} * + * @deprecated + * @see Spider#addUrl(String...) * @param startUrl * @return this */ @@ -209,7 +214,10 @@ public class Site { /** * Add a url to start url.
+ * Because urls are more a Spider's property than Site, move it to {@link Spider#addRequest(Request...)}} * + * @deprecated + * @see Spider#addRequest(Request...) * @param startUrl * @return this */ @@ -312,6 +320,22 @@ public class Site { return this; } + public boolean isUseGzip() { + return useGzip; + } + + /** + * Whether use gzip.
+ * Default is true, you can set it to false to disable gzip. + * + * @param useGzip + * @return + */ + public Site setUseGzip(boolean useGzip) { + this.useGzip = useGzip; + return this; + } + public Task toTask() { return new Task() { @Override diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 04ac894..9a580bd 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -1,9 +1,11 @@ package us.codecraft.webmagic; +import com.google.common.collect.Lists; import org.apache.commons.collections.CollectionUtils; import org.apache.log4j.Logger; import us.codecraft.webmagic.downloader.Downloader; import us.codecraft.webmagic.downloader.HttpClientDownloader; +import us.codecraft.webmagic.pipeline.CollectorPipeline; import us.codecraft.webmagic.pipeline.ConsolePipeline; import us.codecraft.webmagic.pipeline.Pipeline; import us.codecraft.webmagic.processor.PageProcessor; @@ -16,7 +18,9 @@ import us.codecraft.webmagic.utils.UrlUtils; import java.io.Closeable; import java.io.IOException; import java.util.ArrayList; +import java.util.Collection; import java.util.List; +import java.util.UUID; import java.util.concurrent.ExecutorService; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.locks.Condition; @@ -85,6 +89,10 @@ public class Spider implements Runnable, Task { protected final static int STAT_STOPPED = 2; + protected boolean spawnUrl = true; + + protected boolean destroyWhenExit = true; + private ReentrantLock newUrlLock = new ReentrantLock(); private Condition newUrlCondition = newUrlLock.newCondition(); @@ -244,7 +252,9 @@ public class Spider implements Runnable, Task { pipelines.add(new ConsolePipeline()); } downloader.setThread(threadNum); - executorService = ThreadUtils.newFixedThreadPool(threadNum); + if (executorService == null || executorService.isShutdown()) { + executorService = ThreadUtils.newFixedThreadPool(threadNum); + } if (startRequests != null) { for (Request request : startRequests) { scheduler.push(request, this); @@ -285,10 +295,11 @@ public class Spider implements Runnable, Task { }); } } - executorService.shutdown(); stat.set(STAT_STOPPED); // release some resources - destroy(); + if (destroyWhenExit) { + close(); + } } private void checkRunningStat() { @@ -303,12 +314,13 @@ public class Spider implements Runnable, Task { } } - protected void destroy() { + public void close() { destroyEach(downloader); destroyEach(pageProcessor); for (Pipeline pipeline : pipelines) { destroyEach(pipeline); } + executorService.shutdown(); } private void destroyEach(Object object) { @@ -366,7 +378,7 @@ public class Spider implements Runnable, Task { } protected void extractAndAddRequests(Page page) { - if (CollectionUtils.isNotEmpty(page.getTargetRequests())) { + if (spawnUrl && CollectionUtils.isNotEmpty(page.getTargetRequests())) { for (Request request : page.getTargetRequests()) { addRequest(request); } @@ -374,8 +386,10 @@ public class Spider implements Runnable, Task { } private void addRequest(Request request) { + if (site.getDomain() == null && request != null && request.getUrl() != null) { + site.setDomain(UrlUtils.getDomain(request.getUrl())); + } scheduler.push(request, this); - } protected void checkIfRunning() { @@ -391,7 +405,7 @@ public class Spider implements Runnable, Task { } /** - * Add urls to crawl.
+ * Add urls to crawl.
* * @param urls * @return @@ -404,6 +418,34 @@ public class Spider implements Runnable, Task { return this; } + /** + * Download urls synchronizing. + * + * @param urls + * @return + */ + public List getAll(Collection urls) { + destroyWhenExit = false; + spawnUrl = false; + startRequests = UrlUtils.convertToRequests(urls); + CollectorPipeline collectorPipeline = new CollectorPipeline(); + pipelines.add(collectorPipeline); + run(); + spawnUrl = true; + destroyWhenExit = true; + return collectorPipeline.getCollector(); + } + + public ResultItems get(String url) { + List urls = Lists.newArrayList(url); + List resultItemses = getAll(urls); + if (resultItemses != null && resultItemses.size() > 0) { + return resultItemses.get(0); + } else { + return null; + } + } + /** * Add urls with information to crawl.
* @@ -492,6 +534,24 @@ public class Spider implements Runnable, Task { return this; } + public boolean isSpawnUrl() { + return spawnUrl; + } + + /** + * Whether add urls extracted to download.
+ * Add urls to download when it is true, and just download seed urls when it is false.
+ * DO NOT set it unless you know what it means! + * + * @param spawnUrl + * @return + * @since 0.4.0 + */ + public Spider setSpawnUrl(boolean spawnUrl) { + this.spawnUrl = spawnUrl; + return this; + } + @Override public String getUUID() { if (uuid != null) { @@ -500,7 +560,8 @@ public class Spider implements Runnable, Task { if (site != null) { return site.getDomain(); } - return null; + uuid = UUID.randomUUID().toString(); + return uuid; } @Override diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java index dbc3828..a3319a0 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java @@ -1,8 +1,9 @@ package us.codecraft.webmagic.downloader; -import org.apache.http.*; +import org.apache.http.HttpException; +import org.apache.http.HttpRequest; +import org.apache.http.HttpRequestInterceptor; import org.apache.http.client.CookieStore; -import org.apache.http.client.entity.GzipDecompressingEntity; import org.apache.http.config.Registry; import org.apache.http.config.RegistryBuilder; import org.apache.http.conn.socket.ConnectionSocketFactory; @@ -19,7 +20,7 @@ import java.util.Map; /** * @author code4crafter@gmail.com
- * @since 0.3.3 + * @since 0.4.0 */ public class HttpClientGenerator { @@ -46,42 +47,48 @@ public class HttpClientGenerator { } else { httpClientBuilder.setUserAgent(""); } - httpClientBuilder.addInterceptorFirst(new HttpRequestInterceptor() { + if (site == null || site.isUseGzip()) { + httpClientBuilder.addInterceptorFirst(new HttpRequestInterceptor() { - public void process( - final HttpRequest request, - final HttpContext context) throws HttpException, IOException { - if (!request.containsHeader("Accept-Encoding")) { - request.addHeader("Accept-Encoding", "gzip"); - } - - } - }).addInterceptorFirst(new HttpResponseInterceptor() { - - public void process( - final HttpResponse response, - final HttpContext context) throws HttpException, IOException { - HttpEntity entity = response.getEntity(); - if (entity != null) { - Header ceheader = entity.getContentEncoding(); - if (ceheader != null) { - HeaderElement[] codecs = ceheader.getElements(); - for (int i = 0; i < codecs.length; i++) { - if (codecs[i].getName().equalsIgnoreCase("gzip")) { - response.setEntity( - new GzipDecompressingEntity(response.getEntity())); - return; - } - } + public void process( + final HttpRequest request, + final HttpContext context) throws HttpException, IOException { + if (!request.containsHeader("Accept-Encoding")) { + request.addHeader("Accept-Encoding", "gzip"); } - } - } - }); - if (site!=null){ - httpClientBuilder.setRetryHandler(new DefaultHttpRequestRetryHandler(site.getRetryTimes(),true)); + } + }); } - generateCookie(httpClientBuilder,site); +// httpClientBuilder.disableContentCompression().addInterceptorFirst(new HttpResponseInterceptor() { +// +// public void process( +// final HttpResponse response, +// final HttpContext context) throws HttpException, IOException { +// if (response.getStatusLine().getStatusCode() != 200) { +// return; +// } +// HttpEntity entity = response.getEntity(); +// if (entity != null) { +// Header ceheader = entity.getContentEncoding(); +// if (ceheader != null) { +// HeaderElement[] codecs = ceheader.getElements(); +// for (int i = 0; i < codecs.length; i++) { +// if (codecs[i].getName().equalsIgnoreCase("gzip")) { +// response.setEntity( +// new GzipDecompressingEntity(response.getEntity())); +// return; +// } +// } +// } +// } +// } +// +// }); + if (site != null) { + httpClientBuilder.setRetryHandler(new DefaultHttpRequestRetryHandler(site.getRetryTimes(), true)); + } + generateCookie(httpClientBuilder, site); return httpClientBuilder.build(); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/CollectorPipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/CollectorPipeline.java new file mode 100644 index 0000000..012c4c5 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/CollectorPipeline.java @@ -0,0 +1,25 @@ +package us.codecraft.webmagic.pipeline; + +import us.codecraft.webmagic.ResultItems; +import us.codecraft.webmagic.Task; + +import java.util.ArrayList; +import java.util.List; + +/** + * @author code4crafter@gmail.com + * @since 0.4.0 + */ +public class CollectorPipeline implements Pipeline{ + + private List collector = new ArrayList(); + + @Override + public void process(ResultItems resultItems, Task task) { + collector.add(resultItems); + } + + public List getCollector() { + return collector; + } +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/BaiduBaikePageProcesser.java b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/BaiduBaikePageProcesser.java new file mode 100644 index 0000000..b3e7d78 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/BaiduBaikePageProcesser.java @@ -0,0 +1,48 @@ +package us.codecraft.webmagic.processor.example; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.ResultItems; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.processor.PageProcessor; + +import java.util.ArrayList; +import java.util.List; + +/** + * @author code4crafter@gmail.com
+ * @since 0.4.0 + */ +public class BaiduBaikePageProcesser implements PageProcessor { + + private Site site = Site.me()//.setHttpProxy(new HttpHost("127.0.0.1",8888)) + .setCharset("utf-8").setRetryTimes(3).setSleepTime(1000).setUseGzip(true); + + @Override + public void process(Page page) { + page.putField("name", page.getHtml().$("h1.title div.lemmaTitleH1","text").toString()); + page.putField("description", page.getHtml().xpath("//div[@id='lemmaContent-0']//div[@class='para']/allText()")); + } + + @Override + public Site getSite() { + return site; + } + + public static void main(String[] args) { + Spider spider = Spider.create(new BaiduBaikePageProcesser()).thread(2); + List list = new ArrayList(); + String urlTemplate = "http://baike.baidu.com/search/word?word=%s&pic=1&sug=1&enc=utf8"; + list.add(String.format(urlTemplate,"水力发电")); + list.add(String.format(urlTemplate,"风力发电")); + list.add(String.format(urlTemplate,"太阳能")); + list.add(String.format(urlTemplate,"地热发电")); + list.add(String.format(urlTemplate,"众数")); + list.add(String.format(urlTemplate,"地热发电")); + List resultItemses = spider.getAll(list); + for (ResultItems resultItemse : resultItemses) { + System.out.println(resultItemse.getAll()); + } + spider.close(); + } +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/GithubRepoPageProcesser.java b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/GithubRepoPageProcesser.java index 0e7e3b9..47f904f 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/GithubRepoPageProcesser.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/GithubRepoPageProcesser.java @@ -11,7 +11,7 @@ import us.codecraft.webmagic.processor.PageProcessor; */ public class GithubRepoPageProcesser implements PageProcessor { - private Site site = Site.me().addStartUrl("https://github.com/code4craft").setRetryTimes(3).setSleepTime(100); + private Site site = Site.me().setRetryTimes(3).setSleepTime(100); @Override public void process(Page page) { @@ -31,6 +31,6 @@ public class GithubRepoPageProcesser implements PageProcessor { } public static void main(String[] args) { - Spider.create(new GithubRepoPageProcesser()).thread(5).run(); + Spider.create(new GithubRepoPageProcesser()).addUrl("https://github.com/code4craft").thread(5).run(); } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/OschinaBlogPageProcesser.java b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/OschinaBlogPageProcesser.java index fa8dab6..4ef830d 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/OschinaBlogPageProcesser.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/OschinaBlogPageProcesser.java @@ -12,7 +12,7 @@ import java.util.List; */ public class OschinaBlogPageProcesser implements PageProcessor { - private Site site = Site.me().setDomain("my.oschina.net").addStartUrl("http://my.oschina.net/flashsword/blog"); + private Site site = Site.me().setDomain("my.oschina.net"); @Override public void process(Page page) { @@ -34,6 +34,6 @@ public class OschinaBlogPageProcesser implements PageProcessor { } public static void main(String[] args) { - Spider.create(new OschinaBlogPageProcesser()).thread(2).run(); + Spider.create(new OschinaBlogPageProcesser()).addUrl("http://my.oschina.net/flashsword/blog").thread(2).run(); } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java index e45f948..456b3cc 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java @@ -7,6 +7,7 @@ import java.net.MalformedURLException; import java.net.URL; import java.nio.charset.Charset; import java.util.ArrayList; +import java.util.Collection; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -88,7 +89,7 @@ public class UrlUtils { return stringBuilder.toString(); } - public static List convertToRequests(List urls) { + public static List convertToRequests(Collection urls) { List requestList = new ArrayList(urls.size()); for (String url : urls) { requestList.add(new Request(url)); @@ -96,7 +97,7 @@ public class UrlUtils { return requestList; } - public static List convertToUrls(List requests) { + public static List convertToUrls(Collection requests) { List urlList = new ArrayList(requests.size()); for (Request request : requests) { urlList.add(request.getUrl()); diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/BaiduBaike.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/BaiduBaike.java index 9e63055..edd167d 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/BaiduBaike.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/BaiduBaike.java @@ -11,7 +11,7 @@ import java.util.ArrayList; import java.util.List; /** - * @since 0.3.3 + * @since 0.4.0 * NO implement yet!!!!!!!!!!!! * @author code4crafter@gmail.com */ From 00b0a751b4ab877b7f87bd55193e4b6565306f2e Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Wed, 6 Nov 2013 06:57:58 +0800 Subject: [PATCH 29/38] #33 ignore 'content-encoding' when redirect --- .../downloader/HttpClientGenerator.java | 48 ++++++++----------- .../example/BaiduBaikePageProcesser.java | 1 - 2 files changed, 20 insertions(+), 29 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java index a3319a0..92ba6f8 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java @@ -1,9 +1,8 @@ package us.codecraft.webmagic.downloader; -import org.apache.http.HttpException; -import org.apache.http.HttpRequest; -import org.apache.http.HttpRequestInterceptor; +import org.apache.http.*; import org.apache.http.client.CookieStore; +import org.apache.http.client.protocol.ResponseContentEncoding; import org.apache.http.config.Registry; import org.apache.http.config.RegistryBuilder; import org.apache.http.conn.socket.ConnectionSocketFactory; @@ -60,31 +59,24 @@ public class HttpClientGenerator { } }); } -// httpClientBuilder.disableContentCompression().addInterceptorFirst(new HttpResponseInterceptor() { -// -// public void process( -// final HttpResponse response, -// final HttpContext context) throws HttpException, IOException { -// if (response.getStatusLine().getStatusCode() != 200) { -// return; -// } -// HttpEntity entity = response.getEntity(); -// if (entity != null) { -// Header ceheader = entity.getContentEncoding(); -// if (ceheader != null) { -// HeaderElement[] codecs = ceheader.getElements(); -// for (int i = 0; i < codecs.length; i++) { -// if (codecs[i].getName().equalsIgnoreCase("gzip")) { -// response.setEntity( -// new GzipDecompressingEntity(response.getEntity())); -// return; -// } -// } -// } -// } -// } -// -// }); + // Http client has some problem handling compressing entity for redirect + // So I disable it and do it manually + // https://issues.apache.org/jira/browse/HTTPCLIENT-1432 + httpClientBuilder.disableContentCompression(); + httpClientBuilder.addInterceptorFirst(new HttpResponseInterceptor() { + + private ResponseContentEncoding contentEncoding = new ResponseContentEncoding(); + + public void process( + final HttpResponse response, + final HttpContext context) throws HttpException, IOException { + if (response.getStatusLine().getStatusCode() == 301 || response.getStatusLine().getStatusCode() == 302) { + return; + } + contentEncoding.process(response, context); + } + + }); if (site != null) { httpClientBuilder.setRetryHandler(new DefaultHttpRequestRetryHandler(site.getRetryTimes(), true)); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/BaiduBaikePageProcesser.java b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/BaiduBaikePageProcesser.java index b3e7d78..071b7e6 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/BaiduBaikePageProcesser.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/BaiduBaikePageProcesser.java @@ -37,7 +37,6 @@ public class BaiduBaikePageProcesser implements PageProcessor { list.add(String.format(urlTemplate,"风力发电")); list.add(String.format(urlTemplate,"太阳能")); list.add(String.format(urlTemplate,"地热发电")); - list.add(String.format(urlTemplate,"众数")); list.add(String.format(urlTemplate,"地热发电")); List resultItemses = spider.getAll(list); for (ResultItems resultItemse : resultItemses) { From 807aefe9df05b6cfceab36d32350e54ec0e8672a Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Wed, 6 Nov 2013 07:37:34 +0800 Subject: [PATCH 30/38] change EntityUtil to IOUtil because some encoding error --- .../codecraft/webmagic/downloader/HttpClientDownloader.java | 4 ++-- .../webmagic/processor/example/BaiduBaikePageProcesser.java | 2 +- .../main/java/us/codecraft/webmagic/example/BaiduBaike.java | 3 --- 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index c4a0c01..5c51916 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -1,6 +1,7 @@ package us.codecraft.webmagic.downloader; import com.google.common.collect.Sets; +import org.apache.commons.io.IOUtils; import org.apache.http.HttpResponse; import org.apache.http.annotation.ThreadSafe; import org.apache.http.client.config.CookieSpecs; @@ -8,7 +9,6 @@ import org.apache.http.client.config.RequestConfig; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.RequestBuilder; import org.apache.http.impl.client.CloseableHttpClient; -import org.apache.http.util.EntityUtils; import org.apache.log4j.Logger; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; @@ -158,7 +158,7 @@ public class HttpClientDownloader implements Downloader { } protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException { - String content = EntityUtils.toString(httpResponse.getEntity(), charset); + String content = IOUtils.toString(httpResponse.getEntity().getContent(), charset); Page page = new Page(); page.setHtml(new Html(UrlUtils.fixAllRelativeHrefs(content, request.getUrl()))); page.setUrl(new PlainText(request.getUrl())); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/BaiduBaikePageProcesser.java b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/BaiduBaikePageProcesser.java index 071b7e6..af03166 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/BaiduBaikePageProcesser.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/BaiduBaikePageProcesser.java @@ -16,7 +16,7 @@ import java.util.List; public class BaiduBaikePageProcesser implements PageProcessor { private Site site = Site.me()//.setHttpProxy(new HttpHost("127.0.0.1",8888)) - .setCharset("utf-8").setRetryTimes(3).setSleepTime(1000).setUseGzip(true); + .setRetryTimes(3).setSleepTime(1000).setUseGzip(true); @Override public void process(Page page) { diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/BaiduBaike.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/BaiduBaike.java index edd167d..6f901ff 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/BaiduBaike.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/BaiduBaike.java @@ -4,7 +4,6 @@ import us.codecraft.webmagic.Page; import us.codecraft.webmagic.model.AfterExtractor; import us.codecraft.webmagic.model.OOSpider; import us.codecraft.webmagic.model.annotation.ExtractBy; -import us.codecraft.webmagic.model.annotation.UrlTemplate; import us.codecraft.webmagic.model.direct.Param; import java.util.ArrayList; @@ -12,10 +11,8 @@ import java.util.List; /** * @since 0.4.0 - * NO implement yet!!!!!!!!!!!! * @author code4crafter@gmail.com */ -@UrlTemplate("http://baike.baidu.com/search/word?word=${word}&enc=utf8") public class BaiduBaike implements AfterExtractor{ private String word; From 6e32a19f807fbbf7821dee4b9ff57db021f7672b Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Wed, 6 Nov 2013 12:46:50 +0800 Subject: [PATCH 31/38] update api for direct download --- .../java/us/codecraft/webmagic/Spider.java | 15 ++++-- .../webmagic/pipeline/CollectorPipeline.java | 25 ++++------ .../ResultItemsCollectorPipeline.java | 26 ++++++++++ .../example/BaiduBaikePageProcesser.java | 2 +- .../webmagic/example/BaiduBaike.java | 28 +++++++---- .../webmagic/example/GithubRepo.java | 5 +- .../webmagic/example/OschinaBlog.java | 5 +- .../model/ConsolePageModelPipeline.java | 1 + .../webmagic/model/ModelPipeline.java | 1 + .../us/codecraft/webmagic/model/OOSpider.java | 50 +++++++------------ .../model/PageModelCollectorPipeline.java | 46 +++++++++++++++++ .../webmagic/model/PageModelExtractor.java | 2 +- .../pipeline/CollectorPageModelPipeline.java | 23 +++++++++ .../pipeline/FilePageModelPipeline.java | 1 - .../pipeline/JsonFilePageModelPipeline.java | 1 - .../PageModelPipeline.java | 2 +- .../webmagic/MockPageModelPipeline.java | 2 +- .../webmagic/model/GithubRepoTest.java | 1 + .../webmagic/model/samples/Kr36NewsModel.java | 2 +- .../webmagic/model/samples/OschinaBlog.java | 2 +- .../webmagic/model/OOSpider-cmnt.xml | 2 +- .../webmagic/model/PageModelPipeline-cmnt.xml | 2 +- 22 files changed, 167 insertions(+), 77 deletions(-) create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ResultItemsCollectorPipeline.java create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelCollectorPipeline.java create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/CollectorPageModelPipeline.java rename webmagic-extension/src/main/java/us/codecraft/webmagic/{model => pipeline}/PageModelPipeline.java (86%) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 9a580bd..667a71e 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -6,6 +6,7 @@ import org.apache.log4j.Logger; import us.codecraft.webmagic.downloader.Downloader; import us.codecraft.webmagic.downloader.HttpClientDownloader; import us.codecraft.webmagic.pipeline.CollectorPipeline; +import us.codecraft.webmagic.pipeline.ResultItemsCollectorPipeline; import us.codecraft.webmagic.pipeline.ConsolePipeline; import us.codecraft.webmagic.pipeline.Pipeline; import us.codecraft.webmagic.processor.PageProcessor; @@ -424,21 +425,25 @@ public class Spider implements Runnable, Task { * @param urls * @return */ - public List getAll(Collection urls) { + public List getAll(Collection urls) { destroyWhenExit = false; spawnUrl = false; startRequests = UrlUtils.convertToRequests(urls); - CollectorPipeline collectorPipeline = new CollectorPipeline(); + CollectorPipeline collectorPipeline = getCollectorPipeline(); pipelines.add(collectorPipeline); run(); spawnUrl = true; destroyWhenExit = true; - return collectorPipeline.getCollector(); + return collectorPipeline.getCollected(); } - public ResultItems get(String url) { + protected CollectorPipeline getCollectorPipeline() { + return new ResultItemsCollectorPipeline(); + } + + public T get(String url) { List urls = Lists.newArrayList(url); - List resultItemses = getAll(urls); + List resultItemses = getAll(urls); if (resultItemses != null && resultItemses.size() > 0) { return resultItemses.get(0); } else { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/CollectorPipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/CollectorPipeline.java index 012c4c5..7242f43 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/CollectorPipeline.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/CollectorPipeline.java @@ -1,25 +1,20 @@ package us.codecraft.webmagic.pipeline; -import us.codecraft.webmagic.ResultItems; -import us.codecraft.webmagic.Task; - -import java.util.ArrayList; import java.util.List; /** + * Pipeline that can collect and store results.
+ * Used for {@link us.codecraft.webmagic.Spider#getAll(java.util.Collection)} + * * @author code4crafter@gmail.com * @since 0.4.0 */ -public class CollectorPipeline implements Pipeline{ +public interface CollectorPipeline extends Pipeline { - private List collector = new ArrayList(); - - @Override - public void process(ResultItems resultItems, Task task) { - collector.add(resultItems); - } - - public List getCollector() { - return collector; - } + /** + * Get all results collected. + * + * @return collected results + */ + public List getCollected(); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ResultItemsCollectorPipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ResultItemsCollectorPipeline.java new file mode 100644 index 0000000..cf45ec8 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ResultItemsCollectorPipeline.java @@ -0,0 +1,26 @@ +package us.codecraft.webmagic.pipeline; + +import us.codecraft.webmagic.ResultItems; +import us.codecraft.webmagic.Task; + +import java.util.ArrayList; +import java.util.List; + +/** + * @author code4crafter@gmail.com + * @since 0.4.0 + */ +public class ResultItemsCollectorPipeline implements CollectorPipeline { + + private List collector = new ArrayList(); + + @Override + public void process(ResultItems resultItems, Task task) { + collector.add(resultItems); + } + + @Override + public List getCollected() { + return collector; + } +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/BaiduBaikePageProcesser.java b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/BaiduBaikePageProcesser.java index af03166..34f0134 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/BaiduBaikePageProcesser.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/BaiduBaikePageProcesser.java @@ -38,7 +38,7 @@ public class BaiduBaikePageProcesser implements PageProcessor { list.add(String.format(urlTemplate,"太阳能")); list.add(String.format(urlTemplate,"地热发电")); list.add(String.format(urlTemplate,"地热发电")); - List resultItemses = spider.getAll(list); + List resultItemses = spider.getAll(list); for (ResultItems resultItemse : resultItemses) { System.out.println(resultItemse.getAll()); } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/BaiduBaike.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/BaiduBaike.java index 6f901ff..0fd138e 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/BaiduBaike.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/BaiduBaike.java @@ -1,10 +1,8 @@ package us.codecraft.webmagic.example; -import us.codecraft.webmagic.Page; -import us.codecraft.webmagic.model.AfterExtractor; +import us.codecraft.webmagic.Site; import us.codecraft.webmagic.model.OOSpider; import us.codecraft.webmagic.model.annotation.ExtractBy; -import us.codecraft.webmagic.model.direct.Param; import java.util.ArrayList; import java.util.List; @@ -13,21 +11,31 @@ import java.util.List; * @since 0.4.0 * @author code4crafter@gmail.com */ -public class BaiduBaike implements AfterExtractor{ +public class BaiduBaike{ - private String word; + @ExtractBy("//h1[@class=title]/div[@class=lemmaTitleH1]/text()") + private String name; @ExtractBy("//div[@id='lemmaContent-0']//div[@class='para']/allText()") private String description; @Override - public void afterProcess(Page page) { - + public String toString() { + return "BaiduBaike{" + + "name='" + name + '\'' + + ", description='" + description + '\'' + + '}'; } public static void main(String[] args) { - List words = new ArrayList(); - words.add(new Param().put("word","红烧肉")); - OOSpider.direct(words, BaiduBaike.class).thread(10).run(); + List list = new ArrayList(); + String urlTemplate = "http://baike.baidu.com/search/word?word=%s&pic=1&sug=1&enc=utf8"; + list.add(String.format(urlTemplate,"水力发电")); + list.add(String.format(urlTemplate,"风力发电")); + list.add(String.format(urlTemplate,"太阳能")); + list.add(String.format(urlTemplate,"地热发电")); + list.add(String.format(urlTemplate, "地热发电")); + List baiduBaikes = OOSpider.create(Site.me().setSleepTime(100), BaiduBaike.class).getAll(list); + System.out.println(baiduBaikes); } } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/GithubRepo.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/GithubRepo.java index 58441cb..427cdf7 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/GithubRepo.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/GithubRepo.java @@ -41,8 +41,9 @@ public class GithubRepo implements HasKey { private String url; public static void main(String[] args) { - OOSpider.create(Site.me().addStartUrl("https://github.com/code4craft").setSleepTime(100) - , new ConsolePageModelPipeline(), GithubRepo.class).thread(10).run(); + OOSpider.create(Site.me().setSleepTime(100) + , new ConsolePageModelPipeline(), GithubRepo.class) + .addUrl("https://github.com/code4craft").thread(10).run(); } @Override diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/OschinaBlog.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/OschinaBlog.java index 1545f88..5bd8ddd 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/OschinaBlog.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/OschinaBlog.java @@ -31,8 +31,9 @@ public class OschinaBlog { private Date date; public static void main(String[] args) { - OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog") - , new JsonFilePageModelPipeline("/data/webmagic/"), OschinaBlog.class).run(); + OOSpider.create(Site.me() + , new JsonFilePageModelPipeline("/data/webmagic/"), OschinaBlog.class) + .addUrl("http://my.oschina.net/flashsword/blog").run(); } public String getTitle() { diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ConsolePageModelPipeline.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ConsolePageModelPipeline.java index abf411c..f4740c9 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ConsolePageModelPipeline.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ConsolePageModelPipeline.java @@ -2,6 +2,7 @@ package us.codecraft.webmagic.model; import org.apache.commons.lang3.builder.ToStringBuilder; import us.codecraft.webmagic.Task; +import us.codecraft.webmagic.pipeline.PageModelPipeline; /** * Print page model in console.
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ModelPipeline.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ModelPipeline.java index 071cb26..593178f 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ModelPipeline.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ModelPipeline.java @@ -3,6 +3,7 @@ package us.codecraft.webmagic.model; import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.model.annotation.ExtractBy; +import us.codecraft.webmagic.pipeline.PageModelPipeline; import us.codecraft.webmagic.pipeline.Pipeline; import java.lang.annotation.Annotation; diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/OOSpider.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/OOSpider.java index c43b6e8..3133308 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/OOSpider.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/OOSpider.java @@ -2,10 +2,12 @@ package us.codecraft.webmagic.model; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; -import us.codecraft.webmagic.model.direct.Param; +import us.codecraft.webmagic.pipeline.CollectorPipeline; +import us.codecraft.webmagic.pipeline.PageModelPipeline; import us.codecraft.webmagic.processor.PageProcessor; -import java.util.Collection; +import java.util.ArrayList; +import java.util.List; /** * The spider for page model extractor.
@@ -36,12 +38,16 @@ import java.util.Collection; * @author code4crafter@gmail.com
* @since 0.2.0 */ -public class OOSpider extends Spider { +public class OOSpider extends Spider { private ModelPageProcessor modelPageProcessor; private ModelPipeline modelPipeline; + private PageModelPipeline pageModelPipeline; + + private List pageModelClasses = new ArrayList(); + protected OOSpider(ModelPageProcessor modelPageProcessor) { super(modelPageProcessor); this.modelPageProcessor = modelPageProcessor; @@ -62,13 +68,19 @@ public class OOSpider extends Spider { this(ModelPageProcessor.create(site, pageModels)); this.modelPipeline = new ModelPipeline(); super.addPipeline(modelPipeline); - if (pageModelPipeline != null) { - for (Class pageModel : pageModels) { + for (Class pageModel : pageModels) { + if (pageModelPipeline != null) { this.modelPipeline.put(pageModel, pageModelPipeline); } + pageModelClasses.add(pageModel); } } + @Override + protected CollectorPipeline getCollectorPipeline() { + return new PageModelCollectorPipeline(pageModelClasses.get(0)); + } + public static OOSpider create(Site site, Class... pageModels) { return new OOSpider(site, null, pageModels); } @@ -77,34 +89,6 @@ public class OOSpider extends Spider { return new OOSpider(site, pageModelPipeline, pageModels); } - /** - * @since 0.3.3 - * NO implement yet! - */ - public static OOSpider direct(Site site, PageModelPipeline pageModelPipeline, Class... pageModels) { - return new OOSpider(site, pageModelPipeline, pageModels); - } - - /** - * @since 0.3.3 - * NO implement yet! - */ - public static OOSpider direct(PageModelPipeline pageModelPipeline, Class... pageModels) { - return new OOSpider(null, pageModelPipeline, pageModels); - } - - /** - * @since 0.3.3 - * NO implement yet! - */ - public static OOSpider direct(Class... pageModels) { - return new OOSpider(null, null, pageModels); - } - - public static OOSpider direct(Collection params, Class... pageModels) { - return new OOSpider(null, null, pageModels); - } - public OOSpider addPageModel(PageModelPipeline pageModelPipeline, Class... pageModels) { for (Class pageModel : pageModels) { modelPageProcessor.addPageModel(pageModel); diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelCollectorPipeline.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelCollectorPipeline.java new file mode 100644 index 0000000..b61f112 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelCollectorPipeline.java @@ -0,0 +1,46 @@ +package us.codecraft.webmagic.model; + +import us.codecraft.webmagic.ResultItems; +import us.codecraft.webmagic.Task; +import us.codecraft.webmagic.model.annotation.ExtractBy; +import us.codecraft.webmagic.pipeline.CollectorPageModelPipeline; +import us.codecraft.webmagic.pipeline.CollectorPipeline; + +import java.lang.annotation.Annotation; +import java.util.List; + +/** + * @author code4crafter@gmail.com + * @since 0.4.0 + */ +class PageModelCollectorPipeline implements CollectorPipeline { + + private final CollectorPageModelPipeline classPipeline = new CollectorPageModelPipeline(); + + private final Class clazz; + + PageModelCollectorPipeline(Class clazz) { + this.clazz = clazz; + } + + @Override + public List getCollected() { + return classPipeline.getCollected(); + } + + @Override + public synchronized void process(ResultItems resultItems, Task task) { + Object o = resultItems.get(clazz.getCanonicalName()); + if (o != null) { + Annotation annotation = clazz.getAnnotation(ExtractBy.class); + if (annotation == null || !((ExtractBy) annotation).multi()) { + classPipeline.process((T) o, task); + } else { + List list = (List) o; + for (Object o1 : list) { + classPipeline.process((T) o1, task); + } + } + } + } +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java index c78bd31..a079988 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java @@ -195,7 +195,7 @@ class PageModelExtractor { private void initClassExtractors() { Annotation annotation = clazz.getAnnotation(TargetUrl.class); if (annotation == null) { - targetUrlPatterns.add(Pattern.compile(".*")); + targetUrlPatterns.add(Pattern.compile("(.*)")); } else { TargetUrl targetUrl = (TargetUrl) annotation; String[] value = targetUrl.value(); diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/CollectorPageModelPipeline.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/CollectorPageModelPipeline.java new file mode 100644 index 0000000..b6e0b1b --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/CollectorPageModelPipeline.java @@ -0,0 +1,23 @@ +package us.codecraft.webmagic.pipeline; + +import us.codecraft.webmagic.Task; + +import java.util.ArrayList; +import java.util.List; + +/** + * @author code4crafter@gmail.com + */ +public class CollectorPageModelPipeline implements PageModelPipeline { + + private List collected = new ArrayList(); + + @Override + public synchronized void process(T t, Task task) { + collected.add(t); + } + + public List getCollected() { + return collected; + } +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/FilePageModelPipeline.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/FilePageModelPipeline.java index 5586863..273b18b 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/FilePageModelPipeline.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/FilePageModelPipeline.java @@ -5,7 +5,6 @@ import org.apache.commons.lang3.builder.ToStringBuilder; import org.apache.log4j.Logger; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.model.HasKey; -import us.codecraft.webmagic.model.PageModelPipeline; import us.codecraft.webmagic.utils.FilePersistentBase; import java.io.FileWriter; diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePageModelPipeline.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePageModelPipeline.java index 228ec8c..4e35dfe 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePageModelPipeline.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePageModelPipeline.java @@ -6,7 +6,6 @@ import org.apache.commons.lang3.builder.ToStringBuilder; import org.apache.log4j.Logger; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.model.HasKey; -import us.codecraft.webmagic.model.PageModelPipeline; import us.codecraft.webmagic.utils.FilePersistentBase; import java.io.FileWriter; diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelPipeline.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/PageModelPipeline.java similarity index 86% rename from webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelPipeline.java rename to webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/PageModelPipeline.java index 2cb3808..382f71d 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelPipeline.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/PageModelPipeline.java @@ -1,4 +1,4 @@ -package us.codecraft.webmagic.model; +package us.codecraft.webmagic.pipeline; import us.codecraft.webmagic.Task; diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/MockPageModelPipeline.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/MockPageModelPipeline.java index ea7601b..f94efce 100644 --- a/webmagic-extension/src/test/java/us/codecraft/webmagic/MockPageModelPipeline.java +++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/MockPageModelPipeline.java @@ -1,7 +1,7 @@ package us.codecraft.webmagic; import junit.framework.Assert; -import us.codecraft.webmagic.model.PageModelPipeline; +import us.codecraft.webmagic.pipeline.PageModelPipeline; /** * @author code4crafter@gmail.com diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/model/GithubRepoTest.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/GithubRepoTest.java index d6e1bf0..b719bf0 100644 --- a/webmagic-extension/src/test/java/us/codecraft/webmagic/model/GithubRepoTest.java +++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/GithubRepoTest.java @@ -6,6 +6,7 @@ import us.codecraft.webmagic.MockDownloader; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.example.GithubRepo; +import us.codecraft.webmagic.pipeline.PageModelPipeline; /** * @author code4crafter@gmail.com
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/Kr36NewsModel.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/Kr36NewsModel.java index 472b47e..936f132 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/Kr36NewsModel.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/Kr36NewsModel.java @@ -3,7 +3,7 @@ package us.codecraft.webmagic.model.samples; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.model.OOSpider; -import us.codecraft.webmagic.model.PageModelPipeline; +import us.codecraft.webmagic.pipeline.PageModelPipeline; import us.codecraft.webmagic.model.annotation.ExtractBy; import us.codecraft.webmagic.model.annotation.ExtractByUrl; import us.codecraft.webmagic.model.annotation.HelpUrl; diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java index 9e7fa88..468b855 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java @@ -3,7 +3,7 @@ package us.codecraft.webmagic.model.samples; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.model.OOSpider; -import us.codecraft.webmagic.model.PageModelPipeline; +import us.codecraft.webmagic.pipeline.PageModelPipeline; import us.codecraft.webmagic.model.annotation.ExtractBy; import us.codecraft.webmagic.model.annotation.TargetUrl; diff --git a/zh_docs/us/codecraft/webmagic/model/OOSpider-cmnt.xml b/zh_docs/us/codecraft/webmagic/model/OOSpider-cmnt.xml index 2fd60a7..232c509 100644 --- a/zh_docs/us/codecraft/webmagic/model/OOSpider-cmnt.xml +++ b/zh_docs/us/codecraft/webmagic/model/OOSpider-cmnt.xml @@ -12,7 +12,7 @@ ]]> - + @param site @param pageModelPipeline diff --git a/zh_docs/us/codecraft/webmagic/model/PageModelPipeline-cmnt.xml b/zh_docs/us/codecraft/webmagic/model/PageModelPipeline-cmnt.xml index ac65729..64fb524 100644 --- a/zh_docs/us/codecraft/webmagic/model/PageModelPipeline-cmnt.xml +++ b/zh_docs/us/codecraft/webmagic/model/PageModelPipeline-cmnt.xml @@ -4,7 +4,7 @@ Sat Aug 17 14:14:46 CST 2013 - + Date: 13-8-3
Time: 上午9:34
From e046bb072391bd7ec06e5f4a310e60048b5fb58a Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Wed, 6 Nov 2013 12:48:14 +0800 Subject: [PATCH 32/38] remove useless code --- .../ResultItemsCollectorPipeline.java | 2 +- .../model/annotation/UrlTemplate.java | 37 ------------------- .../webmagic/model/direct/Param.java | 15 -------- 3 files changed, 1 insertion(+), 53 deletions(-) delete mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/UrlTemplate.java delete mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/model/direct/Param.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ResultItemsCollectorPipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ResultItemsCollectorPipeline.java index cf45ec8..abafa88 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ResultItemsCollectorPipeline.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ResultItemsCollectorPipeline.java @@ -15,7 +15,7 @@ public class ResultItemsCollectorPipeline implements CollectorPipeline collector = new ArrayList(); @Override - public void process(ResultItems resultItems, Task task) { + public synchronized void process(ResultItems resultItems, Task task) { collector.add(resultItems); } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/UrlTemplate.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/UrlTemplate.java deleted file mode 100644 index a940a64..0000000 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/UrlTemplate.java +++ /dev/null @@ -1,37 +0,0 @@ -package us.codecraft.webmagic.model.annotation; - -import java.lang.annotation.ElementType; -import java.lang.annotation.Retention; -import java.lang.annotation.Target; - -/** - * Define the url patterns for class.
- * All urls matching the pattern will be crawled and extracted for new objects.
- * - * @author code4crafter@gmail.com
- * @since 0.3.3 - */ -@Retention(java.lang.annotation.RetentionPolicy.RUNTIME) -@Target({ElementType.TYPE}) -public @interface UrlTemplate { - - /** - * The url patterns for class.
- * Use regex expression with some changes:
- * "." stand for literal character "." instead of "any character".
- * "*" stand for any legal character for url in 0-n length ([^"'#]*) instead of "any length".
- * - * @return the url patterns for class - */ - String value(); - - /** - * Define the region for url extracting.
- * Only support XPath.
- * When sourceRegion is set, the urls will be extracted only from the region instead of entire content.
- * - * @return the region for url extracting - */ - String encoding() default "utf8"; - -} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/direct/Param.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/direct/Param.java deleted file mode 100644 index c66e854..0000000 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/direct/Param.java +++ /dev/null @@ -1,15 +0,0 @@ -package us.codecraft.webmagic.model.direct; - -import java.util.LinkedHashMap; - -/** - * @author code4crafter@gmail.com - */ -public class Param extends LinkedHashMap{ - - @Override - public Param put(String key, Object value) { - super.put(key, value); - return this; - } -} From 425df085236a9b6b216cb00a5be4a12644f0211a Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Wed, 6 Nov 2013 12:50:45 +0800 Subject: [PATCH 33/38] update version to 0.4.0 --- pom.xml | 2 +- webmagic-core/pom.xml | 2 +- webmagic-extension/pom.xml | 2 +- webmagic-lucene/pom.xml | 2 +- webmagic-samples/pom.xml | 2 +- webmagic-saxon/pom.xml | 2 +- webmagic-selenium/pom.xml | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/pom.xml b/pom.xml index 918ab6a..8e68d49 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ 7 us.codecraft - 0.3.3-SNAPSHOT + 0.4.0-SNAPSHOT 4.0.0 pom diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index 3d89e5c..a03a5c7 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.3.3-SNAPSHOT + 0.4.0-SNAPSHOT 4.0.0 diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index 1e36b79..1c42ea5 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.3.3-SNAPSHOT + 0.4.0-SNAPSHOT 4.0.0 diff --git a/webmagic-lucene/pom.xml b/webmagic-lucene/pom.xml index 97946cc..2b5c5f1 100644 --- a/webmagic-lucene/pom.xml +++ b/webmagic-lucene/pom.xml @@ -5,7 +5,7 @@ webmagic-parent us.codecraft - 0.2.1 + 0.4.0-SNAPSHOT 4.0.0 diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index 79238b0..52f524d 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -5,7 +5,7 @@ webmagic-parent us.codecraft - 0.3.3-SNAPSHOT + 0.4.0-SNAPSHOT 4.0.0 diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml index 98b86a6..8fae211 100644 --- a/webmagic-saxon/pom.xml +++ b/webmagic-saxon/pom.xml @@ -5,7 +5,7 @@ webmagic-parent us.codecraft - 0.3.2 + 0.4.0-SNAPSHOT 4.0.0 diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml index 685a59f..6551e4e 100644 --- a/webmagic-selenium/pom.xml +++ b/webmagic-selenium/pom.xml @@ -5,7 +5,7 @@ webmagic-parent us.codecraft - 0.3.2 + 0.4.0-SNAPSHOT 4.0.0 From fd6d2fd6f89bd1b2b051d271924d1938e0288ba6 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Wed, 6 Nov 2013 21:19:14 +0800 Subject: [PATCH 34/38] try to keepalive TCP connection --- .../main/java/us/codecraft/webmagic/Site.java | 4 ++-- .../java/us/codecraft/webmagic/Spider.java | 5 ++++- .../downloader/HttpClientDownloader.java | 10 +++++----- .../downloader/HttpClientGenerator.java | 11 +++++++++-- .../example/BaiduBaikePageProcesser.java | 8 ++++++-- webmagic-core/src/main/resources/log4j.xml | 5 +++++ .../codecraft/webmagic/example/BaiduBaike.java | 18 +++++++++++++----- .../webmagic/example/OschinaBlog.java | 2 +- 8 files changed, 45 insertions(+), 18 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java index 22015c3..e83e85f 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java @@ -27,13 +27,13 @@ public class Site { */ private List startRequests = new ArrayList(); - private int sleepTime = 3000; + private int sleepTime = 5000; private int retryTimes = 0; private int cycleRetryTimes = 0; - private int timeOut = 2000; + private int timeOut = 5000; private static final Set DEFAULT_STATUS_CODE_SET = new HashSet(); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 667a71e..84beccb 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -428,7 +428,10 @@ public class Spider implements Runnable, Task { public List getAll(Collection urls) { destroyWhenExit = false; spawnUrl = false; - startRequests = UrlUtils.convertToRequests(urls); + startRequests.clear(); + for (Request request : UrlUtils.convertToRequests(urls)) { + addRequest(request); + } CollectorPipeline collectorPipeline = getCollectorPipeline(); pipelines.add(collectorPipeline); run(); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 5c51916..93bcfe5 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -37,7 +37,7 @@ public class HttpClientDownloader implements Downloader { private final Map httpClients = new HashMap(); - private int poolSize = 1; + private HttpClientGenerator httpClientGenerator = new HttpClientGenerator(); /** * A simple method to download a url. @@ -63,14 +63,14 @@ public class HttpClientDownloader implements Downloader { private CloseableHttpClient getHttpClient(Site site) { if (site == null) { - return new HttpClientGenerator(poolSize).getClient(null); + return httpClientGenerator.getClient(null); } String domain = site.getDomain(); CloseableHttpClient httpClient = httpClients.get(domain); if (httpClient == null) { synchronized (this) { if (httpClient == null) { - httpClient = new HttpClientGenerator(poolSize).getClient(site); + httpClient = httpClientGenerator.getClient(site); httpClients.put(domain, httpClient); } } @@ -105,7 +105,7 @@ public class HttpClientDownloader implements Downloader { .setConnectionRequestTimeout(site.getTimeOut()) .setConnectTimeout(site.getTimeOut()) .setCookieSpec(CookieSpecs.BEST_MATCH); - if (site.getHttpProxy() != null) { + if (site != null && site.getHttpProxy() != null) { requestConfigBuilder.setProxy(site.getHttpProxy()); } requestBuilder.setConfig(requestConfigBuilder.build()); @@ -168,6 +168,6 @@ public class HttpClientDownloader implements Downloader { @Override public void setThread(int thread) { - poolSize = thread; + httpClientGenerator.setPoolSize(thread); } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java index 92ba6f8..12f59d5 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java @@ -5,6 +5,7 @@ import org.apache.http.client.CookieStore; import org.apache.http.client.protocol.ResponseContentEncoding; import org.apache.http.config.Registry; import org.apache.http.config.RegistryBuilder; +import org.apache.http.config.SocketConfig; import org.apache.http.conn.socket.ConnectionSocketFactory; import org.apache.http.conn.socket.PlainConnectionSocketFactory; import org.apache.http.conn.ssl.SSLConnectionSocketFactory; @@ -25,16 +26,20 @@ public class HttpClientGenerator { private PoolingHttpClientConnectionManager connectionManager; - public HttpClientGenerator(int poolSize) { + public HttpClientGenerator() { Registry reg = RegistryBuilder.create() .register("http", PlainConnectionSocketFactory.INSTANCE) .register("https", SSLConnectionSocketFactory.getSocketFactory()) .build(); connectionManager = new PoolingHttpClientConnectionManager(reg); - connectionManager.setMaxTotal(poolSize); connectionManager.setDefaultMaxPerRoute(100); } + public HttpClientGenerator setPoolSize(int poolSize){ + connectionManager.setMaxTotal(poolSize); + return this; + } + public CloseableHttpClient getClient(Site site) { return generateClient(site); } @@ -59,6 +64,8 @@ public class HttpClientGenerator { } }); } + SocketConfig socketConfig = SocketConfig.custom().setSoKeepAlive(true).setTcpNoDelay(true).build(); + httpClientBuilder.setDefaultSocketConfig(socketConfig); // Http client has some problem handling compressing entity for redirect // So I disable it and do it manually // https://issues.apache.org/jira/browse/HTTPCLIENT-1432 diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/BaiduBaikePageProcesser.java b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/BaiduBaikePageProcesser.java index 34f0134..866d090 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/BaiduBaikePageProcesser.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/BaiduBaikePageProcesser.java @@ -30,10 +30,14 @@ public class BaiduBaikePageProcesser implements PageProcessor { } public static void main(String[] args) { + //single download Spider spider = Spider.create(new BaiduBaikePageProcesser()).thread(2); - List list = new ArrayList(); String urlTemplate = "http://baike.baidu.com/search/word?word=%s&pic=1&sug=1&enc=utf8"; - list.add(String.format(urlTemplate,"水力发电")); + ResultItems resultItems = spider.get(String.format(urlTemplate, "水力发电")); + System.out.println(resultItems); + + //multidownload + List list = new ArrayList(); list.add(String.format(urlTemplate,"风力发电")); list.add(String.format(urlTemplate,"太阳能")); list.add(String.format(urlTemplate,"地热发电")); diff --git a/webmagic-core/src/main/resources/log4j.xml b/webmagic-core/src/main/resources/log4j.xml index a6630f8..9084694 100644 --- a/webmagic-core/src/main/resources/log4j.xml +++ b/webmagic-core/src/main/resources/log4j.xml @@ -13,6 +13,11 @@ + + + + + diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/BaiduBaike.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/BaiduBaike.java index 0fd138e..96ff24e 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/BaiduBaike.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/BaiduBaike.java @@ -28,14 +28,22 @@ public class BaiduBaike{ } public static void main(String[] args) { - List list = new ArrayList(); + OOSpider ooSpider = OOSpider.create(Site.me().setSleepTime(0), BaiduBaike.class); + //single download String urlTemplate = "http://baike.baidu.com/search/word?word=%s&pic=1&sug=1&enc=utf8"; - list.add(String.format(urlTemplate,"水力发电")); + BaiduBaike baike = ooSpider.get("http://baike.baidu.com/search/word?word=httpclient&pic=1&sug=1&enc=utf8"); + System.out.println(baike); + + //multidownload + List list = new ArrayList(); list.add(String.format(urlTemplate,"风力发电")); list.add(String.format(urlTemplate,"太阳能")); list.add(String.format(urlTemplate,"地热发电")); - list.add(String.format(urlTemplate, "地热发电")); - List baiduBaikes = OOSpider.create(Site.me().setSleepTime(100), BaiduBaike.class).getAll(list); - System.out.println(baiduBaikes); + list.add(String.format(urlTemplate,"地热发电")); + List resultItemses = ooSpider.getAll(list); + for (BaiduBaike resultItemse : resultItemses) { + System.out.println(resultItemse); + } + ooSpider.close(); } } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/OschinaBlog.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/OschinaBlog.java index 5bd8ddd..f72efe0 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/OschinaBlog.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/OschinaBlog.java @@ -31,7 +31,7 @@ public class OschinaBlog { private Date date; public static void main(String[] args) { - OOSpider.create(Site.me() + OOSpider.create(Site.me().setSleepTime(0) , new JsonFilePageModelPipeline("/data/webmagic/"), OschinaBlog.class) .addUrl("http://my.oschina.net/flashsword/blog").run(); } From fe6d9bb2e29c6970363f25428cda379e33da9821 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Wed, 6 Nov 2013 21:53:39 +0800 Subject: [PATCH 35/38] get keep-alive rework --- .../codecraft/webmagic/downloader/HttpClientDownloader.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 93bcfe5..4286054 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -9,6 +9,7 @@ import org.apache.http.client.config.RequestConfig; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.RequestBuilder; import org.apache.http.impl.client.CloseableHttpClient; +import org.apache.http.util.EntityUtils; import org.apache.log4j.Logger; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; @@ -133,7 +134,8 @@ public class HttpClientDownloader implements Downloader { } finally { try { if (httpResponse != null) { - httpResponse.close(); + //ensure the connection is released back to pool + EntityUtils.consume(httpResponse.getEntity()); } } catch (IOException e) { logger.warn("close response fail", e); From 72dfdc300c856f449c171e1c43a1de69e05fdb35 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Wed, 6 Nov 2013 22:01:09 +0800 Subject: [PATCH 36/38] update docs --- README.md | 4 ++-- en_docs/README.md | 4 ++-- user-manual.md | 6 +++--- zh_docs/README.md | 4 ++-- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 2406d12..35e52bc 100644 --- a/README.md +++ b/README.md @@ -25,12 +25,12 @@ Add dependencies to your pom.xml: us.codecraft webmagic-core - 0.3.2 + 0.4.0 us.codecraft webmagic-extension - 0.3.2 + 0.4.0 ## Get Started: diff --git a/en_docs/README.md b/en_docs/README.md index d88ea19..82b82a8 100644 --- a/en_docs/README.md +++ b/en_docs/README.md @@ -28,12 +28,12 @@ Add dependencies to your project: us.codecraft webmagic-core - 0.3.2 + 0.4.0 us.codecraft webmagic-extension - 0.3.2 + 0.4.0 ## Get Started: diff --git a/user-manual.md b/user-manual.md index 653c82a..ddc35fc 100644 --- a/user-manual.md +++ b/user-manual.md @@ -27,12 +27,12 @@ webmagic使用maven管理依赖,在项目中添加对应的依赖即可使用w us.codecraft webmagic-core - 0.3.2 + 0.4.0 us.codecraft webmagic-extension - 0.3.2 + 0.4.0 #### 项目结构 @@ -140,7 +140,7 @@ webmagic-extension包括了注解方式编写爬虫的方法,只需基于一 这个例子定义了一个Model类,Model类的字段'title'、'content'、'tags'均为要抽取的属性。这个类在Pipeline里是可以复用的。 -注解的详细使用方式见后文中得webmagic-extension注解模块。 +注解的详细使用方式见后文中的webmagic-extension注解模块。
diff --git a/zh_docs/README.md b/zh_docs/README.md index 1931d71..e6961d8 100644 --- a/zh_docs/README.md +++ b/zh_docs/README.md @@ -34,12 +34,12 @@ webmagic使用maven管理依赖,在项目中添加对应的依赖即可使用w us.codecraft webmagic-core - 0.3.2 + 0.4.0 us.codecraft webmagic-extension - 0.3.2 + 0.4.0 #### 项目结构 From 35f96c5a5e7c3e949125b74d6a6d2b497f578d9d Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Wed, 6 Nov 2013 22:16:06 +0800 Subject: [PATCH 37/38] fix docs --- release-note.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/release-note.md b/release-note.md index 001568b..ae5fc56 100755 --- a/release-note.md +++ b/release-note.md @@ -1,5 +1,7 @@ Release Notes ---- +See old versions in [https://github.com/code4craft/webmagic/releases](https://github.com/code4craft/webmagic/releases) + *2012-9-4* `version:0.3.0` * Change default XPath selector from HtmlCleaner to [Xsoup](https://github.com/code4craft/xsoup). From 0b4fadc24db519dd418b92fcfeb3072669993aef Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Wed, 6 Nov 2013 22:17:47 +0800 Subject: [PATCH 38/38] [maven-release-plugin] prepare release webmagic-0.4.0 --- pom.xml | 4 ++-- webmagic-core/pom.xml | 2 +- webmagic-extension/pom.xml | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pom.xml b/pom.xml index 8e68d49..7de932a 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ 7 us.codecraft - 0.4.0-SNAPSHOT + 0.4.0 4.0.0 pom @@ -36,7 +36,7 @@ scm:git:git@github.com:code4craft/webmagic.git scm:git:git@github.com:code4craft/webmagic.git git@github.com:code4craft/webmagic.git - HEAD + webmagic-0.4.0 diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index a03a5c7..708b7aa 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.4.0-SNAPSHOT + 0.4.0 4.0.0 diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index 1c42ea5..a71d682 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.4.0-SNAPSHOT + 0.4.0 4.0.0