diff --git a/README.md b/README.md index 2406d12..35e52bc 100644 --- a/README.md +++ b/README.md @@ -25,12 +25,12 @@ Add dependencies to your pom.xml: us.codecraft webmagic-core - 0.3.2 + 0.4.0 us.codecraft webmagic-extension - 0.3.2 + 0.4.0 ## Get Started: diff --git a/en_docs/README.md b/en_docs/README.md index d88ea19..82b82a8 100644 --- a/en_docs/README.md +++ b/en_docs/README.md @@ -28,12 +28,12 @@ Add dependencies to your project: us.codecraft webmagic-core - 0.3.2 + 0.4.0 us.codecraft webmagic-extension - 0.3.2 + 0.4.0 ## Get Started: diff --git a/pom.xml b/pom.xml index 65cce5f..e34042b 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ 7 us.codecraft - 0.3.2 + 0.4.0 4.0.0 pom @@ -36,7 +36,7 @@ scm:git:git@github.com:code4craft/webmagic.git scm:git:git@github.com:code4craft/webmagic.git git@github.com:code4craft/webmagic.git - HEAD + webmagic-0.4.0 @@ -62,7 +62,12 @@ org.apache.httpcomponents httpclient - 4.2.4 + 4.3.1 + + + com.google.guava + guava + 15.0 us.codecraft diff --git a/release-note.md b/release-note.md index 001568b..ae5fc56 100755 --- a/release-note.md +++ b/release-note.md @@ -1,5 +1,7 @@ Release Notes ---- +See old versions in [https://github.com/code4craft/webmagic/releases](https://github.com/code4craft/webmagic/releases) + *2012-9-4* `version:0.3.0` * Change default XPath selector from HtmlCleaner to [Xsoup](https://github.com/code4craft/xsoup). diff --git a/user-manual.md b/user-manual.md index 2127e57..ddc35fc 100644 --- a/user-manual.md +++ b/user-manual.md @@ -1,5 +1,5 @@ webmagic使用手册 ------- +======== >webmagic是一个开源的Java垂直爬虫框架,目标是简化爬虫的开发流程,让开发者专注于逻辑功能的开发。webmagic的核心非常简单,但是覆盖爬虫的整个流程,也是很好的学习爬虫开发的材料。 >web爬虫是一种技术,webmagic致力于将这种技术的实现成本降低,但是出于对资源提供者的尊重,webmagic不会做反封锁的事情,包括:验证码破解、代理切换、自动登录等。 @@ -16,8 +16,9 @@ webmagic使用手册
+-------- -## 快速开始 +## 下载及安装 ### 使用maven @@ -26,12 +27,12 @@ webmagic使用maven管理依赖,在项目中添加对应的依赖即可使用w us.codecraft webmagic-core - 0.3.2 + 0.4.0 us.codecraft webmagic-extension - 0.3.2 + 0.4.0 #### 项目结构 @@ -66,9 +67,11 @@ webmagic还包含两个可用的扩展包,因为这两个包都依赖了比较 在**bin/lib**目录下,有项目依赖的所有jar包,直接在IDE里import即可。 -### 第一个爬虫 +-------- -#### 定制PageProcessor +## 第一个爬虫 + +### 定制PageProcessor PageProcessor是webmagic-core的一部分,定制一个PageProcessor即可实现自己的爬虫逻辑。以下是抓取osc博客的一段代码: @@ -137,10 +140,13 @@ webmagic-extension包括了注解方式编写爬虫的方法,只需基于一 这个例子定义了一个Model类,Model类的字段'title'、'content'、'tags'均为要抽取的属性。这个类在Pipeline里是可以复用的。 -注解的详细使用方式见后文中得webmagic-extension注解模块。 +注解的详细使用方式见后文中的webmagic-extension注解模块。
+-------- + +## 模块详细介绍 ## webmagic-core @@ -213,7 +219,7 @@ Spider还包括一个方法test(String url),该方法只抓取一个单独的 webmagic包括一个对于页面正文的自动抽取的类**SmartContentSelector**。相信用过Evernote Clearly都会对其自动抽取正文的技术印象深刻。这个技术又叫**Readability**。当然webmagic对Readability的实现还比较粗略,但是仍有一些学习价值。 -webmagic的XPath解析使用了作者另一个开源项目:基于Jsoup的XPath解析器[Xsoup](https://github.com/code4craft/xsoup),Xsoup对XPath的语法进行了一些扩展,支持一些自定义的函数。 +webmagic的XPath解析使用了作者另一个开源项目:基于Jsoup的XPath解析器[Xsoup](https://github.com/code4craft/xsoup),Xsoup对XPath的语法进行了一些扩展,支持一些自定义的函数。这些函数的使用方式都是在XPath末尾加上`/name-of-function()`,例如:`"//div[@class='BlogStat']/regex('\\d+-\\d+-\\d+\\s+\\d+:\\d+')"`。 @@ -325,6 +331,8 @@ webmagic目前不支持持久化到数据库,但是结合其他工具,持久
+----- + ## webmagic-extension webmagic-extension是为了开发爬虫更方便而实现的一些功能模块。这些功能完全基于webmagic-core的框架,包括注解形式编写爬虫、分页、分布式等功能。 @@ -354,6 +362,10 @@ webmagic-extension包括注解模块。为什么会有注解方式? @ExtractBy(value = "//div[@class='BlogTags']/a/text()", multi = true) private List tags; + + @Formatter("yyyy-MM-dd HH:mm") + @ExtractBy("//div[@class='BlogStat']/regex('\\d+-\\d+-\\d+\\s+\\d+:\\d+')") + private Date date; public static void main(String[] args) { OOSpider.create( @@ -395,10 +407,21 @@ webmagic-extension包括注解模块。为什么会有注解方式? * #### 类型转换 + webmagic的注解模式支持对抽取结果进行类型转换,这样抽取结果并不需要是String类型,而可以是任意类型。webmagic内置了基本类型的支持(需要保证抽取结果能够被转换到对应类型)。 + +```java + @ExtractBy("//ul[@class='pagehead-actions']/li[1]//a[@class='social-count js-social-count']/text()") + private int star; +``` +抽取结果也可以是`java.util.Date`类型,不过需要指定日期格式化的方式: + +```java @Formatter("yyyy-MM-dd HH:mm") @ExtractBy("//div[@class='BlogStat']/regex('\\d+-\\d+-\\d+\\s+\\d+:\\d+')") private Date date; +``` +你也可以编写一个实现`ObjectFormatter`接口的类,进行自己的类型解析。要使用自己的类,需要调用`ObjectFormatters.put()`对这个类进行注册。 * #### AfterExtractor diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index ec8a90b..708b7aa 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.3.2 + 0.4.0 4.0.0 @@ -20,6 +20,12 @@ junit + + com.google.guava + guava + 15.0 + + org.apache.commons commons-lang3 diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/ResultItems.java b/webmagic-core/src/main/java/us/codecraft/webmagic/ResultItems.java index e055270..4791e77 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/ResultItems.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/ResultItems.java @@ -68,4 +68,13 @@ public class ResultItems { this.skip = skip; return this; } + + @Override + public String toString() { + return "ResultItems{" + + "fields=" + fields + + ", request=" + request + + ", skip=" + skip + + '}'; + } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java index 4c7b992..e83e85f 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java @@ -1,5 +1,6 @@ package us.codecraft.webmagic; +import org.apache.http.HttpHost; import us.codecraft.webmagic.utils.UrlUtils; import java.util.*; @@ -8,8 +9,8 @@ import java.util.*; * Object contains setting for crawler.
* * @author code4crafter@gmail.com
- * @since 0.1.0 * @see us.codecraft.webmagic.processor.PageProcessor + * @since 0.1.0 */ public class Site { @@ -24,18 +25,32 @@ public class Site { /** * startUrls is the urls the crawler to start with. */ - private List startUrls = new ArrayList(); + private List startRequests = new ArrayList(); - private int sleepTime = 3000; + private int sleepTime = 5000; private int retryTimes = 0; private int cycleRetryTimes = 0; + private int timeOut = 5000; + private static final Set DEFAULT_STATUS_CODE_SET = new HashSet(); private Set acceptStatCode = DEFAULT_STATUS_CODE_SET; + private Map headers = new HashMap(); + + private HttpHost httpProxy; + + private boolean useGzip = true; + + public static interface HeaderConst { + + public static final String REFERER = "Referer"; + } + + static { DEFAULT_STATUS_CODE_SET.add(200); } @@ -131,6 +146,20 @@ public class Site { return charset; } + public int getTimeOut() { + return timeOut; + } + + /** + * set timeout for downloader in ms + * + * @param timeOut + */ + public Site setTimeOut(int timeOut) { + this.timeOut = timeOut; + return this; + } + /** * Set acceptStatCode.
* When status code of http response is in acceptStatCodes, it will be processed.
@@ -158,23 +187,44 @@ public class Site { * get start urls * * @return start urls + * @see #getStartRequests + * @deprecated */ + @Deprecated public List getStartUrls() { - return startUrls; + return UrlUtils.convertToUrls(startRequests); + } + + public List getStartRequests() { + return startRequests; } /** * Add a url to start url.
+ * Because urls are more a Spider's property than Site, move it to {@link Spider#addUrl(String...)}} * + * @deprecated + * @see Spider#addUrl(String...) * @param startUrl * @return this */ public Site addStartUrl(String startUrl) { - this.startUrls.add(startUrl); - if (domain == null) { - if (startUrls.size() > 0) { - domain = UrlUtils.getDomain(startUrls.get(0)); - } + return addStartRequest(new Request(startUrl)); + } + + /** + * Add a url to start url.
+ * Because urls are more a Spider's property than Site, move it to {@link Spider#addRequest(Request...)}} + * + * @deprecated + * @see Spider#addRequest(Request...) + * @param startUrl + * @return this + */ + public Site addStartRequest(Request startRequest) { + this.startRequests.add(startRequest); + if (domain == null && startRequest.getUrl() != null) { + domain = UrlUtils.getDomain(startRequest.getUrl()); } return this; } @@ -202,7 +252,7 @@ public class Site { } /** - * Get retry times when download fail immediately, 0 by default.
+ * Get retry times immediately when download fail, 0 by default.
* * @return retry times when download fail */ @@ -210,6 +260,23 @@ public class Site { return retryTimes; } + public Map getHeaders() { + return headers; + } + + /** + * Put an Http header for downloader.
+ * Use {@link #addCookie(String, String)} for cookie and {@link #setUserAgent(String)} for user-agent.
+ * + * @param key key of http header, there are some keys constant in {@link HeaderConst} + * @param value value of header + * @return + */ + public Site addHeader(String key, String value) { + headers.put(key, value); + return this; + } + /** * Set retry times when download fail, 0 by default.
* @@ -239,21 +306,34 @@ public class Site { return this; } - @Override - public boolean equals(Object o) { - if (this == o) return true; - if (o == null || getClass() != o.getClass()) return false; + public HttpHost getHttpProxy() { + return httpProxy; + } - Site site = (Site) o; + /** + * set up httpProxy for this site + * @param httpProxy + * @return + */ + public Site setHttpProxy(HttpHost httpProxy) { + this.httpProxy = httpProxy; + return this; + } - if (acceptStatCode != null ? !acceptStatCode.equals(site.acceptStatCode) : site.acceptStatCode != null) - return false; - if (!domain.equals(site.domain)) return false; - if (!startUrls.equals(site.startUrls)) return false; - if (charset != null ? !charset.equals(site.charset) : site.charset != null) return false; - if (userAgent != null ? !userAgent.equals(site.userAgent) : site.userAgent != null) return false; + public boolean isUseGzip() { + return useGzip; + } - return true; + /** + * Whether use gzip.
+ * Default is true, you can set it to false to disable gzip. + * + * @param useGzip + * @return + */ + public Site setUseGzip(boolean useGzip) { + this.useGzip = useGzip; + return this; } public Task toTask() { @@ -270,13 +350,60 @@ public class Site { }; } + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + Site site = (Site) o; + + if (cycleRetryTimes != site.cycleRetryTimes) return false; + if (retryTimes != site.retryTimes) return false; + if (sleepTime != site.sleepTime) return false; + if (timeOut != site.timeOut) return false; + if (acceptStatCode != null ? !acceptStatCode.equals(site.acceptStatCode) : site.acceptStatCode != null) + return false; + if (charset != null ? !charset.equals(site.charset) : site.charset != null) return false; + if (cookies != null ? !cookies.equals(site.cookies) : site.cookies != null) return false; + if (domain != null ? !domain.equals(site.domain) : site.domain != null) return false; + if (headers != null ? !headers.equals(site.headers) : site.headers != null) return false; + if (startRequests != null ? !startRequests.equals(site.startRequests) : site.startRequests != null) + return false; + if (userAgent != null ? !userAgent.equals(site.userAgent) : site.userAgent != null) return false; + + return true; + } + @Override public int hashCode() { - int result = domain.hashCode(); - result = 31 * result + (startUrls != null ? startUrls.hashCode() : 0); + int result = domain != null ? domain.hashCode() : 0; result = 31 * result + (userAgent != null ? userAgent.hashCode() : 0); + result = 31 * result + (cookies != null ? cookies.hashCode() : 0); result = 31 * result + (charset != null ? charset.hashCode() : 0); + result = 31 * result + (startRequests != null ? startRequests.hashCode() : 0); + result = 31 * result + sleepTime; + result = 31 * result + retryTimes; + result = 31 * result + cycleRetryTimes; + result = 31 * result + timeOut; result = 31 * result + (acceptStatCode != null ? acceptStatCode.hashCode() : 0); + result = 31 * result + (headers != null ? headers.hashCode() : 0); return result; } + + @Override + public String toString() { + return "Site{" + + "domain='" + domain + '\'' + + ", userAgent='" + userAgent + '\'' + + ", cookies=" + cookies + + ", charset='" + charset + '\'' + + ", startRequests=" + startRequests + + ", sleepTime=" + sleepTime + + ", retryTimes=" + retryTimes + + ", cycleRetryTimes=" + cycleRetryTimes + + ", timeOut=" + timeOut + + ", acceptStatCode=" + acceptStatCode + + ", headers=" + headers + + '}'; + } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 829546b..84beccb 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -1,9 +1,12 @@ package us.codecraft.webmagic; +import com.google.common.collect.Lists; import org.apache.commons.collections.CollectionUtils; import org.apache.log4j.Logger; import us.codecraft.webmagic.downloader.Downloader; import us.codecraft.webmagic.downloader.HttpClientDownloader; +import us.codecraft.webmagic.pipeline.CollectorPipeline; +import us.codecraft.webmagic.pipeline.ResultItemsCollectorPipeline; import us.codecraft.webmagic.pipeline.ConsolePipeline; import us.codecraft.webmagic.pipeline.Pipeline; import us.codecraft.webmagic.processor.PageProcessor; @@ -11,13 +14,18 @@ import us.codecraft.webmagic.scheduler.QueueScheduler; import us.codecraft.webmagic.scheduler.Scheduler; import us.codecraft.webmagic.utils.EnvironmentUtil; import us.codecraft.webmagic.utils.ThreadUtils; +import us.codecraft.webmagic.utils.UrlUtils; import java.io.Closeable; import java.io.IOException; import java.util.ArrayList; +import java.util.Collection; import java.util.List; +import java.util.UUID; import java.util.concurrent.ExecutorService; import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.locks.Condition; +import java.util.concurrent.locks.ReentrantLock; /** * Entrance of a crawler.
@@ -42,7 +50,7 @@ import java.util.concurrent.atomic.AtomicInteger; * Spider.create(new SimplePageProcessor("http://my.oschina.net/", * "http://my.oschina.net/*blog/*"))
* .scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")).run();
- * + * * @author code4crafter@gmail.com
* @see Downloader * @see Scheduler @@ -52,381 +60,520 @@ import java.util.concurrent.atomic.AtomicInteger; */ public class Spider implements Runnable, Task { - protected Downloader downloader; + protected Downloader downloader; - protected List pipelines = new ArrayList(); + protected List pipelines = new ArrayList(); - protected PageProcessor pageProcessor; + protected PageProcessor pageProcessor; - protected List startUrls; + protected List startRequests; - protected Site site; + protected Site site; - protected String uuid; + protected String uuid; - protected Scheduler scheduler = new QueueScheduler(); + protected Scheduler scheduler = new QueueScheduler(); - protected Logger logger = Logger.getLogger(getClass()); + protected Logger logger = Logger.getLogger(getClass()); - protected ExecutorService executorService; + protected ExecutorService executorService; - protected int threadNum = 1; + protected int threadNum = 1; - protected AtomicInteger stat = new AtomicInteger(STAT_INIT); + protected AtomicInteger stat = new AtomicInteger(STAT_INIT); - protected final static int STAT_INIT = 0; + protected boolean exitWhenComplete = true; - protected final static int STAT_RUNNING = 1; + protected final static int STAT_INIT = 0; - protected final static int STAT_STOPPED = 2; + protected final static int STAT_RUNNING = 1; - /** - * create a spider with pageProcessor. - * - * @param pageProcessor - * @return new spider - * @see PageProcessor - */ - public static Spider create(PageProcessor pageProcessor) { - return new Spider(pageProcessor); - } + protected final static int STAT_STOPPED = 2; - /** - * create a spider with pageProcessor. - * - * @param pageProcessor - */ - public Spider(PageProcessor pageProcessor) { - this.pageProcessor = pageProcessor; - this.site = pageProcessor.getSite(); - this.startUrls = pageProcessor.getSite().getStartUrls(); - } + protected boolean spawnUrl = true; - /** - * Set startUrls of Spider.
- * Prior to startUrls of Site. - * - * @param startUrls - * @return this - */ - public Spider startUrls(List startUrls) { - checkIfRunning(); - this.startUrls = startUrls; - return this; - } + protected boolean destroyWhenExit = true; - /** - * Set an uuid for spider.
- * Default uuid is domain of site.
- * - * @param uuid - * @return this - */ - public Spider setUUID(String uuid) { - this.uuid = uuid; - return this; - } + private ReentrantLock newUrlLock = new ReentrantLock(); - /** - * set scheduler for Spider - * - * @param scheduler - * @return this - * @Deprecated - * @see #setScheduler(us.codecraft.webmagic.scheduler.Scheduler) - */ - public Spider scheduler(Scheduler scheduler) { - return setScheduler(scheduler); - } + private Condition newUrlCondition = newUrlLock.newCondition(); - /** - * set scheduler for Spider - * - * @param scheduler - * @return this - * @see Scheduler - * @since 0.2.1 - */ - public Spider setScheduler(Scheduler scheduler) { - checkIfRunning(); - this.scheduler = scheduler; - return this; - } + /** + * create a spider with pageProcessor. + * + * @param pageProcessor + * @return new spider + * @see PageProcessor + */ + public static Spider create(PageProcessor pageProcessor) { + return new Spider(pageProcessor); + } - /** - * add a pipeline for Spider - * - * @param pipeline - * @return this - * @see #setPipeline(us.codecraft.webmagic.pipeline.Pipeline) - * @deprecated - */ - public Spider pipeline(Pipeline pipeline) { - return addPipeline(pipeline); - } + /** + * create a spider with pageProcessor. + * + * @param pageProcessor + */ + public Spider(PageProcessor pageProcessor) { + this.pageProcessor = pageProcessor; + this.site = pageProcessor.getSite(); + this.startRequests = pageProcessor.getSite().getStartRequests(); + } - /** - * add a pipeline for Spider - * - * @param pipeline - * @return this - * @see Pipeline - * @since 0.2.1 - */ - public Spider addPipeline(Pipeline pipeline) { - checkIfRunning(); - this.pipelines.add(pipeline); - return this; - } + /** + * Set startUrls of Spider.
+ * Prior to startUrls of Site. + * + * @param startUrls + * @return this + */ + public Spider startUrls(List startUrls) { + checkIfRunning(); + this.startRequests = UrlUtils.convertToRequests(startUrls); + return this; + } - /** - * clear the pipelines set - * - * @return this - */ - public Spider clearPipeline() { - pipelines = new ArrayList(); - return this; - } + /** + * Set startUrls of Spider.
+ * Prior to startUrls of Site. + * + * @param startUrls + * @return this + */ + public Spider startRequest(List startRequests) { + checkIfRunning(); + this.startRequests = startRequests; + return this; + } - /** - * set the downloader of spider - * - * @param downloader - * @return this - * @see #setDownloader(us.codecraft.webmagic.downloader.Downloader) - * @deprecated - */ - public Spider downloader(Downloader downloader) { - return setDownloader(downloader); - } + /** + * Set an uuid for spider.
+ * Default uuid is domain of site.
+ * + * @param uuid + * @return this + */ + public Spider setUUID(String uuid) { + this.uuid = uuid; + return this; + } - /** - * set the downloader of spider - * - * @param downloader - * @return this - * @see Downloader - */ - public Spider setDownloader(Downloader downloader) { - checkIfRunning(); - this.downloader = downloader; - return this; - } + /** + * set scheduler for Spider + * + * @param scheduler + * @return this + * @Deprecated + * @see #setScheduler(us.codecraft.webmagic.scheduler.Scheduler) + */ + public Spider scheduler(Scheduler scheduler) { + return setScheduler(scheduler); + } - protected void checkComponent() { - if (downloader == null) { - this.downloader = new HttpClientDownloader(); - } - if (pipelines.isEmpty()) { - pipelines.add(new ConsolePipeline()); - } - downloader.setThread(threadNum); - } + /** + * set scheduler for Spider + * + * @param scheduler + * @return this + * @see Scheduler + * @since 0.2.1 + */ + public Spider setScheduler(Scheduler scheduler) { + checkIfRunning(); + this.scheduler = scheduler; + return this; + } - @Override - public void run() { - if (!stat.compareAndSet(STAT_INIT, STAT_RUNNING) && !stat.compareAndSet(STAT_STOPPED, STAT_RUNNING)) { - throw new IllegalStateException("Spider is already running!"); - } - checkComponent(); - if (startUrls != null) { - for (String startUrl : startUrls) { - scheduler.push(new Request(startUrl), this); - } - startUrls.clear(); - } - Request request = scheduler.poll(this); + /** + * add a pipeline for Spider + * + * @param pipeline + * @return this + * @see #setPipeline(us.codecraft.webmagic.pipeline.Pipeline) + * @deprecated + */ + public Spider pipeline(Pipeline pipeline) { + return addPipeline(pipeline); + } + + /** + * add a pipeline for Spider + * + * @param pipeline + * @return this + * @see Pipeline + * @since 0.2.1 + */ + public Spider addPipeline(Pipeline pipeline) { + checkIfRunning(); + this.pipelines.add(pipeline); + return this; + } + + /** + * clear the pipelines set + * + * @return this + */ + public Spider clearPipeline() { + pipelines = new ArrayList(); + return this; + } + + /** + * set the downloader of spider + * + * @param downloader + * @return this + * @see #setDownloader(us.codecraft.webmagic.downloader.Downloader) + * @deprecated + */ + public Spider downloader(Downloader downloader) { + return setDownloader(downloader); + } + + /** + * set the downloader of spider + * + * @param downloader + * @return this + * @see Downloader + */ + public Spider setDownloader(Downloader downloader) { + checkIfRunning(); + this.downloader = downloader; + return this; + } + + protected void initComponent() { + if (downloader == null) { + this.downloader = new HttpClientDownloader(); + } + if (pipelines.isEmpty()) { + pipelines.add(new ConsolePipeline()); + } + downloader.setThread(threadNum); + if (executorService == null || executorService.isShutdown()) { + executorService = ThreadUtils.newFixedThreadPool(threadNum); + } + if (startRequests != null) { + for (Request request : startRequests) { + scheduler.push(request, this); + } + startRequests.clear(); + } + } + + @Override + public void run() { + checkRunningStat(); + initComponent(); logger.info("Spider " + getUUID() + " started!"); - // single thread - if (threadNum <= 1) { - while (request != null && stat.compareAndSet(STAT_RUNNING, STAT_RUNNING)) { - processRequest(request); - request = scheduler.poll(this); - } - } else { - synchronized (this) { - this.executorService = ThreadUtils.newFixedThreadPool(threadNum); - } - // multi thread - final AtomicInteger threadAlive = new AtomicInteger(0); - while (true && stat.compareAndSet(STAT_RUNNING, STAT_RUNNING)) { - if (request == null) { - // when no request found but some thread is alive, sleep a - // while. - try { - Thread.sleep(100); - } catch (InterruptedException e) { - } - } else { - final Request requestFinal = request; - threadAlive.incrementAndGet(); - executorService.execute(new Runnable() { - @Override - public void run() { - processRequest(requestFinal); - threadAlive.decrementAndGet(); - } - }); - } - request = scheduler.poll(this); - if (threadAlive.get() == 0) { - request = scheduler.poll(this); - if (request == null) { - break; - } - } - } - executorService.shutdown(); - } - stat.compareAndSet(STAT_RUNNING, STAT_STOPPED); - // release some resources - destroy(); - } + final AtomicInteger threadAlive = new AtomicInteger(0); + while (!Thread.currentThread().isInterrupted() && stat.get() == STAT_RUNNING) { + Request request = scheduler.poll(this); + if (request == null) { + if (threadAlive.get() == 0 && exitWhenComplete) { + break; + } + // wait until new url added + waitNewUrl(); + } else { + final Request requestFinal = request; + threadAlive.incrementAndGet(); + executorService.execute(new Runnable() { + @Override + public void run() { + try { + processRequest(requestFinal); + } catch (Exception e) { + logger.error("download " + requestFinal + " error", e); + } finally { + threadAlive.decrementAndGet(); + signalNewUrl(); + } + } + }); + } + } + stat.set(STAT_STOPPED); + // release some resources + if (destroyWhenExit) { + close(); + } + } - protected void destroy() { - destroyEach(downloader); - destroyEach(pageProcessor); - for (Pipeline pipeline : pipelines) { - destroyEach(pipeline); - } - } + private void checkRunningStat() { + while (true) { + int statNow = stat.get(); + if (statNow == STAT_RUNNING) { + throw new IllegalStateException("Spider is already running!"); + } + if (stat.compareAndSet(statNow, STAT_RUNNING)) { + break; + } + } + } - private void destroyEach(Object object) { - if (object instanceof Closeable) { - try { - ((Closeable) object).close(); - } catch (IOException e) { - e.printStackTrace(); - } - } - } + public void close() { + destroyEach(downloader); + destroyEach(pageProcessor); + for (Pipeline pipeline : pipelines) { + destroyEach(pipeline); + } + executorService.shutdown(); + } - /** - * Process specific urls without url discovering. - * - * @param urls - * urls to process - */ - public void test(String... urls) { - checkComponent(); - if (urls.length > 0) { - for (String url : urls) { - processRequest(new Request(url)); - } - } - } + private void destroyEach(Object object) { + if (object instanceof Closeable) { + try { + ((Closeable) object).close(); + } catch (IOException e) { + e.printStackTrace(); + } + } + } - protected void processRequest(Request request) { - Page page = downloader.download(request, this); - if (page == null) { - sleep(site.getSleepTime()); - return; - } - // for cycle retry - if (page.getHtml() == null) { - addRequest(page); - sleep(site.getSleepTime()); - return; - } - pageProcessor.process(page); - addRequest(page); - if (!page.getResultItems().isSkip()) { - for (Pipeline pipeline : pipelines) { - pipeline.process(page.getResultItems(), this); - } - } - sleep(site.getSleepTime()); - } + /** + * Process specific urls without url discovering. + * + * @param urls urls to process + */ + public void test(String... urls) { + initComponent(); + if (urls.length > 0) { + for (String url : urls) { + processRequest(new Request(url)); + } + } + } - protected void sleep(int time) { - try { - Thread.sleep(time); - } catch (InterruptedException e) { - e.printStackTrace(); - } - } + protected void processRequest(Request request) { + Page page = downloader.download(request, this); + if (page == null) { + sleep(site.getSleepTime()); + return; + } + // for cycle retry + if (page.getHtml() == null) { + extractAndAddRequests(page); + sleep(site.getSleepTime()); + return; + } + pageProcessor.process(page); + extractAndAddRequests(page); + if (!page.getResultItems().isSkip()) { + for (Pipeline pipeline : pipelines) { + pipeline.process(page.getResultItems(), this); + } + } + sleep(site.getSleepTime()); + } - protected void addRequest(Page page) { - if (CollectionUtils.isNotEmpty(page.getTargetRequests())) { - for (Request request : page.getTargetRequests()) { - scheduler.push(request, this); - } - } - } + protected void sleep(int time) { + try { + Thread.sleep(time); + } catch (InterruptedException e) { + e.printStackTrace(); + } + } - protected void checkIfRunning() { - if (!stat.compareAndSet(STAT_INIT, STAT_INIT) && !stat.compareAndSet(STAT_STOPPED, STAT_STOPPED)) { - throw new IllegalStateException("Spider is already running!"); - } - } + protected void extractAndAddRequests(Page page) { + if (spawnUrl && CollectionUtils.isNotEmpty(page.getTargetRequests())) { + for (Request request : page.getTargetRequests()) { + addRequest(request); + } + } + } - public void runAsync() { - Thread thread = new Thread(this); - thread.setDaemon(false); - thread.start(); - } + private void addRequest(Request request) { + if (site.getDomain() == null && request != null && request.getUrl() != null) { + site.setDomain(UrlUtils.getDomain(request.getUrl())); + } + scheduler.push(request, this); + } - public void start() { - runAsync(); - } + protected void checkIfRunning() { + if (stat.get() == STAT_RUNNING) { + throw new IllegalStateException("Spider is already running!"); + } + } - public void stop() { - if (stat.compareAndSet(STAT_RUNNING, STAT_STOPPED)) { - if (executorService != null) { - executorService.shutdown(); - } - logger.info("Spider " + getUUID() + " stop success!"); - } else { - logger.info("Spider " + getUUID() + " stop fail!"); - } - } + public void runAsync() { + Thread thread = new Thread(this); + thread.setDaemon(false); + thread.start(); + } - public void stopAndDestroy() { - stop(); - destroy(); - } + /** + * Add urls to crawl.
+ * + * @param urls + * @return + */ + public Spider addUrl(String... urls) { + for (String url : urls) { + addRequest(new Request(url)); + } + signalNewUrl(); + return this; + } - /** - * start with more than one threads - * - * @param threadNum - * @return this - */ - public Spider thread(int threadNum) { - checkIfRunning(); - this.threadNum = threadNum; - if (threadNum <= 0) { - throw new IllegalArgumentException("threadNum should be more than one!"); - } - if (threadNum == 1) { - return this; - } - return this; - } + /** + * Download urls synchronizing. + * + * @param urls + * @return + */ + public List getAll(Collection urls) { + destroyWhenExit = false; + spawnUrl = false; + startRequests.clear(); + for (Request request : UrlUtils.convertToRequests(urls)) { + addRequest(request); + } + CollectorPipeline collectorPipeline = getCollectorPipeline(); + pipelines.add(collectorPipeline); + run(); + spawnUrl = true; + destroyWhenExit = true; + return collectorPipeline.getCollected(); + } - /** - * switch off xsoup - * - * @return - */ - public static void xsoupOff() { - EnvironmentUtil.setUseXsoup(false); - } + protected CollectorPipeline getCollectorPipeline() { + return new ResultItemsCollectorPipeline(); + } - @Override - public String getUUID() { - if (uuid != null) { - return uuid; - } - if (site != null) { - return site.getDomain(); - } - return null; - } + public T get(String url) { + List urls = Lists.newArrayList(url); + List resultItemses = getAll(urls); + if (resultItemses != null && resultItemses.size() > 0) { + return resultItemses.get(0); + } else { + return null; + } + } - @Override - public Site getSite() { - return site; - } + /** + * Add urls with information to crawl.
+ * + * @param urls + * @return + */ + public Spider addRequest(Request... requests) { + for (Request request : requests) { + addRequest(request); + } + signalNewUrl(); + return this; + } + + private void waitNewUrl() { + try { + newUrlLock.lock(); + try { + newUrlCondition.await(); + } catch (InterruptedException e) { + } + } finally { + newUrlLock.unlock(); + } + } + + private void signalNewUrl() { + try { + newUrlLock.lock(); + newUrlCondition.signalAll(); + } finally { + newUrlLock.unlock(); + } + } + + public void start() { + runAsync(); + } + + public void stop() { + if (stat.compareAndSet(STAT_RUNNING, STAT_STOPPED)) { + logger.info("Spider " + getUUID() + " stop success!"); + } else { + logger.info("Spider " + getUUID() + " stop fail!"); + } + } + + /** + * start with more than one threads + * + * @param threadNum + * @return this + */ + public Spider thread(int threadNum) { + checkIfRunning(); + this.threadNum = threadNum; + if (threadNum <= 0) { + throw new IllegalArgumentException("threadNum should be more than one!"); + } + return this; + } + + /** + * switch off xsoup + * + * @return + */ + public static void xsoupOff() { + EnvironmentUtil.setUseXsoup(false); + } + + public boolean isExitWhenComplete() { + return exitWhenComplete; + } + + /** + * Exit when complete.
+ * True: exit when all url of the site is downloaded.
+ * False: not exit until call stop() manually.
+ * + * @param exitWhenComplete + * @return + */ + public Spider setExitWhenComplete(boolean exitWhenComplete) { + this.exitWhenComplete = exitWhenComplete; + return this; + } + + public boolean isSpawnUrl() { + return spawnUrl; + } + + /** + * Whether add urls extracted to download.
+ * Add urls to download when it is true, and just download seed urls when it is false.
+ * DO NOT set it unless you know what it means! + * + * @param spawnUrl + * @return + * @since 0.4.0 + */ + public Spider setSpawnUrl(boolean spawnUrl) { + this.spawnUrl = spawnUrl; + return this; + } + + @Override + public String getUUID() { + if (uuid != null) { + return uuid; + } + if (site != null) { + return site.getDomain(); + } + uuid = UUID.randomUUID().toString(); + return uuid; + } + + @Override + public Site getSite() { + return site; + } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 82a4a9a..4286054 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -1,13 +1,15 @@ package us.codecraft.webmagic.downloader; +import com.google.common.collect.Sets; import org.apache.commons.io.IOUtils; -import org.apache.http.Header; -import org.apache.http.HeaderElement; import org.apache.http.HttpResponse; import org.apache.http.annotation.ThreadSafe; -import org.apache.http.client.HttpClient; -import org.apache.http.client.entity.GzipDecompressingEntity; -import org.apache.http.client.methods.HttpGet; +import org.apache.http.client.config.CookieSpecs; +import org.apache.http.client.config.RequestConfig; +import org.apache.http.client.methods.CloseableHttpResponse; +import org.apache.http.client.methods.RequestBuilder; +import org.apache.http.impl.client.CloseableHttpClient; +import org.apache.http.util.EntityUtils; import org.apache.log4j.Logger; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; @@ -18,7 +20,8 @@ import us.codecraft.webmagic.selector.PlainText; import us.codecraft.webmagic.utils.UrlUtils; import java.io.IOException; -import java.util.HashSet; +import java.util.HashMap; +import java.util.Map; import java.util.Set; @@ -33,7 +36,9 @@ public class HttpClientDownloader implements Downloader { private Logger logger = Logger.getLogger(getClass()); - private int poolSize = 1; + private final Map httpClients = new HashMap(); + + private HttpClientGenerator httpClientGenerator = new HttpClientGenerator(); /** * A simple method to download a url. @@ -57,63 +62,59 @@ public class HttpClientDownloader implements Downloader { return (Html) page.getHtml(); } + private CloseableHttpClient getHttpClient(Site site) { + if (site == null) { + return httpClientGenerator.getClient(null); + } + String domain = site.getDomain(); + CloseableHttpClient httpClient = httpClients.get(domain); + if (httpClient == null) { + synchronized (this) { + if (httpClient == null) { + httpClient = httpClientGenerator.getClient(site); + httpClients.put(domain, httpClient); + } + } + } + return httpClient; + } + @Override public Page download(Request request, Task task) { Site site = null; if (task != null) { site = task.getSite(); } - int retryTimes = 0; Set acceptStatCode; String charset = null; + Map headers = null; if (site != null) { - retryTimes = site.getRetryTimes(); acceptStatCode = site.getAcceptStatCode(); charset = site.getCharset(); + headers = site.getHeaders(); } else { - acceptStatCode = new HashSet(); - acceptStatCode.add(200); + acceptStatCode = Sets.newHashSet(200); } logger.info("downloading page " + request.getUrl()); - HttpClient httpClient = HttpClientPool.getInstance(poolSize).getClient(site); + RequestBuilder requestBuilder = RequestBuilder.get().setUri(request.getUrl()); + if (headers != null) { + for (Map.Entry headerEntry : headers.entrySet()) { + requestBuilder.addHeader(headerEntry.getKey(), headerEntry.getValue()); + } + } + RequestConfig.Builder requestConfigBuilder = RequestConfig.custom() + .setConnectionRequestTimeout(site.getTimeOut()) + .setConnectTimeout(site.getTimeOut()) + .setCookieSpec(CookieSpecs.BEST_MATCH); + if (site != null && site.getHttpProxy() != null) { + requestConfigBuilder.setProxy(site.getHttpProxy()); + } + requestBuilder.setConfig(requestConfigBuilder.build()); + CloseableHttpResponse httpResponse = null; try { - HttpGet httpGet = new HttpGet(request.getUrl()); - HttpResponse httpResponse = null; - int tried = 0; - boolean retry; - do { - try { - httpResponse = httpClient.execute(httpGet); - retry = false; - } catch (IOException e) { - tried++; - - if (tried > retryTimes) { - logger.warn("download page " + request.getUrl() + " error", e); - if (site.getCycleRetryTimes() > 0) { - Page page = new Page(); - Object cycleTriedTimesObject = request.getExtra(Request.CYCLE_TRIED_TIMES); - if (cycleTriedTimesObject == null) { - page.addTargetRequest(request.setPriority(0).putExtra(Request.CYCLE_TRIED_TIMES, 1)); - } else { - int cycleTriedTimes = (Integer) cycleTriedTimesObject; - cycleTriedTimes++; - if (cycleTriedTimes >= site.getCycleRetryTimes()) { - return null; - } - page.addTargetRequest(request.setPriority(0).putExtra(Request.CYCLE_TRIED_TIMES, 1)); - } - return page; - } - return null; - } - logger.info("download page " + request.getUrl() + " error, retry the " + tried + " time!"); - retry = true; - } - } while (retry); + httpResponse = getHttpClient(site).execute(requestBuilder.build()); int statusCode = httpResponse.getStatusLine().getStatusCode(); if (acceptStatCode.contains(statusCode)) { - handleGzip(httpResponse); //charset if (charset == null) { String value = httpResponse.getEntity().getContentType().getValue(); @@ -122,16 +123,44 @@ public class HttpClientDownloader implements Downloader { return handleResponse(request, charset, httpResponse, task); } else { logger.warn("code error " + statusCode + "\t" + request.getUrl()); + return null; } - } catch (Exception e) { + } catch (IOException e) { logger.warn("download page " + request.getUrl() + " error", e); + if (site.getCycleRetryTimes() > 0) { + return addToCycleRetry(request, site); + } + return null; + } finally { + try { + if (httpResponse != null) { + //ensure the connection is released back to pool + EntityUtils.consume(httpResponse.getEntity()); + } + } catch (IOException e) { + logger.warn("close response fail", e); + } } - return null; + } + + private Page addToCycleRetry(Request request, Site site) { + Page page = new Page(); + Object cycleTriedTimesObject = request.getExtra(Request.CYCLE_TRIED_TIMES); + if (cycleTriedTimesObject == null) { + page.addTargetRequest(request.setPriority(0).putExtra(Request.CYCLE_TRIED_TIMES, 1)); + } else { + int cycleTriedTimes = (Integer) cycleTriedTimesObject; + cycleTriedTimes++; + if (cycleTriedTimes >= site.getCycleRetryTimes()) { + return null; + } + page.addTargetRequest(request.setPriority(0).putExtra(Request.CYCLE_TRIED_TIMES, 1)); + } + return page; } protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException { - String content = IOUtils.toString(httpResponse.getEntity().getContent(), - charset); + String content = IOUtils.toString(httpResponse.getEntity().getContent(), charset); Page page = new Page(); page.setHtml(new Html(UrlUtils.fixAllRelativeHrefs(content, request.getUrl()))); page.setUrl(new PlainText(request.getUrl())); @@ -141,19 +170,6 @@ public class HttpClientDownloader implements Downloader { @Override public void setThread(int thread) { - poolSize = thread; - } - - private void handleGzip(HttpResponse httpResponse) { - Header ceheader = httpResponse.getEntity().getContentEncoding(); - if (ceheader != null) { - HeaderElement[] codecs = ceheader.getElements(); - for (HeaderElement codec : codecs) { - if (codec.getName().equalsIgnoreCase("gzip")) { - httpResponse.setEntity( - new GzipDecompressingEntity(httpResponse.getEntity())); - } - } - } + httpClientGenerator.setPoolSize(thread); } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java new file mode 100644 index 0000000..12f59d5 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java @@ -0,0 +1,106 @@ +package us.codecraft.webmagic.downloader; + +import org.apache.http.*; +import org.apache.http.client.CookieStore; +import org.apache.http.client.protocol.ResponseContentEncoding; +import org.apache.http.config.Registry; +import org.apache.http.config.RegistryBuilder; +import org.apache.http.config.SocketConfig; +import org.apache.http.conn.socket.ConnectionSocketFactory; +import org.apache.http.conn.socket.PlainConnectionSocketFactory; +import org.apache.http.conn.ssl.SSLConnectionSocketFactory; +import org.apache.http.impl.client.*; +import org.apache.http.impl.conn.PoolingHttpClientConnectionManager; +import org.apache.http.impl.cookie.BasicClientCookie; +import org.apache.http.protocol.HttpContext; +import us.codecraft.webmagic.Site; + +import java.io.IOException; +import java.util.Map; + +/** + * @author code4crafter@gmail.com
+ * @since 0.4.0 + */ +public class HttpClientGenerator { + + private PoolingHttpClientConnectionManager connectionManager; + + public HttpClientGenerator() { + Registry reg = RegistryBuilder.create() + .register("http", PlainConnectionSocketFactory.INSTANCE) + .register("https", SSLConnectionSocketFactory.getSocketFactory()) + .build(); + connectionManager = new PoolingHttpClientConnectionManager(reg); + connectionManager.setDefaultMaxPerRoute(100); + } + + public HttpClientGenerator setPoolSize(int poolSize){ + connectionManager.setMaxTotal(poolSize); + return this; + } + + public CloseableHttpClient getClient(Site site) { + return generateClient(site); + } + + private CloseableHttpClient generateClient(Site site) { + HttpClientBuilder httpClientBuilder = HttpClients.custom().setConnectionManager(connectionManager); + if (site != null && site.getUserAgent() != null) { + httpClientBuilder.setUserAgent(site.getUserAgent()); + } else { + httpClientBuilder.setUserAgent(""); + } + if (site == null || site.isUseGzip()) { + httpClientBuilder.addInterceptorFirst(new HttpRequestInterceptor() { + + public void process( + final HttpRequest request, + final HttpContext context) throws HttpException, IOException { + if (!request.containsHeader("Accept-Encoding")) { + request.addHeader("Accept-Encoding", "gzip"); + } + + } + }); + } + SocketConfig socketConfig = SocketConfig.custom().setSoKeepAlive(true).setTcpNoDelay(true).build(); + httpClientBuilder.setDefaultSocketConfig(socketConfig); + // Http client has some problem handling compressing entity for redirect + // So I disable it and do it manually + // https://issues.apache.org/jira/browse/HTTPCLIENT-1432 + httpClientBuilder.disableContentCompression(); + httpClientBuilder.addInterceptorFirst(new HttpResponseInterceptor() { + + private ResponseContentEncoding contentEncoding = new ResponseContentEncoding(); + + public void process( + final HttpResponse response, + final HttpContext context) throws HttpException, IOException { + if (response.getStatusLine().getStatusCode() == 301 || response.getStatusLine().getStatusCode() == 302) { + return; + } + contentEncoding.process(response, context); + } + + }); + if (site != null) { + httpClientBuilder.setRetryHandler(new DefaultHttpRequestRetryHandler(site.getRetryTimes(), true)); + } + generateCookie(httpClientBuilder, site); + return httpClientBuilder.build(); + } + + private void generateCookie(HttpClientBuilder httpClientBuilder, Site site) { + CookieStore cookieStore = new BasicCookieStore(); + if (site.getCookies() != null) { + for (Map.Entry cookieEntry : site.getCookies().entrySet()) { + BasicClientCookie cookie = new BasicClientCookie(cookieEntry.getKey(), cookieEntry.getValue()); + cookie.setDomain(site.getDomain()); + cookieStore.addCookie(cookie); + } + } + httpClientBuilder.setDefaultCookieStore(cookieStore); + } + +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java deleted file mode 100644 index f2fffad..0000000 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java +++ /dev/null @@ -1,93 +0,0 @@ -package us.codecraft.webmagic.downloader; - -import org.apache.http.HttpVersion; -import org.apache.http.client.CookieStore; -import org.apache.http.client.HttpClient; -import org.apache.http.client.params.ClientPNames; -import org.apache.http.client.params.CookiePolicy; -import org.apache.http.conn.scheme.PlainSocketFactory; -import org.apache.http.conn.scheme.Scheme; -import org.apache.http.conn.scheme.SchemeRegistry; -import org.apache.http.conn.ssl.SSLSocketFactory; -import org.apache.http.impl.client.BasicCookieStore; -import org.apache.http.impl.client.DefaultHttpClient; -import org.apache.http.impl.conn.PoolingClientConnectionManager; -import org.apache.http.impl.cookie.BasicClientCookie; -import org.apache.http.params.*; -import us.codecraft.webmagic.Site; - -import java.util.Map; - -/** - * @author code4crafter@gmail.com
- * @since 0.1.0 - */ -public class HttpClientPool { - - public static volatile HttpClientPool INSTANCE; - - public static HttpClientPool getInstance(int poolSize) { - if (INSTANCE == null) { - synchronized (HttpClientPool.class) { - if (INSTANCE == null) { - INSTANCE = new HttpClientPool(poolSize); - } - } - } - return INSTANCE; - } - - private int poolSize; - - private HttpClientPool(int poolSize) { - this.poolSize = poolSize; - } - - public HttpClient getClient(Site site) { - return generateClient(site); - } - - private HttpClient generateClient(Site site) { - HttpParams params = new BasicHttpParams(); - if (site != null && site.getUserAgent() != null) { - params.setParameter(CoreProtocolPNames.USER_AGENT, site.getUserAgent()); - } - params.setIntParameter(CoreConnectionPNames.SO_TIMEOUT, 1000); - params.setIntParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, 2000); - - HttpProtocolParamBean paramsBean = new HttpProtocolParamBean(params); - paramsBean.setVersion(HttpVersion.HTTP_1_1); - if (site != null && site.getCharset() != null) { - paramsBean.setContentCharset(site.getCharset()); - } - paramsBean.setUseExpectContinue(false); - - SchemeRegistry schemeRegistry = new SchemeRegistry(); - schemeRegistry.register(new Scheme("http", 80, PlainSocketFactory.getSocketFactory())); - schemeRegistry.register(new Scheme("https", 443, SSLSocketFactory.getSocketFactory())); - - PoolingClientConnectionManager connectionManager = new PoolingClientConnectionManager(schemeRegistry); - connectionManager.setMaxTotal(poolSize); - connectionManager.setDefaultMaxPerRoute(100); - DefaultHttpClient httpClient = new DefaultHttpClient(connectionManager, params); - if (site != null) { - generateCookie(httpClient, site); - } - httpClient.getParams().setIntParameter("http.socket.timeout", 60000); - httpClient.getParams().setParameter(ClientPNames.COOKIE_POLICY, CookiePolicy.BEST_MATCH); - return httpClient; - } - - private void generateCookie(DefaultHttpClient httpClient, Site site) { - CookieStore cookieStore = new BasicCookieStore(); - if (site.getCookies() != null) { - for (Map.Entry cookieEntry : site.getCookies().entrySet()) { - BasicClientCookie cookie = new BasicClientCookie(cookieEntry.getKey(), cookieEntry.getValue()); - cookie.setDomain(site.getDomain()); - cookieStore.addCookie(cookie); - } - } - httpClient.setCookieStore(cookieStore); - } - -} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/CollectorPipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/CollectorPipeline.java new file mode 100644 index 0000000..7242f43 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/CollectorPipeline.java @@ -0,0 +1,20 @@ +package us.codecraft.webmagic.pipeline; + +import java.util.List; + +/** + * Pipeline that can collect and store results.
+ * Used for {@link us.codecraft.webmagic.Spider#getAll(java.util.Collection)} + * + * @author code4crafter@gmail.com + * @since 0.4.0 + */ +public interface CollectorPipeline extends Pipeline { + + /** + * Get all results collected. + * + * @return collected results + */ + public List getCollected(); +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ResultItemsCollectorPipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ResultItemsCollectorPipeline.java new file mode 100644 index 0000000..abafa88 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ResultItemsCollectorPipeline.java @@ -0,0 +1,26 @@ +package us.codecraft.webmagic.pipeline; + +import us.codecraft.webmagic.ResultItems; +import us.codecraft.webmagic.Task; + +import java.util.ArrayList; +import java.util.List; + +/** + * @author code4crafter@gmail.com + * @since 0.4.0 + */ +public class ResultItemsCollectorPipeline implements CollectorPipeline { + + private List collector = new ArrayList(); + + @Override + public synchronized void process(ResultItems resultItems, Task task) { + collector.add(resultItems); + } + + @Override + public List getCollected() { + return collector; + } +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/BaiduBaikePageProcesser.java b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/BaiduBaikePageProcesser.java new file mode 100644 index 0000000..866d090 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/BaiduBaikePageProcesser.java @@ -0,0 +1,51 @@ +package us.codecraft.webmagic.processor.example; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.ResultItems; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.processor.PageProcessor; + +import java.util.ArrayList; +import java.util.List; + +/** + * @author code4crafter@gmail.com
+ * @since 0.4.0 + */ +public class BaiduBaikePageProcesser implements PageProcessor { + + private Site site = Site.me()//.setHttpProxy(new HttpHost("127.0.0.1",8888)) + .setRetryTimes(3).setSleepTime(1000).setUseGzip(true); + + @Override + public void process(Page page) { + page.putField("name", page.getHtml().$("h1.title div.lemmaTitleH1","text").toString()); + page.putField("description", page.getHtml().xpath("//div[@id='lemmaContent-0']//div[@class='para']/allText()")); + } + + @Override + public Site getSite() { + return site; + } + + public static void main(String[] args) { + //single download + Spider spider = Spider.create(new BaiduBaikePageProcesser()).thread(2); + String urlTemplate = "http://baike.baidu.com/search/word?word=%s&pic=1&sug=1&enc=utf8"; + ResultItems resultItems = spider.get(String.format(urlTemplate, "水力发电")); + System.out.println(resultItems); + + //multidownload + List list = new ArrayList(); + list.add(String.format(urlTemplate,"风力发电")); + list.add(String.format(urlTemplate,"太阳能")); + list.add(String.format(urlTemplate,"地热发电")); + list.add(String.format(urlTemplate,"地热发电")); + List resultItemses = spider.getAll(list); + for (ResultItems resultItemse : resultItemses) { + System.out.println(resultItemse.getAll()); + } + spider.close(); + } +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/GithubRepoPageProcesser.java b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/GithubRepoPageProcesser.java index 0e7e3b9..47f904f 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/GithubRepoPageProcesser.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/GithubRepoPageProcesser.java @@ -11,7 +11,7 @@ import us.codecraft.webmagic.processor.PageProcessor; */ public class GithubRepoPageProcesser implements PageProcessor { - private Site site = Site.me().addStartUrl("https://github.com/code4craft").setRetryTimes(3).setSleepTime(100); + private Site site = Site.me().setRetryTimes(3).setSleepTime(100); @Override public void process(Page page) { @@ -31,6 +31,6 @@ public class GithubRepoPageProcesser implements PageProcessor { } public static void main(String[] args) { - Spider.create(new GithubRepoPageProcesser()).thread(5).run(); + Spider.create(new GithubRepoPageProcesser()).addUrl("https://github.com/code4craft").thread(5).run(); } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/OschinaBlogPageProcesser.java b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/OschinaBlogPageProcesser.java index fa8dab6..4ef830d 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/OschinaBlogPageProcesser.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/OschinaBlogPageProcesser.java @@ -12,7 +12,7 @@ import java.util.List; */ public class OschinaBlogPageProcesser implements PageProcessor { - private Site site = Site.me().setDomain("my.oschina.net").addStartUrl("http://my.oschina.net/flashsword/blog"); + private Site site = Site.me().setDomain("my.oschina.net"); @Override public void process(Page page) { @@ -34,6 +34,6 @@ public class OschinaBlogPageProcesser implements PageProcessor { } public static void main(String[] args) { - Spider.create(new OschinaBlogPageProcesser()).thread(2).run(); + Spider.create(new OschinaBlogPageProcesser()).addUrl("http://my.oschina.net/flashsword/blog").thread(2).run(); } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/PriorityScheduler.java b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/PriorityScheduler.java index 7ce44f0..fa951e1 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/PriorityScheduler.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/PriorityScheduler.java @@ -4,6 +4,7 @@ import org.apache.http.annotation.ThreadSafe; import org.apache.log4j.Logger; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Task; +import us.codecraft.webmagic.utils.NumberUtils; import java.util.Comparator; import java.util.HashSet; @@ -30,14 +31,14 @@ public class PriorityScheduler implements Scheduler { private PriorityBlockingQueue priorityQueuePlus = new PriorityBlockingQueue(INITIAL_CAPACITY, new Comparator() { @Override public int compare(Request o1, Request o2) { - return -(new Long(o1.getPriority()).compareTo(o2.getPriority())); + return -NumberUtils.compareLong(o1.getPriority(), o2.getPriority()); } }); private PriorityBlockingQueue priorityQueueMinus = new PriorityBlockingQueue(INITIAL_CAPACITY, new Comparator() { @Override public int compare(Request o1, Request o2) { - return -(new Long(o1.getPriority()).compareTo(o2.getPriority())); + return -NumberUtils.compareLong(o1.getPriority(), o2.getPriority()); } }); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/NumberUtils.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/NumberUtils.java new file mode 100644 index 0000000..55e1851 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/NumberUtils.java @@ -0,0 +1,17 @@ +package us.codecraft.webmagic.utils; + +/** + * @author yihua.huang@dianping.com + */ +public abstract class NumberUtils { + + public static int compareLong(long o1, long o2) { + if (o1 < o2) { + return -1; + } else if (o1 == o2) { + return 0; + } else { + return 1; + } + } +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/ThreadUtils.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/ThreadUtils.java index ba9774d..cdfe6d0 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/ThreadUtils.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/ThreadUtils.java @@ -1,5 +1,7 @@ package us.codecraft.webmagic.utils; +import com.google.common.util.concurrent.MoreExecutors; + import java.util.concurrent.ExecutorService; import java.util.concurrent.SynchronousQueue; import java.util.concurrent.ThreadPoolExecutor; @@ -11,11 +13,15 @@ import java.util.concurrent.TimeUnit; */ public class ThreadUtils { - public static ExecutorService newFixedThreadPool(int threadSize) { - if (threadSize <= 1) { - throw new IllegalArgumentException("ThreadSize must be greater than 1!"); - } - return new ThreadPoolExecutor(threadSize - 1, threadSize - 1, 0L, TimeUnit.MILLISECONDS, - new SynchronousQueue(), new ThreadPoolExecutor.CallerRunsPolicy()); - } + public static ExecutorService newFixedThreadPool(int threadSize) { + if (threadSize <= 0) { + throw new IllegalArgumentException("ThreadSize must be greater than 0!"); + } + if (threadSize == 1) { + return MoreExecutors.sameThreadExecutor(); + + } + return new ThreadPoolExecutor(threadSize - 1, threadSize - 1, 0L, TimeUnit.MILLISECONDS, + new SynchronousQueue(), new ThreadPoolExecutor.CallerRunsPolicy()); + } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java index 9ca776d..456b3cc 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java @@ -1,10 +1,14 @@ package us.codecraft.webmagic.utils; import org.apache.commons.lang3.StringUtils; +import us.codecraft.webmagic.Request; import java.net.MalformedURLException; import java.net.URL; import java.nio.charset.Charset; +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -18,7 +22,7 @@ public class UrlUtils { /** * canonicalizeUrl - * + *

* Borrowed from Jsoup. * * @param url @@ -85,6 +89,22 @@ public class UrlUtils { return stringBuilder.toString(); } + public static List convertToRequests(Collection urls) { + List requestList = new ArrayList(urls.size()); + for (String url : urls) { + requestList.add(new Request(url)); + } + return requestList; + } + + public static List convertToUrls(Collection requests) { + List urlList = new ArrayList(requests.size()); + for (Request request : requests) { + urlList.add(request.getUrl()); + } + return urlList; + } + private static final Pattern patternForCharset = Pattern.compile("charset\\s*=\\s*['\"]*([^\\s;'\"]*)"); public static String getCharset(String contentType) { diff --git a/webmagic-core/src/main/resources/log4j.xml b/webmagic-core/src/main/resources/log4j.xml index a6630f8..9084694 100644 --- a/webmagic-core/src/main/resources/log4j.xml +++ b/webmagic-core/src/main/resources/log4j.xml @@ -13,6 +13,11 @@ + + + + + diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/SpiderTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/SpiderTest.java index 75c1ba1..3add86c 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/SpiderTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/SpiderTest.java @@ -18,7 +18,7 @@ public class SpiderTest { public void process(ResultItems resultItems, Task task) { System.out.println(1); } - }).thread(2); + }).thread(1); spider.start(); Thread.sleep(10000); spider.stop(); diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java index 936aece..b5ecada 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java @@ -22,4 +22,5 @@ public class HttpClientDownloaderTest { Page download = httpClientDownloader.download(new Request("http://www.diandian.com"), site.toTask()); Assert.assertTrue(download.getHtml().toString().contains("flashsword30")); } + } diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index 816df53..a71d682 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.3.2 + 0.4.0 4.0.0 diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/BaiduBaike.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/BaiduBaike.java new file mode 100644 index 0000000..96ff24e --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/BaiduBaike.java @@ -0,0 +1,49 @@ +package us.codecraft.webmagic.example; + +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.model.OOSpider; +import us.codecraft.webmagic.model.annotation.ExtractBy; + +import java.util.ArrayList; +import java.util.List; + +/** + * @since 0.4.0 + * @author code4crafter@gmail.com + */ +public class BaiduBaike{ + + @ExtractBy("//h1[@class=title]/div[@class=lemmaTitleH1]/text()") + private String name; + + @ExtractBy("//div[@id='lemmaContent-0']//div[@class='para']/allText()") + private String description; + + @Override + public String toString() { + return "BaiduBaike{" + + "name='" + name + '\'' + + ", description='" + description + '\'' + + '}'; + } + + public static void main(String[] args) { + OOSpider ooSpider = OOSpider.create(Site.me().setSleepTime(0), BaiduBaike.class); + //single download + String urlTemplate = "http://baike.baidu.com/search/word?word=%s&pic=1&sug=1&enc=utf8"; + BaiduBaike baike = ooSpider.get("http://baike.baidu.com/search/word?word=httpclient&pic=1&sug=1&enc=utf8"); + System.out.println(baike); + + //multidownload + List list = new ArrayList(); + list.add(String.format(urlTemplate,"风力发电")); + list.add(String.format(urlTemplate,"太阳能")); + list.add(String.format(urlTemplate,"地热发电")); + list.add(String.format(urlTemplate,"地热发电")); + List resultItemses = ooSpider.getAll(list); + for (BaiduBaike resultItemse : resultItemses) { + System.out.println(resultItemse); + } + ooSpider.close(); + } +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/GithubRepo.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/GithubRepo.java index 58441cb..427cdf7 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/GithubRepo.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/GithubRepo.java @@ -41,8 +41,9 @@ public class GithubRepo implements HasKey { private String url; public static void main(String[] args) { - OOSpider.create(Site.me().addStartUrl("https://github.com/code4craft").setSleepTime(100) - , new ConsolePageModelPipeline(), GithubRepo.class).thread(10).run(); + OOSpider.create(Site.me().setSleepTime(100) + , new ConsolePageModelPipeline(), GithubRepo.class) + .addUrl("https://github.com/code4craft").thread(10).run(); } @Override diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/OschinaBlog.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/OschinaBlog.java index 1545f88..f72efe0 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/OschinaBlog.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/OschinaBlog.java @@ -31,8 +31,9 @@ public class OschinaBlog { private Date date; public static void main(String[] args) { - OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog") - , new JsonFilePageModelPipeline("/data/webmagic/"), OschinaBlog.class).run(); + OOSpider.create(Site.me().setSleepTime(0) + , new JsonFilePageModelPipeline("/data/webmagic/"), OschinaBlog.class) + .addUrl("http://my.oschina.net/flashsword/blog").run(); } public String getTitle() { diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ConsolePageModelPipeline.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ConsolePageModelPipeline.java index abf411c..f4740c9 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ConsolePageModelPipeline.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ConsolePageModelPipeline.java @@ -2,6 +2,7 @@ package us.codecraft.webmagic.model; import org.apache.commons.lang3.builder.ToStringBuilder; import us.codecraft.webmagic.Task; +import us.codecraft.webmagic.pipeline.PageModelPipeline; /** * Print page model in console.
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ModelPipeline.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ModelPipeline.java index 071cb26..593178f 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ModelPipeline.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ModelPipeline.java @@ -3,6 +3,7 @@ package us.codecraft.webmagic.model; import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.model.annotation.ExtractBy; +import us.codecraft.webmagic.pipeline.PageModelPipeline; import us.codecraft.webmagic.pipeline.Pipeline; import java.lang.annotation.Annotation; diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/OOSpider.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/OOSpider.java index 3cee9ad..3133308 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/OOSpider.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/OOSpider.java @@ -2,8 +2,13 @@ package us.codecraft.webmagic.model; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.pipeline.CollectorPipeline; +import us.codecraft.webmagic.pipeline.PageModelPipeline; import us.codecraft.webmagic.processor.PageProcessor; +import java.util.ArrayList; +import java.util.List; + /** * The spider for page model extractor.
* In webmagic, we call a POJO containing extract result as "page model".
@@ -22,22 +27,27 @@ import us.codecraft.webmagic.processor.PageProcessor; * {@literal @}ExtractBy(value = "//div[@class='BlogTags']/a/text()", multi = true) * private List tags; * } - + * * And start the spider by: *

  *   OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog")
  *        ,new JsonFilePageModelPipeline(), OschinaBlog.class).run();
  * }
- 
+ * + * * @author code4crafter@gmail.com
* @since 0.2.0 */ -public class OOSpider extends Spider { +public class OOSpider extends Spider { private ModelPageProcessor modelPageProcessor; private ModelPipeline modelPipeline; + private PageModelPipeline pageModelPipeline; + + private List pageModelClasses = new ArrayList(); + protected OOSpider(ModelPageProcessor modelPageProcessor) { super(modelPageProcessor); this.modelPageProcessor = modelPageProcessor; @@ -49,6 +59,7 @@ public class OOSpider extends Spider { /** * create a spider + * * @param site * @param pageModelPipeline * @param pageModels @@ -57,13 +68,19 @@ public class OOSpider extends Spider { this(ModelPageProcessor.create(site, pageModels)); this.modelPipeline = new ModelPipeline(); super.addPipeline(modelPipeline); - if (pageModelPipeline!=null){ - for (Class pageModel : pageModels) { + for (Class pageModel : pageModels) { + if (pageModelPipeline != null) { this.modelPipeline.put(pageModel, pageModelPipeline); } + pageModelClasses.add(pageModel); } } + @Override + protected CollectorPipeline getCollectorPipeline() { + return new PageModelCollectorPipeline(pageModelClasses.get(0)); + } + public static OOSpider create(Site site, Class... pageModels) { return new OOSpider(site, null, pageModels); } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelCollectorPipeline.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelCollectorPipeline.java new file mode 100644 index 0000000..b61f112 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelCollectorPipeline.java @@ -0,0 +1,46 @@ +package us.codecraft.webmagic.model; + +import us.codecraft.webmagic.ResultItems; +import us.codecraft.webmagic.Task; +import us.codecraft.webmagic.model.annotation.ExtractBy; +import us.codecraft.webmagic.pipeline.CollectorPageModelPipeline; +import us.codecraft.webmagic.pipeline.CollectorPipeline; + +import java.lang.annotation.Annotation; +import java.util.List; + +/** + * @author code4crafter@gmail.com + * @since 0.4.0 + */ +class PageModelCollectorPipeline implements CollectorPipeline { + + private final CollectorPageModelPipeline classPipeline = new CollectorPageModelPipeline(); + + private final Class clazz; + + PageModelCollectorPipeline(Class clazz) { + this.clazz = clazz; + } + + @Override + public List getCollected() { + return classPipeline.getCollected(); + } + + @Override + public synchronized void process(ResultItems resultItems, Task task) { + Object o = resultItems.get(clazz.getCanonicalName()); + if (o != null) { + Annotation annotation = clazz.getAnnotation(ExtractBy.class); + if (annotation == null || !((ExtractBy) annotation).multi()) { + classPipeline.process((T) o, task); + } else { + List list = (List) o; + for (Object o1 : list) { + classPipeline.process((T) o1, task); + } + } + } + } +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java index c78bd31..a079988 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java @@ -195,7 +195,7 @@ class PageModelExtractor { private void initClassExtractors() { Annotation annotation = clazz.getAnnotation(TargetUrl.class); if (annotation == null) { - targetUrlPatterns.add(Pattern.compile(".*")); + targetUrlPatterns.add(Pattern.compile("(.*)")); } else { TargetUrl targetUrl = (TargetUrl) annotation; String[] value = targetUrl.value(); diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/CollectorPageModelPipeline.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/CollectorPageModelPipeline.java new file mode 100644 index 0000000..b6e0b1b --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/CollectorPageModelPipeline.java @@ -0,0 +1,23 @@ +package us.codecraft.webmagic.pipeline; + +import us.codecraft.webmagic.Task; + +import java.util.ArrayList; +import java.util.List; + +/** + * @author code4crafter@gmail.com + */ +public class CollectorPageModelPipeline implements PageModelPipeline { + + private List collected = new ArrayList(); + + @Override + public synchronized void process(T t, Task task) { + collected.add(t); + } + + public List getCollected() { + return collected; + } +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/FilePageModelPipeline.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/FilePageModelPipeline.java index 5586863..273b18b 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/FilePageModelPipeline.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/FilePageModelPipeline.java @@ -5,7 +5,6 @@ import org.apache.commons.lang3.builder.ToStringBuilder; import org.apache.log4j.Logger; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.model.HasKey; -import us.codecraft.webmagic.model.PageModelPipeline; import us.codecraft.webmagic.utils.FilePersistentBase; import java.io.FileWriter; diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePageModelPipeline.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePageModelPipeline.java index 228ec8c..4e35dfe 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePageModelPipeline.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePageModelPipeline.java @@ -6,7 +6,6 @@ import org.apache.commons.lang3.builder.ToStringBuilder; import org.apache.log4j.Logger; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.model.HasKey; -import us.codecraft.webmagic.model.PageModelPipeline; import us.codecraft.webmagic.utils.FilePersistentBase; import java.io.FileWriter; diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelPipeline.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/PageModelPipeline.java similarity index 86% rename from webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelPipeline.java rename to webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/PageModelPipeline.java index 2cb3808..382f71d 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelPipeline.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/PageModelPipeline.java @@ -1,4 +1,4 @@ -package us.codecraft.webmagic.model; +package us.codecraft.webmagic.pipeline; import us.codecraft.webmagic.Task; diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/MockPageModelPipeline.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/MockPageModelPipeline.java index ea7601b..f94efce 100644 --- a/webmagic-extension/src/test/java/us/codecraft/webmagic/MockPageModelPipeline.java +++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/MockPageModelPipeline.java @@ -1,7 +1,7 @@ package us.codecraft.webmagic; import junit.framework.Assert; -import us.codecraft.webmagic.model.PageModelPipeline; +import us.codecraft.webmagic.pipeline.PageModelPipeline; /** * @author code4crafter@gmail.com diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/model/GithubRepoTest.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/GithubRepoTest.java index d6e1bf0..b719bf0 100644 --- a/webmagic-extension/src/test/java/us/codecraft/webmagic/model/GithubRepoTest.java +++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/GithubRepoTest.java @@ -6,6 +6,7 @@ import us.codecraft.webmagic.MockDownloader; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.example.GithubRepo; +import us.codecraft.webmagic.pipeline.PageModelPipeline; /** * @author code4crafter@gmail.com
diff --git a/webmagic-lucene/pom.xml b/webmagic-lucene/pom.xml index 97946cc..2b5c5f1 100644 --- a/webmagic-lucene/pom.xml +++ b/webmagic-lucene/pom.xml @@ -5,7 +5,7 @@ webmagic-parent us.codecraft - 0.2.1 + 0.4.0-SNAPSHOT 4.0.0 diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index bf97b75..52f524d 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -5,7 +5,7 @@ webmagic-parent us.codecraft - 0.3.2 + 0.4.0-SNAPSHOT 4.0.0 diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/Kr36NewsModel.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/Kr36NewsModel.java index de3fdf5..936f132 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/Kr36NewsModel.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/Kr36NewsModel.java @@ -1,8 +1,9 @@ package us.codecraft.webmagic.model.samples; import us.codecraft.webmagic.Site; -import us.codecraft.webmagic.model.ConsolePageModelPipeline; +import us.codecraft.webmagic.Task; import us.codecraft.webmagic.model.OOSpider; +import us.codecraft.webmagic.pipeline.PageModelPipeline; import us.codecraft.webmagic.model.annotation.ExtractBy; import us.codecraft.webmagic.model.annotation.ExtractByUrl; import us.codecraft.webmagic.model.annotation.HelpUrl; @@ -18,14 +19,31 @@ public class Kr36NewsModel { @ExtractBy("//h1[@class='entry-title sep10']") private String title; - @ExtractBy("//div[@class='mainContent sep-10']") + @ExtractBy("//div[@class='mainContent sep-10']/tidyText()") private String content; @ExtractByUrl private String url; public static void main(String[] args) { - OOSpider.create(Site.me().addStartUrl("http://www.36kr.com/"), new ConsolePageModelPipeline(), - Kr36NewsModel.class).run(); + //Just for benchmark + OOSpider.create(Site.me().addStartUrl("http://www.36kr.com/").setSleepTime(0), new PageModelPipeline() { + @Override + public void process(Object o, Task task) { + + } + },Kr36NewsModel.class).thread(20).run(); + } + + public String getTitle() { + return title; + } + + public String getContent() { + return content; + } + + public String getUrl() { + return url; } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java index a7f51ad..468b855 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java @@ -1,10 +1,11 @@ package us.codecraft.webmagic.model.samples; import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Task; import us.codecraft.webmagic.model.OOSpider; +import us.codecraft.webmagic.pipeline.PageModelPipeline; import us.codecraft.webmagic.model.annotation.ExtractBy; import us.codecraft.webmagic.model.annotation.TargetUrl; -import us.codecraft.webmagic.pipeline.JsonFilePageModelPipeline; import java.util.List; @@ -24,8 +25,16 @@ public class OschinaBlog{ private List tags; public static void main(String[] args) { - OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog") - ,new JsonFilePageModelPipeline(), OschinaBlog.class).run(); + OOSpider.create(Site.me() + .setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36").addStartUrl("http://my.oschina.net/flashsword/blog") + .setSleepTime(0) + .setRetryTimes(3) + ,new PageModelPipeline() { + @Override + public void process(Object o, Task task) { + + } + }, OschinaBlog.class).thread(10).run(); } public String getTitle() { diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml index 98b86a6..8fae211 100644 --- a/webmagic-saxon/pom.xml +++ b/webmagic-saxon/pom.xml @@ -5,7 +5,7 @@ webmagic-parent us.codecraft - 0.3.2 + 0.4.0-SNAPSHOT 4.0.0 diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml index 685a59f..6551e4e 100644 --- a/webmagic-selenium/pom.xml +++ b/webmagic-selenium/pom.xml @@ -5,7 +5,7 @@ webmagic-parent us.codecraft - 0.3.2 + 0.4.0-SNAPSHOT 4.0.0 diff --git a/zh_docs/README.md b/zh_docs/README.md index 1931d71..e6961d8 100644 --- a/zh_docs/README.md +++ b/zh_docs/README.md @@ -34,12 +34,12 @@ webmagic使用maven管理依赖,在项目中添加对应的依赖即可使用w us.codecraft webmagic-core - 0.3.2 + 0.4.0 us.codecraft webmagic-extension - 0.3.2 + 0.4.0 #### 项目结构 diff --git a/zh_docs/us/codecraft/webmagic/downloader/HttpClientPool-cmnt.xml b/zh_docs/us/codecraft/webmagic/downloader/HttpClientPool-cmnt.xml index 9c7ef38..c1416d3 100644 --- a/zh_docs/us/codecraft/webmagic/downloader/HttpClientPool-cmnt.xml +++ b/zh_docs/us/codecraft/webmagic/downloader/HttpClientPool-cmnt.xml @@ -4,7 +4,7 @@ Sat Aug 17 14:14:45 CST 2013 - + Date: 13-4-21 Time: 下午12:29 diff --git a/zh_docs/us/codecraft/webmagic/model/OOSpider-cmnt.xml b/zh_docs/us/codecraft/webmagic/model/OOSpider-cmnt.xml index 2fd60a7..232c509 100644 --- a/zh_docs/us/codecraft/webmagic/model/OOSpider-cmnt.xml +++ b/zh_docs/us/codecraft/webmagic/model/OOSpider-cmnt.xml @@ -12,7 +12,7 @@ ]]> - + @param site @param pageModelPipeline diff --git a/zh_docs/us/codecraft/webmagic/model/PageModelPipeline-cmnt.xml b/zh_docs/us/codecraft/webmagic/model/PageModelPipeline-cmnt.xml index ac65729..64fb524 100644 --- a/zh_docs/us/codecraft/webmagic/model/PageModelPipeline-cmnt.xml +++ b/zh_docs/us/codecraft/webmagic/model/PageModelPipeline-cmnt.xml @@ -4,7 +4,7 @@ Sat Aug 17 14:14:46 CST 2013 - + Date: 13-8-3
Time: 上午9:34