From 86a20eabd97c863d5a79360af831c7084dace1f2 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Fri, 26 Jul 2013 14:41:30 +0800 Subject: [PATCH] fix a httpclient pool size bug --- .../main/java/us/codecraft/webmagic/Spider.java | 9 ++++++++- .../downloader/HttpClientDownloader.java | 15 +++++++++++++-- .../webmagic/downloader/HttpClientPool.java | 17 ++++++++++++----- .../webmagic/samples/GlobalProcessor.java | 4 ++-- 4 files changed, 35 insertions(+), 10 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index f306542..a568f93 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -39,7 +39,7 @@ import java.util.concurrent.atomic.AtomicInteger; */ public class Spider implements Runnable, Task { - private Downloader downloader = new HttpClientDownloader(); + private Downloader downloader; private List pipelines = new ArrayList(); @@ -139,12 +139,18 @@ public class Spider implements Runnable, Task { return this; } + protected void checkComponent() { + if (downloader == null) { + this.downloader = new HttpClientDownloader(); + } + } @Override public void run() { if (!stat.compareAndSet(STAT_INIT, STAT_RUNNING)) { throw new IllegalStateException("Spider is already running!"); } + checkComponent(); if (startUrls != null) { for (String startUrl : startUrls) { scheduler.push(new Request(startUrl), this); @@ -247,6 +253,7 @@ public class Spider implements Runnable, Task { if (threadNum <= 0) { throw new IllegalArgumentException("threadNum should be more than one!"); } + downloader = new HttpClientDownloader(threadNum); if (threadNum == 1) { return this; } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 7eb6277..d763419 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -21,6 +21,7 @@ import java.io.IOException; /** * 封装了HttpClient的下载器。已实现指定次数重试、处理gzip、自定义UA/cookie等功能。
+ * * @author code4crafter@gmail.com
* Date: 13-4-21 * Time: 下午12:15 @@ -29,11 +30,21 @@ public class HttpClientDownloader implements Downloader { private Logger logger = Logger.getLogger(getClass()); + private int poolSize; + + public HttpClientDownloader(int poolSize) { + this.poolSize = poolSize; + } + + public HttpClientDownloader() { + this(5); + } + @Override public Page download(Request request, Task task) { Site site = task.getSite(); logger.info("downloading page " + request.getUrl()); - HttpClient httpClient = HttpClientPool.getInstance().getClient(site); + HttpClient httpClient = HttpClientPool.getInstance(poolSize).getClient(site); String charset = site.getCharset(); try { HttpGet httpGet = new HttpGet(request.getUrl()); @@ -50,7 +61,7 @@ public class HttpClientDownloader implements Downloader { logger.warn("download page " + request.getUrl() + " error", e); return null; } - logger.info("download page " + request.getUrl() + " error, retry the "+tried+" time!"); + logger.info("download page " + request.getUrl() + " error, retry the " + tried + " time!"); retry = true; } } while (retry); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java index 4e57e16..854f1e5 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java @@ -19,14 +19,21 @@ import java.util.Map; /** * @author code4crafter@gmail.com
- * Date: 13-4-21 - * Time: 下午12:29 + * Date: 13-4-21 + * Time: 下午12:29 */ public class HttpClientPool { - public static final HttpClientPool INSTANCE = new HttpClientPool(5); + public static volatile HttpClientPool INSTANCE; - public static HttpClientPool getInstance() { + public static HttpClientPool getInstance(int poolSize) { + if (INSTANCE == null) { + synchronized (HttpClientPool.class) { + if (INSTANCE == null) { + INSTANCE = new HttpClientPool(poolSize); + } + } + } return INSTANCE; } @@ -48,7 +55,7 @@ public class HttpClientPool { HttpProtocolParamBean paramsBean = new HttpProtocolParamBean(params); paramsBean.setVersion(HttpVersion.HTTP_1_1); - paramsBean.setContentCharset("UTF-8"); + paramsBean.setContentCharset(site.getCharset()); paramsBean.setUseExpectContinue(false); SchemeRegistry schemeRegistry = new SchemeRegistry(); diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GlobalProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GlobalProcessor.java index 0e3f9a3..2bdf342 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GlobalProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GlobalProcessor.java @@ -27,8 +27,8 @@ public class GlobalProcessor implements PageProcessor { @Override public Site getSite() { - if (site==null){ - site = Site.me().setDomain("www.2345.com") + if (site == null) { + site = Site.me().setDomain("www.2345.com").setSleepTime(0) .addStartUrl("http://www.2345.com/").addStartUrl("http://hao.360.cn/") .addStartUrl("http://www.baidu.com/s?wd=%E7%BD%91%E7%AB%99%E5%AF%BC%E8%88%AA&rsv_spt=1&issp=1&rsv_bp=0&ie=utf-8&tn=80039098_oem_dg&rsv_n=2&rsv_sug3=6&rsv_sug4=698&rsv_sug=0&rsv_sug1=3") .setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");