fix a httpclient pool size bug
parent
fcb09f2e08
commit
86a20eabd9
|
@ -39,7 +39,7 @@ import java.util.concurrent.atomic.AtomicInteger;
|
|||
*/
|
||||
public class Spider implements Runnable, Task {
|
||||
|
||||
private Downloader downloader = new HttpClientDownloader();
|
||||
private Downloader downloader;
|
||||
|
||||
private List<Pipeline> pipelines = new ArrayList<Pipeline>();
|
||||
|
||||
|
@ -139,12 +139,18 @@ public class Spider implements Runnable, Task {
|
|||
return this;
|
||||
}
|
||||
|
||||
protected void checkComponent() {
|
||||
if (downloader == null) {
|
||||
this.downloader = new HttpClientDownloader();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void run() {
|
||||
if (!stat.compareAndSet(STAT_INIT, STAT_RUNNING)) {
|
||||
throw new IllegalStateException("Spider is already running!");
|
||||
}
|
||||
checkComponent();
|
||||
if (startUrls != null) {
|
||||
for (String startUrl : startUrls) {
|
||||
scheduler.push(new Request(startUrl), this);
|
||||
|
@ -247,6 +253,7 @@ public class Spider implements Runnable, Task {
|
|||
if (threadNum <= 0) {
|
||||
throw new IllegalArgumentException("threadNum should be more than one!");
|
||||
}
|
||||
downloader = new HttpClientDownloader(threadNum);
|
||||
if (threadNum == 1) {
|
||||
return this;
|
||||
}
|
||||
|
|
|
@ -21,6 +21,7 @@ import java.io.IOException;
|
|||
|
||||
/**
|
||||
* 封装了HttpClient的下载器。已实现指定次数重试、处理gzip、自定义UA/cookie等功能。<br>
|
||||
*
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-4-21
|
||||
* Time: 下午12:15
|
||||
|
@ -29,11 +30,21 @@ public class HttpClientDownloader implements Downloader {
|
|||
|
||||
private Logger logger = Logger.getLogger(getClass());
|
||||
|
||||
private int poolSize;
|
||||
|
||||
public HttpClientDownloader(int poolSize) {
|
||||
this.poolSize = poolSize;
|
||||
}
|
||||
|
||||
public HttpClientDownloader() {
|
||||
this(5);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Page download(Request request, Task task) {
|
||||
Site site = task.getSite();
|
||||
logger.info("downloading page " + request.getUrl());
|
||||
HttpClient httpClient = HttpClientPool.getInstance().getClient(site);
|
||||
HttpClient httpClient = HttpClientPool.getInstance(poolSize).getClient(site);
|
||||
String charset = site.getCharset();
|
||||
try {
|
||||
HttpGet httpGet = new HttpGet(request.getUrl());
|
||||
|
@ -50,7 +61,7 @@ public class HttpClientDownloader implements Downloader {
|
|||
logger.warn("download page " + request.getUrl() + " error", e);
|
||||
return null;
|
||||
}
|
||||
logger.info("download page " + request.getUrl() + " error, retry the "+tried+" time!");
|
||||
logger.info("download page " + request.getUrl() + " error, retry the " + tried + " time!");
|
||||
retry = true;
|
||||
}
|
||||
} while (retry);
|
||||
|
|
|
@ -19,14 +19,21 @@ import java.util.Map;
|
|||
|
||||
/**
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-4-21
|
||||
* Time: 下午12:29
|
||||
* Date: 13-4-21
|
||||
* Time: 下午12:29
|
||||
*/
|
||||
public class HttpClientPool {
|
||||
|
||||
public static final HttpClientPool INSTANCE = new HttpClientPool(5);
|
||||
public static volatile HttpClientPool INSTANCE;
|
||||
|
||||
public static HttpClientPool getInstance() {
|
||||
public static HttpClientPool getInstance(int poolSize) {
|
||||
if (INSTANCE == null) {
|
||||
synchronized (HttpClientPool.class) {
|
||||
if (INSTANCE == null) {
|
||||
INSTANCE = new HttpClientPool(poolSize);
|
||||
}
|
||||
}
|
||||
}
|
||||
return INSTANCE;
|
||||
}
|
||||
|
||||
|
@ -48,7 +55,7 @@ public class HttpClientPool {
|
|||
|
||||
HttpProtocolParamBean paramsBean = new HttpProtocolParamBean(params);
|
||||
paramsBean.setVersion(HttpVersion.HTTP_1_1);
|
||||
paramsBean.setContentCharset("UTF-8");
|
||||
paramsBean.setContentCharset(site.getCharset());
|
||||
paramsBean.setUseExpectContinue(false);
|
||||
|
||||
SchemeRegistry schemeRegistry = new SchemeRegistry();
|
||||
|
|
|
@ -27,8 +27,8 @@ public class GlobalProcessor implements PageProcessor {
|
|||
|
||||
@Override
|
||||
public Site getSite() {
|
||||
if (site==null){
|
||||
site = Site.me().setDomain("www.2345.com")
|
||||
if (site == null) {
|
||||
site = Site.me().setDomain("www.2345.com").setSleepTime(0)
|
||||
.addStartUrl("http://www.2345.com/").addStartUrl("http://hao.360.cn/")
|
||||
.addStartUrl("http://www.baidu.com/s?wd=%E7%BD%91%E7%AB%99%E5%AF%BC%E8%88%AA&rsv_spt=1&issp=1&rsv_bp=0&ie=utf-8&tn=80039098_oem_dg&rsv_n=2&rsv_sug3=6&rsv_sug4=698&rsv_sug=0&rsv_sug1=3")
|
||||
.setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
|
||||
|
|
Loading…
Reference in New Issue