fix a httpclient pool size bug
parent
fcb09f2e08
commit
86a20eabd9
|
@ -39,7 +39,7 @@ import java.util.concurrent.atomic.AtomicInteger;
|
||||||
*/
|
*/
|
||||||
public class Spider implements Runnable, Task {
|
public class Spider implements Runnable, Task {
|
||||||
|
|
||||||
private Downloader downloader = new HttpClientDownloader();
|
private Downloader downloader;
|
||||||
|
|
||||||
private List<Pipeline> pipelines = new ArrayList<Pipeline>();
|
private List<Pipeline> pipelines = new ArrayList<Pipeline>();
|
||||||
|
|
||||||
|
@ -139,12 +139,18 @@ public class Spider implements Runnable, Task {
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
protected void checkComponent() {
|
||||||
|
if (downloader == null) {
|
||||||
|
this.downloader = new HttpClientDownloader();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void run() {
|
public void run() {
|
||||||
if (!stat.compareAndSet(STAT_INIT, STAT_RUNNING)) {
|
if (!stat.compareAndSet(STAT_INIT, STAT_RUNNING)) {
|
||||||
throw new IllegalStateException("Spider is already running!");
|
throw new IllegalStateException("Spider is already running!");
|
||||||
}
|
}
|
||||||
|
checkComponent();
|
||||||
if (startUrls != null) {
|
if (startUrls != null) {
|
||||||
for (String startUrl : startUrls) {
|
for (String startUrl : startUrls) {
|
||||||
scheduler.push(new Request(startUrl), this);
|
scheduler.push(new Request(startUrl), this);
|
||||||
|
@ -247,6 +253,7 @@ public class Spider implements Runnable, Task {
|
||||||
if (threadNum <= 0) {
|
if (threadNum <= 0) {
|
||||||
throw new IllegalArgumentException("threadNum should be more than one!");
|
throw new IllegalArgumentException("threadNum should be more than one!");
|
||||||
}
|
}
|
||||||
|
downloader = new HttpClientDownloader(threadNum);
|
||||||
if (threadNum == 1) {
|
if (threadNum == 1) {
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
|
@ -21,6 +21,7 @@ import java.io.IOException;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 封装了HttpClient的下载器。已实现指定次数重试、处理gzip、自定义UA/cookie等功能。<br>
|
* 封装了HttpClient的下载器。已实现指定次数重试、处理gzip、自定义UA/cookie等功能。<br>
|
||||||
|
*
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
* Date: 13-4-21
|
* Date: 13-4-21
|
||||||
* Time: 下午12:15
|
* Time: 下午12:15
|
||||||
|
@ -29,11 +30,21 @@ public class HttpClientDownloader implements Downloader {
|
||||||
|
|
||||||
private Logger logger = Logger.getLogger(getClass());
|
private Logger logger = Logger.getLogger(getClass());
|
||||||
|
|
||||||
|
private int poolSize;
|
||||||
|
|
||||||
|
public HttpClientDownloader(int poolSize) {
|
||||||
|
this.poolSize = poolSize;
|
||||||
|
}
|
||||||
|
|
||||||
|
public HttpClientDownloader() {
|
||||||
|
this(5);
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Page download(Request request, Task task) {
|
public Page download(Request request, Task task) {
|
||||||
Site site = task.getSite();
|
Site site = task.getSite();
|
||||||
logger.info("downloading page " + request.getUrl());
|
logger.info("downloading page " + request.getUrl());
|
||||||
HttpClient httpClient = HttpClientPool.getInstance().getClient(site);
|
HttpClient httpClient = HttpClientPool.getInstance(poolSize).getClient(site);
|
||||||
String charset = site.getCharset();
|
String charset = site.getCharset();
|
||||||
try {
|
try {
|
||||||
HttpGet httpGet = new HttpGet(request.getUrl());
|
HttpGet httpGet = new HttpGet(request.getUrl());
|
||||||
|
@ -50,7 +61,7 @@ public class HttpClientDownloader implements Downloader {
|
||||||
logger.warn("download page " + request.getUrl() + " error", e);
|
logger.warn("download page " + request.getUrl() + " error", e);
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
logger.info("download page " + request.getUrl() + " error, retry the "+tried+" time!");
|
logger.info("download page " + request.getUrl() + " error, retry the " + tried + " time!");
|
||||||
retry = true;
|
retry = true;
|
||||||
}
|
}
|
||||||
} while (retry);
|
} while (retry);
|
||||||
|
|
|
@ -24,9 +24,16 @@ import java.util.Map;
|
||||||
*/
|
*/
|
||||||
public class HttpClientPool {
|
public class HttpClientPool {
|
||||||
|
|
||||||
public static final HttpClientPool INSTANCE = new HttpClientPool(5);
|
public static volatile HttpClientPool INSTANCE;
|
||||||
|
|
||||||
public static HttpClientPool getInstance() {
|
public static HttpClientPool getInstance(int poolSize) {
|
||||||
|
if (INSTANCE == null) {
|
||||||
|
synchronized (HttpClientPool.class) {
|
||||||
|
if (INSTANCE == null) {
|
||||||
|
INSTANCE = new HttpClientPool(poolSize);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
return INSTANCE;
|
return INSTANCE;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -48,7 +55,7 @@ public class HttpClientPool {
|
||||||
|
|
||||||
HttpProtocolParamBean paramsBean = new HttpProtocolParamBean(params);
|
HttpProtocolParamBean paramsBean = new HttpProtocolParamBean(params);
|
||||||
paramsBean.setVersion(HttpVersion.HTTP_1_1);
|
paramsBean.setVersion(HttpVersion.HTTP_1_1);
|
||||||
paramsBean.setContentCharset("UTF-8");
|
paramsBean.setContentCharset(site.getCharset());
|
||||||
paramsBean.setUseExpectContinue(false);
|
paramsBean.setUseExpectContinue(false);
|
||||||
|
|
||||||
SchemeRegistry schemeRegistry = new SchemeRegistry();
|
SchemeRegistry schemeRegistry = new SchemeRegistry();
|
||||||
|
|
|
@ -27,8 +27,8 @@ public class GlobalProcessor implements PageProcessor {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Site getSite() {
|
public Site getSite() {
|
||||||
if (site==null){
|
if (site == null) {
|
||||||
site = Site.me().setDomain("www.2345.com")
|
site = Site.me().setDomain("www.2345.com").setSleepTime(0)
|
||||||
.addStartUrl("http://www.2345.com/").addStartUrl("http://hao.360.cn/")
|
.addStartUrl("http://www.2345.com/").addStartUrl("http://hao.360.cn/")
|
||||||
.addStartUrl("http://www.baidu.com/s?wd=%E7%BD%91%E7%AB%99%E5%AF%BC%E8%88%AA&rsv_spt=1&issp=1&rsv_bp=0&ie=utf-8&tn=80039098_oem_dg&rsv_n=2&rsv_sug3=6&rsv_sug4=698&rsv_sug=0&rsv_sug1=3")
|
.addStartUrl("http://www.baidu.com/s?wd=%E7%BD%91%E7%AB%99%E5%AF%BC%E8%88%AA&rsv_spt=1&issp=1&rsv_bp=0&ie=utf-8&tn=80039098_oem_dg&rsv_n=2&rsv_sug3=6&rsv_sug4=698&rsv_sug=0&rsv_sug1=3")
|
||||||
.setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
|
.setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
|
||||||
|
|
Loading…
Reference in New Issue