diff --git a/pom.xml b/pom.xml
index 8f9837f..918ab6a 100644
--- a/pom.xml
+++ b/pom.xml
@@ -61,7 +61,7 @@
org.apache.httpcomponents
httpclient
- 4.2.4
+ 4.3.1
com.google.guava
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
index d6ee8c1..ce4f8cb 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
@@ -1,13 +1,11 @@
package us.codecraft.webmagic.downloader;
-import org.apache.commons.io.IOUtils;
-import org.apache.http.Header;
-import org.apache.http.HeaderElement;
import org.apache.http.HttpResponse;
import org.apache.http.annotation.ThreadSafe;
-import org.apache.http.client.HttpClient;
-import org.apache.http.client.entity.GzipDecompressingEntity;
+import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
+import org.apache.http.impl.client.CloseableHttpClient;
+import org.apache.http.util.EntityUtils;
import org.apache.log4j.Logger;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
@@ -34,7 +32,7 @@ public class HttpClientDownloader implements Downloader {
private Logger logger = Logger.getLogger(getClass());
- private HttpClientPool httpClientPool;
+ private volatile CloseableHttpClient httpClient;
private int poolSize = 1;
@@ -60,11 +58,15 @@ public class HttpClientDownloader implements Downloader {
return (Html) page.getHtml();
}
- private HttpClientPool getHttpClientPool(){
- if (httpClientPool==null){
- httpClientPool = new HttpClientPool(poolSize);
+ private CloseableHttpClient getHttpClient(Site site) {
+ if (httpClient == null) {
+ synchronized (this) {
+ if (httpClient == null) {
+ httpClient = new HttpClientPool(poolSize).getClient(site);
+ }
+ }
}
- return httpClientPool;
+ return httpClient;
}
@Override
@@ -73,12 +75,10 @@ public class HttpClientDownloader implements Downloader {
if (task != null) {
site = task.getSite();
}
- int retryTimes = 0;
Set acceptStatCode;
String charset = null;
- Map headers = null;
+ Map headers = null;
if (site != null) {
- retryTimes = site.getRetryTimes();
acceptStatCode = site.getAcceptStatCode();
charset = site.getCharset();
headers = site.getHeaders();
@@ -87,54 +87,17 @@ public class HttpClientDownloader implements Downloader {
acceptStatCode.add(200);
}
logger.info("downloading page " + request.getUrl());
- HttpClient httpClient = getHttpClientPool().getClient(site);
+ HttpGet httpGet = new HttpGet(request.getUrl());
+ if (headers != null) {
+ for (Map.Entry headerEntry : headers.entrySet()) {
+ httpGet.addHeader(headerEntry.getKey(), headerEntry.getValue());
+ }
+ }
+ CloseableHttpResponse httpResponse = null;
try {
- HttpGet httpGet = new HttpGet(request.getUrl());
-
- if (headers!=null){
- for (Map.Entry headerEntry : headers.entrySet()) {
- httpGet.addHeader(headerEntry.getKey(),headerEntry.getValue());
- }
- }
- if (!httpGet.containsHeader("Accept-Encoding")) {
- httpGet.addHeader("Accept-Encoding", "gzip");
- }
- HttpResponse httpResponse = null;
- int tried = 0;
- boolean retry;
- do {
- try {
- httpResponse = httpClient.execute(httpGet);
- retry = false;
- } catch (IOException e) {
- tried++;
-
- if (tried > retryTimes) {
- logger.warn("download page " + request.getUrl() + " error", e);
- if (site.getCycleRetryTimes() > 0) {
- Page page = new Page();
- Object cycleTriedTimesObject = request.getExtra(Request.CYCLE_TRIED_TIMES);
- if (cycleTriedTimesObject == null) {
- page.addTargetRequest(request.setPriority(0).putExtra(Request.CYCLE_TRIED_TIMES, 1));
- } else {
- int cycleTriedTimes = (Integer) cycleTriedTimesObject;
- cycleTriedTimes++;
- if (cycleTriedTimes >= site.getCycleRetryTimes()) {
- return null;
- }
- page.addTargetRequest(request.setPriority(0).putExtra(Request.CYCLE_TRIED_TIMES, 1));
- }
- return page;
- }
- return null;
- }
- logger.info("download page " + request.getUrl() + " error, retry the " + tried + " time!");
- retry = true;
- }
- } while (retry);
+ httpResponse = getHttpClient(site).execute(httpGet);
int statusCode = httpResponse.getStatusLine().getStatusCode();
if (acceptStatCode.contains(statusCode)) {
- handleGzip(httpResponse);
//charset
if (charset == null) {
String value = httpResponse.getEntity().getContentType().getValue();
@@ -143,16 +106,43 @@ public class HttpClientDownloader implements Downloader {
return handleResponse(request, charset, httpResponse, task);
} else {
logger.warn("code error " + statusCode + "\t" + request.getUrl());
+ return null;
}
- } catch (Exception e) {
+ } catch (IOException e) {
logger.warn("download page " + request.getUrl() + " error", e);
+ if (site.getCycleRetryTimes() > 0) {
+ return addToCycleRetry(request, site);
+ }
+ return null;
+ } finally {
+ try {
+ if (httpResponse != null) {
+ httpResponse.close();
+ }
+ } catch (IOException e) {
+ logger.warn("close response fail", e);
+ }
}
- return null;
+ }
+
+ private Page addToCycleRetry(Request request, Site site) {
+ Page page = new Page();
+ Object cycleTriedTimesObject = request.getExtra(Request.CYCLE_TRIED_TIMES);
+ if (cycleTriedTimesObject == null) {
+ page.addTargetRequest(request.setPriority(0).putExtra(Request.CYCLE_TRIED_TIMES, 1));
+ } else {
+ int cycleTriedTimes = (Integer) cycleTriedTimesObject;
+ cycleTriedTimes++;
+ if (cycleTriedTimes >= site.getCycleRetryTimes()) {
+ return null;
+ }
+ page.addTargetRequest(request.setPriority(0).putExtra(Request.CYCLE_TRIED_TIMES, 1));
+ }
+ return page;
}
protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException {
- String content = IOUtils.toString(httpResponse.getEntity().getContent(),
- charset);
+ String content = EntityUtils.toString(httpResponse.getEntity(), charset);
Page page = new Page();
page.setHtml(new Html(UrlUtils.fixAllRelativeHrefs(content, request.getUrl())));
page.setUrl(new PlainText(request.getUrl()));
@@ -163,20 +153,5 @@ public class HttpClientDownloader implements Downloader {
@Override
public void setThread(int thread) {
poolSize = thread;
- httpClientPool = new HttpClientPool(thread);
- }
-
- private void handleGzip(HttpResponse httpResponse) {
- Header ceheader = httpResponse.getEntity().getContentEncoding();
- if (ceheader != null) {
- HeaderElement[] codecs = ceheader.getElements();
- for (HeaderElement codec : codecs) {
- if (codec.getName().equalsIgnoreCase("gzip")) {
- //todo bugfix
- httpResponse.setEntity(
- new GzipDecompressingEntity(httpResponse.getEntity()));
- }
- }
- }
}
}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java
index c882836..43ee94d 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java
@@ -1,72 +1,85 @@
package us.codecraft.webmagic.downloader;
-import org.apache.http.HttpVersion;
+import org.apache.http.*;
import org.apache.http.client.CookieStore;
-import org.apache.http.client.HttpClient;
-import org.apache.http.client.params.ClientPNames;
-import org.apache.http.client.params.CookiePolicy;
-import org.apache.http.conn.scheme.PlainSocketFactory;
-import org.apache.http.conn.scheme.Scheme;
-import org.apache.http.conn.scheme.SchemeRegistry;
-import org.apache.http.conn.ssl.SSLSocketFactory;
-import org.apache.http.impl.client.BasicCookieStore;
-import org.apache.http.impl.client.DefaultHttpClient;
-import org.apache.http.impl.conn.PoolingClientConnectionManager;
+import org.apache.http.client.entity.GzipDecompressingEntity;
+import org.apache.http.config.Registry;
+import org.apache.http.config.RegistryBuilder;
+import org.apache.http.conn.socket.ConnectionSocketFactory;
+import org.apache.http.conn.socket.PlainConnectionSocketFactory;
+import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
+import org.apache.http.impl.client.*;
+import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.impl.cookie.BasicClientCookie;
-import org.apache.http.params.*;
+import org.apache.http.protocol.HttpContext;
import us.codecraft.webmagic.Site;
+import java.io.IOException;
import java.util.Map;
/**
* @author code4crafter@gmail.com
- * @since 0.1.0
+ * @since 0.3.3
*/
public class HttpClientPool {
- private int poolSize;
-
- private PoolingClientConnectionManager connectionManager;
+ private PoolingHttpClientConnectionManager connectionManager;
public HttpClientPool(int poolSize) {
- this.poolSize = poolSize;
- SchemeRegistry schemeRegistry = new SchemeRegistry();
- schemeRegistry.register(new Scheme("http", 80, PlainSocketFactory.getSocketFactory()));
- schemeRegistry.register(new Scheme("https", 443, SSLSocketFactory.getSocketFactory()));
-
- connectionManager = new PoolingClientConnectionManager(schemeRegistry);
+ Registry reg = RegistryBuilder.create()
+ .register("http", PlainConnectionSocketFactory.INSTANCE)
+ .register("https", SSLConnectionSocketFactory.getSocketFactory())
+ .build();
+ PoolingHttpClientConnectionManager connectionManager = new PoolingHttpClientConnectionManager(reg);
connectionManager.setMaxTotal(poolSize);
connectionManager.setDefaultMaxPerRoute(100);
}
- public HttpClient getClient(Site site) {
+ public CloseableHttpClient getClient(Site site) {
return generateClient(site);
}
- private HttpClient generateClient(Site site) {
- HttpParams params = new BasicHttpParams();
+ private CloseableHttpClient generateClient(Site site) {
+ HttpClientBuilder httpClientBuilder = HttpClients.custom().setConnectionManager(connectionManager);
if (site != null && site.getUserAgent() != null) {
- params.setParameter(CoreProtocolPNames.USER_AGENT, site.getUserAgent());
- params.setIntParameter(CoreConnectionPNames.SO_TIMEOUT, site.getTimeOut());
- params.setIntParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, site.getTimeOut());
+ httpClientBuilder.setUserAgent(site.getUserAgent());
} else {
- params.setIntParameter(CoreConnectionPNames.SO_TIMEOUT, 3000);
- params.setIntParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, 3000);
+ httpClientBuilder.setUserAgent("");
}
+ httpClientBuilder.addInterceptorFirst(new HttpRequestInterceptor() {
- params.setParameter(ClientPNames.COOKIE_POLICY, CookiePolicy.BEST_MATCH);
- HttpProtocolParamBean paramsBean = new HttpProtocolParamBean(params);
- paramsBean.setVersion(HttpVersion.HTTP_1_1);
- if (site != null && site.getCharset() != null) {
- paramsBean.setContentCharset(site.getCharset());
- }
- paramsBean.setUseExpectContinue(false);
+ public void process(
+ final HttpRequest request,
+ final HttpContext context) throws HttpException, IOException {
+ if (!request.containsHeader("Accept-Encoding")) {
+ request.addHeader("Accept-Encoding", "gzip");
+ }
- DefaultHttpClient httpClient = new DefaultHttpClient(connectionManager, params);
- if (site != null) {
- generateCookie(httpClient, site);
- }
- return httpClient;
+ }
+ }).addInterceptorFirst(new HttpResponseInterceptor() {
+
+ public void process(
+ final HttpResponse response,
+ final HttpContext context) throws HttpException, IOException {
+ HttpEntity entity = response.getEntity();
+ if (entity != null) {
+ Header ceheader = entity.getContentEncoding();
+ if (ceheader != null) {
+ HeaderElement[] codecs = ceheader.getElements();
+ for (int i = 0; i < codecs.length; i++) {
+ if (codecs[i].getName().equalsIgnoreCase("gzip")) {
+ response.setEntity(
+ new GzipDecompressingEntity(response.getEntity()));
+ return;
+ }
+ }
+ }
+ }
+ }
+
+ });
+ httpClientBuilder.setRetryHandler(new DefaultHttpRequestRetryHandler(site.getRetryTimes(),true));
+ return httpClientBuilder.build();
}
private void generateCookie(DefaultHttpClient httpClient, Site site) {