From edfc319c454a6812841d1a9c9c01f622ee1c1293 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Mon, 4 Nov 2013 00:06:30 +0800 Subject: [PATCH] update httpclient to 4.3.1 --- pom.xml | 2 +- .../downloader/HttpClientDownloader.java | 129 +++++++----------- .../webmagic/downloader/HttpClientPool.java | 97 +++++++------ 3 files changed, 108 insertions(+), 120 deletions(-) diff --git a/pom.xml b/pom.xml index 8f9837f..918ab6a 100644 --- a/pom.xml +++ b/pom.xml @@ -61,7 +61,7 @@ org.apache.httpcomponents httpclient - 4.2.4 + 4.3.1 com.google.guava diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index d6ee8c1..ce4f8cb 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -1,13 +1,11 @@ package us.codecraft.webmagic.downloader; -import org.apache.commons.io.IOUtils; -import org.apache.http.Header; -import org.apache.http.HeaderElement; import org.apache.http.HttpResponse; import org.apache.http.annotation.ThreadSafe; -import org.apache.http.client.HttpClient; -import org.apache.http.client.entity.GzipDecompressingEntity; +import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; +import org.apache.http.impl.client.CloseableHttpClient; +import org.apache.http.util.EntityUtils; import org.apache.log4j.Logger; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; @@ -34,7 +32,7 @@ public class HttpClientDownloader implements Downloader { private Logger logger = Logger.getLogger(getClass()); - private HttpClientPool httpClientPool; + private volatile CloseableHttpClient httpClient; private int poolSize = 1; @@ -60,11 +58,15 @@ public class HttpClientDownloader implements Downloader { return (Html) page.getHtml(); } - private HttpClientPool getHttpClientPool(){ - if (httpClientPool==null){ - httpClientPool = new HttpClientPool(poolSize); + private CloseableHttpClient getHttpClient(Site site) { + if (httpClient == null) { + synchronized (this) { + if (httpClient == null) { + httpClient = new HttpClientPool(poolSize).getClient(site); + } + } } - return httpClientPool; + return httpClient; } @Override @@ -73,12 +75,10 @@ public class HttpClientDownloader implements Downloader { if (task != null) { site = task.getSite(); } - int retryTimes = 0; Set acceptStatCode; String charset = null; - Map headers = null; + Map headers = null; if (site != null) { - retryTimes = site.getRetryTimes(); acceptStatCode = site.getAcceptStatCode(); charset = site.getCharset(); headers = site.getHeaders(); @@ -87,54 +87,17 @@ public class HttpClientDownloader implements Downloader { acceptStatCode.add(200); } logger.info("downloading page " + request.getUrl()); - HttpClient httpClient = getHttpClientPool().getClient(site); + HttpGet httpGet = new HttpGet(request.getUrl()); + if (headers != null) { + for (Map.Entry headerEntry : headers.entrySet()) { + httpGet.addHeader(headerEntry.getKey(), headerEntry.getValue()); + } + } + CloseableHttpResponse httpResponse = null; try { - HttpGet httpGet = new HttpGet(request.getUrl()); - - if (headers!=null){ - for (Map.Entry headerEntry : headers.entrySet()) { - httpGet.addHeader(headerEntry.getKey(),headerEntry.getValue()); - } - } - if (!httpGet.containsHeader("Accept-Encoding")) { - httpGet.addHeader("Accept-Encoding", "gzip"); - } - HttpResponse httpResponse = null; - int tried = 0; - boolean retry; - do { - try { - httpResponse = httpClient.execute(httpGet); - retry = false; - } catch (IOException e) { - tried++; - - if (tried > retryTimes) { - logger.warn("download page " + request.getUrl() + " error", e); - if (site.getCycleRetryTimes() > 0) { - Page page = new Page(); - Object cycleTriedTimesObject = request.getExtra(Request.CYCLE_TRIED_TIMES); - if (cycleTriedTimesObject == null) { - page.addTargetRequest(request.setPriority(0).putExtra(Request.CYCLE_TRIED_TIMES, 1)); - } else { - int cycleTriedTimes = (Integer) cycleTriedTimesObject; - cycleTriedTimes++; - if (cycleTriedTimes >= site.getCycleRetryTimes()) { - return null; - } - page.addTargetRequest(request.setPriority(0).putExtra(Request.CYCLE_TRIED_TIMES, 1)); - } - return page; - } - return null; - } - logger.info("download page " + request.getUrl() + " error, retry the " + tried + " time!"); - retry = true; - } - } while (retry); + httpResponse = getHttpClient(site).execute(httpGet); int statusCode = httpResponse.getStatusLine().getStatusCode(); if (acceptStatCode.contains(statusCode)) { - handleGzip(httpResponse); //charset if (charset == null) { String value = httpResponse.getEntity().getContentType().getValue(); @@ -143,16 +106,43 @@ public class HttpClientDownloader implements Downloader { return handleResponse(request, charset, httpResponse, task); } else { logger.warn("code error " + statusCode + "\t" + request.getUrl()); + return null; } - } catch (Exception e) { + } catch (IOException e) { logger.warn("download page " + request.getUrl() + " error", e); + if (site.getCycleRetryTimes() > 0) { + return addToCycleRetry(request, site); + } + return null; + } finally { + try { + if (httpResponse != null) { + httpResponse.close(); + } + } catch (IOException e) { + logger.warn("close response fail", e); + } } - return null; + } + + private Page addToCycleRetry(Request request, Site site) { + Page page = new Page(); + Object cycleTriedTimesObject = request.getExtra(Request.CYCLE_TRIED_TIMES); + if (cycleTriedTimesObject == null) { + page.addTargetRequest(request.setPriority(0).putExtra(Request.CYCLE_TRIED_TIMES, 1)); + } else { + int cycleTriedTimes = (Integer) cycleTriedTimesObject; + cycleTriedTimes++; + if (cycleTriedTimes >= site.getCycleRetryTimes()) { + return null; + } + page.addTargetRequest(request.setPriority(0).putExtra(Request.CYCLE_TRIED_TIMES, 1)); + } + return page; } protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException { - String content = IOUtils.toString(httpResponse.getEntity().getContent(), - charset); + String content = EntityUtils.toString(httpResponse.getEntity(), charset); Page page = new Page(); page.setHtml(new Html(UrlUtils.fixAllRelativeHrefs(content, request.getUrl()))); page.setUrl(new PlainText(request.getUrl())); @@ -163,20 +153,5 @@ public class HttpClientDownloader implements Downloader { @Override public void setThread(int thread) { poolSize = thread; - httpClientPool = new HttpClientPool(thread); - } - - private void handleGzip(HttpResponse httpResponse) { - Header ceheader = httpResponse.getEntity().getContentEncoding(); - if (ceheader != null) { - HeaderElement[] codecs = ceheader.getElements(); - for (HeaderElement codec : codecs) { - if (codec.getName().equalsIgnoreCase("gzip")) { - //todo bugfix - httpResponse.setEntity( - new GzipDecompressingEntity(httpResponse.getEntity())); - } - } - } } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java index c882836..43ee94d 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java @@ -1,72 +1,85 @@ package us.codecraft.webmagic.downloader; -import org.apache.http.HttpVersion; +import org.apache.http.*; import org.apache.http.client.CookieStore; -import org.apache.http.client.HttpClient; -import org.apache.http.client.params.ClientPNames; -import org.apache.http.client.params.CookiePolicy; -import org.apache.http.conn.scheme.PlainSocketFactory; -import org.apache.http.conn.scheme.Scheme; -import org.apache.http.conn.scheme.SchemeRegistry; -import org.apache.http.conn.ssl.SSLSocketFactory; -import org.apache.http.impl.client.BasicCookieStore; -import org.apache.http.impl.client.DefaultHttpClient; -import org.apache.http.impl.conn.PoolingClientConnectionManager; +import org.apache.http.client.entity.GzipDecompressingEntity; +import org.apache.http.config.Registry; +import org.apache.http.config.RegistryBuilder; +import org.apache.http.conn.socket.ConnectionSocketFactory; +import org.apache.http.conn.socket.PlainConnectionSocketFactory; +import org.apache.http.conn.ssl.SSLConnectionSocketFactory; +import org.apache.http.impl.client.*; +import org.apache.http.impl.conn.PoolingHttpClientConnectionManager; import org.apache.http.impl.cookie.BasicClientCookie; -import org.apache.http.params.*; +import org.apache.http.protocol.HttpContext; import us.codecraft.webmagic.Site; +import java.io.IOException; import java.util.Map; /** * @author code4crafter@gmail.com
- * @since 0.1.0 + * @since 0.3.3 */ public class HttpClientPool { - private int poolSize; - - private PoolingClientConnectionManager connectionManager; + private PoolingHttpClientConnectionManager connectionManager; public HttpClientPool(int poolSize) { - this.poolSize = poolSize; - SchemeRegistry schemeRegistry = new SchemeRegistry(); - schemeRegistry.register(new Scheme("http", 80, PlainSocketFactory.getSocketFactory())); - schemeRegistry.register(new Scheme("https", 443, SSLSocketFactory.getSocketFactory())); - - connectionManager = new PoolingClientConnectionManager(schemeRegistry); + Registry reg = RegistryBuilder.create() + .register("http", PlainConnectionSocketFactory.INSTANCE) + .register("https", SSLConnectionSocketFactory.getSocketFactory()) + .build(); + PoolingHttpClientConnectionManager connectionManager = new PoolingHttpClientConnectionManager(reg); connectionManager.setMaxTotal(poolSize); connectionManager.setDefaultMaxPerRoute(100); } - public HttpClient getClient(Site site) { + public CloseableHttpClient getClient(Site site) { return generateClient(site); } - private HttpClient generateClient(Site site) { - HttpParams params = new BasicHttpParams(); + private CloseableHttpClient generateClient(Site site) { + HttpClientBuilder httpClientBuilder = HttpClients.custom().setConnectionManager(connectionManager); if (site != null && site.getUserAgent() != null) { - params.setParameter(CoreProtocolPNames.USER_AGENT, site.getUserAgent()); - params.setIntParameter(CoreConnectionPNames.SO_TIMEOUT, site.getTimeOut()); - params.setIntParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, site.getTimeOut()); + httpClientBuilder.setUserAgent(site.getUserAgent()); } else { - params.setIntParameter(CoreConnectionPNames.SO_TIMEOUT, 3000); - params.setIntParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, 3000); + httpClientBuilder.setUserAgent(""); } + httpClientBuilder.addInterceptorFirst(new HttpRequestInterceptor() { - params.setParameter(ClientPNames.COOKIE_POLICY, CookiePolicy.BEST_MATCH); - HttpProtocolParamBean paramsBean = new HttpProtocolParamBean(params); - paramsBean.setVersion(HttpVersion.HTTP_1_1); - if (site != null && site.getCharset() != null) { - paramsBean.setContentCharset(site.getCharset()); - } - paramsBean.setUseExpectContinue(false); + public void process( + final HttpRequest request, + final HttpContext context) throws HttpException, IOException { + if (!request.containsHeader("Accept-Encoding")) { + request.addHeader("Accept-Encoding", "gzip"); + } - DefaultHttpClient httpClient = new DefaultHttpClient(connectionManager, params); - if (site != null) { - generateCookie(httpClient, site); - } - return httpClient; + } + }).addInterceptorFirst(new HttpResponseInterceptor() { + + public void process( + final HttpResponse response, + final HttpContext context) throws HttpException, IOException { + HttpEntity entity = response.getEntity(); + if (entity != null) { + Header ceheader = entity.getContentEncoding(); + if (ceheader != null) { + HeaderElement[] codecs = ceheader.getElements(); + for (int i = 0; i < codecs.length; i++) { + if (codecs[i].getName().equalsIgnoreCase("gzip")) { + response.setEntity( + new GzipDecompressingEntity(response.getEntity())); + return; + } + } + } + } + } + + }); + httpClientBuilder.setRetryHandler(new DefaultHttpRequestRetryHandler(site.getRetryTimes(),true)); + return httpClientBuilder.build(); } private void generateCookie(DefaultHttpClient httpClient, Site site) {