update httpclient to 4.3.1
parent
160a149b05
commit
edfc319c45
2
pom.xml
2
pom.xml
|
@ -61,7 +61,7 @@
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.httpcomponents</groupId>
|
<groupId>org.apache.httpcomponents</groupId>
|
||||||
<artifactId>httpclient</artifactId>
|
<artifactId>httpclient</artifactId>
|
||||||
<version>4.2.4</version>
|
<version>4.3.1</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>com.google.guava</groupId>
|
<groupId>com.google.guava</groupId>
|
||||||
|
|
|
@ -1,13 +1,11 @@
|
||||||
package us.codecraft.webmagic.downloader;
|
package us.codecraft.webmagic.downloader;
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
|
||||||
import org.apache.http.Header;
|
|
||||||
import org.apache.http.HeaderElement;
|
|
||||||
import org.apache.http.HttpResponse;
|
import org.apache.http.HttpResponse;
|
||||||
import org.apache.http.annotation.ThreadSafe;
|
import org.apache.http.annotation.ThreadSafe;
|
||||||
import org.apache.http.client.HttpClient;
|
import org.apache.http.client.methods.CloseableHttpResponse;
|
||||||
import org.apache.http.client.entity.GzipDecompressingEntity;
|
|
||||||
import org.apache.http.client.methods.HttpGet;
|
import org.apache.http.client.methods.HttpGet;
|
||||||
|
import org.apache.http.impl.client.CloseableHttpClient;
|
||||||
|
import org.apache.http.util.EntityUtils;
|
||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
import us.codecraft.webmagic.Page;
|
import us.codecraft.webmagic.Page;
|
||||||
import us.codecraft.webmagic.Request;
|
import us.codecraft.webmagic.Request;
|
||||||
|
@ -34,7 +32,7 @@ public class HttpClientDownloader implements Downloader {
|
||||||
|
|
||||||
private Logger logger = Logger.getLogger(getClass());
|
private Logger logger = Logger.getLogger(getClass());
|
||||||
|
|
||||||
private HttpClientPool httpClientPool;
|
private volatile CloseableHttpClient httpClient;
|
||||||
|
|
||||||
private int poolSize = 1;
|
private int poolSize = 1;
|
||||||
|
|
||||||
|
@ -60,11 +58,15 @@ public class HttpClientDownloader implements Downloader {
|
||||||
return (Html) page.getHtml();
|
return (Html) page.getHtml();
|
||||||
}
|
}
|
||||||
|
|
||||||
private HttpClientPool getHttpClientPool(){
|
private CloseableHttpClient getHttpClient(Site site) {
|
||||||
if (httpClientPool==null){
|
if (httpClient == null) {
|
||||||
httpClientPool = new HttpClientPool(poolSize);
|
synchronized (this) {
|
||||||
|
if (httpClient == null) {
|
||||||
|
httpClient = new HttpClientPool(poolSize).getClient(site);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return httpClientPool;
|
return httpClient;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -73,12 +75,10 @@ public class HttpClientDownloader implements Downloader {
|
||||||
if (task != null) {
|
if (task != null) {
|
||||||
site = task.getSite();
|
site = task.getSite();
|
||||||
}
|
}
|
||||||
int retryTimes = 0;
|
|
||||||
Set<Integer> acceptStatCode;
|
Set<Integer> acceptStatCode;
|
||||||
String charset = null;
|
String charset = null;
|
||||||
Map<String,String> headers = null;
|
Map<String, String> headers = null;
|
||||||
if (site != null) {
|
if (site != null) {
|
||||||
retryTimes = site.getRetryTimes();
|
|
||||||
acceptStatCode = site.getAcceptStatCode();
|
acceptStatCode = site.getAcceptStatCode();
|
||||||
charset = site.getCharset();
|
charset = site.getCharset();
|
||||||
headers = site.getHeaders();
|
headers = site.getHeaders();
|
||||||
|
@ -87,54 +87,17 @@ public class HttpClientDownloader implements Downloader {
|
||||||
acceptStatCode.add(200);
|
acceptStatCode.add(200);
|
||||||
}
|
}
|
||||||
logger.info("downloading page " + request.getUrl());
|
logger.info("downloading page " + request.getUrl());
|
||||||
HttpClient httpClient = getHttpClientPool().getClient(site);
|
HttpGet httpGet = new HttpGet(request.getUrl());
|
||||||
|
if (headers != null) {
|
||||||
|
for (Map.Entry<String, String> headerEntry : headers.entrySet()) {
|
||||||
|
httpGet.addHeader(headerEntry.getKey(), headerEntry.getValue());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
CloseableHttpResponse httpResponse = null;
|
||||||
try {
|
try {
|
||||||
HttpGet httpGet = new HttpGet(request.getUrl());
|
httpResponse = getHttpClient(site).execute(httpGet);
|
||||||
|
|
||||||
if (headers!=null){
|
|
||||||
for (Map.Entry<String, String> headerEntry : headers.entrySet()) {
|
|
||||||
httpGet.addHeader(headerEntry.getKey(),headerEntry.getValue());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (!httpGet.containsHeader("Accept-Encoding")) {
|
|
||||||
httpGet.addHeader("Accept-Encoding", "gzip");
|
|
||||||
}
|
|
||||||
HttpResponse httpResponse = null;
|
|
||||||
int tried = 0;
|
|
||||||
boolean retry;
|
|
||||||
do {
|
|
||||||
try {
|
|
||||||
httpResponse = httpClient.execute(httpGet);
|
|
||||||
retry = false;
|
|
||||||
} catch (IOException e) {
|
|
||||||
tried++;
|
|
||||||
|
|
||||||
if (tried > retryTimes) {
|
|
||||||
logger.warn("download page " + request.getUrl() + " error", e);
|
|
||||||
if (site.getCycleRetryTimes() > 0) {
|
|
||||||
Page page = new Page();
|
|
||||||
Object cycleTriedTimesObject = request.getExtra(Request.CYCLE_TRIED_TIMES);
|
|
||||||
if (cycleTriedTimesObject == null) {
|
|
||||||
page.addTargetRequest(request.setPriority(0).putExtra(Request.CYCLE_TRIED_TIMES, 1));
|
|
||||||
} else {
|
|
||||||
int cycleTriedTimes = (Integer) cycleTriedTimesObject;
|
|
||||||
cycleTriedTimes++;
|
|
||||||
if (cycleTriedTimes >= site.getCycleRetryTimes()) {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
page.addTargetRequest(request.setPriority(0).putExtra(Request.CYCLE_TRIED_TIMES, 1));
|
|
||||||
}
|
|
||||||
return page;
|
|
||||||
}
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
logger.info("download page " + request.getUrl() + " error, retry the " + tried + " time!");
|
|
||||||
retry = true;
|
|
||||||
}
|
|
||||||
} while (retry);
|
|
||||||
int statusCode = httpResponse.getStatusLine().getStatusCode();
|
int statusCode = httpResponse.getStatusLine().getStatusCode();
|
||||||
if (acceptStatCode.contains(statusCode)) {
|
if (acceptStatCode.contains(statusCode)) {
|
||||||
handleGzip(httpResponse);
|
|
||||||
//charset
|
//charset
|
||||||
if (charset == null) {
|
if (charset == null) {
|
||||||
String value = httpResponse.getEntity().getContentType().getValue();
|
String value = httpResponse.getEntity().getContentType().getValue();
|
||||||
|
@ -143,16 +106,43 @@ public class HttpClientDownloader implements Downloader {
|
||||||
return handleResponse(request, charset, httpResponse, task);
|
return handleResponse(request, charset, httpResponse, task);
|
||||||
} else {
|
} else {
|
||||||
logger.warn("code error " + statusCode + "\t" + request.getUrl());
|
logger.warn("code error " + statusCode + "\t" + request.getUrl());
|
||||||
|
return null;
|
||||||
}
|
}
|
||||||
} catch (Exception e) {
|
} catch (IOException e) {
|
||||||
logger.warn("download page " + request.getUrl() + " error", e);
|
logger.warn("download page " + request.getUrl() + " error", e);
|
||||||
|
if (site.getCycleRetryTimes() > 0) {
|
||||||
|
return addToCycleRetry(request, site);
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
} finally {
|
||||||
|
try {
|
||||||
|
if (httpResponse != null) {
|
||||||
|
httpResponse.close();
|
||||||
|
}
|
||||||
|
} catch (IOException e) {
|
||||||
|
logger.warn("close response fail", e);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return null;
|
}
|
||||||
|
|
||||||
|
private Page addToCycleRetry(Request request, Site site) {
|
||||||
|
Page page = new Page();
|
||||||
|
Object cycleTriedTimesObject = request.getExtra(Request.CYCLE_TRIED_TIMES);
|
||||||
|
if (cycleTriedTimesObject == null) {
|
||||||
|
page.addTargetRequest(request.setPriority(0).putExtra(Request.CYCLE_TRIED_TIMES, 1));
|
||||||
|
} else {
|
||||||
|
int cycleTriedTimes = (Integer) cycleTriedTimesObject;
|
||||||
|
cycleTriedTimes++;
|
||||||
|
if (cycleTriedTimes >= site.getCycleRetryTimes()) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
page.addTargetRequest(request.setPriority(0).putExtra(Request.CYCLE_TRIED_TIMES, 1));
|
||||||
|
}
|
||||||
|
return page;
|
||||||
}
|
}
|
||||||
|
|
||||||
protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException {
|
protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException {
|
||||||
String content = IOUtils.toString(httpResponse.getEntity().getContent(),
|
String content = EntityUtils.toString(httpResponse.getEntity(), charset);
|
||||||
charset);
|
|
||||||
Page page = new Page();
|
Page page = new Page();
|
||||||
page.setHtml(new Html(UrlUtils.fixAllRelativeHrefs(content, request.getUrl())));
|
page.setHtml(new Html(UrlUtils.fixAllRelativeHrefs(content, request.getUrl())));
|
||||||
page.setUrl(new PlainText(request.getUrl()));
|
page.setUrl(new PlainText(request.getUrl()));
|
||||||
|
@ -163,20 +153,5 @@ public class HttpClientDownloader implements Downloader {
|
||||||
@Override
|
@Override
|
||||||
public void setThread(int thread) {
|
public void setThread(int thread) {
|
||||||
poolSize = thread;
|
poolSize = thread;
|
||||||
httpClientPool = new HttpClientPool(thread);
|
|
||||||
}
|
|
||||||
|
|
||||||
private void handleGzip(HttpResponse httpResponse) {
|
|
||||||
Header ceheader = httpResponse.getEntity().getContentEncoding();
|
|
||||||
if (ceheader != null) {
|
|
||||||
HeaderElement[] codecs = ceheader.getElements();
|
|
||||||
for (HeaderElement codec : codecs) {
|
|
||||||
if (codec.getName().equalsIgnoreCase("gzip")) {
|
|
||||||
//todo bugfix
|
|
||||||
httpResponse.setEntity(
|
|
||||||
new GzipDecompressingEntity(httpResponse.getEntity()));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,72 +1,85 @@
|
||||||
package us.codecraft.webmagic.downloader;
|
package us.codecraft.webmagic.downloader;
|
||||||
|
|
||||||
import org.apache.http.HttpVersion;
|
import org.apache.http.*;
|
||||||
import org.apache.http.client.CookieStore;
|
import org.apache.http.client.CookieStore;
|
||||||
import org.apache.http.client.HttpClient;
|
import org.apache.http.client.entity.GzipDecompressingEntity;
|
||||||
import org.apache.http.client.params.ClientPNames;
|
import org.apache.http.config.Registry;
|
||||||
import org.apache.http.client.params.CookiePolicy;
|
import org.apache.http.config.RegistryBuilder;
|
||||||
import org.apache.http.conn.scheme.PlainSocketFactory;
|
import org.apache.http.conn.socket.ConnectionSocketFactory;
|
||||||
import org.apache.http.conn.scheme.Scheme;
|
import org.apache.http.conn.socket.PlainConnectionSocketFactory;
|
||||||
import org.apache.http.conn.scheme.SchemeRegistry;
|
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
|
||||||
import org.apache.http.conn.ssl.SSLSocketFactory;
|
import org.apache.http.impl.client.*;
|
||||||
import org.apache.http.impl.client.BasicCookieStore;
|
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
|
||||||
import org.apache.http.impl.client.DefaultHttpClient;
|
|
||||||
import org.apache.http.impl.conn.PoolingClientConnectionManager;
|
|
||||||
import org.apache.http.impl.cookie.BasicClientCookie;
|
import org.apache.http.impl.cookie.BasicClientCookie;
|
||||||
import org.apache.http.params.*;
|
import org.apache.http.protocol.HttpContext;
|
||||||
import us.codecraft.webmagic.Site;
|
import us.codecraft.webmagic.Site;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
* @since 0.1.0
|
* @since 0.3.3
|
||||||
*/
|
*/
|
||||||
public class HttpClientPool {
|
public class HttpClientPool {
|
||||||
|
|
||||||
private int poolSize;
|
private PoolingHttpClientConnectionManager connectionManager;
|
||||||
|
|
||||||
private PoolingClientConnectionManager connectionManager;
|
|
||||||
|
|
||||||
public HttpClientPool(int poolSize) {
|
public HttpClientPool(int poolSize) {
|
||||||
this.poolSize = poolSize;
|
Registry<ConnectionSocketFactory> reg = RegistryBuilder.<ConnectionSocketFactory>create()
|
||||||
SchemeRegistry schemeRegistry = new SchemeRegistry();
|
.register("http", PlainConnectionSocketFactory.INSTANCE)
|
||||||
schemeRegistry.register(new Scheme("http", 80, PlainSocketFactory.getSocketFactory()));
|
.register("https", SSLConnectionSocketFactory.getSocketFactory())
|
||||||
schemeRegistry.register(new Scheme("https", 443, SSLSocketFactory.getSocketFactory()));
|
.build();
|
||||||
|
PoolingHttpClientConnectionManager connectionManager = new PoolingHttpClientConnectionManager(reg);
|
||||||
connectionManager = new PoolingClientConnectionManager(schemeRegistry);
|
|
||||||
connectionManager.setMaxTotal(poolSize);
|
connectionManager.setMaxTotal(poolSize);
|
||||||
connectionManager.setDefaultMaxPerRoute(100);
|
connectionManager.setDefaultMaxPerRoute(100);
|
||||||
}
|
}
|
||||||
|
|
||||||
public HttpClient getClient(Site site) {
|
public CloseableHttpClient getClient(Site site) {
|
||||||
return generateClient(site);
|
return generateClient(site);
|
||||||
}
|
}
|
||||||
|
|
||||||
private HttpClient generateClient(Site site) {
|
private CloseableHttpClient generateClient(Site site) {
|
||||||
HttpParams params = new BasicHttpParams();
|
HttpClientBuilder httpClientBuilder = HttpClients.custom().setConnectionManager(connectionManager);
|
||||||
if (site != null && site.getUserAgent() != null) {
|
if (site != null && site.getUserAgent() != null) {
|
||||||
params.setParameter(CoreProtocolPNames.USER_AGENT, site.getUserAgent());
|
httpClientBuilder.setUserAgent(site.getUserAgent());
|
||||||
params.setIntParameter(CoreConnectionPNames.SO_TIMEOUT, site.getTimeOut());
|
|
||||||
params.setIntParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, site.getTimeOut());
|
|
||||||
} else {
|
} else {
|
||||||
params.setIntParameter(CoreConnectionPNames.SO_TIMEOUT, 3000);
|
httpClientBuilder.setUserAgent("");
|
||||||
params.setIntParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, 3000);
|
|
||||||
}
|
}
|
||||||
|
httpClientBuilder.addInterceptorFirst(new HttpRequestInterceptor() {
|
||||||
|
|
||||||
params.setParameter(ClientPNames.COOKIE_POLICY, CookiePolicy.BEST_MATCH);
|
public void process(
|
||||||
HttpProtocolParamBean paramsBean = new HttpProtocolParamBean(params);
|
final HttpRequest request,
|
||||||
paramsBean.setVersion(HttpVersion.HTTP_1_1);
|
final HttpContext context) throws HttpException, IOException {
|
||||||
if (site != null && site.getCharset() != null) {
|
if (!request.containsHeader("Accept-Encoding")) {
|
||||||
paramsBean.setContentCharset(site.getCharset());
|
request.addHeader("Accept-Encoding", "gzip");
|
||||||
}
|
}
|
||||||
paramsBean.setUseExpectContinue(false);
|
|
||||||
|
|
||||||
DefaultHttpClient httpClient = new DefaultHttpClient(connectionManager, params);
|
}
|
||||||
if (site != null) {
|
}).addInterceptorFirst(new HttpResponseInterceptor() {
|
||||||
generateCookie(httpClient, site);
|
|
||||||
}
|
public void process(
|
||||||
return httpClient;
|
final HttpResponse response,
|
||||||
|
final HttpContext context) throws HttpException, IOException {
|
||||||
|
HttpEntity entity = response.getEntity();
|
||||||
|
if (entity != null) {
|
||||||
|
Header ceheader = entity.getContentEncoding();
|
||||||
|
if (ceheader != null) {
|
||||||
|
HeaderElement[] codecs = ceheader.getElements();
|
||||||
|
for (int i = 0; i < codecs.length; i++) {
|
||||||
|
if (codecs[i].getName().equalsIgnoreCase("gzip")) {
|
||||||
|
response.setEntity(
|
||||||
|
new GzipDecompressingEntity(response.getEntity()));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
});
|
||||||
|
httpClientBuilder.setRetryHandler(new DefaultHttpRequestRetryHandler(site.getRetryTimes(),true));
|
||||||
|
return httpClientBuilder.build();
|
||||||
}
|
}
|
||||||
|
|
||||||
private void generateCookie(DefaultHttpClient httpClient, Site site) {
|
private void generateCookie(DefaultHttpClient httpClient, Site site) {
|
||||||
|
|
Loading…
Reference in New Issue