update httpclient to 4.3.1
parent
160a149b05
commit
edfc319c45
2
pom.xml
2
pom.xml
|
@ -61,7 +61,7 @@
|
|||
<dependency>
|
||||
<groupId>org.apache.httpcomponents</groupId>
|
||||
<artifactId>httpclient</artifactId>
|
||||
<version>4.2.4</version>
|
||||
<version>4.3.1</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.google.guava</groupId>
|
||||
|
|
|
@ -1,13 +1,11 @@
|
|||
package us.codecraft.webmagic.downloader;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.http.Header;
|
||||
import org.apache.http.HeaderElement;
|
||||
import org.apache.http.HttpResponse;
|
||||
import org.apache.http.annotation.ThreadSafe;
|
||||
import org.apache.http.client.HttpClient;
|
||||
import org.apache.http.client.entity.GzipDecompressingEntity;
|
||||
import org.apache.http.client.methods.CloseableHttpResponse;
|
||||
import org.apache.http.client.methods.HttpGet;
|
||||
import org.apache.http.impl.client.CloseableHttpClient;
|
||||
import org.apache.http.util.EntityUtils;
|
||||
import org.apache.log4j.Logger;
|
||||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.Request;
|
||||
|
@ -34,7 +32,7 @@ public class HttpClientDownloader implements Downloader {
|
|||
|
||||
private Logger logger = Logger.getLogger(getClass());
|
||||
|
||||
private HttpClientPool httpClientPool;
|
||||
private volatile CloseableHttpClient httpClient;
|
||||
|
||||
private int poolSize = 1;
|
||||
|
||||
|
@ -60,11 +58,15 @@ public class HttpClientDownloader implements Downloader {
|
|||
return (Html) page.getHtml();
|
||||
}
|
||||
|
||||
private HttpClientPool getHttpClientPool(){
|
||||
if (httpClientPool==null){
|
||||
httpClientPool = new HttpClientPool(poolSize);
|
||||
private CloseableHttpClient getHttpClient(Site site) {
|
||||
if (httpClient == null) {
|
||||
synchronized (this) {
|
||||
if (httpClient == null) {
|
||||
httpClient = new HttpClientPool(poolSize).getClient(site);
|
||||
}
|
||||
}
|
||||
}
|
||||
return httpClientPool;
|
||||
return httpClient;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -73,12 +75,10 @@ public class HttpClientDownloader implements Downloader {
|
|||
if (task != null) {
|
||||
site = task.getSite();
|
||||
}
|
||||
int retryTimes = 0;
|
||||
Set<Integer> acceptStatCode;
|
||||
String charset = null;
|
||||
Map<String,String> headers = null;
|
||||
Map<String, String> headers = null;
|
||||
if (site != null) {
|
||||
retryTimes = site.getRetryTimes();
|
||||
acceptStatCode = site.getAcceptStatCode();
|
||||
charset = site.getCharset();
|
||||
headers = site.getHeaders();
|
||||
|
@ -87,54 +87,17 @@ public class HttpClientDownloader implements Downloader {
|
|||
acceptStatCode.add(200);
|
||||
}
|
||||
logger.info("downloading page " + request.getUrl());
|
||||
HttpClient httpClient = getHttpClientPool().getClient(site);
|
||||
HttpGet httpGet = new HttpGet(request.getUrl());
|
||||
if (headers != null) {
|
||||
for (Map.Entry<String, String> headerEntry : headers.entrySet()) {
|
||||
httpGet.addHeader(headerEntry.getKey(), headerEntry.getValue());
|
||||
}
|
||||
}
|
||||
CloseableHttpResponse httpResponse = null;
|
||||
try {
|
||||
HttpGet httpGet = new HttpGet(request.getUrl());
|
||||
|
||||
if (headers!=null){
|
||||
for (Map.Entry<String, String> headerEntry : headers.entrySet()) {
|
||||
httpGet.addHeader(headerEntry.getKey(),headerEntry.getValue());
|
||||
}
|
||||
}
|
||||
if (!httpGet.containsHeader("Accept-Encoding")) {
|
||||
httpGet.addHeader("Accept-Encoding", "gzip");
|
||||
}
|
||||
HttpResponse httpResponse = null;
|
||||
int tried = 0;
|
||||
boolean retry;
|
||||
do {
|
||||
try {
|
||||
httpResponse = httpClient.execute(httpGet);
|
||||
retry = false;
|
||||
} catch (IOException e) {
|
||||
tried++;
|
||||
|
||||
if (tried > retryTimes) {
|
||||
logger.warn("download page " + request.getUrl() + " error", e);
|
||||
if (site.getCycleRetryTimes() > 0) {
|
||||
Page page = new Page();
|
||||
Object cycleTriedTimesObject = request.getExtra(Request.CYCLE_TRIED_TIMES);
|
||||
if (cycleTriedTimesObject == null) {
|
||||
page.addTargetRequest(request.setPriority(0).putExtra(Request.CYCLE_TRIED_TIMES, 1));
|
||||
} else {
|
||||
int cycleTriedTimes = (Integer) cycleTriedTimesObject;
|
||||
cycleTriedTimes++;
|
||||
if (cycleTriedTimes >= site.getCycleRetryTimes()) {
|
||||
return null;
|
||||
}
|
||||
page.addTargetRequest(request.setPriority(0).putExtra(Request.CYCLE_TRIED_TIMES, 1));
|
||||
}
|
||||
return page;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
logger.info("download page " + request.getUrl() + " error, retry the " + tried + " time!");
|
||||
retry = true;
|
||||
}
|
||||
} while (retry);
|
||||
httpResponse = getHttpClient(site).execute(httpGet);
|
||||
int statusCode = httpResponse.getStatusLine().getStatusCode();
|
||||
if (acceptStatCode.contains(statusCode)) {
|
||||
handleGzip(httpResponse);
|
||||
//charset
|
||||
if (charset == null) {
|
||||
String value = httpResponse.getEntity().getContentType().getValue();
|
||||
|
@ -143,16 +106,43 @@ public class HttpClientDownloader implements Downloader {
|
|||
return handleResponse(request, charset, httpResponse, task);
|
||||
} else {
|
||||
logger.warn("code error " + statusCode + "\t" + request.getUrl());
|
||||
return null;
|
||||
}
|
||||
} catch (Exception e) {
|
||||
} catch (IOException e) {
|
||||
logger.warn("download page " + request.getUrl() + " error", e);
|
||||
if (site.getCycleRetryTimes() > 0) {
|
||||
return addToCycleRetry(request, site);
|
||||
}
|
||||
return null;
|
||||
} finally {
|
||||
try {
|
||||
if (httpResponse != null) {
|
||||
httpResponse.close();
|
||||
}
|
||||
} catch (IOException e) {
|
||||
logger.warn("close response fail", e);
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
private Page addToCycleRetry(Request request, Site site) {
|
||||
Page page = new Page();
|
||||
Object cycleTriedTimesObject = request.getExtra(Request.CYCLE_TRIED_TIMES);
|
||||
if (cycleTriedTimesObject == null) {
|
||||
page.addTargetRequest(request.setPriority(0).putExtra(Request.CYCLE_TRIED_TIMES, 1));
|
||||
} else {
|
||||
int cycleTriedTimes = (Integer) cycleTriedTimesObject;
|
||||
cycleTriedTimes++;
|
||||
if (cycleTriedTimes >= site.getCycleRetryTimes()) {
|
||||
return null;
|
||||
}
|
||||
page.addTargetRequest(request.setPriority(0).putExtra(Request.CYCLE_TRIED_TIMES, 1));
|
||||
}
|
||||
return page;
|
||||
}
|
||||
|
||||
protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException {
|
||||
String content = IOUtils.toString(httpResponse.getEntity().getContent(),
|
||||
charset);
|
||||
String content = EntityUtils.toString(httpResponse.getEntity(), charset);
|
||||
Page page = new Page();
|
||||
page.setHtml(new Html(UrlUtils.fixAllRelativeHrefs(content, request.getUrl())));
|
||||
page.setUrl(new PlainText(request.getUrl()));
|
||||
|
@ -163,20 +153,5 @@ public class HttpClientDownloader implements Downloader {
|
|||
@Override
|
||||
public void setThread(int thread) {
|
||||
poolSize = thread;
|
||||
httpClientPool = new HttpClientPool(thread);
|
||||
}
|
||||
|
||||
private void handleGzip(HttpResponse httpResponse) {
|
||||
Header ceheader = httpResponse.getEntity().getContentEncoding();
|
||||
if (ceheader != null) {
|
||||
HeaderElement[] codecs = ceheader.getElements();
|
||||
for (HeaderElement codec : codecs) {
|
||||
if (codec.getName().equalsIgnoreCase("gzip")) {
|
||||
//todo bugfix
|
||||
httpResponse.setEntity(
|
||||
new GzipDecompressingEntity(httpResponse.getEntity()));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,72 +1,85 @@
|
|||
package us.codecraft.webmagic.downloader;
|
||||
|
||||
import org.apache.http.HttpVersion;
|
||||
import org.apache.http.*;
|
||||
import org.apache.http.client.CookieStore;
|
||||
import org.apache.http.client.HttpClient;
|
||||
import org.apache.http.client.params.ClientPNames;
|
||||
import org.apache.http.client.params.CookiePolicy;
|
||||
import org.apache.http.conn.scheme.PlainSocketFactory;
|
||||
import org.apache.http.conn.scheme.Scheme;
|
||||
import org.apache.http.conn.scheme.SchemeRegistry;
|
||||
import org.apache.http.conn.ssl.SSLSocketFactory;
|
||||
import org.apache.http.impl.client.BasicCookieStore;
|
||||
import org.apache.http.impl.client.DefaultHttpClient;
|
||||
import org.apache.http.impl.conn.PoolingClientConnectionManager;
|
||||
import org.apache.http.client.entity.GzipDecompressingEntity;
|
||||
import org.apache.http.config.Registry;
|
||||
import org.apache.http.config.RegistryBuilder;
|
||||
import org.apache.http.conn.socket.ConnectionSocketFactory;
|
||||
import org.apache.http.conn.socket.PlainConnectionSocketFactory;
|
||||
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
|
||||
import org.apache.http.impl.client.*;
|
||||
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
|
||||
import org.apache.http.impl.cookie.BasicClientCookie;
|
||||
import org.apache.http.params.*;
|
||||
import org.apache.http.protocol.HttpContext;
|
||||
import us.codecraft.webmagic.Site;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* @since 0.1.0
|
||||
* @since 0.3.3
|
||||
*/
|
||||
public class HttpClientPool {
|
||||
|
||||
private int poolSize;
|
||||
|
||||
private PoolingClientConnectionManager connectionManager;
|
||||
private PoolingHttpClientConnectionManager connectionManager;
|
||||
|
||||
public HttpClientPool(int poolSize) {
|
||||
this.poolSize = poolSize;
|
||||
SchemeRegistry schemeRegistry = new SchemeRegistry();
|
||||
schemeRegistry.register(new Scheme("http", 80, PlainSocketFactory.getSocketFactory()));
|
||||
schemeRegistry.register(new Scheme("https", 443, SSLSocketFactory.getSocketFactory()));
|
||||
|
||||
connectionManager = new PoolingClientConnectionManager(schemeRegistry);
|
||||
Registry<ConnectionSocketFactory> reg = RegistryBuilder.<ConnectionSocketFactory>create()
|
||||
.register("http", PlainConnectionSocketFactory.INSTANCE)
|
||||
.register("https", SSLConnectionSocketFactory.getSocketFactory())
|
||||
.build();
|
||||
PoolingHttpClientConnectionManager connectionManager = new PoolingHttpClientConnectionManager(reg);
|
||||
connectionManager.setMaxTotal(poolSize);
|
||||
connectionManager.setDefaultMaxPerRoute(100);
|
||||
}
|
||||
|
||||
public HttpClient getClient(Site site) {
|
||||
public CloseableHttpClient getClient(Site site) {
|
||||
return generateClient(site);
|
||||
}
|
||||
|
||||
private HttpClient generateClient(Site site) {
|
||||
HttpParams params = new BasicHttpParams();
|
||||
private CloseableHttpClient generateClient(Site site) {
|
||||
HttpClientBuilder httpClientBuilder = HttpClients.custom().setConnectionManager(connectionManager);
|
||||
if (site != null && site.getUserAgent() != null) {
|
||||
params.setParameter(CoreProtocolPNames.USER_AGENT, site.getUserAgent());
|
||||
params.setIntParameter(CoreConnectionPNames.SO_TIMEOUT, site.getTimeOut());
|
||||
params.setIntParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, site.getTimeOut());
|
||||
httpClientBuilder.setUserAgent(site.getUserAgent());
|
||||
} else {
|
||||
params.setIntParameter(CoreConnectionPNames.SO_TIMEOUT, 3000);
|
||||
params.setIntParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, 3000);
|
||||
httpClientBuilder.setUserAgent("");
|
||||
}
|
||||
httpClientBuilder.addInterceptorFirst(new HttpRequestInterceptor() {
|
||||
|
||||
params.setParameter(ClientPNames.COOKIE_POLICY, CookiePolicy.BEST_MATCH);
|
||||
HttpProtocolParamBean paramsBean = new HttpProtocolParamBean(params);
|
||||
paramsBean.setVersion(HttpVersion.HTTP_1_1);
|
||||
if (site != null && site.getCharset() != null) {
|
||||
paramsBean.setContentCharset(site.getCharset());
|
||||
}
|
||||
paramsBean.setUseExpectContinue(false);
|
||||
public void process(
|
||||
final HttpRequest request,
|
||||
final HttpContext context) throws HttpException, IOException {
|
||||
if (!request.containsHeader("Accept-Encoding")) {
|
||||
request.addHeader("Accept-Encoding", "gzip");
|
||||
}
|
||||
|
||||
DefaultHttpClient httpClient = new DefaultHttpClient(connectionManager, params);
|
||||
if (site != null) {
|
||||
generateCookie(httpClient, site);
|
||||
}
|
||||
return httpClient;
|
||||
}
|
||||
}).addInterceptorFirst(new HttpResponseInterceptor() {
|
||||
|
||||
public void process(
|
||||
final HttpResponse response,
|
||||
final HttpContext context) throws HttpException, IOException {
|
||||
HttpEntity entity = response.getEntity();
|
||||
if (entity != null) {
|
||||
Header ceheader = entity.getContentEncoding();
|
||||
if (ceheader != null) {
|
||||
HeaderElement[] codecs = ceheader.getElements();
|
||||
for (int i = 0; i < codecs.length; i++) {
|
||||
if (codecs[i].getName().equalsIgnoreCase("gzip")) {
|
||||
response.setEntity(
|
||||
new GzipDecompressingEntity(response.getEntity()));
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
});
|
||||
httpClientBuilder.setRetryHandler(new DefaultHttpRequestRetryHandler(site.getRetryTimes(),true));
|
||||
return httpClientBuilder.build();
|
||||
}
|
||||
|
||||
private void generateCookie(DefaultHttpClient httpClient, Site site) {
|
||||
|
|
Loading…
Reference in New Issue