some refactor in httpclientdownloader
parent
4a035e729a
commit
ec446277b1
|
@ -7,6 +7,7 @@ import org.apache.http.annotation.ThreadSafe;
|
||||||
import org.apache.http.client.config.CookieSpecs;
|
import org.apache.http.client.config.CookieSpecs;
|
||||||
import org.apache.http.client.config.RequestConfig;
|
import org.apache.http.client.config.RequestConfig;
|
||||||
import org.apache.http.client.methods.CloseableHttpResponse;
|
import org.apache.http.client.methods.CloseableHttpResponse;
|
||||||
|
import org.apache.http.client.methods.HttpUriRequest;
|
||||||
import org.apache.http.client.methods.RequestBuilder;
|
import org.apache.http.client.methods.RequestBuilder;
|
||||||
import org.apache.http.impl.client.CloseableHttpClient;
|
import org.apache.http.impl.client.CloseableHttpClient;
|
||||||
import org.apache.http.util.EntityUtils;
|
import org.apache.http.util.EntityUtils;
|
||||||
|
@ -75,26 +76,12 @@ public class HttpClientDownloader extends AbstractDownloader {
|
||||||
acceptStatCode = Sets.newHashSet(200);
|
acceptStatCode = Sets.newHashSet(200);
|
||||||
}
|
}
|
||||||
logger.info("downloading page {}" , request.getUrl());
|
logger.info("downloading page {}" , request.getUrl());
|
||||||
RequestBuilder requestBuilder = RequestBuilder.get().setUri(request.getUrl());
|
|
||||||
if (headers != null) {
|
|
||||||
for (Map.Entry<String, String> headerEntry : headers.entrySet()) {
|
|
||||||
requestBuilder.addHeader(headerEntry.getKey(), headerEntry.getValue());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
RequestConfig.Builder requestConfigBuilder = RequestConfig.custom()
|
|
||||||
.setConnectionRequestTimeout(site.getTimeOut())
|
|
||||||
.setSocketTimeout(site.getTimeOut())
|
|
||||||
.setConnectTimeout(site.getTimeOut())
|
|
||||||
.setCookieSpec(CookieSpecs.BEST_MATCH);
|
|
||||||
if (site != null && site.getHttpProxy() != null) {
|
|
||||||
requestConfigBuilder.setProxy(site.getHttpProxy());
|
|
||||||
}
|
|
||||||
requestBuilder.setConfig(requestConfigBuilder.build());
|
|
||||||
CloseableHttpResponse httpResponse = null;
|
CloseableHttpResponse httpResponse = null;
|
||||||
try {
|
try {
|
||||||
httpResponse = getHttpClient(site).execute(requestBuilder.build());
|
HttpUriRequest httpUriRequest = getHttpUriRequest(request, site, headers);
|
||||||
|
httpResponse = getHttpClient(site).execute(httpUriRequest);
|
||||||
int statusCode = httpResponse.getStatusLine().getStatusCode();
|
int statusCode = httpResponse.getStatusLine().getStatusCode();
|
||||||
if (acceptStatCode.contains(statusCode)) {
|
if (statusAccept(acceptStatCode, statusCode)) {
|
||||||
//charset
|
//charset
|
||||||
if (charset == null) {
|
if (charset == null) {
|
||||||
String value = httpResponse.getEntity().getContentType().getValue();
|
String value = httpResponse.getEntity().getContentType().getValue();
|
||||||
|
@ -123,6 +110,34 @@ public class HttpClientDownloader extends AbstractDownloader {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void setThread(int thread) {
|
||||||
|
httpClientGenerator.setPoolSize(thread);
|
||||||
|
}
|
||||||
|
|
||||||
|
protected boolean statusAccept(Set<Integer> acceptStatCode, int statusCode) {
|
||||||
|
return acceptStatCode.contains(statusCode);
|
||||||
|
}
|
||||||
|
|
||||||
|
protected HttpUriRequest getHttpUriRequest(Request request, Site site, Map<String, String> headers) {
|
||||||
|
RequestBuilder requestBuilder = RequestBuilder.get().setUri(request.getUrl());
|
||||||
|
if (headers != null) {
|
||||||
|
for (Map.Entry<String, String> headerEntry : headers.entrySet()) {
|
||||||
|
requestBuilder.addHeader(headerEntry.getKey(), headerEntry.getValue());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
RequestConfig.Builder requestConfigBuilder = RequestConfig.custom()
|
||||||
|
.setConnectionRequestTimeout(site.getTimeOut())
|
||||||
|
.setSocketTimeout(site.getTimeOut())
|
||||||
|
.setConnectTimeout(site.getTimeOut())
|
||||||
|
.setCookieSpec(CookieSpecs.BEST_MATCH);
|
||||||
|
if (site != null && site.getHttpProxy() != null) {
|
||||||
|
requestConfigBuilder.setProxy(site.getHttpProxy());
|
||||||
|
}
|
||||||
|
requestBuilder.setConfig(requestConfigBuilder.build());
|
||||||
|
return requestBuilder.build();
|
||||||
|
}
|
||||||
|
|
||||||
protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException {
|
protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException {
|
||||||
String content = IOUtils.toString(httpResponse.getEntity().getContent(), charset);
|
String content = IOUtils.toString(httpResponse.getEntity().getContent(), charset);
|
||||||
Page page = new Page();
|
Page page = new Page();
|
||||||
|
@ -132,9 +147,4 @@ public class HttpClientDownloader extends AbstractDownloader {
|
||||||
page.setStatusCode(httpResponse.getStatusLine().getStatusCode());
|
page.setStatusCode(httpResponse.getStatusLine().getStatusCode());
|
||||||
return page;
|
return page;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
public void setThread(int thread) {
|
|
||||||
httpClientGenerator.setPoolSize(thread);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -3,9 +3,11 @@ package us.codecraft.webmagic.model.samples;
|
||||||
import us.codecraft.webmagic.Site;
|
import us.codecraft.webmagic.Site;
|
||||||
import us.codecraft.webmagic.model.HasKey;
|
import us.codecraft.webmagic.model.HasKey;
|
||||||
import us.codecraft.webmagic.model.OOSpider;
|
import us.codecraft.webmagic.model.OOSpider;
|
||||||
import us.codecraft.webmagic.model.annotation.*;
|
import us.codecraft.webmagic.model.annotation.ExtractBy;
|
||||||
|
import us.codecraft.webmagic.model.annotation.ExtractByUrl;
|
||||||
|
import us.codecraft.webmagic.model.annotation.HelpUrl;
|
||||||
|
import us.codecraft.webmagic.model.annotation.TargetUrl;
|
||||||
import us.codecraft.webmagic.pipeline.JsonFilePageModelPipeline;
|
import us.codecraft.webmagic.pipeline.JsonFilePageModelPipeline;
|
||||||
import us.codecraft.webmagic.samples.formatter.StringTemplateFormatter;
|
|
||||||
import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler;
|
import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
@ -20,7 +22,6 @@ public class GithubRepo implements HasKey {
|
||||||
@ExtractBy(value = "//h1[@class='entry-title public']/strong/a/text()", notNull = true)
|
@ExtractBy(value = "//h1[@class='entry-title public']/strong/a/text()", notNull = true)
|
||||||
private String name;
|
private String name;
|
||||||
|
|
||||||
@Formatter(value = "author%s",formatter = StringTemplateFormatter.class)
|
|
||||||
@ExtractByUrl("https://github\\.com/(\\w+)/.*")
|
@ExtractByUrl("https://github\\.com/(\\w+)/.*")
|
||||||
private String author;
|
private String author;
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue