diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java index b5f8865..33e9b8f 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java @@ -1,5 +1,6 @@ package us.codecraft.webmagic; +import org.apache.http.HttpHost; import us.codecraft.webmagic.utils.UrlUtils; import java.util.*; @@ -40,6 +41,8 @@ public class Site { private Map headers = new HashMap(); + private HttpHost httpProxy; + public static interface HeaderConst { public static final String REFERER = "Referer"; @@ -295,6 +298,20 @@ public class Site { return this; } + public HttpHost getHttpProxy() { + return httpProxy; + } + + /** + * set up httpProxy for this site + * @param httpProxy + * @return + */ + public Site setHttpProxy(HttpHost httpProxy) { + this.httpProxy = httpProxy; + return this; + } + public Task toTask() { return new Task() { @Override diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index ce4f8cb..2da585f 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -1,9 +1,11 @@ package us.codecraft.webmagic.downloader; +import com.google.common.collect.Sets; import org.apache.http.HttpResponse; import org.apache.http.annotation.ThreadSafe; +import org.apache.http.client.config.RequestConfig; import org.apache.http.client.methods.CloseableHttpResponse; -import org.apache.http.client.methods.HttpGet; +import org.apache.http.client.methods.RequestBuilder; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.util.EntityUtils; import org.apache.log4j.Logger; @@ -16,7 +18,7 @@ import us.codecraft.webmagic.selector.PlainText; import us.codecraft.webmagic.utils.UrlUtils; import java.io.IOException; -import java.util.HashSet; +import java.util.HashMap; import java.util.Map; import java.util.Set; @@ -32,7 +34,7 @@ public class HttpClientDownloader implements Downloader { private Logger logger = Logger.getLogger(getClass()); - private volatile CloseableHttpClient httpClient; + private final Map httpClients = new HashMap(); private int poolSize = 1; @@ -59,10 +61,16 @@ public class HttpClientDownloader implements Downloader { } private CloseableHttpClient getHttpClient(Site site) { + if (site == null) { + return new HttpClientPool(poolSize).getClient(null); + } + String domain = site.getDomain(); + CloseableHttpClient httpClient = httpClients.get(domain); if (httpClient == null) { synchronized (this) { if (httpClient == null) { httpClient = new HttpClientPool(poolSize).getClient(site); + httpClients.put(domain, httpClient); } } } @@ -83,19 +91,25 @@ public class HttpClientDownloader implements Downloader { charset = site.getCharset(); headers = site.getHeaders(); } else { - acceptStatCode = new HashSet(); - acceptStatCode.add(200); + acceptStatCode = Sets.newHashSet(200); } logger.info("downloading page " + request.getUrl()); - HttpGet httpGet = new HttpGet(request.getUrl()); + RequestBuilder requestBuilder = RequestBuilder.get().setUri(request.getUrl()); if (headers != null) { for (Map.Entry headerEntry : headers.entrySet()) { - httpGet.addHeader(headerEntry.getKey(), headerEntry.getValue()); + requestBuilder.addHeader(headerEntry.getKey(), headerEntry.getValue()); } } + RequestConfig.Builder requestConfigBuilder = RequestConfig.custom() + .setConnectionRequestTimeout(site.getTimeOut()) + .setConnectTimeout(site.getTimeOut()); + if (site.getHttpProxy()!=null){ + requestConfigBuilder.setProxy(site.getHttpProxy()); + } + requestBuilder.setConfig(requestConfigBuilder.build()); CloseableHttpResponse httpResponse = null; try { - httpResponse = getHttpClient(site).execute(httpGet); + httpResponse = getHttpClient(site).execute(requestBuilder.build()); int statusCode = httpResponse.getStatusLine().getStatusCode(); if (acceptStatCode.contains(statusCode)) { //charset diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java index 43ee94d..62d8718 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java @@ -78,7 +78,9 @@ public class HttpClientPool { } }); - httpClientBuilder.setRetryHandler(new DefaultHttpRequestRetryHandler(site.getRetryTimes(),true)); + if (site!=null){ + httpClientBuilder.setRetryHandler(new DefaultHttpRequestRetryHandler(site.getRetryTimes(),true)); + } return httpClientBuilder.build(); } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java index a7f51ad..d6b9c9d 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java @@ -1,10 +1,12 @@ package us.codecraft.webmagic.model.samples; +import org.apache.http.HttpHost; import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Task; import us.codecraft.webmagic.model.OOSpider; +import us.codecraft.webmagic.model.PageModelPipeline; import us.codecraft.webmagic.model.annotation.ExtractBy; import us.codecraft.webmagic.model.annotation.TargetUrl; -import us.codecraft.webmagic.pipeline.JsonFilePageModelPipeline; import java.util.List; @@ -24,8 +26,13 @@ public class OschinaBlog{ private List tags; public static void main(String[] args) { - OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog") - ,new JsonFilePageModelPipeline(), OschinaBlog.class).run(); + OOSpider.create(Site.me().setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36").addStartUrl("http://my.oschina.net/flashsword/blog").setSleepTime(0).setHttpProxy(new HttpHost("127.0.0.1",8888)) + ,new PageModelPipeline() { + @Override + public void process(Object o, Task task) { + + } + }, OschinaBlog.class).thread(10).run(); } public String getTitle() {