parent
edfc319c45
commit
09153ff715
|
@ -1,5 +1,6 @@
|
|||
package us.codecraft.webmagic;
|
||||
|
||||
import org.apache.http.HttpHost;
|
||||
import us.codecraft.webmagic.utils.UrlUtils;
|
||||
|
||||
import java.util.*;
|
||||
|
@ -40,6 +41,8 @@ public class Site {
|
|||
|
||||
private Map<String, String> headers = new HashMap<String, String>();
|
||||
|
||||
private HttpHost httpProxy;
|
||||
|
||||
public static interface HeaderConst {
|
||||
|
||||
public static final String REFERER = "Referer";
|
||||
|
@ -295,6 +298,20 @@ public class Site {
|
|||
return this;
|
||||
}
|
||||
|
||||
public HttpHost getHttpProxy() {
|
||||
return httpProxy;
|
||||
}
|
||||
|
||||
/**
|
||||
* set up httpProxy for this site
|
||||
* @param httpProxy
|
||||
* @return
|
||||
*/
|
||||
public Site setHttpProxy(HttpHost httpProxy) {
|
||||
this.httpProxy = httpProxy;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Task toTask() {
|
||||
return new Task() {
|
||||
@Override
|
||||
|
|
|
@ -1,9 +1,11 @@
|
|||
package us.codecraft.webmagic.downloader;
|
||||
|
||||
import com.google.common.collect.Sets;
|
||||
import org.apache.http.HttpResponse;
|
||||
import org.apache.http.annotation.ThreadSafe;
|
||||
import org.apache.http.client.config.RequestConfig;
|
||||
import org.apache.http.client.methods.CloseableHttpResponse;
|
||||
import org.apache.http.client.methods.HttpGet;
|
||||
import org.apache.http.client.methods.RequestBuilder;
|
||||
import org.apache.http.impl.client.CloseableHttpClient;
|
||||
import org.apache.http.util.EntityUtils;
|
||||
import org.apache.log4j.Logger;
|
||||
|
@ -16,7 +18,7 @@ import us.codecraft.webmagic.selector.PlainText;
|
|||
import us.codecraft.webmagic.utils.UrlUtils;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashSet;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
|
@ -32,7 +34,7 @@ public class HttpClientDownloader implements Downloader {
|
|||
|
||||
private Logger logger = Logger.getLogger(getClass());
|
||||
|
||||
private volatile CloseableHttpClient httpClient;
|
||||
private final Map<String, CloseableHttpClient> httpClients = new HashMap<String, CloseableHttpClient>();
|
||||
|
||||
private int poolSize = 1;
|
||||
|
||||
|
@ -59,10 +61,16 @@ public class HttpClientDownloader implements Downloader {
|
|||
}
|
||||
|
||||
private CloseableHttpClient getHttpClient(Site site) {
|
||||
if (site == null) {
|
||||
return new HttpClientPool(poolSize).getClient(null);
|
||||
}
|
||||
String domain = site.getDomain();
|
||||
CloseableHttpClient httpClient = httpClients.get(domain);
|
||||
if (httpClient == null) {
|
||||
synchronized (this) {
|
||||
if (httpClient == null) {
|
||||
httpClient = new HttpClientPool(poolSize).getClient(site);
|
||||
httpClients.put(domain, httpClient);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -83,19 +91,25 @@ public class HttpClientDownloader implements Downloader {
|
|||
charset = site.getCharset();
|
||||
headers = site.getHeaders();
|
||||
} else {
|
||||
acceptStatCode = new HashSet<Integer>();
|
||||
acceptStatCode.add(200);
|
||||
acceptStatCode = Sets.newHashSet(200);
|
||||
}
|
||||
logger.info("downloading page " + request.getUrl());
|
||||
HttpGet httpGet = new HttpGet(request.getUrl());
|
||||
RequestBuilder requestBuilder = RequestBuilder.get().setUri(request.getUrl());
|
||||
if (headers != null) {
|
||||
for (Map.Entry<String, String> headerEntry : headers.entrySet()) {
|
||||
httpGet.addHeader(headerEntry.getKey(), headerEntry.getValue());
|
||||
requestBuilder.addHeader(headerEntry.getKey(), headerEntry.getValue());
|
||||
}
|
||||
}
|
||||
RequestConfig.Builder requestConfigBuilder = RequestConfig.custom()
|
||||
.setConnectionRequestTimeout(site.getTimeOut())
|
||||
.setConnectTimeout(site.getTimeOut());
|
||||
if (site.getHttpProxy()!=null){
|
||||
requestConfigBuilder.setProxy(site.getHttpProxy());
|
||||
}
|
||||
requestBuilder.setConfig(requestConfigBuilder.build());
|
||||
CloseableHttpResponse httpResponse = null;
|
||||
try {
|
||||
httpResponse = getHttpClient(site).execute(httpGet);
|
||||
httpResponse = getHttpClient(site).execute(requestBuilder.build());
|
||||
int statusCode = httpResponse.getStatusLine().getStatusCode();
|
||||
if (acceptStatCode.contains(statusCode)) {
|
||||
//charset
|
||||
|
|
|
@ -78,7 +78,9 @@ public class HttpClientPool {
|
|||
}
|
||||
|
||||
});
|
||||
httpClientBuilder.setRetryHandler(new DefaultHttpRequestRetryHandler(site.getRetryTimes(),true));
|
||||
if (site!=null){
|
||||
httpClientBuilder.setRetryHandler(new DefaultHttpRequestRetryHandler(site.getRetryTimes(),true));
|
||||
}
|
||||
return httpClientBuilder.build();
|
||||
}
|
||||
|
||||
|
|
|
@ -1,10 +1,12 @@
|
|||
package us.codecraft.webmagic.model.samples;
|
||||
|
||||
import org.apache.http.HttpHost;
|
||||
import us.codecraft.webmagic.Site;
|
||||
import us.codecraft.webmagic.Task;
|
||||
import us.codecraft.webmagic.model.OOSpider;
|
||||
import us.codecraft.webmagic.model.PageModelPipeline;
|
||||
import us.codecraft.webmagic.model.annotation.ExtractBy;
|
||||
import us.codecraft.webmagic.model.annotation.TargetUrl;
|
||||
import us.codecraft.webmagic.pipeline.JsonFilePageModelPipeline;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
|
@ -24,8 +26,13 @@ public class OschinaBlog{
|
|||
private List<String> tags;
|
||||
|
||||
public static void main(String[] args) {
|
||||
OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog")
|
||||
,new JsonFilePageModelPipeline(), OschinaBlog.class).run();
|
||||
OOSpider.create(Site.me().setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36").addStartUrl("http://my.oschina.net/flashsword/blog").setSleepTime(0).setHttpProxy(new HttpHost("127.0.0.1",8888))
|
||||
,new PageModelPipeline() {
|
||||
@Override
|
||||
public void process(Object o, Task task) {
|
||||
|
||||
}
|
||||
}, OschinaBlog.class).thread(10).run();
|
||||
}
|
||||
|
||||
public String getTitle() {
|
||||
|
|
Loading…
Reference in New Issue