parent
edfc319c45
commit
09153ff715
|
@ -1,5 +1,6 @@
|
||||||
package us.codecraft.webmagic;
|
package us.codecraft.webmagic;
|
||||||
|
|
||||||
|
import org.apache.http.HttpHost;
|
||||||
import us.codecraft.webmagic.utils.UrlUtils;
|
import us.codecraft.webmagic.utils.UrlUtils;
|
||||||
|
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
@ -40,6 +41,8 @@ public class Site {
|
||||||
|
|
||||||
private Map<String, String> headers = new HashMap<String, String>();
|
private Map<String, String> headers = new HashMap<String, String>();
|
||||||
|
|
||||||
|
private HttpHost httpProxy;
|
||||||
|
|
||||||
public static interface HeaderConst {
|
public static interface HeaderConst {
|
||||||
|
|
||||||
public static final String REFERER = "Referer";
|
public static final String REFERER = "Referer";
|
||||||
|
@ -295,6 +298,20 @@ public class Site {
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public HttpHost getHttpProxy() {
|
||||||
|
return httpProxy;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* set up httpProxy for this site
|
||||||
|
* @param httpProxy
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public Site setHttpProxy(HttpHost httpProxy) {
|
||||||
|
this.httpProxy = httpProxy;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
public Task toTask() {
|
public Task toTask() {
|
||||||
return new Task() {
|
return new Task() {
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -1,9 +1,11 @@
|
||||||
package us.codecraft.webmagic.downloader;
|
package us.codecraft.webmagic.downloader;
|
||||||
|
|
||||||
|
import com.google.common.collect.Sets;
|
||||||
import org.apache.http.HttpResponse;
|
import org.apache.http.HttpResponse;
|
||||||
import org.apache.http.annotation.ThreadSafe;
|
import org.apache.http.annotation.ThreadSafe;
|
||||||
|
import org.apache.http.client.config.RequestConfig;
|
||||||
import org.apache.http.client.methods.CloseableHttpResponse;
|
import org.apache.http.client.methods.CloseableHttpResponse;
|
||||||
import org.apache.http.client.methods.HttpGet;
|
import org.apache.http.client.methods.RequestBuilder;
|
||||||
import org.apache.http.impl.client.CloseableHttpClient;
|
import org.apache.http.impl.client.CloseableHttpClient;
|
||||||
import org.apache.http.util.EntityUtils;
|
import org.apache.http.util.EntityUtils;
|
||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
|
@ -16,7 +18,7 @@ import us.codecraft.webmagic.selector.PlainText;
|
||||||
import us.codecraft.webmagic.utils.UrlUtils;
|
import us.codecraft.webmagic.utils.UrlUtils;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.HashSet;
|
import java.util.HashMap;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
|
@ -32,7 +34,7 @@ public class HttpClientDownloader implements Downloader {
|
||||||
|
|
||||||
private Logger logger = Logger.getLogger(getClass());
|
private Logger logger = Logger.getLogger(getClass());
|
||||||
|
|
||||||
private volatile CloseableHttpClient httpClient;
|
private final Map<String, CloseableHttpClient> httpClients = new HashMap<String, CloseableHttpClient>();
|
||||||
|
|
||||||
private int poolSize = 1;
|
private int poolSize = 1;
|
||||||
|
|
||||||
|
@ -59,10 +61,16 @@ public class HttpClientDownloader implements Downloader {
|
||||||
}
|
}
|
||||||
|
|
||||||
private CloseableHttpClient getHttpClient(Site site) {
|
private CloseableHttpClient getHttpClient(Site site) {
|
||||||
|
if (site == null) {
|
||||||
|
return new HttpClientPool(poolSize).getClient(null);
|
||||||
|
}
|
||||||
|
String domain = site.getDomain();
|
||||||
|
CloseableHttpClient httpClient = httpClients.get(domain);
|
||||||
if (httpClient == null) {
|
if (httpClient == null) {
|
||||||
synchronized (this) {
|
synchronized (this) {
|
||||||
if (httpClient == null) {
|
if (httpClient == null) {
|
||||||
httpClient = new HttpClientPool(poolSize).getClient(site);
|
httpClient = new HttpClientPool(poolSize).getClient(site);
|
||||||
|
httpClients.put(domain, httpClient);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -83,19 +91,25 @@ public class HttpClientDownloader implements Downloader {
|
||||||
charset = site.getCharset();
|
charset = site.getCharset();
|
||||||
headers = site.getHeaders();
|
headers = site.getHeaders();
|
||||||
} else {
|
} else {
|
||||||
acceptStatCode = new HashSet<Integer>();
|
acceptStatCode = Sets.newHashSet(200);
|
||||||
acceptStatCode.add(200);
|
|
||||||
}
|
}
|
||||||
logger.info("downloading page " + request.getUrl());
|
logger.info("downloading page " + request.getUrl());
|
||||||
HttpGet httpGet = new HttpGet(request.getUrl());
|
RequestBuilder requestBuilder = RequestBuilder.get().setUri(request.getUrl());
|
||||||
if (headers != null) {
|
if (headers != null) {
|
||||||
for (Map.Entry<String, String> headerEntry : headers.entrySet()) {
|
for (Map.Entry<String, String> headerEntry : headers.entrySet()) {
|
||||||
httpGet.addHeader(headerEntry.getKey(), headerEntry.getValue());
|
requestBuilder.addHeader(headerEntry.getKey(), headerEntry.getValue());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
RequestConfig.Builder requestConfigBuilder = RequestConfig.custom()
|
||||||
|
.setConnectionRequestTimeout(site.getTimeOut())
|
||||||
|
.setConnectTimeout(site.getTimeOut());
|
||||||
|
if (site.getHttpProxy()!=null){
|
||||||
|
requestConfigBuilder.setProxy(site.getHttpProxy());
|
||||||
|
}
|
||||||
|
requestBuilder.setConfig(requestConfigBuilder.build());
|
||||||
CloseableHttpResponse httpResponse = null;
|
CloseableHttpResponse httpResponse = null;
|
||||||
try {
|
try {
|
||||||
httpResponse = getHttpClient(site).execute(httpGet);
|
httpResponse = getHttpClient(site).execute(requestBuilder.build());
|
||||||
int statusCode = httpResponse.getStatusLine().getStatusCode();
|
int statusCode = httpResponse.getStatusLine().getStatusCode();
|
||||||
if (acceptStatCode.contains(statusCode)) {
|
if (acceptStatCode.contains(statusCode)) {
|
||||||
//charset
|
//charset
|
||||||
|
|
|
@ -78,7 +78,9 @@ public class HttpClientPool {
|
||||||
}
|
}
|
||||||
|
|
||||||
});
|
});
|
||||||
httpClientBuilder.setRetryHandler(new DefaultHttpRequestRetryHandler(site.getRetryTimes(),true));
|
if (site!=null){
|
||||||
|
httpClientBuilder.setRetryHandler(new DefaultHttpRequestRetryHandler(site.getRetryTimes(),true));
|
||||||
|
}
|
||||||
return httpClientBuilder.build();
|
return httpClientBuilder.build();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,10 +1,12 @@
|
||||||
package us.codecraft.webmagic.model.samples;
|
package us.codecraft.webmagic.model.samples;
|
||||||
|
|
||||||
|
import org.apache.http.HttpHost;
|
||||||
import us.codecraft.webmagic.Site;
|
import us.codecraft.webmagic.Site;
|
||||||
|
import us.codecraft.webmagic.Task;
|
||||||
import us.codecraft.webmagic.model.OOSpider;
|
import us.codecraft.webmagic.model.OOSpider;
|
||||||
|
import us.codecraft.webmagic.model.PageModelPipeline;
|
||||||
import us.codecraft.webmagic.model.annotation.ExtractBy;
|
import us.codecraft.webmagic.model.annotation.ExtractBy;
|
||||||
import us.codecraft.webmagic.model.annotation.TargetUrl;
|
import us.codecraft.webmagic.model.annotation.TargetUrl;
|
||||||
import us.codecraft.webmagic.pipeline.JsonFilePageModelPipeline;
|
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
|
@ -24,8 +26,13 @@ public class OschinaBlog{
|
||||||
private List<String> tags;
|
private List<String> tags;
|
||||||
|
|
||||||
public static void main(String[] args) {
|
public static void main(String[] args) {
|
||||||
OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog")
|
OOSpider.create(Site.me().setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36").addStartUrl("http://my.oschina.net/flashsword/blog").setSleepTime(0).setHttpProxy(new HttpHost("127.0.0.1",8888))
|
||||||
,new JsonFilePageModelPipeline(), OschinaBlog.class).run();
|
,new PageModelPipeline() {
|
||||||
|
@Override
|
||||||
|
public void process(Object o, Task task) {
|
||||||
|
|
||||||
|
}
|
||||||
|
}, OschinaBlog.class).thread(10).run();
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getTitle() {
|
public String getTitle() {
|
||||||
|
|
Loading…
Reference in New Issue