Merge branch 'oxf1-master'
commit
a8c9005381
|
@ -4,6 +4,7 @@ import com.google.common.collect.HashBasedTable;
|
|||
import com.google.common.collect.Table;
|
||||
import org.apache.http.HttpHost;
|
||||
|
||||
import us.codecraft.webmagic.proxy.Proxy;
|
||||
import us.codecraft.webmagic.proxy.ProxyPool;
|
||||
import us.codecraft.webmagic.utils.UrlUtils;
|
||||
|
||||
|
@ -474,6 +475,11 @@ public class Site {
|
|||
return this;
|
||||
}
|
||||
|
||||
public Site setHttpProxyPool(List<String[]> httpProxyList, boolean isUseLastProxy) {
|
||||
this.httpProxyPool=new ProxyPool(httpProxyList, isUseLastProxy);
|
||||
return this;
|
||||
}
|
||||
|
||||
public Site enableHttpProxyPool() {
|
||||
this.httpProxyPool=new ProxyPool();
|
||||
return this;
|
||||
|
@ -483,7 +489,7 @@ public class Site {
|
|||
return httpProxyPool;
|
||||
}
|
||||
|
||||
public HttpHost getHttpProxyFromPool() {
|
||||
public Proxy getHttpProxyFromPool() {
|
||||
return httpProxyPool.getProxy();
|
||||
}
|
||||
|
||||
|
|
|
@ -24,6 +24,7 @@ import us.codecraft.webmagic.Page;
|
|||
import us.codecraft.webmagic.Request;
|
||||
import us.codecraft.webmagic.Site;
|
||||
import us.codecraft.webmagic.Task;
|
||||
import us.codecraft.webmagic.proxy.Proxy;
|
||||
import us.codecraft.webmagic.selector.PlainText;
|
||||
import us.codecraft.webmagic.utils.HttpConstant;
|
||||
import us.codecraft.webmagic.utils.UrlUtils;
|
||||
|
@ -50,9 +51,9 @@ public class HttpClientDownloader extends AbstractDownloader {
|
|||
|
||||
private HttpClientGenerator httpClientGenerator = new HttpClientGenerator();
|
||||
|
||||
private CloseableHttpClient getHttpClient(Site site) {
|
||||
private CloseableHttpClient getHttpClient(Site site, Proxy proxy) {
|
||||
if (site == null) {
|
||||
return httpClientGenerator.getClient(null);
|
||||
return httpClientGenerator.getClient(null, proxy);
|
||||
}
|
||||
String domain = site.getDomain();
|
||||
CloseableHttpClient httpClient = httpClients.get(domain);
|
||||
|
@ -60,7 +61,7 @@ public class HttpClientDownloader extends AbstractDownloader {
|
|||
synchronized (this) {
|
||||
httpClient = httpClients.get(domain);
|
||||
if (httpClient == null) {
|
||||
httpClient = httpClientGenerator.getClient(site);
|
||||
httpClient = httpClientGenerator.getClient(site, proxy);
|
||||
httpClients.put(domain, httpClient);
|
||||
}
|
||||
}
|
||||
|
@ -88,8 +89,17 @@ public class HttpClientDownloader extends AbstractDownloader {
|
|||
CloseableHttpResponse httpResponse = null;
|
||||
int statusCode=0;
|
||||
try {
|
||||
HttpUriRequest httpUriRequest = getHttpUriRequest(request, site, headers);
|
||||
httpResponse = getHttpClient(site).execute(httpUriRequest);
|
||||
HttpHost proxyHost = null;
|
||||
Proxy proxy = null; //TODO
|
||||
if (site.getHttpProxyPool() != null && site.getHttpProxyPool().isEnable()) {
|
||||
proxy = site.getHttpProxyFromPool();
|
||||
proxyHost = proxy.getHttpHost();
|
||||
} else if(site.getHttpProxy()!= null){
|
||||
proxyHost = site.getHttpProxy();
|
||||
}
|
||||
|
||||
HttpUriRequest httpUriRequest = getHttpUriRequest(request, site, headers, proxyHost);//<2F><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>˴<EFBFBD><CBB4><EFBFBD>
|
||||
httpResponse = getHttpClient(site, proxy).execute(httpUriRequest);//getHttpClient<6E><74><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>˴<EFBFBD><CBB4><EFBFBD><EFBFBD><EFBFBD>֤
|
||||
statusCode = httpResponse.getStatusLine().getStatusCode();
|
||||
request.putExtra(Request.STATUS_CODE, statusCode);
|
||||
if (statusAccept(acceptStatCode, statusCode)) {
|
||||
|
@ -129,7 +139,7 @@ public class HttpClientDownloader extends AbstractDownloader {
|
|||
return acceptStatCode.contains(statusCode);
|
||||
}
|
||||
|
||||
protected HttpUriRequest getHttpUriRequest(Request request, Site site, Map<String, String> headers) {
|
||||
protected HttpUriRequest getHttpUriRequest(Request request, Site site, Map<String, String> headers,HttpHost proxy) {
|
||||
RequestBuilder requestBuilder = selectRequestMethod(request).setUri(request.getUrl());
|
||||
if (headers != null) {
|
||||
for (Map.Entry<String, String> headerEntry : headers.entrySet()) {
|
||||
|
@ -141,14 +151,9 @@ public class HttpClientDownloader extends AbstractDownloader {
|
|||
.setSocketTimeout(site.getTimeOut())
|
||||
.setConnectTimeout(site.getTimeOut())
|
||||
.setCookieSpec(CookieSpecs.BEST_MATCH);
|
||||
if (site.getHttpProxyPool() != null && site.getHttpProxyPool().isEnable()) {
|
||||
HttpHost host = site.getHttpProxyFromPool();
|
||||
requestConfigBuilder.setProxy(host);
|
||||
request.putExtra(Request.PROXY, host);
|
||||
}else if(site.getHttpProxy()!= null){
|
||||
HttpHost host = site.getHttpProxy();
|
||||
requestConfigBuilder.setProxy(host);
|
||||
request.putExtra(Request.PROXY, host);
|
||||
if (proxy !=null) {
|
||||
requestConfigBuilder.setProxy(proxy);
|
||||
request.putExtra(Request.PROXY, proxy);
|
||||
}
|
||||
requestBuilder.setConfig(requestConfigBuilder.build());
|
||||
return requestBuilder.build();
|
||||
|
|
|
@ -1,9 +1,13 @@
|
|||
package us.codecraft.webmagic.downloader;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.http.HttpException;
|
||||
import org.apache.http.HttpRequest;
|
||||
import org.apache.http.HttpRequestInterceptor;
|
||||
import org.apache.http.auth.AuthScope;
|
||||
import org.apache.http.auth.UsernamePasswordCredentials;
|
||||
import org.apache.http.client.CookieStore;
|
||||
import org.apache.http.client.CredentialsProvider;
|
||||
import org.apache.http.config.Registry;
|
||||
import org.apache.http.config.RegistryBuilder;
|
||||
import org.apache.http.config.SocketConfig;
|
||||
|
@ -15,6 +19,7 @@ import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
|
|||
import org.apache.http.impl.cookie.BasicClientCookie;
|
||||
import org.apache.http.protocol.HttpContext;
|
||||
import us.codecraft.webmagic.Site;
|
||||
import us.codecraft.webmagic.proxy.Proxy;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Map;
|
||||
|
@ -41,12 +46,24 @@ public class HttpClientGenerator {
|
|||
return this;
|
||||
}
|
||||
|
||||
public CloseableHttpClient getClient(Site site) {
|
||||
return generateClient(site);
|
||||
public CloseableHttpClient getClient(Site site, Proxy proxy) {
|
||||
return generateClient(site, proxy);
|
||||
}
|
||||
|
||||
private CloseableHttpClient generateClient(Site site) {
|
||||
HttpClientBuilder httpClientBuilder = HttpClients.custom().setConnectionManager(connectionManager);
|
||||
private CloseableHttpClient generateClient(Site site, Proxy proxy) {
|
||||
CredentialsProvider credsProvider = null;
|
||||
HttpClientBuilder httpClientBuilder = HttpClients.custom();
|
||||
|
||||
if(proxy!=null && StringUtils.isNotBlank(proxy.getUser()) && StringUtils.isNotBlank(proxy.getPassword()))
|
||||
{
|
||||
credsProvider= new BasicCredentialsProvider();
|
||||
credsProvider.setCredentials(
|
||||
new AuthScope(proxy.getHttpHost().getAddress().getHostAddress(), proxy.getHttpHost().getPort()),
|
||||
new UsernamePasswordCredentials(proxy.getUser(), proxy.getPassword()));
|
||||
httpClientBuilder.setDefaultCredentialsProvider(credsProvider);
|
||||
}
|
||||
|
||||
httpClientBuilder.setConnectionManager(connectionManager);
|
||||
if (site != null && site.getUserAgent() != null) {
|
||||
httpClientBuilder.setUserAgent(site.getUserAgent());
|
||||
} else {
|
||||
|
@ -61,7 +78,6 @@ public class HttpClientGenerator {
|
|||
if (!request.containsHeader("Accept-Encoding")) {
|
||||
request.addHeader("Accept-Encoding", "gzip");
|
||||
}
|
||||
|
||||
}
|
||||
});
|
||||
}
|
||||
|
|
|
@ -1,13 +1,13 @@
|
|||
package us.codecraft.webmagic.proxy;
|
||||
|
||||
import org.apache.http.HttpHost;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.Delayed;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
import org.apache.http.HttpHost;
|
||||
|
||||
/**
|
||||
* >>>> Proxy lifecycle
|
||||
|
||||
|
@ -64,6 +64,9 @@ public class Proxy implements Delayed, Serializable {
|
|||
public static final int SUCCESS = 200;
|
||||
|
||||
private final HttpHost httpHost;
|
||||
private String user;
|
||||
private String password;
|
||||
|
||||
|
||||
private int reuseTimeInterval = 1500;// ms
|
||||
private Long canReuseTime = 0L;
|
||||
|
@ -76,13 +79,17 @@ public class Proxy implements Delayed, Serializable {
|
|||
|
||||
private List<Integer> failedErrorType = new ArrayList<Integer>();
|
||||
|
||||
Proxy(HttpHost httpHost) {
|
||||
Proxy(HttpHost httpHost, String user, String password) {
|
||||
this.httpHost = httpHost;
|
||||
this.user = user;
|
||||
this.password = password;
|
||||
this.canReuseTime = System.nanoTime() + TimeUnit.NANOSECONDS.convert(reuseTimeInterval, TimeUnit.MILLISECONDS);
|
||||
}
|
||||
|
||||
Proxy(HttpHost httpHost, int reuseInterval) {
|
||||
Proxy(HttpHost httpHost, int reuseInterval, String user, String password) {
|
||||
this.httpHost = httpHost;
|
||||
this.user = user;
|
||||
this.password = password;
|
||||
this.canReuseTime = System.nanoTime() + TimeUnit.NANOSECONDS.convert(reuseInterval, TimeUnit.MILLISECONDS);
|
||||
}
|
||||
|
||||
|
@ -170,6 +177,17 @@ public class Proxy implements Delayed, Serializable {
|
|||
return re;
|
||||
|
||||
}
|
||||
|
||||
public String getUser()
|
||||
{
|
||||
return user;
|
||||
|
||||
}
|
||||
public String getPassword()
|
||||
{
|
||||
return password;
|
||||
|
||||
}
|
||||
|
||||
public void borrowNumIncrement(int increment) {
|
||||
this.borrowNum += increment;
|
||||
|
|
|
@ -156,14 +156,14 @@ public class ProxyPool {
|
|||
isEnable = true;
|
||||
for (String[] s : httpProxyList) {
|
||||
try {
|
||||
if (allProxy.containsKey(s[0])) {
|
||||
if (allProxy.containsKey(s[2])) {
|
||||
continue;
|
||||
}
|
||||
HttpHost item = new HttpHost(InetAddress.getByName(s[0]), Integer.valueOf(s[1]));
|
||||
HttpHost item = new HttpHost(InetAddress.getByName(s[2]), Integer.valueOf(s[3]));
|
||||
if (!validateWhenInit || ProxyUtils.validateProxy(item)) {
|
||||
Proxy p = new Proxy(item, reuseInterval);
|
||||
Proxy p = new Proxy(item, reuseInterval, s[0], s[1]);
|
||||
proxyQueue.add(p);
|
||||
allProxy.put(s[0], p);
|
||||
allProxy.put(s[2], p);
|
||||
}
|
||||
} catch (NumberFormatException e) {
|
||||
logger.error("HttpHost init error:", e);
|
||||
|
@ -174,7 +174,7 @@ public class ProxyPool {
|
|||
logger.info("proxy pool size>>>>" + allProxy.size());
|
||||
}
|
||||
|
||||
public HttpHost getProxy() {
|
||||
public Proxy getProxy() {
|
||||
Proxy proxy = null;
|
||||
try {
|
||||
Long time = System.currentTimeMillis();
|
||||
|
@ -192,7 +192,7 @@ public class ProxyPool {
|
|||
if (proxy == null) {
|
||||
throw new NoSuchElementException();
|
||||
}
|
||||
return proxy.getHttpHost();
|
||||
return proxy;
|
||||
}
|
||||
|
||||
public void returnProxy(HttpHost host, int statusCode) {
|
||||
|
|
|
@ -90,12 +90,12 @@ public class HttpClientDownloaderTest {
|
|||
private String getCharsetByUrl(String url) {
|
||||
HttpClientDownloader downloader = new HttpClientDownloader();
|
||||
Site site = Site.me();
|
||||
CloseableHttpClient httpClient = new HttpClientGenerator().getClient(site);
|
||||
CloseableHttpClient httpClient = new HttpClientGenerator().getClient(site, null);
|
||||
// encoding in http header Content-Type
|
||||
Request requestGBK = new Request(url);
|
||||
CloseableHttpResponse httpResponse = null;
|
||||
try {
|
||||
httpResponse = httpClient.execute(downloader.getHttpUriRequest(requestGBK, site, null));
|
||||
httpResponse = httpClient.execute(downloader.getHttpUriRequest(requestGBK, site, null,null));
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
|
|
|
@ -22,9 +22,9 @@ public class ProxyTest {
|
|||
public static void before() {
|
||||
// String[] source = { "0.0.0.1:0", "0.0.0.2:0", "0.0.0.3:0",
|
||||
// "0.0.0.4:0" };
|
||||
String[] source = { "0.0.0.1:0", "0.0.0.2:0", "0.0.0.3:0", "0.0.0.4:0" };
|
||||
String[] source = { "::0.0.0.1:0", "::0.0.0.2:0", "::0.0.0.3:0", "::0.0.0.4:0" };
|
||||
for (String line : source) {
|
||||
httpProxyList.add(new String[] { line.split(":")[0], line.split(":")[1] });
|
||||
httpProxyList.add(new String[] {line.split(":")[0], line.split(":")[1], line.split(":")[2], line.split(":")[3] });
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -37,7 +37,8 @@ public class ProxyTest {
|
|||
for (int i = 0; i < 2; i++) {
|
||||
List<Fetch> fetchList = new ArrayList<Fetch>();
|
||||
while (proxyPool.getIdleNum() != 0) {
|
||||
HttpHost httphost = proxyPool.getProxy();
|
||||
Proxy proxy = proxyPool.getProxy();
|
||||
HttpHost httphost = proxy.getHttpHost();
|
||||
// httphostList.add(httphost);
|
||||
System.out.println(httphost.getHostName() + ":" + httphost.getPort());
|
||||
Fetch tmp = new Fetch(httphost);
|
||||
|
@ -69,4 +70,5 @@ public class ProxyTest {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue