diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java index bc786fc..16c0729 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java @@ -4,6 +4,7 @@ import com.google.common.collect.HashBasedTable; import com.google.common.collect.Table; import org.apache.http.HttpHost; +import us.codecraft.webmagic.proxy.Proxy; import us.codecraft.webmagic.proxy.ProxyPool; import us.codecraft.webmagic.utils.UrlUtils; @@ -474,6 +475,11 @@ public class Site { return this; } + public Site setHttpProxyPool(List httpProxyList, boolean isUseLastProxy) { + this.httpProxyPool=new ProxyPool(httpProxyList, isUseLastProxy); + return this; + } + public Site enableHttpProxyPool() { this.httpProxyPool=new ProxyPool(); return this; @@ -483,7 +489,7 @@ public class Site { return httpProxyPool; } - public HttpHost getHttpProxyFromPool() { + public Proxy getHttpProxyFromPool() { return httpProxyPool.getProxy(); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 95d99ce..733d79c 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -24,6 +24,7 @@ import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Task; +import us.codecraft.webmagic.proxy.Proxy; import us.codecraft.webmagic.selector.PlainText; import us.codecraft.webmagic.utils.HttpConstant; import us.codecraft.webmagic.utils.UrlUtils; @@ -50,9 +51,9 @@ public class HttpClientDownloader extends AbstractDownloader { private HttpClientGenerator httpClientGenerator = new HttpClientGenerator(); - private CloseableHttpClient getHttpClient(Site site) { + private CloseableHttpClient getHttpClient(Site site, Proxy proxy) { if (site == null) { - return httpClientGenerator.getClient(null); + return httpClientGenerator.getClient(null, proxy); } String domain = site.getDomain(); CloseableHttpClient httpClient = httpClients.get(domain); @@ -60,7 +61,7 @@ public class HttpClientDownloader extends AbstractDownloader { synchronized (this) { httpClient = httpClients.get(domain); if (httpClient == null) { - httpClient = httpClientGenerator.getClient(site); + httpClient = httpClientGenerator.getClient(site, proxy); httpClients.put(domain, httpClient); } } @@ -88,8 +89,17 @@ public class HttpClientDownloader extends AbstractDownloader { CloseableHttpResponse httpResponse = null; int statusCode=0; try { - HttpUriRequest httpUriRequest = getHttpUriRequest(request, site, headers); - httpResponse = getHttpClient(site).execute(httpUriRequest); + HttpHost proxyHost = null; + Proxy proxy = null; //TODO + if (site.getHttpProxyPool() != null && site.getHttpProxyPool().isEnable()) { + proxy = site.getHttpProxyFromPool(); + proxyHost = proxy.getHttpHost(); + } else if(site.getHttpProxy()!= null){ + proxyHost = site.getHttpProxy(); + } + + HttpUriRequest httpUriRequest = getHttpUriRequest(request, site, headers, proxyHost);//���������˴��� + httpResponse = getHttpClient(site, proxy).execute(httpUriRequest);//getHttpClient�������˴�����֤ statusCode = httpResponse.getStatusLine().getStatusCode(); request.putExtra(Request.STATUS_CODE, statusCode); if (statusAccept(acceptStatCode, statusCode)) { @@ -129,7 +139,7 @@ public class HttpClientDownloader extends AbstractDownloader { return acceptStatCode.contains(statusCode); } - protected HttpUriRequest getHttpUriRequest(Request request, Site site, Map headers) { + protected HttpUriRequest getHttpUriRequest(Request request, Site site, Map headers,HttpHost proxy) { RequestBuilder requestBuilder = selectRequestMethod(request).setUri(request.getUrl()); if (headers != null) { for (Map.Entry headerEntry : headers.entrySet()) { @@ -141,14 +151,9 @@ public class HttpClientDownloader extends AbstractDownloader { .setSocketTimeout(site.getTimeOut()) .setConnectTimeout(site.getTimeOut()) .setCookieSpec(CookieSpecs.BEST_MATCH); - if (site.getHttpProxyPool() != null && site.getHttpProxyPool().isEnable()) { - HttpHost host = site.getHttpProxyFromPool(); - requestConfigBuilder.setProxy(host); - request.putExtra(Request.PROXY, host); - }else if(site.getHttpProxy()!= null){ - HttpHost host = site.getHttpProxy(); - requestConfigBuilder.setProxy(host); - request.putExtra(Request.PROXY, host); + if (proxy !=null) { + requestConfigBuilder.setProxy(proxy); + request.putExtra(Request.PROXY, proxy); } requestBuilder.setConfig(requestConfigBuilder.build()); return requestBuilder.build(); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java index 136d9c5..0befdd6 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java @@ -1,9 +1,13 @@ package us.codecraft.webmagic.downloader; +import org.apache.commons.lang3.StringUtils; import org.apache.http.HttpException; import org.apache.http.HttpRequest; import org.apache.http.HttpRequestInterceptor; +import org.apache.http.auth.AuthScope; +import org.apache.http.auth.UsernamePasswordCredentials; import org.apache.http.client.CookieStore; +import org.apache.http.client.CredentialsProvider; import org.apache.http.config.Registry; import org.apache.http.config.RegistryBuilder; import org.apache.http.config.SocketConfig; @@ -15,6 +19,7 @@ import org.apache.http.impl.conn.PoolingHttpClientConnectionManager; import org.apache.http.impl.cookie.BasicClientCookie; import org.apache.http.protocol.HttpContext; import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.proxy.Proxy; import java.io.IOException; import java.util.Map; @@ -41,12 +46,24 @@ public class HttpClientGenerator { return this; } - public CloseableHttpClient getClient(Site site) { - return generateClient(site); + public CloseableHttpClient getClient(Site site, Proxy proxy) { + return generateClient(site, proxy); } - private CloseableHttpClient generateClient(Site site) { - HttpClientBuilder httpClientBuilder = HttpClients.custom().setConnectionManager(connectionManager); + private CloseableHttpClient generateClient(Site site, Proxy proxy) { + CredentialsProvider credsProvider = null; + HttpClientBuilder httpClientBuilder = HttpClients.custom(); + + if(proxy!=null && StringUtils.isNotBlank(proxy.getUser()) && StringUtils.isNotBlank(proxy.getPassword())) + { + credsProvider= new BasicCredentialsProvider(); + credsProvider.setCredentials( + new AuthScope(proxy.getHttpHost().getAddress().getHostAddress(), proxy.getHttpHost().getPort()), + new UsernamePasswordCredentials(proxy.getUser(), proxy.getPassword())); + httpClientBuilder.setDefaultCredentialsProvider(credsProvider); + } + + httpClientBuilder.setConnectionManager(connectionManager); if (site != null && site.getUserAgent() != null) { httpClientBuilder.setUserAgent(site.getUserAgent()); } else { @@ -61,7 +78,6 @@ public class HttpClientGenerator { if (!request.containsHeader("Accept-Encoding")) { request.addHeader("Accept-Encoding", "gzip"); } - } }); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java index 5ae9ffd..14bf66c 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java @@ -1,13 +1,13 @@ package us.codecraft.webmagic.proxy; +import org.apache.http.HttpHost; + import java.io.Serializable; import java.util.ArrayList; import java.util.List; import java.util.concurrent.Delayed; import java.util.concurrent.TimeUnit; -import org.apache.http.HttpHost; - /** * >>>> Proxy lifecycle @@ -64,6 +64,9 @@ public class Proxy implements Delayed, Serializable { public static final int SUCCESS = 200; private final HttpHost httpHost; + private String user; + private String password; + private int reuseTimeInterval = 1500;// ms private Long canReuseTime = 0L; @@ -76,13 +79,17 @@ public class Proxy implements Delayed, Serializable { private List failedErrorType = new ArrayList(); - Proxy(HttpHost httpHost) { + Proxy(HttpHost httpHost, String user, String password) { this.httpHost = httpHost; + this.user = user; + this.password = password; this.canReuseTime = System.nanoTime() + TimeUnit.NANOSECONDS.convert(reuseTimeInterval, TimeUnit.MILLISECONDS); } - Proxy(HttpHost httpHost, int reuseInterval) { + Proxy(HttpHost httpHost, int reuseInterval, String user, String password) { this.httpHost = httpHost; + this.user = user; + this.password = password; this.canReuseTime = System.nanoTime() + TimeUnit.NANOSECONDS.convert(reuseInterval, TimeUnit.MILLISECONDS); } @@ -170,6 +177,17 @@ public class Proxy implements Delayed, Serializable { return re; } + + public String getUser() + { + return user; + + } + public String getPassword() + { + return password; + + } public void borrowNumIncrement(int increment) { this.borrowNum += increment; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyPool.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyPool.java index c9f27d5..5524141 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyPool.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyPool.java @@ -156,14 +156,14 @@ public class ProxyPool { isEnable = true; for (String[] s : httpProxyList) { try { - if (allProxy.containsKey(s[0])) { + if (allProxy.containsKey(s[2])) { continue; } - HttpHost item = new HttpHost(InetAddress.getByName(s[0]), Integer.valueOf(s[1])); + HttpHost item = new HttpHost(InetAddress.getByName(s[2]), Integer.valueOf(s[3])); if (!validateWhenInit || ProxyUtils.validateProxy(item)) { - Proxy p = new Proxy(item, reuseInterval); + Proxy p = new Proxy(item, reuseInterval, s[0], s[1]); proxyQueue.add(p); - allProxy.put(s[0], p); + allProxy.put(s[2], p); } } catch (NumberFormatException e) { logger.error("HttpHost init error:", e); @@ -174,7 +174,7 @@ public class ProxyPool { logger.info("proxy pool size>>>>" + allProxy.size()); } - public HttpHost getProxy() { + public Proxy getProxy() { Proxy proxy = null; try { Long time = System.currentTimeMillis(); @@ -192,7 +192,7 @@ public class ProxyPool { if (proxy == null) { throw new NoSuchElementException(); } - return proxy.getHttpHost(); + return proxy; } public void returnProxy(HttpHost host, int statusCode) { diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java index 352e49c..6379797 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java @@ -90,12 +90,12 @@ public class HttpClientDownloaderTest { private String getCharsetByUrl(String url) { HttpClientDownloader downloader = new HttpClientDownloader(); Site site = Site.me(); - CloseableHttpClient httpClient = new HttpClientGenerator().getClient(site); + CloseableHttpClient httpClient = new HttpClientGenerator().getClient(site, null); // encoding in http header Content-Type Request requestGBK = new Request(url); CloseableHttpResponse httpResponse = null; try { - httpResponse = httpClient.execute(downloader.getHttpUriRequest(requestGBK, site, null)); + httpResponse = httpClient.execute(downloader.getHttpUriRequest(requestGBK, site, null,null)); } catch (IOException e) { e.printStackTrace(); } diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java index fd5827f..c9315b9 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java @@ -22,9 +22,9 @@ public class ProxyTest { public static void before() { // String[] source = { "0.0.0.1:0", "0.0.0.2:0", "0.0.0.3:0", // "0.0.0.4:0" }; - String[] source = { "0.0.0.1:0", "0.0.0.2:0", "0.0.0.3:0", "0.0.0.4:0" }; + String[] source = { "::0.0.0.1:0", "::0.0.0.2:0", "::0.0.0.3:0", "::0.0.0.4:0" }; for (String line : source) { - httpProxyList.add(new String[] { line.split(":")[0], line.split(":")[1] }); + httpProxyList.add(new String[] {line.split(":")[0], line.split(":")[1], line.split(":")[2], line.split(":")[3] }); } } @@ -37,7 +37,8 @@ public class ProxyTest { for (int i = 0; i < 2; i++) { List fetchList = new ArrayList(); while (proxyPool.getIdleNum() != 0) { - HttpHost httphost = proxyPool.getProxy(); + Proxy proxy = proxyPool.getProxy(); + HttpHost httphost = proxy.getHttpHost(); // httphostList.add(httphost); System.out.println(httphost.getHostName() + ":" + httphost.getPort()); Fetch tmp = new Fetch(httphost); @@ -69,4 +70,5 @@ public class ProxyTest { } } } + }