From 1d86f7c048b27cf15f84d1690740f0b338026137 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Mon, 20 Mar 2017 22:40:14 +0800 Subject: [PATCH] compile passed in httpclientDownloader --- .../java/us/codecraft/webmagic/Request.java | 1 - .../java/us/codecraft/webmagic/Spider.java | 2 - .../downloader/HttpClientDownloader.java | 41 ++++++------------- .../downloader/HttpUriRequestConverter.java | 3 +- .../us/codecraft/webmagic/proxy/Proxy.java | 14 +++---- .../codecraft/webmagic/proxy/ProxyPool.java | 4 +- .../webmagic/proxy/TimerReuseProxyPool.java | 18 ++++---- 7 files changed, 30 insertions(+), 53 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java index c8c5978..0a38fcc 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java @@ -18,7 +18,6 @@ public class Request implements Serializable { private static final long serialVersionUID = 2062192774891352043L; public static final String CYCLE_TRIED_TIMES = "_cycle_tried_times"; - public static final String STATUS_CODE = "statusCode"; public static final String PROXY = "proxy"; private String url; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 49734b7..213cf3f 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -419,8 +419,6 @@ public class Spider implements Runnable, Task { pipeline.process(page.getResultItems(), this); } } - //for proxy status management - request.putExtra(Request.STATUS_CODE, page.getStatusCode()); sleep(site.getSleepTime()); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 816e6c5..052c6fa 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -1,7 +1,6 @@ package us.codecraft.webmagic.downloader; import org.apache.commons.io.IOUtils; -import org.apache.http.HttpHost; import org.apache.http.HttpResponse; import org.apache.http.annotation.ThreadSafe; import org.apache.http.auth.AuthState; @@ -23,13 +22,11 @@ import us.codecraft.webmagic.Task; import us.codecraft.webmagic.proxy.Proxy; import us.codecraft.webmagic.selector.PlainText; import us.codecraft.webmagic.utils.CharsetUtils; -import us.codecraft.webmagic.utils.WMCollections; import java.io.IOException; import java.nio.charset.Charset; import java.util.HashMap; import java.util.Map; -import java.util.Set; /** @@ -80,28 +77,22 @@ public class HttpClientDownloader extends AbstractDownloader { CloseableHttpResponse httpResponse = null; int statusCode = 0; Site site = task.getSite(); - try { - Proxy proxy = null; - if (site.getHttpProxyPool() != null && site.getHttpProxyPool().isEnable()) { - proxy = site.getHttpProxyFromPool(); - } else if (site != null && site.getHttpProxy() != null){ - proxy = site.getHttpProxy(); - request.putExtra(Request.PROXY, site.getHttpProxy()); - } + Proxy proxy = null; + HttpContext httpContext = new BasicHttpContext(); + if (site.getHttpProxyPool() != null && site.getHttpProxyPool().isEnable()) { + proxy = site.getHttpProxyFromPool(); request.putExtra(Request.PROXY, proxy); - - HttpContext httpContext = new BasicHttpContext(); - - HttpUriRequest httpUriRequest = httpUriRequestConverter.convert(request,site); AuthState authState = new AuthState(); - authState.update(new BasicScheme(), new UsernamePasswordCredentials("userName", "password")); + authState.update(new BasicScheme(), new UsernamePasswordCredentials(proxy.getUsername(), proxy.getPassword())); httpContext.setAttribute(HttpClientContext.PROXY_AUTH_STATE, authState); - CloseableHttpClient httpClient = getHttpClient(site, proxy); + } + HttpUriRequest httpUriRequest = httpUriRequestConverter.convert(request, site); + CloseableHttpClient httpClient = getHttpClient(site); + try { httpResponse = httpClient.execute(httpUriRequest, httpContext); statusCode = httpResponse.getStatusLine().getStatusCode(); - request.putExtra(Request.STATUS_CODE, statusCode); - if (statusAccept(acceptStatCode, statusCode)) { - Page page = handleResponse(request, charset, httpResponse, task); + if (site.getAcceptStatCode().contains(statusCode)) { + Page page = handleResponse(request, site.getCharset(), httpResponse, task); onSuccess(request); return page; } else { @@ -120,10 +111,8 @@ public class HttpClientDownloader extends AbstractDownloader { //ensure the connection is released back to pool EntityUtils.consumeQuietly(httpResponse.getEntity()); } - request.putExtra(Request.STATUS_CODE, statusCode); - if (site != null && site.getHttpProxyPool() != null && site.getHttpProxyPool().isEnable()) { - site.returnHttpProxyToPool((HttpHost) request.getExtra(Request.PROXY), (Integer) request - .getExtra(Request.STATUS_CODE)); + if (proxy != null) { + site.getHttpProxyPool().returnProxy(proxy, statusCode); } } } @@ -133,10 +122,6 @@ public class HttpClientDownloader extends AbstractDownloader { httpClientGenerator.setPoolSize(thread); } - protected boolean statusAccept(Set acceptStatCode, int statusCode) { - return acceptStatCode.contains(statusCode); - } - protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException { String content = getContent(charset, httpResponse); Page page = new Page(); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java index 7e77676..beda2e6 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java @@ -10,7 +10,6 @@ import org.apache.http.client.methods.RequestBuilder; import org.apache.http.message.BasicNameValuePair; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Site; -import us.codecraft.webmagic.proxy.Proxy; import us.codecraft.webmagic.utils.HttpConstant; import java.nio.charset.Charset; @@ -26,7 +25,7 @@ import java.util.Map; */ public class HttpUriRequestConverter { - public HttpUriRequest convert(Request request, Site site, Proxy proxy) { + public HttpUriRequest convert(Request request, Site site) { return null; } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java index b078fd9..1d872d4 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java @@ -7,12 +7,12 @@ package us.codecraft.webmagic.proxy; public class Proxy { private ProxyHost proxyHost; - private String user; + private String username; private String password; - public Proxy(ProxyHost proxyHost, String user, String password) { + public Proxy(ProxyHost proxyHost, String username, String password) { this.proxyHost = proxyHost; - this.user = user; + this.username = username; this.password = password; } @@ -28,12 +28,12 @@ public class Proxy { this.proxyHost = proxyHost; } - public String getUser() { - return user; + public String getUsername() { + return username; } - public void setUser(String user) { - this.user = user; + public void setUsername(String username) { + this.username = username; } public String getPassword() { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyPool.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyPool.java index 418b445..50e4029 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyPool.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyPool.java @@ -1,13 +1,11 @@ package us.codecraft.webmagic.proxy; -import org.apache.http.HttpHost; - /** * Created by edwardsbean on 15-2-28. */ public interface ProxyPool { - void returnProxy(HttpHost host, int statusCode); + void returnProxy(Proxy proxy, int statusCode); Proxy getProxy(); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/TimerReuseProxyPool.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/TimerReuseProxyPool.java index b375fae..a336c71 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/TimerReuseProxyPool.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/TimerReuseProxyPool.java @@ -7,8 +7,6 @@ import us.codecraft.webmagic.utils.FilePersistentBase; import us.codecraft.webmagic.utils.ProxyUtils; import java.io.*; -import java.net.InetAddress; -import java.net.UnknownHostException; import java.util.*; import java.util.Map.Entry; import java.util.concurrent.BlockingQueue; @@ -156,7 +154,7 @@ public class TimerReuseProxyPool implements ProxyPool { isEnable = true; for (Proxy proxy : httpProxyList) { if (!validateWhenInit || ProxyUtils.validateProxy(proxy.getProxyHost())) { - TimerReuseProxy p = new TimerReuseProxy(proxy.getProxyHost(), proxy.getUser(), proxy.getPassword(), reuseInterval); + TimerReuseProxy p = new TimerReuseProxy(proxy.getProxyHost(), proxy.getUsername(), proxy.getPassword(), reuseInterval); proxyQueue.add(p); allProxy.put(p.getProxyHost().getHost(), p); } @@ -185,8 +183,8 @@ public class TimerReuseProxyPool implements ProxyPool { return proxy; } - public void returnProxy(HttpHost host, int statusCode) { - TimerReuseProxy p = allProxy.get(host.getAddress().getHostAddress()); + public void returnProxy(Proxy proxy, int statusCode) { + TimerReuseProxy p = allProxy.get(proxy.getProxyHost()); if (p == null) { return; } @@ -202,13 +200,13 @@ public class TimerReuseProxyPool implements ProxyPool { // banned,try longer interval p.fail(TimerReuseProxy.ERROR_403); p.setReuseTimeInterval(reuseInterval * p.getFailedNum()); - logger.info(host + " >>>> reuseTimeInterval is >>>> " + p.getReuseTimeInterval() / 1000.0); + logger.info(proxy + " >>>> reuseTimeInterval is >>>> " + p.getReuseTimeInterval() / 1000.0); break; case TimerReuseProxy.ERROR_BANNED: p.fail(TimerReuseProxy.ERROR_BANNED); p.setReuseTimeInterval(10 * 60 * 1000 * p.getFailedNum()); logger.warn("this proxy is banned >>>> " + p.getHttpHost()); - logger.info(host + " >>>> reuseTimeInterval is >>>> " + p.getReuseTimeInterval() / 1000.0); + logger.info(proxy + " >>>> reuseTimeInterval is >>>> " + p.getReuseTimeInterval() / 1000.0); break; case TimerReuseProxy.ERROR_404: // p.fail(Proxy.ERROR_404); @@ -220,13 +218,13 @@ public class TimerReuseProxyPool implements ProxyPool { } if (p.getFailedNum() > 20) { p.setReuseTimeInterval(reviveTime); - logger.error("remove proxy >>>> " + host + ">>>>" + p.getFailedType() + " >>>> remain proxy >>>> " + proxyQueue.size()); + logger.error("remove proxy >>>> " + proxy + ">>>>" + p.getFailedType() + " >>>> remain proxy >>>> " + proxyQueue.size()); return; } if (p.getFailedNum() > 0 && p.getFailedNum() % 5 == 0) { - if (!ProxyUtils.validateProxy(host)) { + if (!ProxyUtils.validateProxy(proxy)) { p.setReuseTimeInterval(reviveTime); - logger.error("remove proxy >>>> " + host + ">>>>" + p.getFailedType() + " >>>> remain proxy >>>> " + proxyQueue.size()); + logger.error("remove proxy >>>> " + proxy + ">>>>" + p.getFailedType() + " >>>> remain proxy >>>> " + proxyQueue.size()); return; } }