diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java index ac9f9ce..146bb0d 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java @@ -4,7 +4,7 @@ import org.apache.http.HttpHost; import org.apache.http.auth.UsernamePasswordCredentials; import us.codecraft.webmagic.proxy.Proxy; import us.codecraft.webmagic.proxy.ProxyPool; -import us.codecraft.webmagic.proxy.SimpleProxyPool; +import us.codecraft.webmagic.proxy.TimerReuseProxyPool; import us.codecraft.webmagic.utils.UrlUtils; import java.util.*; @@ -487,12 +487,12 @@ public class Site { * @return this */ public Site setHttpProxyPool(List httpProxyList, boolean isUseLastProxy) { - this.httpProxyPool=new SimpleProxyPool(httpProxyList, isUseLastProxy); + this.httpProxyPool=new TimerReuseProxyPool(httpProxyList, isUseLastProxy); return this; } public Site enableHttpProxyPool() { - this.httpProxyPool=new SimpleProxyPool(); + this.httpProxyPool=new TimerReuseProxyPool(); return this; } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index fa907a1..816e6c5 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -3,16 +3,16 @@ package us.codecraft.webmagic.downloader; import org.apache.commons.io.IOUtils; import org.apache.http.HttpHost; import org.apache.http.HttpResponse; -import org.apache.http.NameValuePair; import org.apache.http.annotation.ThreadSafe; -import org.apache.http.client.config.CookieSpecs; -import org.apache.http.client.config.RequestConfig; -import org.apache.http.client.entity.UrlEncodedFormEntity; +import org.apache.http.auth.AuthState; +import org.apache.http.auth.UsernamePasswordCredentials; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpUriRequest; -import org.apache.http.client.methods.RequestBuilder; +import org.apache.http.client.protocol.HttpClientContext; +import org.apache.http.impl.auth.BasicScheme; import org.apache.http.impl.client.CloseableHttpClient; -import org.apache.http.message.BasicNameValuePair; +import org.apache.http.protocol.BasicHttpContext; +import org.apache.http.protocol.HttpContext; import org.apache.http.util.EntityUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -23,12 +23,13 @@ import us.codecraft.webmagic.Task; import us.codecraft.webmagic.proxy.Proxy; import us.codecraft.webmagic.selector.PlainText; import us.codecraft.webmagic.utils.CharsetUtils; -import us.codecraft.webmagic.utils.HttpConstant; import us.codecraft.webmagic.utils.WMCollections; import java.io.IOException; import java.nio.charset.Charset; -import java.util.*; +import java.util.HashMap; +import java.util.Map; +import java.util.Set; /** @@ -46,9 +47,15 @@ public class HttpClientDownloader extends AbstractDownloader { private HttpClientGenerator httpClientGenerator = new HttpClientGenerator(); - private CloseableHttpClient getHttpClient(Site site, Proxy proxy) { + private HttpUriRequestConverter httpUriRequestConverter = new HttpUriRequestConverter(); + + public void setHttpUriRequestConverter(HttpUriRequestConverter httpUriRequestConverter) { + this.httpUriRequestConverter = httpUriRequestConverter; + } + + private CloseableHttpClient getHttpClient(Site site) { if (site == null) { - return httpClientGenerator.getClient(null, proxy); + return httpClientGenerator.getClient(null); } String domain = site.getDomain(); CloseableHttpClient httpClient = httpClients.get(domain); @@ -56,7 +63,7 @@ public class HttpClientDownloader extends AbstractDownloader { synchronized (this) { httpClient = httpClients.get(domain); if (httpClient == null) { - httpClient = httpClientGenerator.getClient(site, proxy); + httpClient = httpClientGenerator.getClient(site); httpClients.put(domain, httpClient); } } @@ -66,35 +73,31 @@ public class HttpClientDownloader extends AbstractDownloader { @Override public Page download(Request request, Task task) { - Site site = null; - if (task != null) { - site = task.getSite(); + if (task == null || task.getSite() == null) { + throw new NullPointerException("task or site can not be null"); } - Set acceptStatCode; - String charset = null; - Map headers = null; - if (site != null) { - acceptStatCode = site.getAcceptStatCode(); - charset = site.getCharset(); - headers = site.getHeaders(); - } else { - acceptStatCode = WMCollections.newHashSet(200); - } - logger.info("downloading page {}", request.getUrl()); + logger.debug("downloading page {}", request.getUrl()); CloseableHttpResponse httpResponse = null; int statusCode = 0; + Site site = task.getSite(); try { - HttpHost proxyHost = null; - Proxy proxy = null; //TODO - if (site != null && site.getHttpProxyPool() != null && site.getHttpProxyPool().isEnable()) { + Proxy proxy = null; + if (site.getHttpProxyPool() != null && site.getHttpProxyPool().isEnable()) { proxy = site.getHttpProxyFromPool(); - proxyHost = proxy.getHttpHost(); } else if (site != null && site.getHttpProxy() != null){ - proxyHost = site.getHttpProxy(); + proxy = site.getHttpProxy(); + request.putExtra(Request.PROXY, site.getHttpProxy()); } - - HttpUriRequest httpUriRequest = getHttpUriRequest(request, site, headers, proxyHost); - httpResponse = getHttpClient(site, proxy).execute(httpUriRequest); + request.putExtra(Request.PROXY, proxy); + + HttpContext httpContext = new BasicHttpContext(); + + HttpUriRequest httpUriRequest = httpUriRequestConverter.convert(request,site); + AuthState authState = new AuthState(); + authState.update(new BasicScheme(), new UsernamePasswordCredentials("userName", "password")); + httpContext.setAttribute(HttpClientContext.PROXY_AUTH_STATE, authState); + CloseableHttpClient httpClient = getHttpClient(site, proxy); + httpResponse = httpClient.execute(httpUriRequest, httpContext); statusCode = httpResponse.getStatusLine().getStatusCode(); request.putExtra(Request.STATUS_CODE, statusCode); if (statusAccept(acceptStatCode, statusCode)) { @@ -134,72 +137,6 @@ public class HttpClientDownloader extends AbstractDownloader { return acceptStatCode.contains(statusCode); } - protected HttpUriRequest getHttpUriRequest(Request request, Site site, Map headers, HttpHost proxy) { - RequestBuilder requestBuilder = selectRequestMethod(request).setUri(request.getUrl()); - if (headers != null) { - for (Map.Entry headerEntry : headers.entrySet()) { - requestBuilder.addHeader(headerEntry.getKey(), headerEntry.getValue()); - } - } - - RequestConfig.Builder requestConfigBuilder = RequestConfig.custom(); - if (site != null) { - requestConfigBuilder.setConnectionRequestTimeout(site.getTimeOut()) - .setSocketTimeout(site.getTimeOut()) - .setConnectTimeout(site.getTimeOut()) - .setCookieSpec(CookieSpecs.BEST_MATCH); - } - - if (proxy != null) { - requestConfigBuilder.setProxy(proxy); - request.putExtra(Request.PROXY, proxy); - } - requestBuilder.setConfig(requestConfigBuilder.build()); - return requestBuilder.build(); - } - - protected RequestBuilder selectRequestMethod(Request request) { - String method = request.getMethod(); - if (method == null || method.equalsIgnoreCase(HttpConstant.Method.GET)) { - //default get - return addQueryParams(RequestBuilder.get(),request.getParams()); - } else if (method.equalsIgnoreCase(HttpConstant.Method.POST)) { - return addFormParams(RequestBuilder.post(), (NameValuePair[]) request.getExtra("nameValuePair"), request.getParams()); - } else if (method.equalsIgnoreCase(HttpConstant.Method.HEAD)) { - return addQueryParams(RequestBuilder.head(),request.getParams()); - } else if (method.equalsIgnoreCase(HttpConstant.Method.PUT)) { - return addFormParams(RequestBuilder.put(), (NameValuePair[]) request.getExtra("nameValuePair"), request.getParams()); - } else if (method.equalsIgnoreCase(HttpConstant.Method.DELETE)) { - return addQueryParams(RequestBuilder.delete(),request.getParams()); - } else if (method.equalsIgnoreCase(HttpConstant.Method.TRACE)) { - return addQueryParams(RequestBuilder.trace(),request.getParams()); - } - throw new IllegalArgumentException("Illegal HTTP Method " + method); - } - - private RequestBuilder addFormParams(RequestBuilder requestBuilder, NameValuePair[] nameValuePair, Map params) { - List allNameValuePair=new ArrayList(); - if (nameValuePair != null && nameValuePair.length > 0) { - allNameValuePair= Arrays.asList(nameValuePair); - } - if (params != null) { - for (String key : params.keySet()) { - allNameValuePair.add(new BasicNameValuePair(key, params.get(key))); - } - } - requestBuilder.setEntity(new UrlEncodedFormEntity(allNameValuePair, Charset.forName("utf8"))); - return requestBuilder; - } - - private RequestBuilder addQueryParams(RequestBuilder requestBuilder, Map params) { - if (params != null) { - for (Map.Entry entry : params.entrySet()) { - requestBuilder.addParameter(entry.getKey(), entry.getValue()); - } - } - return requestBuilder; - } - protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException { String content = getContent(charset, httpResponse); Page page = new Page(); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java index aec5309..1da64e7 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java @@ -1,13 +1,9 @@ package us.codecraft.webmagic.downloader; -import org.apache.commons.lang3.StringUtils; import org.apache.http.HttpException; import org.apache.http.HttpRequest; import org.apache.http.HttpRequestInterceptor; -import org.apache.http.auth.AuthScope; -import org.apache.http.auth.UsernamePasswordCredentials; import org.apache.http.client.CookieStore; -import org.apache.http.client.CredentialsProvider; import org.apache.http.config.Registry; import org.apache.http.config.RegistryBuilder; import org.apache.http.config.SocketConfig; @@ -21,7 +17,6 @@ import org.apache.http.protocol.HttpContext; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import us.codecraft.webmagic.Site; -import us.codecraft.webmagic.proxy.Proxy; import javax.net.ssl.SSLContext; import javax.net.ssl.TrustManager; @@ -92,31 +87,13 @@ public class HttpClientGenerator { return this; } - public CloseableHttpClient getClient(Site site, Proxy proxy) { - return generateClient(site, proxy); + public CloseableHttpClient getClient(Site site) { + return generateClient(site); } - private CloseableHttpClient generateClient(Site site, Proxy proxy) { - CredentialsProvider credsProvider = null; + private CloseableHttpClient generateClient(Site site) { HttpClientBuilder httpClientBuilder = HttpClients.custom(); - if (proxy != null && StringUtils.isNotBlank(proxy.getUser()) && StringUtils.isNotBlank(proxy.getPassword())) - { - credsProvider= new BasicCredentialsProvider(); - credsProvider.setCredentials( - new AuthScope(proxy.getHttpHost().getAddress().getHostAddress(), proxy.getHttpHost().getPort()), - new UsernamePasswordCredentials(proxy.getUser(), proxy.getPassword())); - httpClientBuilder.setDefaultCredentialsProvider(credsProvider); - } - - if (site != null && site.getHttpProxy()!= null && site.getUsernamePasswordCredentials() != null){ - credsProvider = new BasicCredentialsProvider(); - credsProvider.setCredentials( - new AuthScope(site.getHttpProxy()),//可以访问的范围 - site.getUsernamePasswordCredentials());//用户名和密码 - httpClientBuilder.setDefaultCredentialsProvider(credsProvider); - } - httpClientBuilder.setConnectionManager(connectionManager); if (site != null && site.getUserAgent() != null) { httpClientBuilder.setUserAgent(site.getUserAgent()); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java new file mode 100644 index 0000000..7e77676 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java @@ -0,0 +1,98 @@ +package us.codecraft.webmagic.downloader; + +import org.apache.http.HttpHost; +import org.apache.http.NameValuePair; +import org.apache.http.client.config.CookieSpecs; +import org.apache.http.client.config.RequestConfig; +import org.apache.http.client.entity.UrlEncodedFormEntity; +import org.apache.http.client.methods.HttpUriRequest; +import org.apache.http.client.methods.RequestBuilder; +import org.apache.http.message.BasicNameValuePair; +import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.proxy.Proxy; +import us.codecraft.webmagic.utils.HttpConstant; + +import java.nio.charset.Charset; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Map; + +/** + * @author code4crafter@gmail.com + * Date: 17/3/18 + * Time: 上午11:28 + */ +public class HttpUriRequestConverter { + + public HttpUriRequest convert(Request request, Site site, Proxy proxy) { + return null; + } + + private HttpUriRequest getHttpUriRequest(Request request, Site site, Map headers, HttpHost proxy) { + RequestBuilder requestBuilder = selectRequestMethod(request).setUri(request.getUrl()); + if (headers != null) { + for (Map.Entry headerEntry : headers.entrySet()) { + requestBuilder.addHeader(headerEntry.getKey(), headerEntry.getValue()); + } + } + + RequestConfig.Builder requestConfigBuilder = RequestConfig.custom(); + if (site != null) { + requestConfigBuilder.setConnectionRequestTimeout(site.getTimeOut()) + .setSocketTimeout(site.getTimeOut()) + .setConnectTimeout(site.getTimeOut()) + .setCookieSpec(CookieSpecs.BEST_MATCH); + } + + if (proxy != null) { + requestConfigBuilder.setProxy(proxy); + } + requestBuilder.setConfig(requestConfigBuilder.build()); + return requestBuilder.build(); + } + + private RequestBuilder selectRequestMethod(Request request) { + String method = request.getMethod(); + if (method == null || method.equalsIgnoreCase(HttpConstant.Method.GET)) { + //default get + return addQueryParams(RequestBuilder.get(),request.getParams()); + } else if (method.equalsIgnoreCase(HttpConstant.Method.POST)) { + return addFormParams(RequestBuilder.post(), (NameValuePair[]) request.getExtra("nameValuePair"), request.getParams()); + } else if (method.equalsIgnoreCase(HttpConstant.Method.HEAD)) { + return addQueryParams(RequestBuilder.head(),request.getParams()); + } else if (method.equalsIgnoreCase(HttpConstant.Method.PUT)) { + return addFormParams(RequestBuilder.put(), (NameValuePair[]) request.getExtra("nameValuePair"), request.getParams()); + } else if (method.equalsIgnoreCase(HttpConstant.Method.DELETE)) { + return addQueryParams(RequestBuilder.delete(),request.getParams()); + } else if (method.equalsIgnoreCase(HttpConstant.Method.TRACE)) { + return addQueryParams(RequestBuilder.trace(),request.getParams()); + } + throw new IllegalArgumentException("Illegal HTTP Method " + method); + } + + private RequestBuilder addFormParams(RequestBuilder requestBuilder, NameValuePair[] nameValuePair, Map params) { + List allNameValuePair=new ArrayList(); + if (nameValuePair != null && nameValuePair.length > 0) { + allNameValuePair= Arrays.asList(nameValuePair); + } + if (params != null) { + for (String key : params.keySet()) { + allNameValuePair.add(new BasicNameValuePair(key, params.get(key))); + } + } + requestBuilder.setEntity(new UrlEncodedFormEntity(allNameValuePair, Charset.forName("utf8"))); + return requestBuilder; + } + + private RequestBuilder addQueryParams(RequestBuilder requestBuilder, Map params) { + if (params != null) { + for (Map.Entry entry : params.entrySet()) { + requestBuilder.addParameter(entry.getKey(), entry.getValue()); + } + } + return requestBuilder; + } + +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java index dbe3a18..b078fd9 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java @@ -1,199 +1,47 @@ package us.codecraft.webmagic.proxy; -import org.apache.http.HttpHost; - -import java.io.Serializable; -import java.util.ArrayList; -import java.util.List; -import java.util.concurrent.Delayed; -import java.util.concurrent.TimeUnit; - -/** - * >>>> Proxy lifecycle - - +----------+ +-----+ - | last use | | new | - +-----+----+ +---+-+ - | +------+ | - +->| init |<--+ - +--+---+ - | - v - +--------+ - +--->| borrow | - | +---+----+ - | |+------------------+ - | v - | +--------+ - | | in use | Respone Time - | +---+----+ - | |+------------------+ - | v - | +--------+ - | | return | - | +---+----+ - | |+-------------------+ - | v - | +-------+ reuse interval - | | delay | (delay time) - | +---+---+ - | |+-------------------+ - | v - | +------+ - | | idle | idle time - | +---+--+ - | |+-------------------+ - +--------+ - */ - /** - * Object has these status of lifecycle above.
* - * @author yxssfxwzy@sina.com
- * @since 0.5.1 - * @see SimpleProxyPool */ -public class Proxy implements Delayed, Serializable { +public class Proxy { - private static final long serialVersionUID = 228939737383625551L; - public static final int ERROR_403 = 403; - public static final int ERROR_404 = 404; - public static final int ERROR_BANNED = 10000;// banned by website - public static final int ERROR_Proxy = 10001;// the proxy itself failed - public static final int SUCCESS = 200; - - private final HttpHost httpHost; + private ProxyHost proxyHost; private String user; private String password; - - private int reuseTimeInterval = 1500;// ms - private Long canReuseTime = 0L; - private Long lastBorrowTime = System.currentTimeMillis(); - private Long responseTime = 0L; - - private int failedNum = 0; - private int successNum = 0; - private int borrowNum = 0; - - private List failedErrorType = new ArrayList(); - - public Proxy(HttpHost httpHost, String user, String password) { - this.httpHost = httpHost; + public Proxy(ProxyHost proxyHost, String user, String password) { + this.proxyHost = proxyHost; this.user = user; this.password = password; - this.canReuseTime = System.nanoTime() + TimeUnit.NANOSECONDS.convert(reuseTimeInterval, TimeUnit.MILLISECONDS); } - public Proxy(HttpHost httpHost, int reuseInterval, String user, String password) { - this.httpHost = httpHost; - this.user = user; - this.password = password; - this.canReuseTime = System.nanoTime() + TimeUnit.NANOSECONDS.convert(reuseInterval, TimeUnit.MILLISECONDS); + public Proxy(ProxyHost proxyHost) { + this.proxyHost = proxyHost; } - public int getSuccessNum() { - return successNum; + public ProxyHost getProxyHost() { + return proxyHost; } - public void successNumIncrement(int increment) { - this.successNum += increment; + public void setProxyHost(ProxyHost proxyHost) { + this.proxyHost = proxyHost; } - public Long getLastUseTime() { - return lastBorrowTime; - } - - public void setLastBorrowTime(Long lastBorrowTime) { - this.lastBorrowTime = lastBorrowTime; - } - - public void recordResponse() { - this.responseTime = (System.currentTimeMillis() - lastBorrowTime + responseTime) / 2; - this.lastBorrowTime = System.currentTimeMillis(); - } - - public List getFailedErrorType() { - return failedErrorType; - } - - public void setFailedErrorType(List failedErrorType) { - this.failedErrorType = failedErrorType; - } - - public void fail(int failedErrorType) { - this.failedNum++; - this.failedErrorType.add(failedErrorType); - } - - public void setFailedNum(int failedNum) { - this.failedNum = failedNum; - } - - public int getFailedNum() { - return failedNum; - } - - public String getFailedType() { - String re = ""; - for (Integer i : this.failedErrorType) { - re += i + " . "; - } - return re; - } - - public HttpHost getHttpHost() { - return httpHost; - } - - public int getReuseTimeInterval() { - return reuseTimeInterval; - } - - public void setReuseTimeInterval(int reuseTimeInterval) { - this.reuseTimeInterval = reuseTimeInterval; - this.canReuseTime = System.nanoTime() + TimeUnit.NANOSECONDS.convert(reuseTimeInterval, TimeUnit.MILLISECONDS); - - } - - @Override - public long getDelay(TimeUnit unit) { - return unit.convert(canReuseTime - System.nanoTime(), TimeUnit.NANOSECONDS); - } - - @Override - public int compareTo(Delayed o) { - Proxy that = (Proxy) o; - return canReuseTime > that.canReuseTime ? 1 : (canReuseTime < that.canReuseTime ? -1 : 0); - - } - - @Override - public String toString() { - - String re = String.format("host: %15s >> %5dms >> success: %-3.2f%% >> borrow: %d", httpHost.getAddress().getHostAddress(), responseTime, - successNum * 100.0 / borrowNum, borrowNum); - return re; - - } - - public String getUser() - { + public String getUser() { return user; - } - public String getPassword() - { + + public void setUser(String user) { + this.user = user; + } + + public String getPassword() { return password; - } - public void borrowNumIncrement(int increment) { - this.borrowNum += increment; + public void setPassword(String password) { + this.password = password; } - public int getBorrowNum() { - return borrowNum; - } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyHost.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyHost.java new file mode 100644 index 0000000..11e8c87 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyHost.java @@ -0,0 +1,34 @@ +package us.codecraft.webmagic.proxy; + +/** + * @author code4crafter@gmail.com + * Date: 17/3/18 + * Time: 下午12:04 + */ +public class ProxyHost { + + private String host; + + private int port; + + public String getHost() { + return host; + } + + public ProxyHost(String host, int port) { + this.host = host; + this.port = port; + } + + public void setHost(String host) { + this.host = host; + } + + public int getPort() { + return port; + } + + public void setPort(int port) { + this.port = port; + } +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyPool.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyPool.java index 40b1913..418b445 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyPool.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyPool.java @@ -6,7 +6,10 @@ import org.apache.http.HttpHost; * Created by edwardsbean on 15-2-28. */ public interface ProxyPool { - public void returnProxy(HttpHost host, int statusCode); - public Proxy getProxy(); - public boolean isEnable(); + + void returnProxy(HttpHost host, int statusCode); + + Proxy getProxy(); + + boolean isEnable(); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/TimerReuseProxy.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/TimerReuseProxy.java new file mode 100644 index 0000000..8f59252 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/TimerReuseProxy.java @@ -0,0 +1,163 @@ +package us.codecraft.webmagic.proxy; + +import java.io.Serializable; +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.Delayed; +import java.util.concurrent.TimeUnit; + +/** + * >>>> Proxy lifecycle + + +----------+ +-----+ + | last use | | new | + +-----+----+ +---+-+ + | +------+ | + +->| init |<--+ + +--+---+ + | + v + +--------+ + +--->| borrow | + | +---+----+ + | |+------------------+ + | v + | +--------+ + | | in use | Respone Time + | +---+----+ + | |+------------------+ + | v + | +--------+ + | | return | + | +---+----+ + | |+-------------------+ + | v + | +-------+ reuse interval + | | delay | (delay time) + | +---+---+ + | |+-------------------+ + | v + | +------+ + | | idle | idle time + | +---+--+ + | |+-------------------+ + +--------+ + */ + +/** + * Object has these status of lifecycle above.
+ * + * @author yxssfxwzy@sina.com
+ * @since 0.5.1 + * @see TimerReuseProxyPool + */ + +public class TimerReuseProxy extends Proxy implements Delayed, Serializable { + + private static final long serialVersionUID = 228939737383625551L; + public static final int ERROR_403 = 403; + public static final int ERROR_404 = 404; + public static final int ERROR_BANNED = 10000;// banned by website + public static final int ERROR_Proxy = 10001;// the proxy itself failed + public static final int SUCCESS = 200; + + private int reuseTimeInterval = 1500;// ms + private Long canReuseTime = 0L; + private Long lastBorrowTime = System.currentTimeMillis(); + private Long responseTime = 0L; + + private int failedNum = 0; + private int successNum = 0; + private int borrowNum = 0; + + private List failedErrorType = new ArrayList(); + + public TimerReuseProxy(ProxyHost proxyHost, String user, String password) { + super(proxyHost, user, password); + } + + public TimerReuseProxy(ProxyHost proxyHost, String user, String password, int reuseTimeInterval) { + super(proxyHost, user, password); + this.reuseTimeInterval = reuseTimeInterval; + } + + public int getSuccessNum() { + return successNum; + } + + public void successNumIncrement(int increment) { + this.successNum += increment; + } + + public Long getLastUseTime() { + return lastBorrowTime; + } + + public void setLastBorrowTime(Long lastBorrowTime) { + this.lastBorrowTime = lastBorrowTime; + } + + public void recordResponse() { + this.responseTime = (System.currentTimeMillis() - lastBorrowTime + responseTime) / 2; + this.lastBorrowTime = System.currentTimeMillis(); + } + + public List getFailedErrorType() { + return failedErrorType; + } + + public void setFailedErrorType(List failedErrorType) { + this.failedErrorType = failedErrorType; + } + + public void fail(int failedErrorType) { + this.failedNum++; + this.failedErrorType.add(failedErrorType); + } + + public void setFailedNum(int failedNum) { + this.failedNum = failedNum; + } + + public int getFailedNum() { + return failedNum; + } + + public String getFailedType() { + String re = ""; + for (Integer i : this.failedErrorType) { + re += i + " . "; + } + return re; + } + + public int getReuseTimeInterval() { + return reuseTimeInterval; + } + + public void setReuseTimeInterval(int reuseTimeInterval) { + this.reuseTimeInterval = reuseTimeInterval; + this.canReuseTime = System.nanoTime() + TimeUnit.NANOSECONDS.convert(reuseTimeInterval, TimeUnit.MILLISECONDS); + + } + + @Override + public long getDelay(TimeUnit unit) { + return unit.convert(canReuseTime - System.nanoTime(), TimeUnit.NANOSECONDS); + } + + @Override + public int compareTo(Delayed o) { + TimerReuseProxy that = (TimerReuseProxy) o; + return canReuseTime > that.canReuseTime ? 1 : (canReuseTime < that.canReuseTime ? -1 : 0); + + } + + public void borrowNumIncrement(int increment) { + this.borrowNum += increment; + } + + public int getBorrowNum() { + return borrowNum; + } +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyPool.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/TimerReuseProxyPool.java similarity index 85% rename from webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyPool.java rename to webmagic-core/src/main/java/us/codecraft/webmagic/proxy/TimerReuseProxyPool.java index f7cd049..4752fee 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyPool.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/TimerReuseProxyPool.java @@ -22,12 +22,12 @@ import java.util.concurrent.DelayQueue; * @see Proxy * @since 0.5.1 */ -public class SimpleProxyPool implements ProxyPool { +public class TimerReuseProxyPool implements ProxyPool { private Logger logger = LoggerFactory.getLogger(getClass()); - private BlockingQueue proxyQueue = new DelayQueue(); - private Map allProxy = new ConcurrentHashMap(); + private BlockingQueue proxyQueue = new DelayQueue(); + private Map allProxy = new ConcurrentHashMap(); private int reuseInterval = 1500;// ms private int reviveTime = 2 * 60 * 60 * 1000;// ms @@ -50,15 +50,15 @@ public class SimpleProxyPool implements ProxyPool { } }; - public SimpleProxyPool() { + public TimerReuseProxyPool() { this(null, true); } - public SimpleProxyPool(List httpProxyList) { + public TimerReuseProxyPool(List httpProxyList) { this(httpProxyList, true); } - public SimpleProxyPool(List httpProxyList, boolean isUseLastProxy) { + public TimerReuseProxyPool(List httpProxyList, boolean isUseLastProxy) { if (httpProxyList != null) { addProxy(httpProxyList.toArray(new String[httpProxyList.size()][])); } @@ -109,9 +109,9 @@ public class SimpleProxyPool implements ProxyPool { } private Map prepareForSaving() { - Map tmp = new HashMap(); - for (Entry e : allProxy.entrySet()) { - Proxy p = e.getValue(); + Map tmp = new HashMap(); + for (Entry e : allProxy.entrySet()) { + TimerReuseProxy p = e.getValue(); p.setFailedNum(0); tmp.put(e.getKey(), p); } @@ -152,30 +152,20 @@ public class SimpleProxyPool implements ProxyPool { logger.info("proxy pool size>>>>" + allProxy.size()); } - public void addProxy(String[]... httpProxyList) { + public void addProxy(Proxy... httpProxyList) { isEnable = true; - for (String[] s : httpProxyList) { - try { - if (allProxy.containsKey(s[2])) { - continue; - } - HttpHost item = new HttpHost(InetAddress.getByName(s[2]), Integer.valueOf(s[3])); - if (!validateWhenInit || ProxyUtils.validateProxy(item)) { - Proxy p = new Proxy(item, reuseInterval, s[0], s[1]); - proxyQueue.add(p); - allProxy.put(s[2], p); - } - } catch (NumberFormatException e) { - logger.error("HttpHost init error:", e); - } catch (UnknownHostException e) { - logger.error("HttpHost init error:", e); + for (Proxy proxy : httpProxyList) { + if (!validateWhenInit || ProxyUtils.validateProxy(proxy.getProxyHost())) { + TimerReuseProxy p = new TimerReuseProxy(proxy.getProxyHost(), proxy.getUser(), proxy.getPassword(), reuseInterval); + proxyQueue.add(p); + allProxy.put(p.getProxyHost().getHost(), p); } } logger.info("proxy pool size>>>>" + allProxy.size()); } - public Proxy getProxy() { - Proxy proxy = null; + public TimerReuseProxy getProxy() { + TimerReuseProxy proxy = null; try { Long time = System.currentTimeMillis(); proxy = proxyQueue.take(); @@ -183,7 +173,7 @@ public class SimpleProxyPool implements ProxyPool { if (costTime > reuseInterval) { logger.info("get proxy time >>>> " + costTime); } - Proxy p = allProxy.get(proxy.getHttpHost().getAddress().getHostAddress()); + TimerReuseProxy p = allProxy.get(proxy.getProxyHost().getHost()); p.setLastBorrowTime(System.currentTimeMillis()); p.borrowNumIncrement(1); } catch (InterruptedException e) { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/ProxyUtils.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/ProxyUtils.java index f44c2ac..f9f9a8c 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/ProxyUtils.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/ProxyUtils.java @@ -1,19 +1,14 @@ package us.codecraft.webmagic.utils; -import java.io.IOException; -import java.net.Inet6Address; -import java.net.InetAddress; -import java.net.InetSocketAddress; -import java.net.NetworkInterface; -import java.net.Socket; -import java.net.SocketException; -import java.net.UnknownHostException; -import java.util.Enumeration; -import java.util.regex.Pattern; - import org.apache.http.HttpHost; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import us.codecraft.webmagic.proxy.ProxyHost; + +import java.io.IOException; +import java.net.*; +import java.util.Enumeration; +import java.util.regex.Pattern; /** * Pooled Proxy Object @@ -69,7 +64,11 @@ public class ProxyUtils { } } - public static boolean validateProxy(HttpHost p) { + public static HttpHost convert(ProxyHost p){ + return new HttpHost(p.getHost(),p.getPort()); + } + + public static boolean validateProxy(ProxyHost p) { if (localAddr == null) { logger.error("cannot get local IP"); return false; @@ -79,7 +78,7 @@ public class ProxyUtils { try { socket = new Socket(); socket.bind(new InetSocketAddress(localAddr, 0)); - InetSocketAddress endpointSocketAddr = new InetSocketAddress(p.getAddress().getHostAddress(), p.getPort()); + InetSocketAddress endpointSocketAddr = new InetSocketAddress(p.getHost(), p.getPort()); socket.connect(endpointSocketAddr, 3000); logger.debug("SUCCESS - connection established! Local: " + localAddr.getHostAddress() + " remote: " + p); isReachable = true; diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java index f218356..6477323 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java @@ -29,7 +29,7 @@ public class ProxyTest { @Test public void testProxy() { - SimpleProxyPool proxyPool = new SimpleProxyPool(httpProxyList,false); + TimerReuseProxyPool proxyPool = new TimerReuseProxyPool(httpProxyList,false); proxyPool.setReuseInterval(500); assertThat(proxyPool.getIdleNum()).isEqualTo(4); for (int i = 0; i < 2; i++) {