From 19474e47169c24c1639c8ebbb671dec0937d54f2 Mon Sep 17 00:00:00 2001 From: edwardsbean Date: Sat, 28 Feb 2015 17:50:10 +0800 Subject: [PATCH] add SimpleProxyPool and IProxyPool --- .../main/java/us/codecraft/webmagic/Site.java | 18 ++- .../codecraft/webmagic/proxy/IProxyPool.java | 12 ++ .../codecraft/webmagic/proxy/ProxyPool.java | 2 +- .../webmagic/proxy/SimpleProxyPool.java | 116 ++++++++++++++++++ 4 files changed, 136 insertions(+), 12 deletions(-) create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/proxy/IProxyPool.java create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyPool.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java index 32118ab..9769410 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java @@ -4,7 +4,8 @@ import com.google.common.collect.HashBasedTable; import com.google.common.collect.Table; import org.apache.http.HttpHost; -import us.codecraft.webmagic.proxy.ProxyPool; +import us.codecraft.webmagic.proxy.IProxyPool; +import us.codecraft.webmagic.proxy.SimpleProxyPool; import us.codecraft.webmagic.utils.UrlUtils; import java.util.*; @@ -51,7 +52,7 @@ public class Site { private HttpHost httpProxy; - private ProxyPool httpProxyPool; + private IProxyPool httpProxyPool; private boolean useGzip = true; @@ -464,17 +465,17 @@ public class Site { * * @return this */ - public Site setHttpProxyPool(List httpProxyList) { - this.httpProxyPool=new ProxyPool(httpProxyList); + public Site setHttpProxyPool(IProxyPool proxyPool) { + this.httpProxyPool = proxyPool; return this; } public Site enableHttpProxyPool() { - this.httpProxyPool=new ProxyPool(); + this.httpProxyPool=new SimpleProxyPool(); return this; } - public ProxyPool getHttpProxyPool() { + public IProxyPool getHttpProxyPool() { return httpProxyPool; } @@ -486,9 +487,4 @@ public class Site { httpProxyPool.returnProxy(proxy,statusCode); } - public Site setProxyReuseInterval(int reuseInterval) { - this.httpProxyPool.setReuseInterval(reuseInterval); - return this; - } - } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/IProxyPool.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/IProxyPool.java new file mode 100644 index 0000000..de7737e --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/IProxyPool.java @@ -0,0 +1,12 @@ +package us.codecraft.webmagic.proxy; + +import org.apache.http.HttpHost; + +/** + * Created by edwardsbean on 15-2-28. + */ +public interface IProxyPool { + public void returnProxy(HttpHost host, int statusCode); + public HttpHost getProxy(); + public boolean isEnable(); +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyPool.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyPool.java index c9f27d5..dcecb26 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyPool.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyPool.java @@ -22,7 +22,7 @@ import java.util.concurrent.DelayQueue; * @see Proxy * @since 0.5.1 */ -public class ProxyPool { +public class ProxyPool implements IProxyPool{ private Logger logger = LoggerFactory.getLogger(getClass()); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyPool.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyPool.java new file mode 100644 index 0000000..e56b2cb --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyPool.java @@ -0,0 +1,116 @@ +package us.codecraft.webmagic.proxy; + +import org.apache.http.HttpHost; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.net.InetAddress; +import java.net.UnknownHostException; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.NoSuchElementException; +import java.util.concurrent.BlockingQueue; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.DelayQueue; + +/** + * Created by edwardsbean on 15-2-28. + */ +public class SimpleProxyPool implements IProxyPool{ + private Logger logger = LoggerFactory.getLogger(getClass()); + + private BlockingQueue proxyQueue = new DelayQueue(); + private Map allProxy = new ConcurrentHashMap(); + private boolean isEnable = false; + private int reuseInterval = 1500;// ms + private int reviveTime = 2 * 60 * 60 * 1000;// ms + + public SimpleProxyPool() { + this(null); + } + + public SimpleProxyPool(List httpProxyList) { + if (httpProxyList != null) { + addProxy(httpProxyList.toArray(new String[httpProxyList.size()][])); + } + } + + public void addProxy(String[]... httpProxyList) { + isEnable = true; + for (String[] s : httpProxyList) { + try { + if (allProxy.containsKey(s[0])) { + continue; + } + HttpHost item = new HttpHost(InetAddress.getByName(s[0]), Integer.valueOf(s[1])); + Proxy p = new Proxy(item, reuseInterval); + proxyQueue.add(p); + allProxy.put(s[0], p); + } catch (NumberFormatException e) { + logger.error("HttpHost init error:", e); + } catch (UnknownHostException e) { + logger.error("HttpHost init error:", e); + } + } + logger.info("proxy pool size>>>>" + allProxy.size()); + } + + public void returnProxy(HttpHost host, int statusCode) { + Proxy p = allProxy.get(host.getAddress().getHostAddress()); + if (p == null) { + return; + } + switch (statusCode) { + case Proxy.SUCCESS: + p.setFailedNum(0); + p.setFailedErrorType(new ArrayList()); + p.recordResponse(); + p.successNumIncrement(1); + break; + case Proxy.ERROR_403: + // banned,try longer interval + p.fail(Proxy.ERROR_403); + break; + case Proxy.ERROR_BANNED: + p.fail(Proxy.ERROR_BANNED); + logger.warn("this proxy is banned >>>> " + p.getHttpHost()); + break; + case Proxy.ERROR_404: + // p.fail(Proxy.ERROR_404); + // p.setReuseTimeInterval(reuseInterval * p.getFailedNum()); + break; + default: + p.fail(statusCode); + break; + } + if (p.getFailedNum() > 3) { + logger.error("remove proxy >>>> " + host + ">>>>" + p.getFailedType() + " >>>> remain proxy >>>> " + proxyQueue.size()); + return; + } + try { + proxyQueue.put(p); + } catch (InterruptedException e) { + logger.warn("proxyQueue return proxy error", e); + } + } + + @Override + public HttpHost getProxy() { + Proxy proxy = null; + try { + proxy = proxyQueue.take(); + } catch (InterruptedException e) { + logger.error("get proxy error", e); + } + if (proxy == null) { + throw new NoSuchElementException(); + } + return proxy.getHttpHost(); + } + + @Override + public boolean isEnable() { + return isEnable; + } +}