diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java
index 01a4c75..3d38333 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java
@@ -49,7 +49,7 @@ public class Site {
private HttpHost httpProxy;
- private ProxyPool httpProxyPool=new ProxyPool();
+ private ProxyPool httpProxyPool;
private boolean useGzip = true;
@@ -453,6 +453,11 @@ public class Site {
return this;
}
+ public Site enableHttpProxyPool() {
+ this.httpProxyPool=new ProxyPool();
+ return this;
+ }
+
public ProxyPool getHttpProxyPool() {
return httpProxyPool;
}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
index bdafea7..1891ef5 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
@@ -141,8 +141,8 @@ public class HttpClientDownloader extends AbstractDownloader {
.setSocketTimeout(site.getTimeOut())
.setConnectTimeout(site.getTimeOut())
.setCookieSpec(CookieSpecs.BEST_MATCH);
- if (site.getHttpProxyPool().isEnable()) {
- HttpHost host = site.getHttpProxyFromPool();
+ if (site.getHttpProxyPool() != null && site.getHttpProxyPool().isEnable()) {
+ HttpHost host = site.getHttpProxyFromPool();
requestConfigBuilder.setProxy(host);
request.putExtra(Request.PROXY, host);
}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyPool.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyPool.java
index d7ad275..3996ffb 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyPool.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyPool.java
@@ -1,324 +1,311 @@
package us.codecraft.webmagic.proxy;
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.FileNotFoundException;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.io.ObjectInputStream;
-import java.io.ObjectOutputStream;
+import org.apache.http.HttpHost;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import us.codecraft.webmagic.utils.FilePersistentBase;
+import us.codecraft.webmagic.utils.ProxyUtils;
+
+import java.io.*;
import java.net.InetAddress;
import java.net.UnknownHostException;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.NoSuchElementException;
-import java.util.Timer;
-import java.util.TimerTask;
+import java.util.*;
import java.util.Map.Entry;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.DelayQueue;
-import org.apache.http.HttpHost;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import us.codecraft.webmagic.utils.FilePersistentBase;
-import us.codecraft.webmagic.utils.ProxyUtils;
-
/**
* Pooled Proxy Object
- *
+ *
* @author yxssfxwzy@sina.com
- * @since 0.5.1
* @see Proxy
+ * @since 0.5.1
*/
public class ProxyPool {
- private Logger logger = LoggerFactory.getLogger(getClass());
+ private Logger logger = LoggerFactory.getLogger(getClass());
- private BlockingQueue proxyQueue = new DelayQueue();
- private Map allProxy = new ConcurrentHashMap();
+ private BlockingQueue proxyQueue = new DelayQueue();
+ private Map allProxy = new ConcurrentHashMap();
- private int reuseInterval = 1500;// ms
- private int reviveTime = 2 * 60 * 60 * 1000;// ms
- private int saveProxyInterval = 10 * 60 * 1000;// ms
+ private int reuseInterval = 1500;// ms
+ private int reviveTime = 2 * 60 * 60 * 1000;// ms
+ private int saveProxyInterval = 10 * 60 * 1000;// ms
- private boolean isEnable = false;
- private boolean validateWhenInit = false;
- // private boolean isUseLastProxy = true;
- private String proxyFilePath = "/data/webmagic/lastUse.proxy";
+ private boolean isEnable = false;
+ private boolean validateWhenInit = false;
+ // private boolean isUseLastProxy = true;
+ private String proxyFilePath = "/data/webmagic/lastUse.proxy";
- private FilePersistentBase fBase = new FilePersistentBase();
+ private FilePersistentBase fBase = new FilePersistentBase();
- private Timer timer = new Timer(true);
- private TimerTask saveProxyTask = new TimerTask() {
+ private Timer timer = new Timer(true);
+ private TimerTask saveProxyTask = new TimerTask() {
- @Override
- public void run() {
- saveProxyList();
- logger.info(allProxyStatus());
- }
- };
+ @Override
+ public void run() {
+ saveProxyList();
+ logger.info(allProxyStatus());
+ }
+ };
- public ProxyPool() {
- this(null, true);
- }
+ public ProxyPool() {
+ this(null, true);
+ }
- public ProxyPool(List httpProxyList) {
- this(httpProxyList, true);
- }
+ public ProxyPool(List httpProxyList) {
+ this(httpProxyList, true);
+ }
- public ProxyPool(List httpProxyList, boolean isUseLastProxy) {
- if (httpProxyList != null) {
- addProxy(httpProxyList.toArray(new String[httpProxyList.size()][]));
- }
- if (isUseLastProxy) {
- if (!new File(proxyFilePath).exists()) {
- setFilePath();
- }
- setFilePath();
- readProxyList();
- timer.schedule(saveProxyTask, 0, saveProxyInterval);
- }
- }
+ public ProxyPool(List httpProxyList, boolean isUseLastProxy) {
+ if (httpProxyList != null) {
+ addProxy(httpProxyList.toArray(new String[httpProxyList.size()][]));
+ }
+ if (isUseLastProxy) {
+ if (!new File(proxyFilePath).exists()) {
+ setFilePath();
+ }
+ setFilePath();
+ readProxyList();
+ timer.schedule(saveProxyTask, 0, saveProxyInterval);
+ }
+ }
- private void setFilePath() {
- String tmpDir = System.getProperty("java.io.tmpdir");
- String path = tmpDir + "webmagic\\lastUse.proxy";
- if (tmpDir != null && new File(tmpDir).isDirectory()) {
- fBase.setPath(tmpDir + "webmagic");
- File f = fBase.getFile(path);
- if (!f.exists()) {
- try {
- f.createNewFile();
+ private void setFilePath() {
+ String tmpDir = System.getProperty("java.io.tmpdir");
+ String path = tmpDir + FilePersistentBase.PATH_SEPERATOR + "webmagic" + FilePersistentBase.PATH_SEPERATOR + "lastUse.proxy";
+ if (tmpDir != null && new File(tmpDir).isDirectory()) {
+ fBase.setPath(tmpDir + FilePersistentBase.PATH_SEPERATOR + "webmagic");
+ File f = fBase.getFile(path);
+ if (!f.exists()) {
+ try {
+ f.createNewFile();
- } catch (IOException e) {
- logger.error("proxy file create error", e);
- }
- }
+ } catch (IOException e) {
+ logger.error("proxy file create error", e);
+ }
+ }
- } else {
- logger.error("java tmp dir not exists");
- }
- this.proxyFilePath = path;
- }
+ } else {
+ logger.error("java tmp dir not exists");
+ }
+ this.proxyFilePath = path;
+ }
- private void saveProxyList() {
- if (allProxy.size() == 0) {
- return;
- }
- try {
- ObjectOutputStream os = new ObjectOutputStream(new FileOutputStream(fBase.getFile(proxyFilePath)));
- os.writeObject(prepareForSaving());
- os.close();
- logger.info("save proxy");
- } catch (FileNotFoundException e) {
- logger.error("proxy file not found", e);
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
+ private void saveProxyList() {
+ if (allProxy.size() == 0) {
+ return;
+ }
+ try {
+ ObjectOutputStream os = new ObjectOutputStream(new FileOutputStream(fBase.getFile(proxyFilePath)));
+ os.writeObject(prepareForSaving());
+ os.close();
+ logger.info("save proxy");
+ } catch (FileNotFoundException e) {
+ logger.error("proxy file not found", e);
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
- private Map prepareForSaving() {
- Map tmp = new HashMap();
- for (Entry e : allProxy.entrySet()) {
- Proxy p = e.getValue();
- p.setFailedNum(0);
- tmp.put(e.getKey(), p);
- }
- return tmp;
- }
+ private Map prepareForSaving() {
+ Map tmp = new HashMap();
+ for (Entry e : allProxy.entrySet()) {
+ Proxy p = e.getValue();
+ p.setFailedNum(0);
+ tmp.put(e.getKey(), p);
+ }
+ return tmp;
+ }
- private void readProxyList() {
- try {
- ObjectInputStream is = new ObjectInputStream(new FileInputStream(fBase.getFile(proxyFilePath)));
- addProxy((Map) is.readObject());
- is.close();
- } catch (FileNotFoundException e) {
- logger.info("last use proxy file not found", e);
- } catch (IOException e) {
- // e.printStackTrace();
- } catch (ClassNotFoundException e) {
- // e.printStackTrace();
- }
- }
+ private void readProxyList() {
+ try {
+ ObjectInputStream is = new ObjectInputStream(new FileInputStream(fBase.getFile(proxyFilePath)));
+ addProxy((Map) is.readObject());
+ is.close();
+ } catch (FileNotFoundException e) {
+ logger.info("last use proxy file not found", e);
+ } catch (IOException e) {
+ // e.printStackTrace();
+ } catch (ClassNotFoundException e) {
+ // e.printStackTrace();
+ }
+ }
- private void addProxy(Map httpProxyMap) {
- isEnable = true;
- for (Entry entry : httpProxyMap.entrySet()) {
- try {
- if (allProxy.containsKey(entry.getKey())) {
- continue;
- }
- if (!validateWhenInit || ProxyUtils.validateProxy(entry.getValue().getHttpHost())) {
- entry.getValue().setFailedNum(0);
- entry.getValue().setReuseTimeInterval(reuseInterval);
- proxyQueue.add(entry.getValue());
- allProxy.put(entry.getKey(), entry.getValue());
- }
- } catch (NumberFormatException e) {
- logger.error("HttpHost init error:", e);
- }
- }
- logger.info("proxy pool size>>>>" + allProxy.size());
- }
+ private void addProxy(Map httpProxyMap) {
+ isEnable = true;
+ for (Entry entry : httpProxyMap.entrySet()) {
+ try {
+ if (allProxy.containsKey(entry.getKey())) {
+ continue;
+ }
+ if (!validateWhenInit || ProxyUtils.validateProxy(entry.getValue().getHttpHost())) {
+ entry.getValue().setFailedNum(0);
+ entry.getValue().setReuseTimeInterval(reuseInterval);
+ proxyQueue.add(entry.getValue());
+ allProxy.put(entry.getKey(), entry.getValue());
+ }
+ } catch (NumberFormatException e) {
+ logger.error("HttpHost init error:", e);
+ }
+ }
+ logger.info("proxy pool size>>>>" + allProxy.size());
+ }
- public void addProxy(String[]... httpProxyList) {
- isEnable = true;
- for (String[] s : httpProxyList) {
- try {
- if (allProxy.containsKey(s[0])) {
- continue;
- }
- HttpHost item = new HttpHost(InetAddress.getByName(s[0]), Integer.valueOf(s[1]));
- if (!validateWhenInit || ProxyUtils.validateProxy(item)) {
- Proxy p = new Proxy(item, reuseInterval);
- proxyQueue.add(p);
- allProxy.put(s[0], p);
- }
- } catch (NumberFormatException e) {
- logger.error("HttpHost init error:", e);
- } catch (UnknownHostException e) {
- logger.error("HttpHost init error:", e);
- }
- }
- logger.info("proxy pool size>>>>" + allProxy.size());
- }
+ public void addProxy(String[]... httpProxyList) {
+ isEnable = true;
+ for (String[] s : httpProxyList) {
+ try {
+ if (allProxy.containsKey(s[0])) {
+ continue;
+ }
+ HttpHost item = new HttpHost(InetAddress.getByName(s[0]), Integer.valueOf(s[1]));
+ if (!validateWhenInit || ProxyUtils.validateProxy(item)) {
+ Proxy p = new Proxy(item, reuseInterval);
+ proxyQueue.add(p);
+ allProxy.put(s[0], p);
+ }
+ } catch (NumberFormatException e) {
+ logger.error("HttpHost init error:", e);
+ } catch (UnknownHostException e) {
+ logger.error("HttpHost init error:", e);
+ }
+ }
+ logger.info("proxy pool size>>>>" + allProxy.size());
+ }
- public HttpHost getProxy() {
- Proxy proxy = null;
- try {
- Long time = System.currentTimeMillis();
- proxy = proxyQueue.take();
- double costTime = (System.currentTimeMillis() - time) / 1000.0;
- if (costTime > reuseInterval) {
- logger.info("get proxy time >>>> " + costTime);
- }
- Proxy p = allProxy.get(proxy.getHttpHost().getAddress().getHostAddress());
- p.setLastBorrowTime(System.currentTimeMillis());
- p.borrowNumIncrement(1);
- } catch (InterruptedException e) {
- logger.error("get proxy error", e);
- }
- if (proxy == null) {
- throw new NoSuchElementException();
- }
- return proxy.getHttpHost();
- }
+ public HttpHost getProxy() {
+ Proxy proxy = null;
+ try {
+ Long time = System.currentTimeMillis();
+ proxy = proxyQueue.take();
+ double costTime = (System.currentTimeMillis() - time) / 1000.0;
+ if (costTime > reuseInterval) {
+ logger.info("get proxy time >>>> " + costTime);
+ }
+ Proxy p = allProxy.get(proxy.getHttpHost().getAddress().getHostAddress());
+ p.setLastBorrowTime(System.currentTimeMillis());
+ p.borrowNumIncrement(1);
+ } catch (InterruptedException e) {
+ logger.error("get proxy error", e);
+ }
+ if (proxy == null) {
+ throw new NoSuchElementException();
+ }
+ return proxy.getHttpHost();
+ }
- public void returnProxy(HttpHost host, int statusCode) {
- Proxy p = allProxy.get(host.getAddress().getHostAddress());
- if (p == null) {
- return;
- }
- switch (statusCode) {
- case Proxy.SUCCESS:
- p.setReuseTimeInterval(reuseInterval);
- p.setFailedNum(0);
- p.setFailedErrorType(new ArrayList());
- p.recordResponse();
- p.successNumIncrement(1);
- break;
- case Proxy.ERROR_403:
- // banned,try longer interval
- p.fail(Proxy.ERROR_403);
- p.setReuseTimeInterval(reuseInterval * p.getFailedNum());
- logger.info(host + " >>>> reuseTimeInterval is >>>> " + p.getReuseTimeInterval() / 1000.0);
- break;
- case Proxy.ERROR_BANNED:
- p.fail(Proxy.ERROR_BANNED);
- p.setReuseTimeInterval(10 * 60 * 1000 * p.getFailedNum());
- logger.warn("this proxy is banned >>>> " + p.getHttpHost());
- logger.info(host + " >>>> reuseTimeInterval is >>>> " + p.getReuseTimeInterval() / 1000.0);
- break;
- case Proxy.ERROR_404:
- // p.fail(Proxy.ERROR_404);
- // p.setReuseTimeInterval(reuseInterval * p.getFailedNum());
- break;
- default:
- p.fail(statusCode);
- break;
- }
- if (p.getFailedNum() > 20) {
- p.setReuseTimeInterval(reviveTime);
- logger.error("remove proxy >>>> " + host + ">>>>" + p.getFailedType() + " >>>> remain proxy >>>> " + proxyQueue.size());
- return;
- }
- if (p.getFailedNum() > 0 && p.getFailedNum() % 5 == 0) {
- if (!ProxyUtils.validateProxy(host)) {
- p.setReuseTimeInterval(reviveTime);
- logger.error("remove proxy >>>> " + host + ">>>>" + p.getFailedType() + " >>>> remain proxy >>>> " + proxyQueue.size());
- return;
- }
- }
- try {
- proxyQueue.put(p);
- } catch (InterruptedException e) {
- logger.warn("proxyQueue return proxy error", e);
- }
- }
+ public void returnProxy(HttpHost host, int statusCode) {
+ Proxy p = allProxy.get(host.getAddress().getHostAddress());
+ if (p == null) {
+ return;
+ }
+ switch (statusCode) {
+ case Proxy.SUCCESS:
+ p.setReuseTimeInterval(reuseInterval);
+ p.setFailedNum(0);
+ p.setFailedErrorType(new ArrayList());
+ p.recordResponse();
+ p.successNumIncrement(1);
+ break;
+ case Proxy.ERROR_403:
+ // banned,try longer interval
+ p.fail(Proxy.ERROR_403);
+ p.setReuseTimeInterval(reuseInterval * p.getFailedNum());
+ logger.info(host + " >>>> reuseTimeInterval is >>>> " + p.getReuseTimeInterval() / 1000.0);
+ break;
+ case Proxy.ERROR_BANNED:
+ p.fail(Proxy.ERROR_BANNED);
+ p.setReuseTimeInterval(10 * 60 * 1000 * p.getFailedNum());
+ logger.warn("this proxy is banned >>>> " + p.getHttpHost());
+ logger.info(host + " >>>> reuseTimeInterval is >>>> " + p.getReuseTimeInterval() / 1000.0);
+ break;
+ case Proxy.ERROR_404:
+ // p.fail(Proxy.ERROR_404);
+ // p.setReuseTimeInterval(reuseInterval * p.getFailedNum());
+ break;
+ default:
+ p.fail(statusCode);
+ break;
+ }
+ if (p.getFailedNum() > 20) {
+ p.setReuseTimeInterval(reviveTime);
+ logger.error("remove proxy >>>> " + host + ">>>>" + p.getFailedType() + " >>>> remain proxy >>>> " + proxyQueue.size());
+ return;
+ }
+ if (p.getFailedNum() > 0 && p.getFailedNum() % 5 == 0) {
+ if (!ProxyUtils.validateProxy(host)) {
+ p.setReuseTimeInterval(reviveTime);
+ logger.error("remove proxy >>>> " + host + ">>>>" + p.getFailedType() + " >>>> remain proxy >>>> " + proxyQueue.size());
+ return;
+ }
+ }
+ try {
+ proxyQueue.put(p);
+ } catch (InterruptedException e) {
+ logger.warn("proxyQueue return proxy error", e);
+ }
+ }
- public String allProxyStatus() {
- String re = "all proxy info >>>> \n";
- for (Entry entry : allProxy.entrySet()) {
- re += entry.getValue().toString() + "\n";
- }
- return re;
- }
+ public String allProxyStatus() {
+ String re = "all proxy info >>>> \n";
+ for (Entry entry : allProxy.entrySet()) {
+ re += entry.getValue().toString() + "\n";
+ }
+ return re;
+ }
- public int getIdleNum() {
- return proxyQueue.size();
- }
+ public int getIdleNum() {
+ return proxyQueue.size();
+ }
- public int getReuseInterval() {
- return reuseInterval;
- }
+ public int getReuseInterval() {
+ return reuseInterval;
+ }
- public void setReuseInterval(int reuseInterval) {
- this.reuseInterval = reuseInterval;
- }
+ public void setReuseInterval(int reuseInterval) {
+ this.reuseInterval = reuseInterval;
+ }
- public void enable(boolean isEnable) {
- this.isEnable = isEnable;
- }
+ public void enable(boolean isEnable) {
+ this.isEnable = isEnable;
+ }
- public boolean isEnable() {
- return isEnable;
- }
+ public boolean isEnable() {
+ return isEnable;
+ }
- public int getReviveTime() {
- return reviveTime;
- }
+ public int getReviveTime() {
+ return reviveTime;
+ }
- public void setReviveTime(int reviveTime) {
- this.reviveTime = reviveTime;
- }
+ public void setReviveTime(int reviveTime) {
+ this.reviveTime = reviveTime;
+ }
- public boolean isValidateWhenInit() {
- return validateWhenInit;
- }
+ public boolean isValidateWhenInit() {
+ return validateWhenInit;
+ }
- public void validateWhenInit(boolean validateWhenInit) {
- this.validateWhenInit = validateWhenInit;
- }
+ public void validateWhenInit(boolean validateWhenInit) {
+ this.validateWhenInit = validateWhenInit;
+ }
- public int getSaveProxyInterval() {
- return saveProxyInterval;
- }
+ public int getSaveProxyInterval() {
+ return saveProxyInterval;
+ }
- public void setSaveProxyInterval(int saveProxyInterval) {
- this.saveProxyInterval = saveProxyInterval;
- }
+ public void setSaveProxyInterval(int saveProxyInterval) {
+ this.saveProxyInterval = saveProxyInterval;
+ }
- public String getProxyFilePath() {
- return proxyFilePath;
- }
+ public String getProxyFilePath() {
+ return proxyFilePath;
+ }
- public void setProxyFilePath(String proxyFilePath) {
- this.proxyFilePath = proxyFilePath;
- }
+ public void setProxyFilePath(String proxyFilePath) {
+ this.proxyFilePath = proxyFilePath;
+ }
}
diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java
index 9d3d420..fd5827f 100644
--- a/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java
+++ b/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java
@@ -1,16 +1,14 @@
package us.codecraft.webmagic.proxy;
-import static org.assertj.core.api.Assertions.assertThat;
-
-import java.io.File;
-import java.util.ArrayList;
-import java.util.List;
-
import org.apache.http.HttpHost;
import org.junit.BeforeClass;
import org.junit.Test;
-import us.codecraft.webmagic.Request;
+import java.io.File;
+import java.util.ArrayList;
+import java.util.List;
+
+import static org.assertj.core.api.Assertions.assertThat;
/**
* @author yxssfxwzy@sina.com May 30, 2014
@@ -30,11 +28,6 @@ public class ProxyTest {
}
}
- @Test
- public void testAddProxy() {
-
- }
-
@Test
public void testProxy() {
ProxyPool proxyPool = new ProxyPool(httpProxyList);