add test and fix bug of proxy module
parent
2a15bc0289
commit
2f89cfc31a
|
@ -424,6 +424,8 @@ public class Spider implements Runnable, Task {
|
|||
pipeline.process(page.getResultItems(), this);
|
||||
}
|
||||
}
|
||||
//for proxy status management
|
||||
request.putExtra(Request.STATUS_CODE, page.getStatusCode());
|
||||
sleep(site.getSleepTime());
|
||||
}
|
||||
|
||||
|
|
|
@ -9,7 +9,8 @@ import java.util.concurrent.TimeUnit;
|
|||
import org.apache.http.HttpHost;
|
||||
|
||||
/**
|
||||
* >>>>Proxy Status
|
||||
* >>>> Proxy lifecycle
|
||||
|
||||
+----------+ +-----+
|
||||
| last use | | new |
|
||||
+-----+----+ +---+-+
|
||||
|
@ -44,13 +45,22 @@ import org.apache.http.HttpHost;
|
|||
| |+-------------------+
|
||||
+--------+
|
||||
*/
|
||||
|
||||
/**
|
||||
* Object has these status of lifecycle above.<br>
|
||||
*
|
||||
* @author yxssfxwzy@sina.com <br>
|
||||
* @since 0.5.1
|
||||
* @see ProxyPool
|
||||
*/
|
||||
|
||||
public class Proxy implements Delayed, Serializable {
|
||||
|
||||
private static final long serialVersionUID = 228939737383625551L;
|
||||
public static final int ERROR_403 = 403;
|
||||
public static final int ERROR_404 = 404;
|
||||
public static final int ERROR_BANNED = 10000;
|
||||
public static final int ERROR_Proxy = 10001;
|
||||
public static final int ERROR_BANNED = 10000;// banned by website
|
||||
public static final int ERROR_Proxy = 10001;// the proxy itself failed
|
||||
public static final int SUCCESS = 200;
|
||||
|
||||
private final HttpHost httpHost;
|
||||
|
@ -59,7 +69,6 @@ public class Proxy implements Delayed, Serializable {
|
|||
private Long canReuseTime = 0L;
|
||||
private Long lastBorrowTime = System.currentTimeMillis();
|
||||
private Long responseTime = 0L;
|
||||
private Long idleTime = 0L;
|
||||
|
||||
private int failedNum = 0;
|
||||
private int successNum = 0;
|
||||
|
@ -143,7 +152,7 @@ public class Proxy implements Delayed, Serializable {
|
|||
|
||||
@Override
|
||||
public long getDelay(TimeUnit unit) {
|
||||
return unit.convert(canReuseTime - System.nanoTime(), unit.NANOSECONDS);
|
||||
return unit.convert(canReuseTime - System.nanoTime(), TimeUnit.NANOSECONDS);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -1,26 +1,39 @@
|
|||
package us.codecraft.webmagic.proxy;
|
||||
|
||||
import org.apache.http.HttpHost;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.*;
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.ObjectInputStream;
|
||||
import java.io.ObjectOutputStream;
|
||||
import java.net.InetAddress;
|
||||
import java.net.UnknownHostException;
|
||||
import java.util.*;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.NoSuchElementException;
|
||||
import java.util.Timer;
|
||||
import java.util.TimerTask;
|
||||
import java.util.Map.Entry;
|
||||
import java.util.concurrent.BlockingQueue;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.concurrent.DelayQueue;
|
||||
|
||||
import org.apache.http.HttpHost;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import us.codecraft.webmagic.utils.FilePersistentBase;
|
||||
import us.codecraft.webmagic.utils.ProxyUtils;
|
||||
|
||||
/**
|
||||
* ClassName:ProxyPool
|
||||
* Pooled Proxy Object
|
||||
*
|
||||
* @see
|
||||
* @Function: TODO ADD FUNCTION
|
||||
* @author ch
|
||||
* @version Ver 1.0
|
||||
* @Date 2014-2-14 下午01:10:04
|
||||
* @author yxssfxwzy@sina.com <br>
|
||||
* @since 0.5.1
|
||||
* @see Proxy
|
||||
*/
|
||||
public class ProxyPool {
|
||||
|
||||
|
@ -31,10 +44,14 @@ public class ProxyPool {
|
|||
|
||||
private int reuseInterval = 1500;// ms
|
||||
private int reviveTime = 2 * 60 * 60 * 1000;// ms
|
||||
private int saveProxyInterval = 10 * 60 * 1000;// ms
|
||||
|
||||
private boolean isEnable = false;
|
||||
private boolean validateWhenInit = false;
|
||||
private String proxyFile = "data/lastUse.proxy";
|
||||
// private boolean isUseLastProxy = true;
|
||||
private String proxyFilePath = "/data/webmagic/lastUse.proxy";
|
||||
|
||||
private FilePersistentBase fBase = new FilePersistentBase();
|
||||
|
||||
private Timer timer = new Timer(true);
|
||||
private TimerTask saveProxyTask = new TimerTask() {
|
||||
|
@ -47,13 +64,46 @@ public class ProxyPool {
|
|||
};
|
||||
|
||||
public ProxyPool() {
|
||||
|
||||
this(null, true);
|
||||
}
|
||||
|
||||
public ProxyPool(List<String[]> httpProxyList) {
|
||||
readProxyList();
|
||||
addProxy(httpProxyList.toArray(new String[httpProxyList.size()][]));
|
||||
timer.schedule(saveProxyTask, 10 * 60 * 1000L, 10 * 60 * 1000);
|
||||
this(httpProxyList, true);
|
||||
}
|
||||
|
||||
public ProxyPool(List<String[]> httpProxyList, boolean isUseLastProxy) {
|
||||
if (httpProxyList != null) {
|
||||
addProxy(httpProxyList.toArray(new String[httpProxyList.size()][]));
|
||||
}
|
||||
if (isUseLastProxy) {
|
||||
if (!new File(proxyFilePath).exists()) {
|
||||
setFilePath();
|
||||
}
|
||||
setFilePath();
|
||||
readProxyList();
|
||||
timer.schedule(saveProxyTask, 0, saveProxyInterval);
|
||||
}
|
||||
}
|
||||
|
||||
private void setFilePath() {
|
||||
String tmpDir = System.getProperty("java.io.tmpdir");
|
||||
String path = tmpDir + "webmagic\\lastUse.proxy";
|
||||
if (tmpDir != null && new File(tmpDir).isDirectory()) {
|
||||
fBase.setPath(tmpDir + "webmagic");
|
||||
File f = fBase.getFile(path);
|
||||
if (!f.exists()) {
|
||||
try {
|
||||
f.createNewFile();
|
||||
|
||||
} catch (IOException e) {
|
||||
logger.error("proxy file create error", e);
|
||||
}
|
||||
}
|
||||
|
||||
} else {
|
||||
logger.error("java tmp dir not exists");
|
||||
}
|
||||
this.proxyFilePath = path;
|
||||
}
|
||||
|
||||
private void saveProxyList() {
|
||||
|
@ -61,7 +111,7 @@ public class ProxyPool {
|
|||
return;
|
||||
}
|
||||
try {
|
||||
ObjectOutputStream os = new ObjectOutputStream(new FileOutputStream(proxyFile));
|
||||
ObjectOutputStream os = new ObjectOutputStream(new FileOutputStream(fBase.getFile(proxyFilePath)));
|
||||
os.writeObject(prepareForSaving());
|
||||
os.close();
|
||||
logger.info("save proxy");
|
||||
|
@ -84,15 +134,15 @@ public class ProxyPool {
|
|||
|
||||
private void readProxyList() {
|
||||
try {
|
||||
ObjectInputStream is = new ObjectInputStream(new FileInputStream(proxyFile));
|
||||
ObjectInputStream is = new ObjectInputStream(new FileInputStream(fBase.getFile(proxyFilePath)));
|
||||
addProxy((Map<String, Proxy>) is.readObject());
|
||||
is.close();
|
||||
} catch (FileNotFoundException e) {
|
||||
logger.error("proxy file not found", e);
|
||||
logger.info("last use proxy file not found", e);
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
// e.printStackTrace();
|
||||
} catch (ClassNotFoundException e) {
|
||||
e.printStackTrace();
|
||||
// e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -103,7 +153,7 @@ public class ProxyPool {
|
|||
if (allProxy.containsKey(entry.getKey())) {
|
||||
continue;
|
||||
}
|
||||
if (!validateWhenInit || ProxyUtil.validateProxy(entry.getValue().getHttpHost())) {
|
||||
if (!validateWhenInit || ProxyUtils.validateProxy(entry.getValue().getHttpHost())) {
|
||||
entry.getValue().setFailedNum(0);
|
||||
entry.getValue().setReuseTimeInterval(reuseInterval);
|
||||
proxyQueue.add(entry.getValue());
|
||||
|
@ -124,7 +174,7 @@ public class ProxyPool {
|
|||
continue;
|
||||
}
|
||||
HttpHost item = new HttpHost(InetAddress.getByName(s[0]), Integer.valueOf(s[1]));
|
||||
if (!validateWhenInit || ProxyUtil.validateProxy(item)) {
|
||||
if (!validateWhenInit || ProxyUtils.validateProxy(item)) {
|
||||
Proxy p = new Proxy(item, reuseInterval);
|
||||
proxyQueue.add(p);
|
||||
allProxy.put(s[0], p);
|
||||
|
@ -173,7 +223,7 @@ public class ProxyPool {
|
|||
p.successNumIncrement(1);
|
||||
break;
|
||||
case Proxy.ERROR_403:
|
||||
// banned,try larger interval
|
||||
// banned,try longer interval
|
||||
p.fail(Proxy.ERROR_403);
|
||||
p.setReuseTimeInterval(reuseInterval * p.getFailedNum());
|
||||
logger.info(host + " >>>> reuseTimeInterval is >>>> " + p.getReuseTimeInterval() / 1000.0);
|
||||
|
@ -185,7 +235,7 @@ public class ProxyPool {
|
|||
logger.info(host + " >>>> reuseTimeInterval is >>>> " + p.getReuseTimeInterval() / 1000.0);
|
||||
break;
|
||||
case Proxy.ERROR_404:
|
||||
//p.fail(Proxy.ERROR_404);
|
||||
// p.fail(Proxy.ERROR_404);
|
||||
// p.setReuseTimeInterval(reuseInterval * p.getFailedNum());
|
||||
break;
|
||||
default:
|
||||
|
@ -193,14 +243,12 @@ public class ProxyPool {
|
|||
break;
|
||||
}
|
||||
if (p.getFailedNum() > 20) {
|
||||
// allProxy.remove(host.getAddress().getHostAddress());
|
||||
p.setReuseTimeInterval(reviveTime);
|
||||
logger.error("remove proxy >>>> " + host + ">>>>" + p.getFailedType() + " >>>> remain proxy >>>> " + proxyQueue.size());
|
||||
return;
|
||||
}
|
||||
if (p.getFailedNum()%5==0) {
|
||||
if (!ProxyUtil.validateProxy(host)) {
|
||||
// allProxy.remove(host.getAddress().getHostAddress());
|
||||
if (p.getFailedNum() > 0 && p.getFailedNum() % 5 == 0) {
|
||||
if (!ProxyUtils.validateProxy(host)) {
|
||||
p.setReuseTimeInterval(reviveTime);
|
||||
logger.error("remove proxy >>>> " + host + ">>>>" + p.getFailedType() + " >>>> remain proxy >>>> " + proxyQueue.size());
|
||||
return;
|
||||
|
@ -219,7 +267,6 @@ public class ProxyPool {
|
|||
re += entry.getValue().toString() + "\n";
|
||||
}
|
||||
return re;
|
||||
|
||||
}
|
||||
|
||||
public int getIdleNum() {
|
||||
|
@ -234,52 +281,6 @@ public class ProxyPool {
|
|||
this.reuseInterval = reuseInterval;
|
||||
}
|
||||
|
||||
public static List<String[]> getProxyList() {
|
||||
List<String[]> proxyList = new ArrayList<String[]>();
|
||||
BufferedReader br = null;
|
||||
try {
|
||||
br = new BufferedReader(new FileReader(new File("proxy.txt")));
|
||||
|
||||
String line = "";
|
||||
while ((line = br.readLine()) != null) {
|
||||
proxyList.add(new String[] { line.split(":")[0], line.split(":")[1] });
|
||||
}
|
||||
} catch (FileNotFoundException e) {
|
||||
e.printStackTrace();
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
return proxyList;
|
||||
}
|
||||
|
||||
public static void main(String[] args) throws IOException {
|
||||
ProxyPool proxyPool = new ProxyPool(getProxyList());
|
||||
proxyPool.setReuseInterval(10000);
|
||||
// proxyPool.saveProxyList();
|
||||
|
||||
while (true) {
|
||||
List<HttpHost> httphostList = new ArrayList<HttpHost>();
|
||||
System.in.read();
|
||||
int i = 0;
|
||||
while (proxyPool.getIdleNum() > 2) {
|
||||
HttpHost httphost = proxyPool.getProxy();
|
||||
httphostList.add(httphost);
|
||||
// proxyPool.proxyPool.use(httphost);
|
||||
proxyPool.logger.info("borrow object>>>>" + i + ">>>>" + httphostList.get(i).toString());
|
||||
i++;
|
||||
}
|
||||
System.out.println(proxyPool.allProxyStatus());
|
||||
System.in.read();
|
||||
for (i = 0; i < httphostList.size(); i++) {
|
||||
proxyPool.returnProxy(httphostList.get(i), 200);
|
||||
proxyPool.logger.info("return object>>>>" + i + ">>>>" + httphostList.get(i).toString());
|
||||
}
|
||||
System.out.println(proxyPool.allProxyStatus());
|
||||
System.in.read();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public void enable(boolean isEnable) {
|
||||
this.isEnable = isEnable;
|
||||
}
|
||||
|
@ -287,4 +288,37 @@ public class ProxyPool {
|
|||
public boolean isEnable() {
|
||||
return isEnable;
|
||||
}
|
||||
|
||||
public int getReviveTime() {
|
||||
return reviveTime;
|
||||
}
|
||||
|
||||
public void setReviveTime(int reviveTime) {
|
||||
this.reviveTime = reviveTime;
|
||||
}
|
||||
|
||||
public boolean isValidateWhenInit() {
|
||||
return validateWhenInit;
|
||||
}
|
||||
|
||||
public void validateWhenInit(boolean validateWhenInit) {
|
||||
this.validateWhenInit = validateWhenInit;
|
||||
}
|
||||
|
||||
public int getSaveProxyInterval() {
|
||||
return saveProxyInterval;
|
||||
}
|
||||
|
||||
public void setSaveProxyInterval(int saveProxyInterval) {
|
||||
this.saveProxyInterval = saveProxyInterval;
|
||||
}
|
||||
|
||||
public String getProxyFilePath() {
|
||||
return proxyFilePath;
|
||||
}
|
||||
|
||||
public void setProxyFilePath(String proxyFilePath) {
|
||||
this.proxyFilePath = proxyFilePath;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
package us.codecraft.webmagic.proxy;
|
||||
package us.codecraft.webmagic.utils;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.Inet6Address;
|
||||
|
@ -7,36 +7,54 @@ import java.net.InetSocketAddress;
|
|||
import java.net.NetworkInterface;
|
||||
import java.net.Socket;
|
||||
import java.net.SocketException;
|
||||
import java.net.UnknownHostException;
|
||||
import java.util.Enumeration;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.apache.http.HttpHost;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
/**
|
||||
* ClassName:ProxyUtil
|
||||
* Pooled Proxy Object
|
||||
*
|
||||
* @see
|
||||
* @author ch
|
||||
* @version Ver 1.0
|
||||
* @Date 2014-2-16 下午04:20:07
|
||||
* @author yxssfxwzy@sina.com <br>
|
||||
* @since 0.5.1
|
||||
*/
|
||||
public class ProxyUtil {
|
||||
// TODO 改为单例
|
||||
|
||||
public class ProxyUtils {
|
||||
private static InetAddress localAddr;
|
||||
private static final Logger logger = LoggerFactory.getLogger(ProxyUtil.class);
|
||||
private static String networkInterface = "eth7";
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(ProxyUtils.class);
|
||||
static {
|
||||
init();
|
||||
}
|
||||
|
||||
private static void init() {
|
||||
// first way to get local IP
|
||||
try {
|
||||
localAddr = InetAddress.getLocalHost();
|
||||
logger.info("local IP:" + localAddr.getHostAddress());
|
||||
} catch (UnknownHostException e) {
|
||||
logger.info("try again\n");
|
||||
}
|
||||
if (localAddr != null) {
|
||||
return;
|
||||
}
|
||||
// other way to get local IP
|
||||
Enumeration<InetAddress> localAddrs;
|
||||
try {
|
||||
NetworkInterface ni = NetworkInterface.getByName("eth7");
|
||||
// modify your network interface name
|
||||
NetworkInterface ni = NetworkInterface.getByName(networkInterface);
|
||||
if (ni == null) {
|
||||
logger.error("choose NetworkInterface\n" + getNetworkInterface());
|
||||
return;
|
||||
}
|
||||
localAddrs = ni.getInetAddresses();
|
||||
if (localAddrs == null || !localAddrs.hasMoreElements()) {
|
||||
logger.error("choose NetworkInterface\n" + getNetworkInterface());
|
||||
return;
|
||||
}
|
||||
while (localAddrs.hasMoreElements()) {
|
||||
InetAddress tmp = localAddrs.nextElement();
|
||||
if (!tmp.isLoopbackAddress() && !tmp.isLinkLocalAddress() && !(tmp instanceof Inet6Address)) {
|
||||
|
@ -49,12 +67,11 @@ public class ProxyUtil {
|
|||
logger.error("Failure when init ProxyUtil", e);
|
||||
logger.error("choose NetworkInterface\n" + getNetworkInterface());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public static boolean validateProxy(HttpHost p) {
|
||||
if (localAddr == null) {
|
||||
logger.error("cannot get local ip");
|
||||
logger.error("cannot get local IP");
|
||||
return false;
|
||||
}
|
||||
boolean isReachable = false;
|
||||
|
@ -81,7 +98,8 @@ public class ProxyUtil {
|
|||
}
|
||||
|
||||
private static String getNetworkInterface() {
|
||||
String networkInterfaceName = "";
|
||||
|
||||
String networkInterfaceName = ">>>> modify networkInterface in us.codecraft.webmagic.utils.ProxyUtils";
|
||||
Enumeration<NetworkInterface> enumeration = null;
|
||||
try {
|
||||
enumeration = NetworkInterface.getNetworkInterfaces();
|
||||
|
@ -90,10 +108,14 @@ public class ProxyUtil {
|
|||
}
|
||||
while (enumeration.hasMoreElements()) {
|
||||
NetworkInterface networkInterface = enumeration.nextElement();
|
||||
networkInterfaceName += networkInterface.toString() + '\n';
|
||||
|
||||
Enumeration<InetAddress> addr = networkInterface.getInetAddresses();
|
||||
while (addr.hasMoreElements()) {
|
||||
networkInterfaceName += "\tip:" + addr.nextElement().getHostAddress() + "\n";
|
||||
String s = addr.nextElement().getHostAddress();
|
||||
Pattern IPV4_PATTERN = Pattern.compile("^(25[0-5]|2[0-4]\\d|[0-1]?\\d?\\d)(\\.(25[0-5]|2[0-4]\\d|[0-1]?\\d?\\d)){3}$");
|
||||
if (s != null && IPV4_PATTERN.matcher(s).matches()) {
|
||||
networkInterfaceName += networkInterface.toString() + "IP:" + s + "\n\n";
|
||||
}
|
||||
}
|
||||
}
|
||||
return networkInterfaceName;
|
|
@ -0,0 +1,79 @@
|
|||
package us.codecraft.webmagic.proxy;
|
||||
|
||||
import static org.assertj.core.api.Assertions.assertThat;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.http.HttpHost;
|
||||
import org.junit.BeforeClass;
|
||||
import org.junit.Test;
|
||||
|
||||
import us.codecraft.webmagic.Request;
|
||||
|
||||
/**
|
||||
* @author yxssfxwzy@sina.com May 30, 2014
|
||||
*
|
||||
*/
|
||||
public class ProxyTest {
|
||||
|
||||
private static List<String[]> httpProxyList = new ArrayList<String[]>();
|
||||
|
||||
@BeforeClass
|
||||
public static void before() {
|
||||
// String[] source = { "0.0.0.1:0", "0.0.0.2:0", "0.0.0.3:0",
|
||||
// "0.0.0.4:0" };
|
||||
String[] source = { "0.0.0.1:0", "0.0.0.2:0", "0.0.0.3:0", "0.0.0.4:0" };
|
||||
for (String line : source) {
|
||||
httpProxyList.add(new String[] { line.split(":")[0], line.split(":")[1] });
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testAddProxy() {
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testProxy() {
|
||||
ProxyPool proxyPool = new ProxyPool(httpProxyList);
|
||||
proxyPool.setReuseInterval(500);
|
||||
assertThat(proxyPool.getIdleNum()).isEqualTo(4);
|
||||
assertThat(new File(proxyPool.getProxyFilePath()).exists()).isEqualTo(true);
|
||||
for (int i = 0; i < 2; i++) {
|
||||
List<Fetch> fetchList = new ArrayList<Fetch>();
|
||||
while (proxyPool.getIdleNum() != 0) {
|
||||
HttpHost httphost = proxyPool.getProxy();
|
||||
// httphostList.add(httphost);
|
||||
System.out.println(httphost.getHostName() + ":" + httphost.getPort());
|
||||
Fetch tmp = new Fetch(httphost);
|
||||
tmp.start();
|
||||
fetchList.add(tmp);
|
||||
}
|
||||
for (Fetch fetch : fetchList) {
|
||||
proxyPool.returnProxy(fetch.hp, Proxy.SUCCESS);
|
||||
}
|
||||
System.out.println(proxyPool.allProxyStatus());
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
class Fetch extends Thread {
|
||||
HttpHost hp;
|
||||
|
||||
public Fetch(HttpHost hp) {
|
||||
this.hp = hp;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void run() {
|
||||
try {
|
||||
System.out.println("fetch web page use proxy: " + hp.getHostName() + ":" + hp.getPort());
|
||||
sleep(500);
|
||||
} catch (InterruptedException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue