test pass
parent
474b7c9d57
commit
68050fc88e
|
@ -2,8 +2,7 @@ package us.codecraft.webmagic;
|
|||
|
||||
import org.apache.http.HttpHost;
|
||||
import org.apache.http.auth.UsernamePasswordCredentials;
|
||||
import us.codecraft.webmagic.proxy.ProxyPool;
|
||||
import us.codecraft.webmagic.proxy.TimerReuseProxyPool;
|
||||
import us.codecraft.webmagic.proxy.ProxyProvider;
|
||||
import us.codecraft.webmagic.utils.UrlUtils;
|
||||
|
||||
import java.util.*;
|
||||
|
@ -52,7 +51,7 @@ public class Site {
|
|||
|
||||
private UsernamePasswordCredentials usernamePasswordCredentials; //代理用户名密码设置
|
||||
|
||||
private ProxyPool httpProxyPool;
|
||||
private ProxyProvider httpProxyPool;
|
||||
|
||||
private boolean useGzip = true;
|
||||
|
||||
|
@ -399,7 +398,11 @@ public class Site {
|
|||
return new Task() {
|
||||
@Override
|
||||
public String getUUID() {
|
||||
return Site.this.getDomain();
|
||||
String uuid = Site.this.getDomain();
|
||||
if (uuid == null) {
|
||||
uuid = UUID.randomUUID().toString();
|
||||
}
|
||||
return uuid;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -467,45 +470,4 @@ public class Site {
|
|||
'}';
|
||||
}
|
||||
|
||||
/**
|
||||
* Set httpProxyPool, String[0]:ip, String[1]:port <br>
|
||||
*
|
||||
* @param proxyPool proxyPool
|
||||
* @return this
|
||||
*/
|
||||
public Site setHttpProxyPool(ProxyPool proxyPool) {
|
||||
this.httpProxyPool = proxyPool;
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set httpProxyPool, String[0]:ip, String[1]:port <br>
|
||||
*
|
||||
* @param httpProxyList httpProxyList
|
||||
* @param isUseLastProxy isUseLastProxy
|
||||
* @return this
|
||||
*/
|
||||
public Site setHttpProxyPool(List<String[]> httpProxyList, boolean isUseLastProxy) {
|
||||
this.httpProxyPool=new TimerReuseProxyPool(httpProxyList, isUseLastProxy);
|
||||
return this;
|
||||
}
|
||||
|
||||
public Site enableHttpProxyPool() {
|
||||
this.httpProxyPool=new TimerReuseProxyPool();
|
||||
return this;
|
||||
}
|
||||
|
||||
public UsernamePasswordCredentials getUsernamePasswordCredentials() {
|
||||
return usernamePasswordCredentials;
|
||||
}
|
||||
|
||||
public Site setUsernamePasswordCredentials(UsernamePasswordCredentials usernamePasswordCredentials) {
|
||||
this.usernamePasswordCredentials = usernamePasswordCredentials;
|
||||
return this;
|
||||
}
|
||||
|
||||
public ProxyPool getHttpProxyPool() {
|
||||
return httpProxyPool;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -20,6 +20,7 @@ import us.codecraft.webmagic.Request;
|
|||
import us.codecraft.webmagic.Site;
|
||||
import us.codecraft.webmagic.Task;
|
||||
import us.codecraft.webmagic.proxy.Proxy;
|
||||
import us.codecraft.webmagic.proxy.ProxyProvider;
|
||||
import us.codecraft.webmagic.selector.PlainText;
|
||||
import us.codecraft.webmagic.utils.CharsetUtils;
|
||||
|
||||
|
@ -46,10 +47,16 @@ public class HttpClientDownloader extends AbstractDownloader {
|
|||
|
||||
private HttpUriRequestConverter httpUriRequestConverter = new HttpUriRequestConverter();
|
||||
|
||||
private ProxyProvider proxyProvider;
|
||||
|
||||
public void setHttpUriRequestConverter(HttpUriRequestConverter httpUriRequestConverter) {
|
||||
this.httpUriRequestConverter = httpUriRequestConverter;
|
||||
}
|
||||
|
||||
public void setProxyProvider(ProxyProvider proxyProvider) {
|
||||
this.proxyProvider = proxyProvider;
|
||||
}
|
||||
|
||||
private CloseableHttpClient getHttpClient(Site site) {
|
||||
if (site == null) {
|
||||
return httpClientGenerator.getClient(null);
|
||||
|
@ -79,8 +86,8 @@ public class HttpClientDownloader extends AbstractDownloader {
|
|||
Site site = task.getSite();
|
||||
Proxy proxy = null;
|
||||
HttpContext httpContext = new BasicHttpContext();
|
||||
if (site.getHttpProxyPool() != null) {
|
||||
proxy = site.getHttpProxyPool().getProxy(task);
|
||||
if (proxyProvider != null) {
|
||||
proxy = proxyProvider.getProxy(task);
|
||||
request.putExtra(Request.PROXY, proxy);
|
||||
AuthState authState = new AuthState();
|
||||
authState.update(new BasicScheme(), new UsernamePasswordCredentials(proxy.getUsername(), proxy.getPassword()));
|
||||
|
@ -111,9 +118,6 @@ public class HttpClientDownloader extends AbstractDownloader {
|
|||
//ensure the connection is released back to pool
|
||||
EntityUtils.consumeQuietly(httpResponse.getEntity());
|
||||
}
|
||||
if (proxy != null) {
|
||||
site.getHttpProxyPool().returnProxy(proxy, statusCode, task);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -43,7 +43,7 @@ public class HttpUriRequestConverter {
|
|||
}
|
||||
|
||||
if (proxy != null) {
|
||||
requestConfigBuilder.setProxy(new HttpHost(proxy.getProxyHost().getHost(), proxy.getProxyHost().getPort()));
|
||||
requestConfigBuilder.setProxy(new HttpHost(proxy.getHost(), proxy.getPort()));
|
||||
}
|
||||
requestBuilder.setConfig(requestConfigBuilder.build());
|
||||
return requestBuilder.build();
|
||||
|
|
|
@ -6,42 +6,36 @@ package us.codecraft.webmagic.proxy;
|
|||
|
||||
public class Proxy {
|
||||
|
||||
private ProxyHost proxyHost;
|
||||
private String host;
|
||||
private int port;
|
||||
private String username;
|
||||
private String password;
|
||||
|
||||
public Proxy(ProxyHost proxyHost, String username, String password) {
|
||||
this.proxyHost = proxyHost;
|
||||
public Proxy(String host, int port) {
|
||||
this.host = host;
|
||||
this.port = port;
|
||||
}
|
||||
|
||||
public Proxy(String host, int port, String username, String password) {
|
||||
this.host = host;
|
||||
this.port = port;
|
||||
this.username = username;
|
||||
this.password = password;
|
||||
}
|
||||
|
||||
public Proxy(ProxyHost proxyHost) {
|
||||
this.proxyHost = proxyHost;
|
||||
public String getHost() {
|
||||
return host;
|
||||
}
|
||||
|
||||
public ProxyHost getProxyHost() {
|
||||
return proxyHost;
|
||||
}
|
||||
|
||||
public void setProxyHost(ProxyHost proxyHost) {
|
||||
this.proxyHost = proxyHost;
|
||||
public int getPort() {
|
||||
return port;
|
||||
}
|
||||
|
||||
public String getUsername() {
|
||||
return username;
|
||||
}
|
||||
|
||||
public void setUsername(String username) {
|
||||
this.username = username;
|
||||
}
|
||||
|
||||
public String getPassword() {
|
||||
return password;
|
||||
}
|
||||
|
||||
public void setPassword(String password) {
|
||||
this.password = password;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,34 +0,0 @@
|
|||
package us.codecraft.webmagic.proxy;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com
|
||||
* Date: 17/3/18
|
||||
* Time: 下午12:04
|
||||
*/
|
||||
public class ProxyHost {
|
||||
|
||||
private String host;
|
||||
|
||||
private int port;
|
||||
|
||||
public String getHost() {
|
||||
return host;
|
||||
}
|
||||
|
||||
public ProxyHost(String host, int port) {
|
||||
this.host = host;
|
||||
this.port = port;
|
||||
}
|
||||
|
||||
public void setHost(String host) {
|
||||
this.host = host;
|
||||
}
|
||||
|
||||
public int getPort() {
|
||||
return port;
|
||||
}
|
||||
|
||||
public void setPort(int port) {
|
||||
this.port = port;
|
||||
}
|
||||
}
|
|
@ -5,7 +5,7 @@ import us.codecraft.webmagic.Task;
|
|||
/**
|
||||
* Created by edwardsbean on 15-2-28.
|
||||
*/
|
||||
public interface ProxyPool {
|
||||
public interface ProxyProvider {
|
||||
|
||||
void returnProxy(Proxy proxy, boolean banned, Task task);
|
||||
|
|
@ -72,14 +72,10 @@ public class TimerReuseProxy extends Proxy implements Delayed, Serializable {
|
|||
|
||||
private List<Integer> failedErrorType = new ArrayList<Integer>();
|
||||
|
||||
public TimerReuseProxy(ProxyHost proxyHost, String user, String password) {
|
||||
super(proxyHost, user, password);
|
||||
public TimerReuseProxy(String host, int port, String username, String password) {
|
||||
super(host, port, username, password);
|
||||
}
|
||||
|
||||
public TimerReuseProxy(ProxyHost proxyHost, String user, String password, int reuseTimeInterval) {
|
||||
super(proxyHost, user, password);
|
||||
this.reuseTimeInterval = reuseTimeInterval;
|
||||
}
|
||||
|
||||
public int getSuccessNum() {
|
||||
return successNum;
|
||||
|
|
|
@ -1,17 +1,6 @@
|
|||
package us.codecraft.webmagic.proxy;
|
||||
|
||||
import org.apache.http.HttpHost;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import us.codecraft.webmagic.utils.FilePersistentBase;
|
||||
import us.codecraft.webmagic.utils.ProxyUtils;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.*;
|
||||
import java.util.Map.Entry;
|
||||
import java.util.concurrent.BlockingQueue;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.concurrent.DelayQueue;
|
||||
import us.codecraft.webmagic.Task;
|
||||
|
||||
/**
|
||||
* Pooled Proxy Object
|
||||
|
@ -20,187 +9,196 @@ import java.util.concurrent.DelayQueue;
|
|||
* @see Proxy
|
||||
* @since 0.5.1
|
||||
*/
|
||||
public class TimerReuseProxyPool implements ProxyPool {
|
||||
public class TimerReuseProxyPool implements ProxyProvider {
|
||||
@Override
|
||||
public void returnProxy(Proxy proxy, boolean banned, Task task) {
|
||||
|
||||
private Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
private BlockingQueue<TimerReuseProxy> proxyQueue = new DelayQueue<TimerReuseProxy>();
|
||||
private Map<String, TimerReuseProxy> allProxy = new ConcurrentHashMap<String, TimerReuseProxy>();
|
||||
|
||||
private int reuseInterval = 1500;// ms
|
||||
private int reviveTime = 2 * 60 * 60 * 1000;// ms
|
||||
private int saveProxyInterval = 10 * 60 * 1000;// ms
|
||||
|
||||
private boolean isEnable = false;
|
||||
private boolean validateWhenInit = false;
|
||||
// private boolean isUseLastProxy = true;
|
||||
|
||||
public TimerReuseProxyPool(List<String[]> httpProxyList) {
|
||||
this(httpProxyList, true);
|
||||
}
|
||||
|
||||
private void addProxy(Map<String, Proxy> httpProxyMap) {
|
||||
isEnable = true;
|
||||
for (Entry<String, Proxy> entry : httpProxyMap.entrySet()) {
|
||||
try {
|
||||
if (allProxy.containsKey(entry.getKey())) {
|
||||
continue;
|
||||
}
|
||||
if (!validateWhenInit || ProxyUtils.validateProxy(entry.getValue().getHttpHost())) {
|
||||
entry.getValue().setFailedNum(0);
|
||||
entry.getValue().setReuseTimeInterval(reuseInterval);
|
||||
proxyQueue.add(entry.getValue());
|
||||
allProxy.put(entry.getKey(), entry.getValue());
|
||||
}
|
||||
} catch (NumberFormatException e) {
|
||||
logger.error("HttpHost init error:", e);
|
||||
}
|
||||
}
|
||||
logger.info("proxy pool size>>>>" + allProxy.size());
|
||||
@Override
|
||||
public Proxy getProxy(Task task) {
|
||||
return null;
|
||||
}
|
||||
|
||||
public void addProxy(Proxy... httpProxyList) {
|
||||
isEnable = true;
|
||||
for (Proxy proxy : httpProxyList) {
|
||||
if (!validateWhenInit || ProxyUtils.validateProxy(proxy.getProxyHost())) {
|
||||
TimerReuseProxy p = new TimerReuseProxy(proxy.getProxyHost(), proxy.getUsername(), proxy.getPassword(), reuseInterval);
|
||||
proxyQueue.add(p);
|
||||
allProxy.put(p.getProxyHost().getHost(), p);
|
||||
}
|
||||
}
|
||||
logger.info("proxy pool size>>>>" + allProxy.size());
|
||||
}
|
||||
|
||||
public TimerReuseProxy getProxy() {
|
||||
TimerReuseProxy proxy = null;
|
||||
try {
|
||||
Long time = System.currentTimeMillis();
|
||||
proxy = proxyQueue.take();
|
||||
double costTime = (System.currentTimeMillis() - time) / 1000.0;
|
||||
if (costTime > reuseInterval) {
|
||||
logger.info("get proxy time >>>> " + costTime);
|
||||
}
|
||||
TimerReuseProxy p = allProxy.get(proxy.getProxyHost().getHost());
|
||||
p.setLastBorrowTime(System.currentTimeMillis());
|
||||
p.borrowNumIncrement(1);
|
||||
} catch (InterruptedException e) {
|
||||
logger.error("get proxy error", e);
|
||||
}
|
||||
if (proxy == null) {
|
||||
throw new NoSuchElementException();
|
||||
}
|
||||
return proxy;
|
||||
}
|
||||
|
||||
public void returnProxy(Proxy proxy, int statusCode) {
|
||||
TimerReuseProxy p = allProxy.get(proxy.getProxyHost());
|
||||
if (p == null) {
|
||||
return;
|
||||
}
|
||||
switch (statusCode) {
|
||||
case TimerReuseProxy.SUCCESS:
|
||||
p.setReuseTimeInterval(reuseInterval);
|
||||
p.setFailedNum(0);
|
||||
p.setFailedErrorType(new ArrayList<Integer>());
|
||||
p.recordResponse();
|
||||
p.successNumIncrement(1);
|
||||
break;
|
||||
case TimerReuseProxy.ERROR_403:
|
||||
// banned,try longer interval
|
||||
p.fail(TimerReuseProxy.ERROR_403);
|
||||
p.setReuseTimeInterval(reuseInterval * p.getFailedNum());
|
||||
logger.info(proxy + " >>>> reuseTimeInterval is >>>> " + p.getReuseTimeInterval() / 1000.0);
|
||||
break;
|
||||
case TimerReuseProxy.ERROR_BANNED:
|
||||
p.fail(TimerReuseProxy.ERROR_BANNED);
|
||||
p.setReuseTimeInterval(10 * 60 * 1000 * p.getFailedNum());
|
||||
logger.info(proxy + " >>>> reuseTimeInterval is >>>> " + p.getReuseTimeInterval() / 1000.0);
|
||||
break;
|
||||
case TimerReuseProxy.ERROR_404:
|
||||
// p.fail(Proxy.ERROR_404);
|
||||
// private Logger logger = LoggerFactory.getLogger(getClass());
|
||||
//
|
||||
// private BlockingQueue<TimerReuseProxy> proxyQueue = new DelayQueue<TimerReuseProxy>();
|
||||
// private Map<String, TimerReuseProxy> allProxy = new ConcurrentHashMap<String, TimerReuseProxy>();
|
||||
//
|
||||
// private int reuseInterval = 1500;// ms
|
||||
// private int reviveTime = 2 * 60 * 60 * 1000;// ms
|
||||
// private int saveProxyInterval = 10 * 60 * 1000;// ms
|
||||
//
|
||||
// private boolean isEnable = false;
|
||||
// private boolean validateWhenInit = false;
|
||||
// // private boolean isUseLastProxy = true;
|
||||
//
|
||||
// public TimerReuseProxyPool(List<String[]> httpProxyList) {
|
||||
// this(httpProxyList, true);
|
||||
// }
|
||||
//
|
||||
// private void addProxy(Map<String, Proxy> httpProxyMap) {
|
||||
// isEnable = true;
|
||||
// for (Entry<String, Proxy> entry : httpProxyMap.entrySet()) {
|
||||
// try {
|
||||
// if (allProxy.containsKey(entry.getKey())) {
|
||||
// continue;
|
||||
// }
|
||||
// if (!validateWhenInit || ProxyUtils.validateProxy(entry.getValue().getHttpHost())) {
|
||||
// entry.getValue().setFailedNum(0);
|
||||
// entry.getValue().setReuseTimeInterval(reuseInterval);
|
||||
// proxyQueue.add(entry.getValue());
|
||||
// allProxy.put(entry.getKey(), entry.getValue());
|
||||
// }
|
||||
// } catch (NumberFormatException e) {
|
||||
// logger.error("HttpHost init error:", e);
|
||||
// }
|
||||
// }
|
||||
// logger.info("proxy pool size>>>>" + allProxy.size());
|
||||
// }
|
||||
//
|
||||
// public void addProxy(Proxy... httpProxyList) {
|
||||
// isEnable = true;
|
||||
// for (Proxy proxy : httpProxyList) {
|
||||
// if (!validateWhenInit || ProxyUtils.validateProxy(proxy.getProxyHost())) {
|
||||
// TimerReuseProxy p = new TimerReuseProxy(proxy.getProxyHost(), proxy.getUsername(), proxy.getPassword(), reuseInterval);
|
||||
// proxyQueue.add(p);
|
||||
// allProxy.put(p.getProxyHost().getHost(), p);
|
||||
// }
|
||||
// }
|
||||
// logger.info("proxy pool size>>>>" + allProxy.size());
|
||||
// }
|
||||
//
|
||||
// public TimerReuseProxy getProxy() {
|
||||
// TimerReuseProxy proxy = null;
|
||||
// try {
|
||||
// Long time = System.currentTimeMillis();
|
||||
// proxy = proxyQueue.take();
|
||||
// double costTime = (System.currentTimeMillis() - time) / 1000.0;
|
||||
// if (costTime > reuseInterval) {
|
||||
// logger.info("get proxy time >>>> " + costTime);
|
||||
// }
|
||||
// TimerReuseProxy p = allProxy.get(proxy.getProxyHost().getHost());
|
||||
// p.setLastBorrowTime(System.currentTimeMillis());
|
||||
// p.borrowNumIncrement(1);
|
||||
// } catch (InterruptedException e) {
|
||||
// logger.error("get proxy error", e);
|
||||
// }
|
||||
// if (proxy == null) {
|
||||
// throw new NoSuchElementException();
|
||||
// }
|
||||
// return proxy;
|
||||
// }
|
||||
//
|
||||
// public void returnProxy(Proxy proxy, int statusCode) {
|
||||
// TimerReuseProxy p = allProxy.get(proxy.getProxyHost());
|
||||
// if (p == null) {
|
||||
// return;
|
||||
// }
|
||||
// switch (statusCode) {
|
||||
// case TimerReuseProxy.SUCCESS:
|
||||
// p.setReuseTimeInterval(reuseInterval);
|
||||
// p.setFailedNum(0);
|
||||
// p.setFailedErrorType(new ArrayList<Integer>());
|
||||
// p.recordResponse();
|
||||
// p.successNumIncrement(1);
|
||||
// break;
|
||||
// case TimerReuseProxy.ERROR_403:
|
||||
// // banned,try longer interval
|
||||
// p.fail(TimerReuseProxy.ERROR_403);
|
||||
// p.setReuseTimeInterval(reuseInterval * p.getFailedNum());
|
||||
break;
|
||||
default:
|
||||
p.fail(statusCode);
|
||||
break;
|
||||
}
|
||||
if (p.getFailedNum() > 20) {
|
||||
p.setReuseTimeInterval(reviveTime);
|
||||
logger.error("remove proxy >>>> " + proxy + ">>>>" + p.getFailedType() + " >>>> remain proxy >>>> " + proxyQueue.size());
|
||||
return;
|
||||
}
|
||||
if (p.getFailedNum() > 0 && p.getFailedNum() % 5 == 0) {
|
||||
if (!ProxyUtils.validateProxy(proxy)) {
|
||||
p.setReuseTimeInterval(reviveTime);
|
||||
logger.error("remove proxy >>>> " + proxy + ">>>>" + p.getFailedType() + " >>>> remain proxy >>>> " + proxyQueue.size());
|
||||
return;
|
||||
}
|
||||
}
|
||||
try {
|
||||
proxyQueue.put(p);
|
||||
} catch (InterruptedException e) {
|
||||
logger.warn("proxyQueue return proxy error", e);
|
||||
}
|
||||
}
|
||||
|
||||
public String allProxyStatus() {
|
||||
String re = "all proxy info >>>> \n";
|
||||
for (Entry<String, Proxy> entry : allProxy.entrySet()) {
|
||||
re += entry.getValue().toString() + "\n";
|
||||
}
|
||||
return re;
|
||||
}
|
||||
|
||||
public int getIdleNum() {
|
||||
return proxyQueue.size();
|
||||
}
|
||||
|
||||
public int getReuseInterval() {
|
||||
return reuseInterval;
|
||||
}
|
||||
|
||||
public void setReuseInterval(int reuseInterval) {
|
||||
this.reuseInterval = reuseInterval;
|
||||
}
|
||||
|
||||
public void enable(boolean isEnable) {
|
||||
this.isEnable = isEnable;
|
||||
}
|
||||
|
||||
public boolean isEnable() {
|
||||
return isEnable;
|
||||
}
|
||||
|
||||
public int getReviveTime() {
|
||||
return reviveTime;
|
||||
}
|
||||
|
||||
public void setReviveTime(int reviveTime) {
|
||||
this.reviveTime = reviveTime;
|
||||
}
|
||||
|
||||
public boolean isValidateWhenInit() {
|
||||
return validateWhenInit;
|
||||
}
|
||||
|
||||
public void validateWhenInit(boolean validateWhenInit) {
|
||||
this.validateWhenInit = validateWhenInit;
|
||||
}
|
||||
|
||||
public int getSaveProxyInterval() {
|
||||
return saveProxyInterval;
|
||||
}
|
||||
|
||||
public void setSaveProxyInterval(int saveProxyInterval) {
|
||||
this.saveProxyInterval = saveProxyInterval;
|
||||
}
|
||||
|
||||
public String getProxyFilePath() {
|
||||
return proxyFilePath;
|
||||
}
|
||||
|
||||
public void setProxyFilePath(String proxyFilePath) {
|
||||
this.proxyFilePath = proxyFilePath;
|
||||
}
|
||||
// logger.info(proxy + " >>>> reuseTimeInterval is >>>> " + p.getReuseTimeInterval() / 1000.0);
|
||||
// break;
|
||||
// case TimerReuseProxy.ERROR_BANNED:
|
||||
// p.fail(TimerReuseProxy.ERROR_BANNED);
|
||||
// p.setReuseTimeInterval(10 * 60 * 1000 * p.getFailedNum());
|
||||
// logger.info(proxy + " >>>> reuseTimeInterval is >>>> " + p.getReuseTimeInterval() / 1000.0);
|
||||
// break;
|
||||
// case TimerReuseProxy.ERROR_404:
|
||||
// // p.fail(Proxy.ERROR_404);
|
||||
// // p.setReuseTimeInterval(reuseInterval * p.getFailedNum());
|
||||
// break;
|
||||
// default:
|
||||
// p.fail(statusCode);
|
||||
// break;
|
||||
// }
|
||||
// if (p.getFailedNum() > 20) {
|
||||
// p.setReuseTimeInterval(reviveTime);
|
||||
// logger.error("remove proxy >>>> " + proxy + ">>>>" + p.getFailedType() + " >>>> remain proxy >>>> " + proxyQueue.size());
|
||||
// return;
|
||||
// }
|
||||
// if (p.getFailedNum() > 0 && p.getFailedNum() % 5 == 0) {
|
||||
// if (!ProxyUtils.validateProxy(proxy)) {
|
||||
// p.setReuseTimeInterval(reviveTime);
|
||||
// logger.error("remove proxy >>>> " + proxy + ">>>>" + p.getFailedType() + " >>>> remain proxy >>>> " + proxyQueue.size());
|
||||
// return;
|
||||
// }
|
||||
// }
|
||||
// try {
|
||||
// proxyQueue.put(p);
|
||||
// } catch (InterruptedException e) {
|
||||
// logger.warn("proxyQueue return proxy error", e);
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// public String allProxyStatus() {
|
||||
// String re = "all proxy info >>>> \n";
|
||||
// for (Entry<String, Proxy> entry : allProxy.entrySet()) {
|
||||
// re += entry.getValue().toString() + "\n";
|
||||
// }
|
||||
// return re;
|
||||
// }
|
||||
//
|
||||
// public int getIdleNum() {
|
||||
// return proxyQueue.size();
|
||||
// }
|
||||
//
|
||||
// public int getReuseInterval() {
|
||||
// return reuseInterval;
|
||||
// }
|
||||
//
|
||||
// public void setReuseInterval(int reuseInterval) {
|
||||
// this.reuseInterval = reuseInterval;
|
||||
// }
|
||||
//
|
||||
// public void enable(boolean isEnable) {
|
||||
// this.isEnable = isEnable;
|
||||
// }
|
||||
//
|
||||
// public boolean isEnable() {
|
||||
// return isEnable;
|
||||
// }
|
||||
//
|
||||
// public int getReviveTime() {
|
||||
// return reviveTime;
|
||||
// }
|
||||
//
|
||||
// public void setReviveTime(int reviveTime) {
|
||||
// this.reviveTime = reviveTime;
|
||||
// }
|
||||
//
|
||||
// public boolean isValidateWhenInit() {
|
||||
// return validateWhenInit;
|
||||
// }
|
||||
//
|
||||
// public void validateWhenInit(boolean validateWhenInit) {
|
||||
// this.validateWhenInit = validateWhenInit;
|
||||
// }
|
||||
//
|
||||
// public int getSaveProxyInterval() {
|
||||
// return saveProxyInterval;
|
||||
// }
|
||||
//
|
||||
// public void setSaveProxyInterval(int saveProxyInterval) {
|
||||
// this.saveProxyInterval = saveProxyInterval;
|
||||
// }
|
||||
//
|
||||
// public String getProxyFilePath() {
|
||||
// return proxyFilePath;
|
||||
// }
|
||||
//
|
||||
// public void setProxyFilePath(String proxyFilePath) {
|
||||
// this.proxyFilePath = proxyFilePath;
|
||||
// }
|
||||
|
||||
}
|
||||
|
|
|
@ -1,14 +1,12 @@
|
|||
package us.codecraft.webmagic.utils;
|
||||
|
||||
import org.apache.http.HttpHost;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import us.codecraft.webmagic.proxy.ProxyHost;
|
||||
import us.codecraft.webmagic.proxy.Proxy;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.*;
|
||||
import java.util.Enumeration;
|
||||
import java.util.regex.Pattern;
|
||||
import java.net.InetSocketAddress;
|
||||
import java.net.Socket;
|
||||
|
||||
/**
|
||||
* Pooled Proxy Object
|
||||
|
@ -18,72 +16,19 @@ import java.util.regex.Pattern;
|
|||
*/
|
||||
|
||||
public class ProxyUtils {
|
||||
private static InetAddress localAddr;
|
||||
private static String networkInterface = "eth7";
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(ProxyUtils.class);
|
||||
static {
|
||||
init();
|
||||
}
|
||||
|
||||
private static void init() {
|
||||
// first way to get local IP
|
||||
try {
|
||||
localAddr = InetAddress.getLocalHost();
|
||||
logger.info("local IP:" + localAddr.getHostAddress());
|
||||
} catch (UnknownHostException e) {
|
||||
logger.info("try again\n");
|
||||
}
|
||||
if (localAddr != null) {
|
||||
return;
|
||||
}
|
||||
// other way to get local IP
|
||||
Enumeration<InetAddress> localAddrs;
|
||||
try {
|
||||
// modify your network interface name
|
||||
NetworkInterface ni = NetworkInterface.getByName(networkInterface);
|
||||
if (ni == null) {
|
||||
return;
|
||||
}
|
||||
localAddrs = ni.getInetAddresses();
|
||||
if (localAddrs == null || !localAddrs.hasMoreElements()) {
|
||||
logger.error("choose NetworkInterface\n" + getNetworkInterface());
|
||||
return;
|
||||
}
|
||||
while (localAddrs.hasMoreElements()) {
|
||||
InetAddress tmp = localAddrs.nextElement();
|
||||
if (!tmp.isLoopbackAddress() && !tmp.isLinkLocalAddress() && !(tmp instanceof Inet6Address)) {
|
||||
localAddr = tmp;
|
||||
logger.info("local IP:" + localAddr.getHostAddress());
|
||||
break;
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
logger.error("Failure when init ProxyUtil", e);
|
||||
logger.error("choose NetworkInterface\n" + getNetworkInterface());
|
||||
}
|
||||
}
|
||||
|
||||
public static HttpHost convert(ProxyHost p){
|
||||
return new HttpHost(p.getHost(),p.getPort());
|
||||
}
|
||||
|
||||
public static boolean validateProxy(ProxyHost p) {
|
||||
if (localAddr == null) {
|
||||
logger.error("cannot get local IP");
|
||||
return false;
|
||||
}
|
||||
boolean isReachable = false;
|
||||
public static boolean validateProxy(Proxy p) {
|
||||
Socket socket = null;
|
||||
try {
|
||||
socket = new Socket();
|
||||
socket.bind(new InetSocketAddress(localAddr, 0));
|
||||
InetSocketAddress endpointSocketAddr = new InetSocketAddress(p.getHost(), p.getPort());
|
||||
socket.connect(endpointSocketAddr, 3000);
|
||||
logger.debug("SUCCESS - connection established! Local: " + localAddr.getHostAddress() + " remote: " + p);
|
||||
isReachable = true;
|
||||
return true;
|
||||
} catch (IOException e) {
|
||||
logger.warn("FAILRE - CAN not connect! Local: " + localAddr.getHostAddress() + " remote: " + p);
|
||||
logger.warn("FAILRE - CAN not connect! remote: " + p);
|
||||
return false;
|
||||
} finally {
|
||||
if (socket != null) {
|
||||
try {
|
||||
|
@ -93,30 +38,7 @@ public class ProxyUtils {
|
|||
}
|
||||
}
|
||||
}
|
||||
return isReachable;
|
||||
|
||||
}
|
||||
|
||||
private static String getNetworkInterface() {
|
||||
|
||||
String networkInterfaceName = ">>>> modify networkInterface in us.codecraft.webmagic.utils.ProxyUtils";
|
||||
Enumeration<NetworkInterface> enumeration = null;
|
||||
try {
|
||||
enumeration = NetworkInterface.getNetworkInterfaces();
|
||||
} catch (SocketException e1) {
|
||||
e1.printStackTrace();
|
||||
}
|
||||
while (enumeration.hasMoreElements()) {
|
||||
NetworkInterface networkInterface = enumeration.nextElement();
|
||||
|
||||
Enumeration<InetAddress> addr = networkInterface.getInetAddresses();
|
||||
while (addr.hasMoreElements()) {
|
||||
String s = addr.nextElement().getHostAddress();
|
||||
Pattern IPV4_PATTERN = Pattern.compile("^(25[0-5]|2[0-4]\\d|[0-1]?\\d?\\d)(\\.(25[0-5]|2[0-4]\\d|[0-1]?\\d?\\d)){3}$");
|
||||
if (s != null && IPV4_PATTERN.matcher(s).matches()) {
|
||||
networkInterfaceName += networkInterface.toString() + "IP:" + s + "\n\n";
|
||||
}
|
||||
}
|
||||
}
|
||||
return networkInterfaceName;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -5,7 +5,7 @@ import com.github.dreamhead.moco.Runnable;
|
|||
import com.github.dreamhead.moco.Runner;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.http.client.methods.CloseableHttpResponse;
|
||||
import org.apache.http.client.methods.RequestBuilder;
|
||||
import org.apache.http.client.methods.HttpUriRequest;
|
||||
import org.apache.http.impl.client.CloseableHttpClient;
|
||||
import org.apache.http.impl.client.HttpClients;
|
||||
import org.apache.http.util.EntityUtils;
|
||||
|
@ -87,12 +87,12 @@ public class HttpClientDownloaderTest {
|
|||
private String getCharsetByUrl(String url) {
|
||||
HttpClientDownloader downloader = new HttpClientDownloader();
|
||||
Site site = Site.me();
|
||||
CloseableHttpClient httpClient = new HttpClientGenerator().getClient(site, null);
|
||||
CloseableHttpClient httpClient = new HttpClientGenerator().getClient(site);
|
||||
// encoding in http header Content-Type
|
||||
Request requestGBK = new Request(url);
|
||||
CloseableHttpResponse httpResponse = null;
|
||||
try {
|
||||
httpResponse = httpClient.execute(downloader.getHttpUriRequest(requestGBK, site, null,null));
|
||||
httpResponse = httpClient.execute(new HttpUriRequestConverter().convert(requestGBK, site, null));
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
|
@ -117,31 +117,32 @@ public class HttpClientDownloaderTest {
|
|||
server.delete(eq(query("q"), "webmagic")).response("delete");
|
||||
server.request(and(by(method("HEAD")),eq(query("q"), "webmagic"))).response(header("method","head"));
|
||||
server.request(and(by(method("TRACE")),eq(query("q"), "webmagic"))).response("trace");
|
||||
final HttpUriRequestConverter httpUriRequestConverter = new HttpUriRequestConverter();
|
||||
final Site site = Site.me();
|
||||
Runner.running(server, new Runnable() {
|
||||
@Override
|
||||
public void run() throws Exception {
|
||||
HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
|
||||
Request request = new Request();
|
||||
request.setUrl("http://127.0.0.1:12306/search");
|
||||
request.putParams("q", "webmagic");
|
||||
request.setMethod(HttpConstant.Method.GET);
|
||||
RequestBuilder requestBuilder = httpClientDownloader.selectRequestMethod(request).setUri(request.getUrl());
|
||||
assertThat(EntityUtils.toString(HttpClients.custom().build().execute(requestBuilder.build()).getEntity())).isEqualTo("get");
|
||||
HttpUriRequest httpUriRequest = httpUriRequestConverter.convert(request,site,null);
|
||||
assertThat(EntityUtils.toString(HttpClients.custom().build().execute(httpUriRequest).getEntity())).isEqualTo("get");
|
||||
request.setMethod(HttpConstant.Method.POST);
|
||||
requestBuilder = httpClientDownloader.selectRequestMethod(request).setUri(request.getUrl());
|
||||
assertThat(EntityUtils.toString(HttpClients.custom().build().execute(requestBuilder.build()).getEntity())).isEqualTo("post");
|
||||
httpUriRequest = httpUriRequestConverter.convert(request, site, null);
|
||||
assertThat(EntityUtils.toString(HttpClients.custom().build().execute(httpUriRequest).getEntity())).isEqualTo("post");
|
||||
request.setMethod(HttpConstant.Method.PUT);
|
||||
requestBuilder = httpClientDownloader.selectRequestMethod(request).setUri(request.getUrl());
|
||||
assertThat(EntityUtils.toString(HttpClients.custom().build().execute(requestBuilder.build()).getEntity())).isEqualTo("put");
|
||||
httpUriRequest = httpUriRequestConverter.convert(request, site, null);
|
||||
assertThat(EntityUtils.toString(HttpClients.custom().build().execute(httpUriRequest).getEntity())).isEqualTo("put");
|
||||
request.setMethod(HttpConstant.Method.DELETE);
|
||||
requestBuilder = httpClientDownloader.selectRequestMethod(request).setUri(request.getUrl());
|
||||
assertThat(EntityUtils.toString(HttpClients.custom().build().execute(requestBuilder.build()).getEntity())).isEqualTo("delete");
|
||||
httpUriRequest = httpUriRequestConverter.convert(request, site, null);
|
||||
assertThat(EntityUtils.toString(HttpClients.custom().build().execute(httpUriRequest).getEntity())).isEqualTo("delete");
|
||||
request.setMethod(HttpConstant.Method.HEAD);
|
||||
requestBuilder = httpClientDownloader.selectRequestMethod(request).setUri(request.getUrl());
|
||||
assertThat(HttpClients.custom().build().execute(requestBuilder.build()).getFirstHeader("method").getValue()).isEqualTo("head");
|
||||
httpUriRequest = httpUriRequestConverter.convert(request, site, null);
|
||||
assertThat(HttpClients.custom().build().execute(httpUriRequest).getFirstHeader("method").getValue()).isEqualTo("head");
|
||||
request.setMethod(HttpConstant.Method.TRACE);
|
||||
requestBuilder = httpClientDownloader.selectRequestMethod(request).setUri(request.getUrl());
|
||||
assertThat(EntityUtils.toString(HttpClients.custom().build().execute(requestBuilder.build()).getEntity())).isEqualTo("trace");
|
||||
httpUriRequest = httpUriRequestConverter.convert(request, site, null);
|
||||
assertThat(EntityUtils.toString(HttpClients.custom().build().execute(httpUriRequest).getEntity())).isEqualTo("trace");
|
||||
}
|
||||
});
|
||||
}
|
||||
|
@ -156,7 +157,7 @@ public class HttpClientDownloaderTest {
|
|||
final HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
|
||||
Request request = new Request();
|
||||
request.setUrl("http://127.0.0.1:12306/");
|
||||
Page page = httpClientDownloader.download(request, null);
|
||||
Page page = httpClientDownloader.download(request, Site.me().toTask());
|
||||
assertThat(page.getRawText()).isEqualTo("foo");
|
||||
}
|
||||
});
|
||||
|
|
|
@ -2,13 +2,10 @@ package us.codecraft.webmagic.proxy;
|
|||
|
||||
import org.apache.http.HttpHost;
|
||||
import org.junit.BeforeClass;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import static org.assertj.core.api.Assertions.assertThat;
|
||||
|
||||
/**
|
||||
* @author yxssfxwzy@sina.com May 30, 2014
|
||||
*
|
||||
|
@ -27,30 +24,6 @@ public class ProxyTest {
|
|||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testProxy() {
|
||||
TimerReuseProxyPool proxyPool = new TimerReuseProxyPool(httpProxyList,false);
|
||||
proxyPool.setReuseInterval(500);
|
||||
assertThat(proxyPool.getIdleNum()).isEqualTo(4);
|
||||
for (int i = 0; i < 2; i++) {
|
||||
List<Fetch> fetchList = new ArrayList<Fetch>();
|
||||
while (proxyPool.getIdleNum() != 0) {
|
||||
Proxy proxy = proxyPool.getProxy();
|
||||
HttpHost httphost = proxy.getHttpHost();
|
||||
// httphostList.add(httphost);
|
||||
System.out.println(httphost.getHostName() + ":" + httphost.getPort());
|
||||
Fetch tmp = new Fetch(httphost);
|
||||
tmp.start();
|
||||
fetchList.add(tmp);
|
||||
}
|
||||
for (Fetch fetch : fetchList) {
|
||||
proxyPool.returnProxy(fetch.hp, Proxy.SUCCESS);
|
||||
}
|
||||
System.out.println(proxyPool.allProxyStatus());
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
class Fetch extends Thread {
|
||||
HttpHost hp;
|
||||
|
||||
|
|
Loading…
Reference in New Issue