compile passed in httpclientDownloader
parent
b71f379512
commit
1d86f7c048
|
@ -18,7 +18,6 @@ public class Request implements Serializable {
|
||||||
private static final long serialVersionUID = 2062192774891352043L;
|
private static final long serialVersionUID = 2062192774891352043L;
|
||||||
|
|
||||||
public static final String CYCLE_TRIED_TIMES = "_cycle_tried_times";
|
public static final String CYCLE_TRIED_TIMES = "_cycle_tried_times";
|
||||||
public static final String STATUS_CODE = "statusCode";
|
|
||||||
public static final String PROXY = "proxy";
|
public static final String PROXY = "proxy";
|
||||||
|
|
||||||
private String url;
|
private String url;
|
||||||
|
|
|
@ -419,8 +419,6 @@ public class Spider implements Runnable, Task {
|
||||||
pipeline.process(page.getResultItems(), this);
|
pipeline.process(page.getResultItems(), this);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
//for proxy status management
|
|
||||||
request.putExtra(Request.STATUS_CODE, page.getStatusCode());
|
|
||||||
sleep(site.getSleepTime());
|
sleep(site.getSleepTime());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
package us.codecraft.webmagic.downloader;
|
package us.codecraft.webmagic.downloader;
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.http.HttpHost;
|
|
||||||
import org.apache.http.HttpResponse;
|
import org.apache.http.HttpResponse;
|
||||||
import org.apache.http.annotation.ThreadSafe;
|
import org.apache.http.annotation.ThreadSafe;
|
||||||
import org.apache.http.auth.AuthState;
|
import org.apache.http.auth.AuthState;
|
||||||
|
@ -23,13 +22,11 @@ import us.codecraft.webmagic.Task;
|
||||||
import us.codecraft.webmagic.proxy.Proxy;
|
import us.codecraft.webmagic.proxy.Proxy;
|
||||||
import us.codecraft.webmagic.selector.PlainText;
|
import us.codecraft.webmagic.selector.PlainText;
|
||||||
import us.codecraft.webmagic.utils.CharsetUtils;
|
import us.codecraft.webmagic.utils.CharsetUtils;
|
||||||
import us.codecraft.webmagic.utils.WMCollections;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.nio.charset.Charset;
|
import java.nio.charset.Charset;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -80,28 +77,22 @@ public class HttpClientDownloader extends AbstractDownloader {
|
||||||
CloseableHttpResponse httpResponse = null;
|
CloseableHttpResponse httpResponse = null;
|
||||||
int statusCode = 0;
|
int statusCode = 0;
|
||||||
Site site = task.getSite();
|
Site site = task.getSite();
|
||||||
try {
|
Proxy proxy = null;
|
||||||
Proxy proxy = null;
|
HttpContext httpContext = new BasicHttpContext();
|
||||||
if (site.getHttpProxyPool() != null && site.getHttpProxyPool().isEnable()) {
|
if (site.getHttpProxyPool() != null && site.getHttpProxyPool().isEnable()) {
|
||||||
proxy = site.getHttpProxyFromPool();
|
proxy = site.getHttpProxyFromPool();
|
||||||
} else if (site != null && site.getHttpProxy() != null){
|
|
||||||
proxy = site.getHttpProxy();
|
|
||||||
request.putExtra(Request.PROXY, site.getHttpProxy());
|
|
||||||
}
|
|
||||||
request.putExtra(Request.PROXY, proxy);
|
request.putExtra(Request.PROXY, proxy);
|
||||||
|
|
||||||
HttpContext httpContext = new BasicHttpContext();
|
|
||||||
|
|
||||||
HttpUriRequest httpUriRequest = httpUriRequestConverter.convert(request,site);
|
|
||||||
AuthState authState = new AuthState();
|
AuthState authState = new AuthState();
|
||||||
authState.update(new BasicScheme(), new UsernamePasswordCredentials("userName", "password"));
|
authState.update(new BasicScheme(), new UsernamePasswordCredentials(proxy.getUsername(), proxy.getPassword()));
|
||||||
httpContext.setAttribute(HttpClientContext.PROXY_AUTH_STATE, authState);
|
httpContext.setAttribute(HttpClientContext.PROXY_AUTH_STATE, authState);
|
||||||
CloseableHttpClient httpClient = getHttpClient(site, proxy);
|
}
|
||||||
|
HttpUriRequest httpUriRequest = httpUriRequestConverter.convert(request, site);
|
||||||
|
CloseableHttpClient httpClient = getHttpClient(site);
|
||||||
|
try {
|
||||||
httpResponse = httpClient.execute(httpUriRequest, httpContext);
|
httpResponse = httpClient.execute(httpUriRequest, httpContext);
|
||||||
statusCode = httpResponse.getStatusLine().getStatusCode();
|
statusCode = httpResponse.getStatusLine().getStatusCode();
|
||||||
request.putExtra(Request.STATUS_CODE, statusCode);
|
if (site.getAcceptStatCode().contains(statusCode)) {
|
||||||
if (statusAccept(acceptStatCode, statusCode)) {
|
Page page = handleResponse(request, site.getCharset(), httpResponse, task);
|
||||||
Page page = handleResponse(request, charset, httpResponse, task);
|
|
||||||
onSuccess(request);
|
onSuccess(request);
|
||||||
return page;
|
return page;
|
||||||
} else {
|
} else {
|
||||||
|
@ -120,10 +111,8 @@ public class HttpClientDownloader extends AbstractDownloader {
|
||||||
//ensure the connection is released back to pool
|
//ensure the connection is released back to pool
|
||||||
EntityUtils.consumeQuietly(httpResponse.getEntity());
|
EntityUtils.consumeQuietly(httpResponse.getEntity());
|
||||||
}
|
}
|
||||||
request.putExtra(Request.STATUS_CODE, statusCode);
|
if (proxy != null) {
|
||||||
if (site != null && site.getHttpProxyPool() != null && site.getHttpProxyPool().isEnable()) {
|
site.getHttpProxyPool().returnProxy(proxy, statusCode);
|
||||||
site.returnHttpProxyToPool((HttpHost) request.getExtra(Request.PROXY), (Integer) request
|
|
||||||
.getExtra(Request.STATUS_CODE));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -133,10 +122,6 @@ public class HttpClientDownloader extends AbstractDownloader {
|
||||||
httpClientGenerator.setPoolSize(thread);
|
httpClientGenerator.setPoolSize(thread);
|
||||||
}
|
}
|
||||||
|
|
||||||
protected boolean statusAccept(Set<Integer> acceptStatCode, int statusCode) {
|
|
||||||
return acceptStatCode.contains(statusCode);
|
|
||||||
}
|
|
||||||
|
|
||||||
protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException {
|
protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException {
|
||||||
String content = getContent(charset, httpResponse);
|
String content = getContent(charset, httpResponse);
|
||||||
Page page = new Page();
|
Page page = new Page();
|
||||||
|
|
|
@ -10,7 +10,6 @@ import org.apache.http.client.methods.RequestBuilder;
|
||||||
import org.apache.http.message.BasicNameValuePair;
|
import org.apache.http.message.BasicNameValuePair;
|
||||||
import us.codecraft.webmagic.Request;
|
import us.codecraft.webmagic.Request;
|
||||||
import us.codecraft.webmagic.Site;
|
import us.codecraft.webmagic.Site;
|
||||||
import us.codecraft.webmagic.proxy.Proxy;
|
|
||||||
import us.codecraft.webmagic.utils.HttpConstant;
|
import us.codecraft.webmagic.utils.HttpConstant;
|
||||||
|
|
||||||
import java.nio.charset.Charset;
|
import java.nio.charset.Charset;
|
||||||
|
@ -26,7 +25,7 @@ import java.util.Map;
|
||||||
*/
|
*/
|
||||||
public class HttpUriRequestConverter {
|
public class HttpUriRequestConverter {
|
||||||
|
|
||||||
public HttpUriRequest convert(Request request, Site site, Proxy proxy) {
|
public HttpUriRequest convert(Request request, Site site) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -7,12 +7,12 @@ package us.codecraft.webmagic.proxy;
|
||||||
public class Proxy {
|
public class Proxy {
|
||||||
|
|
||||||
private ProxyHost proxyHost;
|
private ProxyHost proxyHost;
|
||||||
private String user;
|
private String username;
|
||||||
private String password;
|
private String password;
|
||||||
|
|
||||||
public Proxy(ProxyHost proxyHost, String user, String password) {
|
public Proxy(ProxyHost proxyHost, String username, String password) {
|
||||||
this.proxyHost = proxyHost;
|
this.proxyHost = proxyHost;
|
||||||
this.user = user;
|
this.username = username;
|
||||||
this.password = password;
|
this.password = password;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -28,12 +28,12 @@ public class Proxy {
|
||||||
this.proxyHost = proxyHost;
|
this.proxyHost = proxyHost;
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getUser() {
|
public String getUsername() {
|
||||||
return user;
|
return username;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setUser(String user) {
|
public void setUsername(String username) {
|
||||||
this.user = user;
|
this.username = username;
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getPassword() {
|
public String getPassword() {
|
||||||
|
|
|
@ -1,13 +1,11 @@
|
||||||
package us.codecraft.webmagic.proxy;
|
package us.codecraft.webmagic.proxy;
|
||||||
|
|
||||||
import org.apache.http.HttpHost;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Created by edwardsbean on 15-2-28.
|
* Created by edwardsbean on 15-2-28.
|
||||||
*/
|
*/
|
||||||
public interface ProxyPool {
|
public interface ProxyPool {
|
||||||
|
|
||||||
void returnProxy(HttpHost host, int statusCode);
|
void returnProxy(Proxy proxy, int statusCode);
|
||||||
|
|
||||||
Proxy getProxy();
|
Proxy getProxy();
|
||||||
|
|
||||||
|
|
|
@ -7,8 +7,6 @@ import us.codecraft.webmagic.utils.FilePersistentBase;
|
||||||
import us.codecraft.webmagic.utils.ProxyUtils;
|
import us.codecraft.webmagic.utils.ProxyUtils;
|
||||||
|
|
||||||
import java.io.*;
|
import java.io.*;
|
||||||
import java.net.InetAddress;
|
|
||||||
import java.net.UnknownHostException;
|
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.Map.Entry;
|
import java.util.Map.Entry;
|
||||||
import java.util.concurrent.BlockingQueue;
|
import java.util.concurrent.BlockingQueue;
|
||||||
|
@ -156,7 +154,7 @@ public class TimerReuseProxyPool implements ProxyPool {
|
||||||
isEnable = true;
|
isEnable = true;
|
||||||
for (Proxy proxy : httpProxyList) {
|
for (Proxy proxy : httpProxyList) {
|
||||||
if (!validateWhenInit || ProxyUtils.validateProxy(proxy.getProxyHost())) {
|
if (!validateWhenInit || ProxyUtils.validateProxy(proxy.getProxyHost())) {
|
||||||
TimerReuseProxy p = new TimerReuseProxy(proxy.getProxyHost(), proxy.getUser(), proxy.getPassword(), reuseInterval);
|
TimerReuseProxy p = new TimerReuseProxy(proxy.getProxyHost(), proxy.getUsername(), proxy.getPassword(), reuseInterval);
|
||||||
proxyQueue.add(p);
|
proxyQueue.add(p);
|
||||||
allProxy.put(p.getProxyHost().getHost(), p);
|
allProxy.put(p.getProxyHost().getHost(), p);
|
||||||
}
|
}
|
||||||
|
@ -185,8 +183,8 @@ public class TimerReuseProxyPool implements ProxyPool {
|
||||||
return proxy;
|
return proxy;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void returnProxy(HttpHost host, int statusCode) {
|
public void returnProxy(Proxy proxy, int statusCode) {
|
||||||
TimerReuseProxy p = allProxy.get(host.getAddress().getHostAddress());
|
TimerReuseProxy p = allProxy.get(proxy.getProxyHost());
|
||||||
if (p == null) {
|
if (p == null) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
@ -202,13 +200,13 @@ public class TimerReuseProxyPool implements ProxyPool {
|
||||||
// banned,try longer interval
|
// banned,try longer interval
|
||||||
p.fail(TimerReuseProxy.ERROR_403);
|
p.fail(TimerReuseProxy.ERROR_403);
|
||||||
p.setReuseTimeInterval(reuseInterval * p.getFailedNum());
|
p.setReuseTimeInterval(reuseInterval * p.getFailedNum());
|
||||||
logger.info(host + " >>>> reuseTimeInterval is >>>> " + p.getReuseTimeInterval() / 1000.0);
|
logger.info(proxy + " >>>> reuseTimeInterval is >>>> " + p.getReuseTimeInterval() / 1000.0);
|
||||||
break;
|
break;
|
||||||
case TimerReuseProxy.ERROR_BANNED:
|
case TimerReuseProxy.ERROR_BANNED:
|
||||||
p.fail(TimerReuseProxy.ERROR_BANNED);
|
p.fail(TimerReuseProxy.ERROR_BANNED);
|
||||||
p.setReuseTimeInterval(10 * 60 * 1000 * p.getFailedNum());
|
p.setReuseTimeInterval(10 * 60 * 1000 * p.getFailedNum());
|
||||||
logger.warn("this proxy is banned >>>> " + p.getHttpHost());
|
logger.warn("this proxy is banned >>>> " + p.getHttpHost());
|
||||||
logger.info(host + " >>>> reuseTimeInterval is >>>> " + p.getReuseTimeInterval() / 1000.0);
|
logger.info(proxy + " >>>> reuseTimeInterval is >>>> " + p.getReuseTimeInterval() / 1000.0);
|
||||||
break;
|
break;
|
||||||
case TimerReuseProxy.ERROR_404:
|
case TimerReuseProxy.ERROR_404:
|
||||||
// p.fail(Proxy.ERROR_404);
|
// p.fail(Proxy.ERROR_404);
|
||||||
|
@ -220,13 +218,13 @@ public class TimerReuseProxyPool implements ProxyPool {
|
||||||
}
|
}
|
||||||
if (p.getFailedNum() > 20) {
|
if (p.getFailedNum() > 20) {
|
||||||
p.setReuseTimeInterval(reviveTime);
|
p.setReuseTimeInterval(reviveTime);
|
||||||
logger.error("remove proxy >>>> " + host + ">>>>" + p.getFailedType() + " >>>> remain proxy >>>> " + proxyQueue.size());
|
logger.error("remove proxy >>>> " + proxy + ">>>>" + p.getFailedType() + " >>>> remain proxy >>>> " + proxyQueue.size());
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (p.getFailedNum() > 0 && p.getFailedNum() % 5 == 0) {
|
if (p.getFailedNum() > 0 && p.getFailedNum() % 5 == 0) {
|
||||||
if (!ProxyUtils.validateProxy(host)) {
|
if (!ProxyUtils.validateProxy(proxy)) {
|
||||||
p.setReuseTimeInterval(reviveTime);
|
p.setReuseTimeInterval(reviveTime);
|
||||||
logger.error("remove proxy >>>> " + host + ">>>>" + p.getFailedType() + " >>>> remain proxy >>>> " + proxyQueue.size());
|
logger.error("remove proxy >>>> " + proxy + ">>>>" + p.getFailedType() + " >>>> remain proxy >>>> " + proxyQueue.size());
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue