compile passed in httpclientDownloader
parent
b71f379512
commit
1d86f7c048
|
@ -18,7 +18,6 @@ public class Request implements Serializable {
|
|||
private static final long serialVersionUID = 2062192774891352043L;
|
||||
|
||||
public static final String CYCLE_TRIED_TIMES = "_cycle_tried_times";
|
||||
public static final String STATUS_CODE = "statusCode";
|
||||
public static final String PROXY = "proxy";
|
||||
|
||||
private String url;
|
||||
|
|
|
@ -419,8 +419,6 @@ public class Spider implements Runnable, Task {
|
|||
pipeline.process(page.getResultItems(), this);
|
||||
}
|
||||
}
|
||||
//for proxy status management
|
||||
request.putExtra(Request.STATUS_CODE, page.getStatusCode());
|
||||
sleep(site.getSleepTime());
|
||||
}
|
||||
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
package us.codecraft.webmagic.downloader;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.http.HttpHost;
|
||||
import org.apache.http.HttpResponse;
|
||||
import org.apache.http.annotation.ThreadSafe;
|
||||
import org.apache.http.auth.AuthState;
|
||||
|
@ -23,13 +22,11 @@ import us.codecraft.webmagic.Task;
|
|||
import us.codecraft.webmagic.proxy.Proxy;
|
||||
import us.codecraft.webmagic.selector.PlainText;
|
||||
import us.codecraft.webmagic.utils.CharsetUtils;
|
||||
import us.codecraft.webmagic.utils.WMCollections;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.charset.Charset;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
|
||||
/**
|
||||
|
@ -80,28 +77,22 @@ public class HttpClientDownloader extends AbstractDownloader {
|
|||
CloseableHttpResponse httpResponse = null;
|
||||
int statusCode = 0;
|
||||
Site site = task.getSite();
|
||||
try {
|
||||
Proxy proxy = null;
|
||||
if (site.getHttpProxyPool() != null && site.getHttpProxyPool().isEnable()) {
|
||||
proxy = site.getHttpProxyFromPool();
|
||||
} else if (site != null && site.getHttpProxy() != null){
|
||||
proxy = site.getHttpProxy();
|
||||
request.putExtra(Request.PROXY, site.getHttpProxy());
|
||||
}
|
||||
Proxy proxy = null;
|
||||
HttpContext httpContext = new BasicHttpContext();
|
||||
if (site.getHttpProxyPool() != null && site.getHttpProxyPool().isEnable()) {
|
||||
proxy = site.getHttpProxyFromPool();
|
||||
request.putExtra(Request.PROXY, proxy);
|
||||
|
||||
HttpContext httpContext = new BasicHttpContext();
|
||||
|
||||
HttpUriRequest httpUriRequest = httpUriRequestConverter.convert(request,site);
|
||||
AuthState authState = new AuthState();
|
||||
authState.update(new BasicScheme(), new UsernamePasswordCredentials("userName", "password"));
|
||||
authState.update(new BasicScheme(), new UsernamePasswordCredentials(proxy.getUsername(), proxy.getPassword()));
|
||||
httpContext.setAttribute(HttpClientContext.PROXY_AUTH_STATE, authState);
|
||||
CloseableHttpClient httpClient = getHttpClient(site, proxy);
|
||||
}
|
||||
HttpUriRequest httpUriRequest = httpUriRequestConverter.convert(request, site);
|
||||
CloseableHttpClient httpClient = getHttpClient(site);
|
||||
try {
|
||||
httpResponse = httpClient.execute(httpUriRequest, httpContext);
|
||||
statusCode = httpResponse.getStatusLine().getStatusCode();
|
||||
request.putExtra(Request.STATUS_CODE, statusCode);
|
||||
if (statusAccept(acceptStatCode, statusCode)) {
|
||||
Page page = handleResponse(request, charset, httpResponse, task);
|
||||
if (site.getAcceptStatCode().contains(statusCode)) {
|
||||
Page page = handleResponse(request, site.getCharset(), httpResponse, task);
|
||||
onSuccess(request);
|
||||
return page;
|
||||
} else {
|
||||
|
@ -120,10 +111,8 @@ public class HttpClientDownloader extends AbstractDownloader {
|
|||
//ensure the connection is released back to pool
|
||||
EntityUtils.consumeQuietly(httpResponse.getEntity());
|
||||
}
|
||||
request.putExtra(Request.STATUS_CODE, statusCode);
|
||||
if (site != null && site.getHttpProxyPool() != null && site.getHttpProxyPool().isEnable()) {
|
||||
site.returnHttpProxyToPool((HttpHost) request.getExtra(Request.PROXY), (Integer) request
|
||||
.getExtra(Request.STATUS_CODE));
|
||||
if (proxy != null) {
|
||||
site.getHttpProxyPool().returnProxy(proxy, statusCode);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -133,10 +122,6 @@ public class HttpClientDownloader extends AbstractDownloader {
|
|||
httpClientGenerator.setPoolSize(thread);
|
||||
}
|
||||
|
||||
protected boolean statusAccept(Set<Integer> acceptStatCode, int statusCode) {
|
||||
return acceptStatCode.contains(statusCode);
|
||||
}
|
||||
|
||||
protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException {
|
||||
String content = getContent(charset, httpResponse);
|
||||
Page page = new Page();
|
||||
|
|
|
@ -10,7 +10,6 @@ import org.apache.http.client.methods.RequestBuilder;
|
|||
import org.apache.http.message.BasicNameValuePair;
|
||||
import us.codecraft.webmagic.Request;
|
||||
import us.codecraft.webmagic.Site;
|
||||
import us.codecraft.webmagic.proxy.Proxy;
|
||||
import us.codecraft.webmagic.utils.HttpConstant;
|
||||
|
||||
import java.nio.charset.Charset;
|
||||
|
@ -26,7 +25,7 @@ import java.util.Map;
|
|||
*/
|
||||
public class HttpUriRequestConverter {
|
||||
|
||||
public HttpUriRequest convert(Request request, Site site, Proxy proxy) {
|
||||
public HttpUriRequest convert(Request request, Site site) {
|
||||
return null;
|
||||
}
|
||||
|
||||
|
|
|
@ -7,12 +7,12 @@ package us.codecraft.webmagic.proxy;
|
|||
public class Proxy {
|
||||
|
||||
private ProxyHost proxyHost;
|
||||
private String user;
|
||||
private String username;
|
||||
private String password;
|
||||
|
||||
public Proxy(ProxyHost proxyHost, String user, String password) {
|
||||
public Proxy(ProxyHost proxyHost, String username, String password) {
|
||||
this.proxyHost = proxyHost;
|
||||
this.user = user;
|
||||
this.username = username;
|
||||
this.password = password;
|
||||
}
|
||||
|
||||
|
@ -28,12 +28,12 @@ public class Proxy {
|
|||
this.proxyHost = proxyHost;
|
||||
}
|
||||
|
||||
public String getUser() {
|
||||
return user;
|
||||
public String getUsername() {
|
||||
return username;
|
||||
}
|
||||
|
||||
public void setUser(String user) {
|
||||
this.user = user;
|
||||
public void setUsername(String username) {
|
||||
this.username = username;
|
||||
}
|
||||
|
||||
public String getPassword() {
|
||||
|
|
|
@ -1,13 +1,11 @@
|
|||
package us.codecraft.webmagic.proxy;
|
||||
|
||||
import org.apache.http.HttpHost;
|
||||
|
||||
/**
|
||||
* Created by edwardsbean on 15-2-28.
|
||||
*/
|
||||
public interface ProxyPool {
|
||||
|
||||
void returnProxy(HttpHost host, int statusCode);
|
||||
void returnProxy(Proxy proxy, int statusCode);
|
||||
|
||||
Proxy getProxy();
|
||||
|
||||
|
|
|
@ -7,8 +7,6 @@ import us.codecraft.webmagic.utils.FilePersistentBase;
|
|||
import us.codecraft.webmagic.utils.ProxyUtils;
|
||||
|
||||
import java.io.*;
|
||||
import java.net.InetAddress;
|
||||
import java.net.UnknownHostException;
|
||||
import java.util.*;
|
||||
import java.util.Map.Entry;
|
||||
import java.util.concurrent.BlockingQueue;
|
||||
|
@ -156,7 +154,7 @@ public class TimerReuseProxyPool implements ProxyPool {
|
|||
isEnable = true;
|
||||
for (Proxy proxy : httpProxyList) {
|
||||
if (!validateWhenInit || ProxyUtils.validateProxy(proxy.getProxyHost())) {
|
||||
TimerReuseProxy p = new TimerReuseProxy(proxy.getProxyHost(), proxy.getUser(), proxy.getPassword(), reuseInterval);
|
||||
TimerReuseProxy p = new TimerReuseProxy(proxy.getProxyHost(), proxy.getUsername(), proxy.getPassword(), reuseInterval);
|
||||
proxyQueue.add(p);
|
||||
allProxy.put(p.getProxyHost().getHost(), p);
|
||||
}
|
||||
|
@ -185,8 +183,8 @@ public class TimerReuseProxyPool implements ProxyPool {
|
|||
return proxy;
|
||||
}
|
||||
|
||||
public void returnProxy(HttpHost host, int statusCode) {
|
||||
TimerReuseProxy p = allProxy.get(host.getAddress().getHostAddress());
|
||||
public void returnProxy(Proxy proxy, int statusCode) {
|
||||
TimerReuseProxy p = allProxy.get(proxy.getProxyHost());
|
||||
if (p == null) {
|
||||
return;
|
||||
}
|
||||
|
@ -202,13 +200,13 @@ public class TimerReuseProxyPool implements ProxyPool {
|
|||
// banned,try longer interval
|
||||
p.fail(TimerReuseProxy.ERROR_403);
|
||||
p.setReuseTimeInterval(reuseInterval * p.getFailedNum());
|
||||
logger.info(host + " >>>> reuseTimeInterval is >>>> " + p.getReuseTimeInterval() / 1000.0);
|
||||
logger.info(proxy + " >>>> reuseTimeInterval is >>>> " + p.getReuseTimeInterval() / 1000.0);
|
||||
break;
|
||||
case TimerReuseProxy.ERROR_BANNED:
|
||||
p.fail(TimerReuseProxy.ERROR_BANNED);
|
||||
p.setReuseTimeInterval(10 * 60 * 1000 * p.getFailedNum());
|
||||
logger.warn("this proxy is banned >>>> " + p.getHttpHost());
|
||||
logger.info(host + " >>>> reuseTimeInterval is >>>> " + p.getReuseTimeInterval() / 1000.0);
|
||||
logger.info(proxy + " >>>> reuseTimeInterval is >>>> " + p.getReuseTimeInterval() / 1000.0);
|
||||
break;
|
||||
case TimerReuseProxy.ERROR_404:
|
||||
// p.fail(Proxy.ERROR_404);
|
||||
|
@ -220,13 +218,13 @@ public class TimerReuseProxyPool implements ProxyPool {
|
|||
}
|
||||
if (p.getFailedNum() > 20) {
|
||||
p.setReuseTimeInterval(reviveTime);
|
||||
logger.error("remove proxy >>>> " + host + ">>>>" + p.getFailedType() + " >>>> remain proxy >>>> " + proxyQueue.size());
|
||||
logger.error("remove proxy >>>> " + proxy + ">>>>" + p.getFailedType() + " >>>> remain proxy >>>> " + proxyQueue.size());
|
||||
return;
|
||||
}
|
||||
if (p.getFailedNum() > 0 && p.getFailedNum() % 5 == 0) {
|
||||
if (!ProxyUtils.validateProxy(host)) {
|
||||
if (!ProxyUtils.validateProxy(proxy)) {
|
||||
p.setReuseTimeInterval(reviveTime);
|
||||
logger.error("remove proxy >>>> " + host + ">>>>" + p.getFailedType() + " >>>> remain proxy >>>> " + proxyQueue.size());
|
||||
logger.error("remove proxy >>>> " + proxy + ">>>>" + p.getFailedType() + " >>>> remain proxy >>>> " + proxyQueue.size());
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue