diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index d1ad6a6..bc07651 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -421,7 +421,7 @@ public class Spider implements Runnable, Task { } } else if(site.getRefreshCode().contains(page.getStatusCode())) { logger.info("page status code error, page {} , code: {}, start refresh downloader", request.getUrl(), page.getStatusCode()); - downloader.refreshComponent(this); + failHandler(request); }else { logger.info("page status code error, page {} , code: {}", request.getUrl(), page.getStatusCode()); } @@ -430,6 +430,11 @@ public class Spider implements Runnable, Task { } private void onDownloaderFail(Request request) { + failHandler(request); + } + + private void failHandler(Request request){ + downloader.refreshComponent(this); if (site.getCycleRetryTimes() == 0) { sleep(site.getSleepTime()); } else { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index eed49fb..ace8175 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -54,7 +54,7 @@ public class HttpClientDownloader extends AbstractDownloader { this.refreshClientOnError = clientOnError; } public void setRefreshProxyOnError(Predicate proxyOnError) { - this.refreshProxyOnError = refreshProxyOnError; + this.refreshProxyOnError = proxyOnError; } public void setHttpUriRequestConverter(HttpUriRequestConverter httpUriRequestConverter) { @@ -94,7 +94,7 @@ public class HttpClientDownloader extends AbstractDownloader { logger.warn("download page {} error", request.getUrl(), e); onError(request, e, proxyProvider); if (proxyProvider != null && refreshProxyOnError.test(e)) { - proxyProvider.refreshProxy(task); + proxyProvider.refreshProxy(task,proxy); } if(refreshClientOnError.test(e)) { httpClients.remove(task.getSite().getDomain()); @@ -115,7 +115,7 @@ public class HttpClientDownloader extends AbstractDownloader { @Override public void refreshComponent(Task task) { if (proxyProvider != null ) { - proxyProvider.refreshProxy(task); + proxyProvider.refreshProxy(task,proxyProvider.getCurrentProxy(task)); } httpClients.remove(task.getSite().getDomain()); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java index 1f20c5a..2d27b79 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java @@ -143,6 +143,7 @@ public class HttpClientGenerator { SocketConfig.Builder socketConfigBuilder = SocketConfig.custom(); socketConfigBuilder.setSoKeepAlive(true).setTcpNoDelay(true); socketConfigBuilder.setSoTimeout(site.getTimeOut()); + SocketConfig socketConfig = socketConfigBuilder.build(); httpClientBuilder.setDefaultSocketConfig(socketConfig); connectionManager.setDefaultSocketConfig(socketConfig); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java index da3bec9..b4e7b48 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java @@ -23,8 +23,19 @@ public interface ProxyProvider { * 代理IP是珍贵资源,有可能代理提供者内部代理没有过期,就一直提供某个IP,但这个IP又不可以使用,所以提供一种方式通知提供者,这个代理该刷新了 * * @param task 下载任务 + * @param proxy 需要对代理进行验证,如果确实持有的时错误代理,则刷新,否则,继续执行 */ - void refreshProxy(Task task); + void refreshProxy(Task task,Proxy proxy); + + + /** + * + * 获取当前正在提供的代理 + * + * @param task + * @return + */ + Proxy getCurrentProxy(Task task); /** * Get a proxy for task by some strategy. diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyProvider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyProvider.java index fd80b30..8ad9ce7 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyProvider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyProvider.java @@ -31,7 +31,12 @@ public class SimpleProxyProvider implements ProxyProvider { } @Override - public void refreshProxy(Task task) { + public Proxy getCurrentProxy(Task task) { + return null; + } + + @Override + public void refreshProxy(Task task,Proxy proxy) { }