From 0aa2c3949d29e4c02c199eb30c7adae8f244e1ee Mon Sep 17 00:00:00 2001 From: yao Date: Tue, 22 Dec 2020 18:19:37 +0800 Subject: [PATCH] =?UTF-8?q?=20=E5=88=B7=E6=96=B0=E4=BB=A3=E7=90=86api?= =?UTF-8?q?=E9=87=8D=E6=9E=84=EF=BC=8C=E9=9C=80=E8=A6=81=E6=8F=90=E4=BE=9B?= =?UTF-8?q?=E6=97=A7=E4=BB=A3=E7=90=86=EF=BC=8C=E5=A6=82=E6=9E=9C=E4=BE=9D?= =?UTF-8?q?=E7=84=B6=E6=98=AF=E6=97=A7=E4=BB=A3=E7=90=86=EF=BC=8C=E6=89=8D?= =?UTF-8?q?=E8=BF=9B=E8=A1=8C=E5=88=B7=E6=96=B0=EF=BC=8C=E9=98=B2=E6=AD=A2?= =?UTF-8?q?=E5=BA=94=E5=BB=B6=E8=BF=9F=E5=93=8D=E5=BA=94=E9=80=A0=E6=88=90?= =?UTF-8?q?=E7=9A=84=E8=BF=87=E5=BA=A6=E5=88=B7=E6=96=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../src/main/java/us/codecraft/webmagic/Spider.java | 7 ++++++- .../webmagic/downloader/HttpClientDownloader.java | 6 +++--- .../webmagic/downloader/HttpClientGenerator.java | 1 + .../us/codecraft/webmagic/proxy/ProxyProvider.java | 13 ++++++++++++- .../webmagic/proxy/SimpleProxyProvider.java | 7 ++++++- 5 files changed, 28 insertions(+), 6 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index d1ad6a6..bc07651 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -421,7 +421,7 @@ public class Spider implements Runnable, Task { } } else if(site.getRefreshCode().contains(page.getStatusCode())) { logger.info("page status code error, page {} , code: {}, start refresh downloader", request.getUrl(), page.getStatusCode()); - downloader.refreshComponent(this); + failHandler(request); }else { logger.info("page status code error, page {} , code: {}", request.getUrl(), page.getStatusCode()); } @@ -430,6 +430,11 @@ public class Spider implements Runnable, Task { } private void onDownloaderFail(Request request) { + failHandler(request); + } + + private void failHandler(Request request){ + downloader.refreshComponent(this); if (site.getCycleRetryTimes() == 0) { sleep(site.getSleepTime()); } else { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index eed49fb..ace8175 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -54,7 +54,7 @@ public class HttpClientDownloader extends AbstractDownloader { this.refreshClientOnError = clientOnError; } public void setRefreshProxyOnError(Predicate proxyOnError) { - this.refreshProxyOnError = refreshProxyOnError; + this.refreshProxyOnError = proxyOnError; } public void setHttpUriRequestConverter(HttpUriRequestConverter httpUriRequestConverter) { @@ -94,7 +94,7 @@ public class HttpClientDownloader extends AbstractDownloader { logger.warn("download page {} error", request.getUrl(), e); onError(request, e, proxyProvider); if (proxyProvider != null && refreshProxyOnError.test(e)) { - proxyProvider.refreshProxy(task); + proxyProvider.refreshProxy(task,proxy); } if(refreshClientOnError.test(e)) { httpClients.remove(task.getSite().getDomain()); @@ -115,7 +115,7 @@ public class HttpClientDownloader extends AbstractDownloader { @Override public void refreshComponent(Task task) { if (proxyProvider != null ) { - proxyProvider.refreshProxy(task); + proxyProvider.refreshProxy(task,proxyProvider.getCurrentProxy(task)); } httpClients.remove(task.getSite().getDomain()); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java index 1f20c5a..2d27b79 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java @@ -143,6 +143,7 @@ public class HttpClientGenerator { SocketConfig.Builder socketConfigBuilder = SocketConfig.custom(); socketConfigBuilder.setSoKeepAlive(true).setTcpNoDelay(true); socketConfigBuilder.setSoTimeout(site.getTimeOut()); + SocketConfig socketConfig = socketConfigBuilder.build(); httpClientBuilder.setDefaultSocketConfig(socketConfig); connectionManager.setDefaultSocketConfig(socketConfig); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java index da3bec9..b4e7b48 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java @@ -23,8 +23,19 @@ public interface ProxyProvider { * 代理IP是珍贵资源,有可能代理提供者内部代理没有过期,就一直提供某个IP,但这个IP又不可以使用,所以提供一种方式通知提供者,这个代理该刷新了 * * @param task 下载任务 + * @param proxy 需要对代理进行验证,如果确实持有的时错误代理,则刷新,否则,继续执行 */ - void refreshProxy(Task task); + void refreshProxy(Task task,Proxy proxy); + + + /** + * + * 获取当前正在提供的代理 + * + * @param task + * @return + */ + Proxy getCurrentProxy(Task task); /** * Get a proxy for task by some strategy. diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyProvider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyProvider.java index fd80b30..8ad9ce7 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyProvider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyProvider.java @@ -31,7 +31,12 @@ public class SimpleProxyProvider implements ProxyProvider { } @Override - public void refreshProxy(Task task) { + public Proxy getCurrentProxy(Task task) { + return null; + } + + @Override + public void refreshProxy(Task task,Proxy proxy) { }