From 9a71f0ac924615d21882e1faa4bbda0c2e5eb7d7 Mon Sep 17 00:00:00 2001 From: yao Date: Tue, 15 Dec 2020 17:05:16 +0800 Subject: [PATCH 1/9] =?UTF-8?q?pageCount=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../src/main/java/us/codecraft/webmagic/Spider.java | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 62c989f..1a03bbf 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -24,6 +24,7 @@ import java.util.concurrent.ExecutorService; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicLong; +import java.util.concurrent.atomic.LongAdder; import java.util.concurrent.locks.Condition; import java.util.concurrent.locks.ReentrantLock; @@ -102,7 +103,7 @@ public class Spider implements Runnable, Task { private List spiderListeners; - private final AtomicLong pageCount = new AtomicLong(0); + private final LongAdder pageCount = new LongAdder(); private Date startTime; @@ -323,7 +324,7 @@ public class Spider implements Runnable, Task { onError(request); logger.error("process request " + request + " error", e); } finally { - pageCount.incrementAndGet(); + pageCount.increment(); signalNewUrl(); } } @@ -335,7 +336,7 @@ public class Spider implements Runnable, Task { if (destroyWhenExit) { close(); } - logger.info("Spider {} closed! {} pages downloaded.", getUUID(), pageCount.get()); + logger.info("Spider {} closed! {} pages downloaded.", getUUID(), pageCount.sumThenReset()); } protected void onError(Request request) { @@ -645,7 +646,7 @@ public class Spider implements Runnable, Task { * @since 0.4.1 */ public long getPageCount() { - return pageCount.get(); + return pageCount.sum(); } /** From ba69eba669d32fadbbe8b021b85b9b458d2db6aa Mon Sep 17 00:00:00 2001 From: yao Date: Mon, 21 Dec 2020 14:36:44 +0800 Subject: [PATCH 2/9] =?UTF-8?q?=E4=BB=A3=E7=90=86=E6=8E=A5=E5=8F=A3?= =?UTF-8?q?=E7=9A=84=E4=BF=AE=E6=94=B9=EF=BC=8C=E6=8F=90=E4=BE=9B=E5=88=B7?= =?UTF-8?q?=E6=98=9F=E4=BB=A3=E7=90=86API=E3=80=82downloader=20=E4=B8=8B?= =?UTF-8?q?=E8=BD=BD=E9=94=99=E8=AF=AF=E6=97=B6=EF=BC=8C=E6=8F=90=E4=BE=9B?= =?UTF-8?q?request,exception,proxyProvider=E4=B8=89=E4=B8=AA=E5=8F=82?= =?UTF-8?q?=E6=95=B0=EF=BC=8C?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../codecraft/webmagic/downloader/AbstractDownloader.java | 3 ++- .../webmagic/downloader/HttpClientDownloader.java | 2 +- .../java/us/codecraft/webmagic/proxy/ProxyProvider.java | 7 +++++++ .../us/codecraft/webmagic/proxy/SimpleProxyProvider.java | 5 +++++ 4 files changed, 15 insertions(+), 2 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java index c27292d..05f5686 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java @@ -3,6 +3,7 @@ package us.codecraft.webmagic.downloader; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.proxy.ProxyProvider; import us.codecraft.webmagic.selector.Html; /** @@ -38,7 +39,7 @@ public abstract class AbstractDownloader implements Downloader { protected void onSuccess(Request request) { } - protected void onError(Request request) { + protected void onError(Request request, Throwable throwable, ProxyProvider proxyProvider) { } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 24889c8..757cdd3 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -87,7 +87,7 @@ public class HttpClientDownloader extends AbstractDownloader { return page; } catch (IOException e) { logger.warn("download page {} error", request.getUrl(), e); - onError(request); + onError(request,e,proxyProvider); return page; } finally { if (httpResponse != null) { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java index 0cef4ed..da3bec9 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java @@ -19,6 +19,13 @@ public interface ProxyProvider { */ void returnProxy(Proxy proxy, Page page, Task task); + /** + * 代理IP是珍贵资源,有可能代理提供者内部代理没有过期,就一直提供某个IP,但这个IP又不可以使用,所以提供一种方式通知提供者,这个代理该刷新了 + * + * @param task 下载任务 + */ + void refreshProxy(Task task); + /** * Get a proxy for task by some strategy. * @param task the download task diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyProvider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyProvider.java index ddef6a8..fd80b30 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyProvider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyProvider.java @@ -30,6 +30,11 @@ public class SimpleProxyProvider implements ProxyProvider { this.pointer = pointer; } + @Override + public void refreshProxy(Task task) { + + } + public static SimpleProxyProvider from(Proxy... proxies) { List proxiesTemp = new ArrayList(proxies.length); for (Proxy proxy : proxies) { From 4a6441e7c5923c14d889c7f54af0ef15e5a05cb9 Mon Sep 17 00:00:00 2001 From: yao Date: Mon, 21 Dec 2020 14:52:25 +0800 Subject: [PATCH 3/9] =?UTF-8?q?=E6=8F=90=E4=BE=9B=E5=87=BA=E7=8E=B0?= =?UTF-8?q?=E6=9F=90=E7=A7=8D=E5=BC=82=E5=B8=B8=E5=88=B7=E6=96=B0=E4=BB=A3?= =?UTF-8?q?=E7=90=86=EF=BC=8C=E5=BC=82=E5=B8=B8=E5=8F=AF=E9=85=8D=E7=BD=AE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../webmagic/downloader/HttpClientDownloader.java | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 757cdd3..2dd340f 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -21,6 +21,7 @@ import java.io.IOException; import java.nio.charset.Charset; import java.util.HashMap; import java.util.Map; +import java.util.function.Predicate; /** @@ -43,6 +44,14 @@ public class HttpClientDownloader extends AbstractDownloader { private boolean responseHeader = true; + private volatile boolean refreshProxyOnError = false; + + private Predicate throwablePredicate = t->false; + + public void setThrowablePredicate(Predicate predicate){ + this.throwablePredicate = predicate; + } + public void setHttpUriRequestConverter(HttpUriRequestConverter httpUriRequestConverter) { this.httpUriRequestConverter = httpUriRequestConverter; } @@ -88,6 +97,9 @@ public class HttpClientDownloader extends AbstractDownloader { } catch (IOException e) { logger.warn("download page {} error", request.getUrl(), e); onError(request,e,proxyProvider); + if(proxyProvider != null && refreshProxyOnError && throwablePredicate.test(e)){ + proxyProvider.refreshProxy(task); + } return page; } finally { if (httpResponse != null) { From 9cc5287743de9715ec3ac10a20636377be41d060 Mon Sep 17 00:00:00 2001 From: yao Date: Mon, 21 Dec 2020 14:58:01 +0800 Subject: [PATCH 4/9] =?UTF-8?q?=E7=AE=80=E5=8C=96=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../downloader/HttpClientDownloader.java | 23 ++++++++----------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 2dd340f..5684114 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -32,24 +32,21 @@ import java.util.function.Predicate; */ public class HttpClientDownloader extends AbstractDownloader { - private Logger logger = LoggerFactory.getLogger(getClass()); - private final Map httpClients = new HashMap(); - - private HttpClientGenerator httpClientGenerator = new HttpClientGenerator(); + private final Logger logger = LoggerFactory.getLogger(getClass()); + private final HttpClientGenerator httpClientGenerator = new HttpClientGenerator(); private HttpUriRequestConverter httpUriRequestConverter = new HttpUriRequestConverter(); - + private ProxyProvider proxyProvider; - private boolean responseHeader = true; + private final boolean responseHeader = true; - private volatile boolean refreshProxyOnError = false; - private Predicate throwablePredicate = t->false; + private Predicate refreshProxyOnError = t -> false; - public void setThrowablePredicate(Predicate predicate){ - this.throwablePredicate = predicate; + public void setRefreshProxyOnError(Predicate proxyOnError) { + this.refreshProxyOnError = refreshProxyOnError; } public void setHttpUriRequestConverter(HttpUriRequestConverter httpUriRequestConverter) { @@ -96,8 +93,8 @@ public class HttpClientDownloader extends AbstractDownloader { return page; } catch (IOException e) { logger.warn("download page {} error", request.getUrl(), e); - onError(request,e,proxyProvider); - if(proxyProvider != null && refreshProxyOnError && throwablePredicate.test(e)){ + onError(request, e, proxyProvider); + if (proxyProvider != null && refreshProxyOnError.test(e)) { proxyProvider.refreshProxy(task); } return page; @@ -122,7 +119,7 @@ public class HttpClientDownloader extends AbstractDownloader { String contentType = httpResponse.getEntity().getContentType() == null ? "" : httpResponse.getEntity().getContentType().getValue(); Page page = new Page(); page.setBytes(bytes); - if (!request.isBinaryContent()){ + if (!request.isBinaryContent()) { if (charset == null) { charset = getHtmlCharset(contentType, bytes); } From 19465089c3ad254e6f35b96cbe707bc6dd33ec62 Mon Sep 17 00:00:00 2001 From: yao Date: Mon, 21 Dec 2020 16:02:35 +0800 Subject: [PATCH 5/9] =?UTF-8?q?=E6=8F=90=E4=BE=9B=E5=BC=82=E5=B8=B8?= =?UTF-8?q?=E5=88=B7=E6=96=B0httpClient=EF=BC=8C=E5=BC=82=E5=B8=B8?= =?UTF-8?q?=E5=8F=AF=E9=85=8D=E7=BD=AE=EF=BC=8C=E9=87=8D=E5=86=99getHttpCl?= =?UTF-8?q?ient=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../downloader/HttpClientDownloader.java | 26 ++++++++++--------- 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 5684114..f9f8c82 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -21,6 +21,7 @@ import java.io.IOException; import java.nio.charset.Charset; import java.util.HashMap; import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; import java.util.function.Predicate; @@ -32,7 +33,7 @@ import java.util.function.Predicate; */ public class HttpClientDownloader extends AbstractDownloader { - private final Map httpClients = new HashMap(); + private final Map httpClients = new ConcurrentHashMap<>(); private final Logger logger = LoggerFactory.getLogger(getClass()); private final HttpClientGenerator httpClientGenerator = new HttpClientGenerator(); @@ -45,6 +46,13 @@ public class HttpClientDownloader extends AbstractDownloader { private Predicate refreshProxyOnError = t -> false; + + private Predicate refreshClientOnError = t -> false; + + + public void setRefreshClientOnError(Predicate clientOnError){ + this.refreshClientOnError = clientOnError; + } public void setRefreshProxyOnError(Predicate proxyOnError) { this.refreshProxyOnError = refreshProxyOnError; } @@ -62,17 +70,8 @@ public class HttpClientDownloader extends AbstractDownloader { return httpClientGenerator.getClient(null); } String domain = site.getDomain(); - CloseableHttpClient httpClient = httpClients.get(domain); - if (httpClient == null) { - synchronized (this) { - httpClient = httpClients.get(domain); - if (httpClient == null) { - httpClient = httpClientGenerator.getClient(site); - httpClients.put(domain, httpClient); - } - } - } - return httpClient; + return httpClients.computeIfAbsent(domain,k->httpClientGenerator.getClient(site)); + } @Override @@ -97,6 +96,9 @@ public class HttpClientDownloader extends AbstractDownloader { if (proxyProvider != null && refreshProxyOnError.test(e)) { proxyProvider.refreshProxy(task); } + if(refreshClientOnError.test(e)) { + httpClients.remove(task.getSite().getDomain()); + } return page; } finally { if (httpResponse != null) { From 2e2a0fdf3e8e614d9a3af146dfa462d0e299ceb5 Mon Sep 17 00:00:00 2001 From: yao Date: Mon, 21 Dec 2020 18:08:55 +0800 Subject: [PATCH 6/9] =?UTF-8?q?=20Downloader=20=E6=8F=90=E4=BE=9B=E5=88=B7?= =?UTF-8?q?=E6=96=B0=E7=BB=84=E4=BB=B6=E7=9A=84api,=E6=96=B9=E4=BE=BF?= =?UTF-8?q?=E5=9C=A8spider=E4=B8=AD=E6=93=8D=E4=BD=9C?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../main/java/us/codecraft/webmagic/Site.java | 15 +++++++++++++++ .../main/java/us/codecraft/webmagic/Spider.java | 5 ++++- .../webmagic/downloader/Downloader.java | 10 +++++++--- .../downloader/HttpClientDownloader.java | 11 +++++++++++ .../webmagic/downloader/HttpClientGenerator.java | 16 +++++++++++----- .../codecraft/webmagic/utils/HttpConstant.java | 1 + .../java/us/codecraft/webmagic/SpiderTest.java | 5 +++++ .../downloader/MockGithubDownloader.java | 5 +++++ .../webmagic/downloader/PhantomJSDownloader.java | 7 ++++++- .../downloader/MockGithubDownloader.java | 4 ++++ .../downloader/selenium/SeleniumDownloader.java | 5 +++++ 11 files changed, 74 insertions(+), 10 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java index 72cc7d0..bf603b3 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java @@ -1,5 +1,6 @@ package us.codecraft.webmagic; +import com.sun.org.apache.regexp.internal.RE; import us.codecraft.webmagic.utils.HttpConstant; import java.util.*; @@ -35,8 +36,12 @@ public class Site { private static final Set DEFAULT_STATUS_CODE_SET = new HashSet(); + private static final Set DEFAULT_REFRESH_CODE_SET = new HashSet<>(); + + private Set refreshCode = DEFAULT_REFRESH_CODE_SET; private Set acceptStatCode = DEFAULT_STATUS_CODE_SET; + private Map headers = new HashMap(); private boolean useGzip = true; @@ -44,6 +49,7 @@ public class Site { private boolean disableCookieManagement = false; static { + DEFAULT_REFRESH_CODE_SET.add(HttpConstant.StatusCode.FORBIDDEN); DEFAULT_STATUS_CODE_SET.add(HttpConstant.StatusCode.CODE_200); } @@ -192,6 +198,15 @@ public class Site { return this; } + public Site setRefreshCode(Set refreshCode){ + this.refreshCode = refreshCode; + return this; + } + public Set getRefreshCode(){ + return refreshCode; + + } + /** * get acceptStatCode * diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 1a03bbf..d1ad6a6 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -419,7 +419,10 @@ public class Spider implements Runnable, Task { pipeline.process(page.getResultItems(), this); } } - } else { + } else if(site.getRefreshCode().contains(page.getStatusCode())) { + logger.info("page status code error, page {} , code: {}, start refresh downloader", request.getUrl(), page.getStatusCode()); + downloader.refreshComponent(this); + }else { logger.info("page status code error, page {} , code: {}", request.getUrl(), page.getStatusCode()); } sleep(site.getSleepTime()); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Downloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Downloader.java index f7ced49..5095501 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Downloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Downloader.java @@ -18,14 +18,18 @@ public interface Downloader { * Downloads web pages and store in Page object. * * @param request request - * @param task task + * @param task task * @return page */ - public Page download(Request request, Task task); + Page download(Request request, Task task); /** * Tell the downloader how many threads the spider used. + * * @param threadNum number of threads */ - public void setThread(int threadNum); + void setThread(int threadNum); + + + void refreshComponent(Task task); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index f9f8c82..eed49fb 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -111,6 +111,17 @@ public class HttpClientDownloader extends AbstractDownloader { } } + + @Override + public void refreshComponent(Task task) { + if (proxyProvider != null ) { + proxyProvider.refreshProxy(task); + } + + httpClients.remove(task.getSite().getDomain()); + + } + @Override public void setThread(int thread) { httpClientGenerator.setPoolSize(thread); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java index 80e0f10..1f20c5a 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java @@ -1,13 +1,17 @@ package us.codecraft.webmagic.downloader; +import java.io.File; import java.io.IOException; import java.security.KeyManagementException; +import java.security.KeyStore; +import java.security.KeyStoreException; import java.security.NoSuchAlgorithmException; import java.security.cert.CertificateException; import java.security.cert.X509Certificate; import java.util.Map; import javax.net.ssl.SSLContext; +import javax.net.ssl.SSLContextSpi; import javax.net.ssl.TrustManager; import javax.net.ssl.X509TrustManager; @@ -24,6 +28,7 @@ import org.apache.http.conn.socket.ConnectionSocketFactory; import org.apache.http.conn.socket.PlainConnectionSocketFactory; import org.apache.http.conn.ssl.DefaultHostnameVerifier; import org.apache.http.conn.ssl.SSLConnectionSocketFactory; +import org.apache.http.conn.ssl.TrustSelfSignedStrategy; import org.apache.http.impl.client.BasicCookieStore; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.DefaultHttpRequestRetryHandler; @@ -32,6 +37,7 @@ import org.apache.http.impl.client.HttpClients; import org.apache.http.impl.conn.PoolingHttpClientConnectionManager; import org.apache.http.impl.cookie.BasicClientCookie; import org.apache.http.protocol.HttpContext; +import org.apache.http.ssl.SSLContexts; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -69,7 +75,7 @@ public class HttpClientGenerator { return new SSLConnectionSocketFactory(sslContext, supportedProtocols, null, new DefaultHostnameVerifier()); // 优先绕过安全证书 - } catch (KeyManagementException e) { + } catch (KeyManagementException | CertificateException | KeyStoreException | IOException e) { logger.error("ssl connection fail", e); } catch (NoSuchAlgorithmException e) { logger.error("ssl connection fail", e); @@ -77,8 +83,8 @@ public class HttpClientGenerator { return SSLConnectionSocketFactory.getSocketFactory(); } - private SSLContext createIgnoreVerifySSL() throws NoSuchAlgorithmException, KeyManagementException { - // 实现一个X509TrustManager接口,用于绕过验证,不用修改里面的方法 + private SSLContext createIgnoreVerifySSL() throws NoSuchAlgorithmException, KeyManagementException, CertificateException, KeyStoreException, IOException { +// 实现一个X509TrustManager接口,用于绕过验证,不用修改里面的方法 X509TrustManager trustManager = new X509TrustManager() { @Override @@ -96,10 +102,10 @@ public class HttpClientGenerator { }; - SSLContext sc = SSLContext.getInstance("TLS"); + SSLContext sc = SSLContext.getInstance("SSLv3"); sc.init(null, new TrustManager[] { trustManager }, null); return sc; - } + } public HttpClientGenerator setPoolSize(int poolSize) { connectionManager.setMaxTotal(poolSize); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/HttpConstant.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/HttpConstant.java index 2d6b8fe..bfacec3 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/HttpConstant.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/HttpConstant.java @@ -28,6 +28,7 @@ public abstract class HttpConstant { public static abstract class StatusCode { public static final int CODE_200 = 200; + public static final int FORBIDDEN = 403; } diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/SpiderTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/SpiderTest.java index 4f4a280..6b9c423 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/SpiderTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/SpiderTest.java @@ -57,6 +57,11 @@ public class SpiderTest { return Site.me().setSleepTime(0); } }).setDownloader(new Downloader() { + @Override + public void refreshComponent(Task task) { + + } + @Override public Page download(Request request, Task task) { return new Page().setRawText(""); diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java index 3aa742c..6d764a5 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java @@ -28,6 +28,11 @@ public class MockGithubDownloader implements Downloader { return page; } + @Override + public void refreshComponent(Task task) { + + } + @Override public void setThread(int threadNum) { } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java index 6055bdb..f3751d6 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java @@ -42,7 +42,12 @@ public class PhantomJSDownloader extends AbstractDownloader { this.initPhantomjsCrawlPath(); PhantomJSDownloader.phantomJsCommand = phantomJsCommand; } - + + @Override + public void refreshComponent(Task task) { + + } + /** * 新增构造函数,支持crawl.js路径自定义,因为当其他项目依赖此jar包时,runtime.exec()执行phantomjs命令时无使用法jar包中的crawl.js *
diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java
index 91e3698..7744692 100644
--- a/webmagic-extension/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java
+++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java
@@ -9,6 +9,10 @@ import us.codecraft.webmagic.selector.PlainText;
  * @author code4crafter@gmail.com
  */
 public class MockGithubDownloader implements Downloader{
+    @Override
+    public void refreshComponent(Task task) {
+
+    }
 
     private String html = "\n" +
             "\n" +
diff --git a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java
index cce293f..11b2356 100644
--- a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java
+++ b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java
@@ -59,6 +59,11 @@ public class SeleniumDownloader implements Downloader, Closeable {
 		// "/Users/Bingo/Downloads/phantomjs-1.9.7-macosx/bin/phantomjs");
 	}
 
+	@Override
+	public void refreshComponent(Task task) {
+
+	}
+
 	/**
 	 * set sleep time to wait until load success
 	 *

From 0aa2c3949d29e4c02c199eb30c7adae8f244e1ee Mon Sep 17 00:00:00 2001
From: yao 
Date: Tue, 22 Dec 2020 18:19:37 +0800
Subject: [PATCH 7/9] =?UTF-8?q?=20=E5=88=B7=E6=96=B0=E4=BB=A3=E7=90=86api?=
 =?UTF-8?q?=E9=87=8D=E6=9E=84=EF=BC=8C=E9=9C=80=E8=A6=81=E6=8F=90=E4=BE=9B?=
 =?UTF-8?q?=E6=97=A7=E4=BB=A3=E7=90=86=EF=BC=8C=E5=A6=82=E6=9E=9C=E4=BE=9D?=
 =?UTF-8?q?=E7=84=B6=E6=98=AF=E6=97=A7=E4=BB=A3=E7=90=86=EF=BC=8C=E6=89=8D?=
 =?UTF-8?q?=E8=BF=9B=E8=A1=8C=E5=88=B7=E6=96=B0=EF=BC=8C=E9=98=B2=E6=AD=A2?=
 =?UTF-8?q?=E5=BA=94=E5=BB=B6=E8=BF=9F=E5=93=8D=E5=BA=94=E9=80=A0=E6=88=90?=
 =?UTF-8?q?=E7=9A=84=E8=BF=87=E5=BA=A6=E5=88=B7=E6=96=B0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../src/main/java/us/codecraft/webmagic/Spider.java |  7 ++++++-
 .../webmagic/downloader/HttpClientDownloader.java   |  6 +++---
 .../webmagic/downloader/HttpClientGenerator.java    |  1 +
 .../us/codecraft/webmagic/proxy/ProxyProvider.java  | 13 ++++++++++++-
 .../webmagic/proxy/SimpleProxyProvider.java         |  7 ++++++-
 5 files changed, 28 insertions(+), 6 deletions(-)

diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
index d1ad6a6..bc07651 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
@@ -421,7 +421,7 @@ public class Spider implements Runnable, Task {
             }
         } else if(site.getRefreshCode().contains(page.getStatusCode())) {
             logger.info("page status code error, page {} , code: {}, start refresh downloader", request.getUrl(), page.getStatusCode());
-            downloader.refreshComponent(this);
+            failHandler(request);
         }else {
             logger.info("page status code error, page {} , code: {}", request.getUrl(), page.getStatusCode());
         }
@@ -430,6 +430,11 @@ public class Spider implements Runnable, Task {
     }
 
     private void onDownloaderFail(Request request) {
+       failHandler(request);
+    }
+
+    private void failHandler(Request request){
+        downloader.refreshComponent(this);
         if (site.getCycleRetryTimes() == 0) {
             sleep(site.getSleepTime());
         } else {
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
index eed49fb..ace8175 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
@@ -54,7 +54,7 @@ public class HttpClientDownloader extends AbstractDownloader {
         this.refreshClientOnError = clientOnError;
     }
     public void setRefreshProxyOnError(Predicate proxyOnError) {
-        this.refreshProxyOnError = refreshProxyOnError;
+        this.refreshProxyOnError = proxyOnError;
     }
 
     public void setHttpUriRequestConverter(HttpUriRequestConverter httpUriRequestConverter) {
@@ -94,7 +94,7 @@ public class HttpClientDownloader extends AbstractDownloader {
             logger.warn("download page {} error", request.getUrl(), e);
             onError(request, e, proxyProvider);
             if (proxyProvider != null  && refreshProxyOnError.test(e)) {
-                proxyProvider.refreshProxy(task);
+                proxyProvider.refreshProxy(task,proxy);
             }
             if(refreshClientOnError.test(e)) {
                 httpClients.remove(task.getSite().getDomain());
@@ -115,7 +115,7 @@ public class HttpClientDownloader extends AbstractDownloader {
     @Override
     public void refreshComponent(Task task) {
         if (proxyProvider != null ) {
-            proxyProvider.refreshProxy(task);
+            proxyProvider.refreshProxy(task,proxyProvider.getCurrentProxy(task));
         }
 
             httpClients.remove(task.getSite().getDomain());
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java
index 1f20c5a..2d27b79 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java
@@ -143,6 +143,7 @@ public class HttpClientGenerator {
         SocketConfig.Builder socketConfigBuilder = SocketConfig.custom();
         socketConfigBuilder.setSoKeepAlive(true).setTcpNoDelay(true);
         socketConfigBuilder.setSoTimeout(site.getTimeOut());
+
         SocketConfig socketConfig = socketConfigBuilder.build();
         httpClientBuilder.setDefaultSocketConfig(socketConfig);
         connectionManager.setDefaultSocketConfig(socketConfig);
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java
index da3bec9..b4e7b48 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java
@@ -23,8 +23,19 @@ public interface ProxyProvider {
      *  代理IP是珍贵资源,有可能代理提供者内部代理没有过期,就一直提供某个IP,但这个IP又不可以使用,所以提供一种方式通知提供者,这个代理该刷新了
      *
      * @param task  下载任务
+     * @param proxy 需要对代理进行验证,如果确实持有的时错误代理,则刷新,否则,继续执行
      */
-    void refreshProxy(Task task);
+    void refreshProxy(Task task,Proxy proxy);
+
+
+    /**
+     *
+     * 获取当前正在提供的代理
+     *
+     * @param task
+     * @return
+     */
+    Proxy getCurrentProxy(Task task);
 
     /**
      * Get a proxy for task by some strategy.
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyProvider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyProvider.java
index fd80b30..8ad9ce7 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyProvider.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyProvider.java
@@ -31,7 +31,12 @@ public class SimpleProxyProvider implements ProxyProvider {
     }
 
     @Override
-    public void refreshProxy(Task task) {
+    public Proxy getCurrentProxy(Task task) {
+        return null;
+    }
+
+    @Override
+    public void refreshProxy(Task task,Proxy proxy) {
 
     }
 

From 33906e36f48588f8d1a44331d1a21fbcd3a5f9d7 Mon Sep 17 00:00:00 2001
From: yao 
Date: Tue, 29 Dec 2020 16:18:43 +0800
Subject: [PATCH 8/9] =?UTF-8?q?=20=E4=BB=A3=E7=90=86=E5=8A=9F=E8=83=BD?=
 =?UTF-8?q?=E6=89=A9=E5=B1=95=EF=BC=8C=E5=AF=B9=E5=8E=9F=E4=BB=A3=E7=90=86?=
 =?UTF-8?q?=E6=8F=90=E4=BE=9B=E5=95=86=E8=BF=9B=E8=A1=8C=E6=8B=86=E5=88=86?=
 =?UTF-8?q?=EF=BC=8C=E5=8A=A0=E5=85=A5lombok?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 webmagic-core/pom.xml                         |   9 +-
 .../java/us/codecraft/webmagic/Spider.java    |   2 +-
 .../downloader/HttpClientDownloader.java      |  15 +-
 .../AbstractRefreshableProxyProvider.java     | 135 ++++++++++++++++++
 .../webmagic/proxy/ExpirableProxy.java        |  34 +++++
 .../us/codecraft/webmagic/proxy/Proxy.java    |  66 ++++-----
 .../webmagic/proxy/ProxyProvider.java         |  27 ----
 .../proxy/RefreshableProxyProvider.java       |  30 ++++
 .../proxy/ReturnableProxyProvider.java        |  22 +++
 .../webmagic/proxy/SimpleProxyProvider.java   |  15 --
 10 files changed, 273 insertions(+), 82 deletions(-)
 create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/proxy/AbstractRefreshableProxyProvider.java
 create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ExpirableProxy.java
 create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/proxy/RefreshableProxyProvider.java
 create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ReturnableProxyProvider.java

diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml
index 4b89cac..0cea05f 100644
--- a/webmagic-core/pom.xml
+++ b/webmagic-core/pom.xml
@@ -1,5 +1,6 @@
 
-
+
     
         us.codecraft
         webmagic-parent
@@ -24,6 +25,12 @@
             org.apache.commons
             commons-lang3
         
+        
+            org.projectlombok
+            lombok
+            1.18.10
+            provided
+        
 
         
             us.codecraft
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
index bc07651..dfca9dd 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
@@ -421,6 +421,7 @@ public class Spider implements Runnable, Task {
             }
         } else if(site.getRefreshCode().contains(page.getStatusCode())) {
             logger.info("page status code error, page {} , code: {}, start refresh downloader", request.getUrl(), page.getStatusCode());
+            downloader.refreshComponent(this);
             failHandler(request);
         }else {
             logger.info("page status code error, page {} , code: {}", request.getUrl(), page.getStatusCode());
@@ -434,7 +435,6 @@ public class Spider implements Runnable, Task {
     }
 
     private void failHandler(Request request){
-        downloader.refreshComponent(this);
         if (site.getCycleRetryTimes() == 0) {
             sleep(site.getSleepTime());
         } else {
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
index ace8175..8e8676d 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
@@ -13,6 +13,8 @@ import us.codecraft.webmagic.Site;
 import us.codecraft.webmagic.Task;
 import us.codecraft.webmagic.proxy.Proxy;
 import us.codecraft.webmagic.proxy.ProxyProvider;
+import us.codecraft.webmagic.proxy.RefreshableProxyProvider;
+import us.codecraft.webmagic.proxy.ReturnableProxyProvider;
 import us.codecraft.webmagic.selector.PlainText;
 import us.codecraft.webmagic.utils.CharsetUtils;
 import us.codecraft.webmagic.utils.HttpClientUtils;
@@ -93,8 +95,8 @@ public class HttpClientDownloader extends AbstractDownloader {
         } catch (IOException e) {
             logger.warn("download page {} error", request.getUrl(), e);
             onError(request, e, proxyProvider);
-            if (proxyProvider != null  && refreshProxyOnError.test(e)) {
-                proxyProvider.refreshProxy(task,proxy);
+            if (proxyProvider != null && proxy != null && proxyProvider instanceof RefreshableProxyProvider && refreshProxyOnError.test(e)) {
+                ((RefreshableProxyProvider)proxyProvider).refreshProxy(task,proxy);
             }
             if(refreshClientOnError.test(e)) {
                 httpClients.remove(task.getSite().getDomain());
@@ -105,8 +107,9 @@ public class HttpClientDownloader extends AbstractDownloader {
                 //ensure the connection is released back to pool
                 EntityUtils.consumeQuietly(httpResponse.getEntity());
             }
-            if (proxyProvider != null && proxy != null) {
-                proxyProvider.returnProxy(proxy, page, task);
+            if (proxyProvider != null && proxy != null && proxyProvider instanceof ReturnableProxyProvider) {
+                ((ReturnableProxyProvider) proxyProvider).returnProxy(proxy, page, task);
+
             }
         }
     }
@@ -114,8 +117,8 @@ public class HttpClientDownloader extends AbstractDownloader {
 
     @Override
     public void refreshComponent(Task task) {
-        if (proxyProvider != null ) {
-            proxyProvider.refreshProxy(task,proxyProvider.getCurrentProxy(task));
+        if (proxyProvider != null && proxyProvider instanceof RefreshableProxyProvider) {
+            ((RefreshableProxyProvider) proxyProvider).refreshProxy(task, ((RefreshableProxyProvider) proxyProvider).getCurrentProxy(task));
         }
 
             httpClients.remove(task.getSite().getDomain());
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/AbstractRefreshableProxyProvider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/AbstractRefreshableProxyProvider.java
new file mode 100644
index 0000000..8e7cb08
--- /dev/null
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/AbstractRefreshableProxyProvider.java
@@ -0,0 +1,135 @@
+package us.codecraft.webmagic.proxy;
+
+import lombok.extern.slf4j.Slf4j;
+import us.codecraft.webmagic.Task;
+
+import java.math.BigDecimal;
+import java.math.RoundingMode;
+import java.util.Comparator;
+import java.util.concurrent.*;
+import java.util.concurrent.atomic.AtomicReference;
+import java.util.concurrent.atomic.LongAdder;
+
+/**
+ * @author yaoqiang
+ * 可刷新的代理提供商抽象实现
+ */
+@Slf4j
+public abstract class AbstractRefreshableProxyProvider implements RefreshableProxyProvider {
+
+    private final LongAdder totalGet = new LongAdder();
+
+    private final LongAdder canUse = new LongAdder();
+
+    private final AtomicReference> usedProxyCache = new AtomicReference<>();
+
+    private final PriorityBlockingQueue ipQueue = new PriorityBlockingQueue<>(1000, Comparator.comparing(ExpirableProxy::getExpireTime));
+
+    private final int maxHostNum;
+
+    public AbstractRefreshableProxyProvider(int maxHostNum) {
+        this.maxHostNum = maxHostNum;
+    }
+
+    protected void doPut(ExpirableProxy expirableProxy) {
+        synchronized (ipQueue) {
+            if (ipQueue.size() <= maxHostNum) {
+                ipQueue.put(expirableProxy);
+            }
+        }
+    }
+
+    @Override
+    public void refreshProxy(Task task, Proxy proxy) {
+        if (proxy != null) {
+            FutureTask proxyFutureTask = usedProxyCache.get();
+            Proxy currentProxy = getCurrentProxy(task);
+            // 如果在出错到这里的过程中,usedProxyCache被更新过,proxy 就不可能相等,如果依然相等,说明没有更新过
+            // 可能没有使用代理的情况
+            if (proxy.equals(currentProxy)) {
+                // 如果此时依然没有更新过,就设置为空
+                usedProxyCache.compareAndSet(proxyFutureTask, null);
+            }
+        }
+    }
+
+    @Override
+    public Proxy getCurrentProxy(Task task) {
+        FutureTask cache = usedProxyCache.get();
+        Proxy currentProxy = null;
+        try {
+            if (cache != null)
+                currentProxy = cache.get(5, TimeUnit.SECONDS);
+        } catch (InterruptedException e) {
+            e.printStackTrace();
+            log.error(e.getMessage(), e);
+            Thread.currentThread().interrupt();
+        } catch (ExecutionException e) {
+            e.printStackTrace();
+            log.error(e.getCause().getMessage(), e);
+        } catch (TimeoutException e) {
+            log.error(e.getMessage(), e);
+            e.printStackTrace();
+        }
+        return currentProxy;
+    }
+
+
+    private FutureTask buildCacheTask() {
+        return new FutureTask<>(this::doGet);
+    }
+
+
+    /**
+     * 特别注意,防止活锁,集cache中总是抛出异常,那么将无限循环,无限报错
+     *
+     * @param task 下载任务
+     * @return 返回代理
+     */
+    @Override
+    public Proxy getProxy(Task task) {
+        while (!Thread.currentThread().isInterrupted()) {
+            FutureTask cache = usedProxyCache.get();
+            if (cache == null) {
+                FutureTask futureTask = buildCacheTask();
+                if (usedProxyCache.compareAndSet(null, futureTask)) {
+                    cache = futureTask;
+                    futureTask.run();
+                } else {
+                    // 交换失败,需要更新到最新数据
+                    cache = usedProxyCache.get();
+                }
+            }
+            try {
+                if (cache != null) {
+
+                    ExpirableProxy proxy = (ExpirableProxy) cache.get(5, TimeUnit.SECONDS);
+                    if (!proxy.isExpire())
+                        return proxy;
+                }
+                usedProxyCache.compareAndSet(cache, null);
+            } catch (InterruptedException e) {
+                Thread.currentThread().interrupt();
+                log.error(e.getMessage(), e);
+                usedProxyCache.compareAndSet(cache, null);
+            } catch (ExecutionException e) {
+                log.error(e.getMessage(), e);
+                usedProxyCache.compareAndSet(cache, null);
+            } catch (TimeoutException e) {
+                log.error(e.getMessage(), e);
+            }
+        }
+        return null;
+    }
+
+    private Proxy doGet() throws InterruptedException {
+        ExpirableProxy proxy;
+        do {
+            proxy = ipQueue.take();
+        } while (proxy.isExpire());
+        log.info("切换到proxy:ip:{},port:{},ip可用率:{}", proxy.getHost(), proxy.getPort(), BigDecimal.valueOf(canUse.sum()).divide(BigDecimal.valueOf(totalGet.sum()), 2, RoundingMode.HALF_DOWN).doubleValue());
+        return proxy;
+    }
+
+
+}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ExpirableProxy.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ExpirableProxy.java
new file mode 100644
index 0000000..f23caaf
--- /dev/null
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ExpirableProxy.java
@@ -0,0 +1,34 @@
+package us.codecraft.webmagic.proxy;
+
+import org.apache.http.annotation.Contract;
+import org.apache.http.annotation.ThreadingBehavior;
+
+import java.time.LocalDateTime;
+import java.time.temporal.ChronoUnit;
+
+/**
+ * @author yaoqiang
+ *
+ * 可以过期的代理
+ */
+@Contract(threading = ThreadingBehavior.IMMUTABLE_CONDITIONAL)
+public class ExpirableProxy extends Proxy {
+    private final int ttl;
+    private final LocalDateTime expireTime;
+
+
+    public ExpirableProxy(String host, int port, int ttl, ChronoUnit chronoUnit) {
+        super(host, port);
+        this.ttl = ttl;
+        this.expireTime = LocalDateTime.now().plus(ttl, chronoUnit);
+
+    }
+
+    public boolean isExpire() {
+        return LocalDateTime.now().isAfter(expireTime);
+    }
+    public LocalDateTime getExpireTime(){
+        return expireTime;
+    }
+
+}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java
index 6554fab..ffae4be 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java
@@ -6,33 +6,30 @@ import java.net.URISyntaxException;
 import java.net.URLEncoder;
 import java.nio.charset.StandardCharsets;
 
+import jdk.nashorn.internal.ir.annotations.Immutable;
 import org.apache.commons.lang3.StringUtils;
+import org.apache.http.annotation.Contract;
+import org.apache.http.annotation.ThreadingBehavior;
 
+@Contract(threading = ThreadingBehavior.IMMUTABLE)
 public class Proxy {
 
-    private String scheme;
+    private final String scheme;
 
-    private String host;
+    private final String host;
 
-    private int port;
+    private final int port;
 
-    private String username;
+    private final String username;
 
-    private String password;
+    private final String password;
 
-    public static Proxy create(final URI uri) {
-        Proxy proxy = new Proxy(uri.getHost(), uri.getPort(), uri.getScheme());
-        String userInfo = uri.getUserInfo();
-        if (userInfo != null) {
-            String[] up = userInfo.split(":");
-            if (up.length == 1) {
-                proxy.username = up[0].isEmpty() ? null : up[0];
-            } else {
-                proxy.username = up[0].isEmpty() ? null : up[0];
-                proxy.password = up[1].isEmpty() ? null : up[1];
-            }
-        }
-        return proxy;
+    public Proxy(String host, int port, String scheme, String username, String password) {
+        this.scheme = scheme;
+        this.host = host;
+        this.port = port;
+        this.username = username;
+        this.password = password;
     }
 
     public Proxy(String host, int port) {
@@ -40,27 +37,30 @@ public class Proxy {
     }
 
     public Proxy(String host, int port, String scheme) {
-        this.host = host;
-        this.port = port;
-        this.scheme = scheme;
+        this(host, port, scheme, null, null);
     }
 
     public Proxy(String host, int port, String username, String password) {
-        this.host = host;
-        this.port = port;
-        this.username = username;
-        this.password = password;
+        this(host, port, null, username, password);
     }
 
-    public String getScheme() {
-        return scheme;
+    public static Proxy create(final URI uri) {
+        String userInfo = uri.getUserInfo();
+        String username = null;
+        String password = null;
+        if (userInfo != null) {
+            String[] up = userInfo.split(":");
+            if (up.length == 1) {
+                username = up[0].isEmpty() ? null : up[0];
+            } else {
+                username = up[0].isEmpty() ? null : up[0];
+                password = up[1].isEmpty() ? null : up[1];
+            }
+        }
+        return new Proxy(uri.getHost(), uri.getPort(), uri.getScheme(), username, password);
     }
 
-    public void setScheme(String scheme) {
-        this.scheme = scheme;
-    }
-
-	public String getHost() {
+    public String getHost() {
         return host;
     }
 
@@ -68,6 +68,8 @@ public class Proxy {
         return port;
     }
 
+    public String getScheme(){return scheme;}
+
     public String getUsername() {
         return username;
     }
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java
index b4e7b48..b567d58 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java
@@ -1,6 +1,5 @@
 package us.codecraft.webmagic.proxy;
 
-import us.codecraft.webmagic.Page;
 import us.codecraft.webmagic.Task;
 
 /**
@@ -10,32 +9,6 @@ import us.codecraft.webmagic.Task;
  */
 public interface ProxyProvider {
 
-    /**
-     *
-     * Return proxy to Provider when complete a download.
-     * @param proxy the proxy config contains host,port and identify info
-     * @param page the download result
-     * @param task the download task
-     */
-    void returnProxy(Proxy proxy, Page page, Task task);
-
-    /**
-     *  代理IP是珍贵资源,有可能代理提供者内部代理没有过期,就一直提供某个IP,但这个IP又不可以使用,所以提供一种方式通知提供者,这个代理该刷新了
-     *
-     * @param task  下载任务
-     * @param proxy 需要对代理进行验证,如果确实持有的时错误代理,则刷新,否则,继续执行
-     */
-    void refreshProxy(Task task,Proxy proxy);
-
-
-    /**
-     *
-     * 获取当前正在提供的代理
-     *
-     * @param task
-     * @return
-     */
-    Proxy getCurrentProxy(Task task);
 
     /**
      * Get a proxy for task by some strategy.
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/RefreshableProxyProvider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/RefreshableProxyProvider.java
new file mode 100644
index 0000000..77e1ce2
--- /dev/null
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/RefreshableProxyProvider.java
@@ -0,0 +1,30 @@
+package us.codecraft.webmagic.proxy;
+
+import us.codecraft.webmagic.Task;
+
+/**
+ * @author yaoqiang
+ *
+ * 可以手动刷新的代理供应商
+ */
+public interface RefreshableProxyProvider extends ProxyProvider{
+
+    /**
+     *  代理IP是珍贵资源,有可能代理提供者内部代理没有过期,就一直提供某个IP,但这个IP又不可以使用,所以提供一种方式通知提供者,这个代理该刷新了
+     *
+     * @param task  爬虫任务
+     * @param proxy 需要对代理进行验证,如果确实持有的时错误代理,则刷新,否则,继续执行
+     */
+    void refreshProxy(Task task,Proxy proxy);
+
+
+    /**
+     *
+     * 获取当前正在提供的代理
+     *
+     * @param task 工作中的爬虫任务
+     * @return 获取当前正在使用的代理
+     */
+    Proxy getCurrentProxy(Task task);
+
+}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ReturnableProxyProvider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ReturnableProxyProvider.java
new file mode 100644
index 0000000..43b49fc
--- /dev/null
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ReturnableProxyProvider.java
@@ -0,0 +1,22 @@
+package us.codecraft.webmagic.proxy;
+
+import us.codecraft.webmagic.Page;
+import us.codecraft.webmagic.Task;
+
+/**
+ * @author yaoqiang
+ *
+ * 可归还的代理提供商,代理被取出后,实用完成,可以归还给代理提供商
+ */
+public interface ReturnableProxyProvider {
+
+    /**
+     *
+     * Return proxy to Provider when complete a download.
+     * @param proxy the proxy config contains host,port and identify info
+     * @param page the download result
+     * @param task the download task
+     */
+    void returnProxy(Proxy proxy, Page page, Task task);
+
+}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyProvider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyProvider.java
index 8ad9ce7..fda3e23 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyProvider.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyProvider.java
@@ -1,6 +1,5 @@
 package us.codecraft.webmagic.proxy;
 
-import us.codecraft.webmagic.Page;
 import us.codecraft.webmagic.Task;
 
 import java.util.ArrayList;
@@ -30,15 +29,6 @@ public class SimpleProxyProvider implements ProxyProvider {
         this.pointer = pointer;
     }
 
-    @Override
-    public Proxy getCurrentProxy(Task task) {
-        return null;
-    }
-
-    @Override
-    public void refreshProxy(Task task,Proxy proxy) {
-
-    }
 
     public static SimpleProxyProvider from(Proxy... proxies) {
         List proxiesTemp = new ArrayList(proxies.length);
@@ -48,11 +38,6 @@ public class SimpleProxyProvider implements ProxyProvider {
         return new SimpleProxyProvider(Collections.unmodifiableList(proxiesTemp));
     }
 
-    @Override
-    public void returnProxy(Proxy proxy, Page page, Task task) {
-        //Donothing
-    }
-
     @Override
     public Proxy getProxy(Task task) {
         return proxies.get(incrForLoop());

From f68795d7dd1ad3202a59ab9d49030065992001b1 Mon Sep 17 00:00:00 2001
From: yao 
Date: Tue, 29 Dec 2020 16:54:38 +0800
Subject: [PATCH 9/9] =?UTF-8?q?=20bug=E4=BF=AE=E6=94=B9=EF=BC=8C=E5=AF=B9?=
 =?UTF-8?q?=E7=BB=93=E6=9E=9C=E6=8F=90=E4=BE=9B=E7=BC=93=E5=AD=98=E8=83=BD?=
 =?UTF-8?q?=E5=8A=9B?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../webmagic/pipeline/CachePipeline.java      | 18 ++++
 .../pipeline/CloseableCachePipeline.java      | 87 +++++++++++++++++++
 .../AbstractRefreshableProxyProvider.java     | 22 ++---
 .../webmagic/proxy/ExpirableProxy.java        |  2 +
 4 files changed, 112 insertions(+), 17 deletions(-)
 create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/CachePipeline.java
 create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/CloseableCachePipeline.java

diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/CachePipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/CachePipeline.java
new file mode 100644
index 0000000..e0acd90
--- /dev/null
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/CachePipeline.java
@@ -0,0 +1,18 @@
+package us.codecraft.webmagic.pipeline;
+
+import java.util.Collection;
+
+/**
+ * @author yaoqiang
+ *
+ * 为pipeline提供缓存能力
+ * 在某个时机执行批处理任务
+ */
+public interface CachePipeline extends Pipeline{
+
+    /**
+     * @param collection  缓存批处理
+     *
+     */
+    void process(Collection collection);
+}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/CloseableCachePipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/CloseableCachePipeline.java
new file mode 100644
index 0000000..4ba433e
--- /dev/null
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/CloseableCachePipeline.java
@@ -0,0 +1,87 @@
+package us.codecraft.webmagic.pipeline;
+
+import lombok.extern.slf4j.Slf4j;
+import us.codecraft.webmagic.ResultItems;
+import us.codecraft.webmagic.Task;
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.concurrent.ArrayBlockingQueue;
+import java.util.concurrent.BlockingQueue;
+import java.util.concurrent.ExecutorService;
+
+/**
+ * @author yaoqiang
+ * 提供关闭时刷新能力
+ * 

+ *

+ * 不负责创建 {@link ExecutorService},如果需要异步执行,那么需要从外界传入,由外界自己管理 {@link ExecutorService}生命周期 + * @see ExecutorService + */ +@Slf4j +public abstract class CloseableCachePipeline implements CachePipeline, Closeable { + + + private final BlockingQueue cache; + + private final ExecutorService executorService; + + public CloseableCachePipeline(int max, ExecutorService executorService) { + this.cache = new ArrayBlockingQueue<>(max, false); + this.executorService = executorService; + } + + public CloseableCachePipeline(int max) { + this(max, null); + } + + /** + * @param resultItems 接收到的信息 + * @param task 执行的任务 + */ + @Override + public final void process(ResultItems resultItems, Task task) { + try { + cache.put(resultItems); + } catch (InterruptedException e) { + e.printStackTrace(); + Thread.currentThread().interrupt(); + log.error(e.getMessage(), e); + } + if (cache.remainingCapacity() == 0) { + // set 中的resultItem 使用权依然传递了出去,cache的使用全保留,考虑到后面也用不上 resultItem,所以发布出去问题也不大 + // temp的修改权限被发布,理由同上,想添加,删除都可以,反正以后也用不上了; + Set temp = new HashSet<>(cache); + if (executorService != null && !executorService.isShutdown()) { + executorService.execute(() -> process(temp, task)); + } else { + process(temp, task); + } + cache.clear(); + } + + } + + protected abstract void process(Collection resultItems, Task task); + + private synchronized void flush(Collection resultItems) { + process(resultItems, null); + + } + + /** + * 结合源码,实现关闭时处理剩余的缓存,直接交由主线程处理 + * + * @throws IOException 关闭可能出现异常,由上层处理 + */ + @Override + public final void close() throws IOException { + if (!cache.isEmpty()) { + flush(cache); + cache.clear(); + } + } +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/AbstractRefreshableProxyProvider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/AbstractRefreshableProxyProvider.java index 8e7cb08..781553c 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/AbstractRefreshableProxyProvider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/AbstractRefreshableProxyProvider.java @@ -3,12 +3,9 @@ package us.codecraft.webmagic.proxy; import lombok.extern.slf4j.Slf4j; import us.codecraft.webmagic.Task; -import java.math.BigDecimal; -import java.math.RoundingMode; import java.util.Comparator; import java.util.concurrent.*; import java.util.concurrent.atomic.AtomicReference; -import java.util.concurrent.atomic.LongAdder; /** * @author yaoqiang @@ -17,26 +14,18 @@ import java.util.concurrent.atomic.LongAdder; @Slf4j public abstract class AbstractRefreshableProxyProvider implements RefreshableProxyProvider { - private final LongAdder totalGet = new LongAdder(); - - private final LongAdder canUse = new LongAdder(); private final AtomicReference> usedProxyCache = new AtomicReference<>(); private final PriorityBlockingQueue ipQueue = new PriorityBlockingQueue<>(1000, Comparator.comparing(ExpirableProxy::getExpireTime)); - private final int maxHostNum; - - public AbstractRefreshableProxyProvider(int maxHostNum) { - this.maxHostNum = maxHostNum; - } protected void doPut(ExpirableProxy expirableProxy) { - synchronized (ipQueue) { - if (ipQueue.size() <= maxHostNum) { - ipQueue.put(expirableProxy); - } - } + ipQueue.put(expirableProxy); + } + + protected int hostSize() { + return ipQueue.size(); } @Override @@ -127,7 +116,6 @@ public abstract class AbstractRefreshableProxyProvider implements RefreshablePro do { proxy = ipQueue.take(); } while (proxy.isExpire()); - log.info("切换到proxy:ip:{},port:{},ip可用率:{}", proxy.getHost(), proxy.getPort(), BigDecimal.valueOf(canUse.sum()).divide(BigDecimal.valueOf(totalGet.sum()), 2, RoundingMode.HALF_DOWN).doubleValue()); return proxy; } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ExpirableProxy.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ExpirableProxy.java index f23caaf..d22b7dc 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ExpirableProxy.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ExpirableProxy.java @@ -1,5 +1,6 @@ package us.codecraft.webmagic.proxy; +import lombok.Getter; import org.apache.http.annotation.Contract; import org.apache.http.annotation.ThreadingBehavior; @@ -13,6 +14,7 @@ import java.time.temporal.ChronoUnit; */ @Contract(threading = ThreadingBehavior.IMMUTABLE_CONDITIONAL) public class ExpirableProxy extends Proxy { + @Getter private final int ttl; private final LocalDateTime expireTime;