diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index 4b89cac..0cea05f 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -1,5 +1,6 @@ - + us.codecraft webmagic-parent @@ -24,6 +25,12 @@ org.apache.commons commons-lang3 + + org.projectlombok + lombok + 1.18.10 + provided + us.codecraft diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java index 72cc7d0..bf603b3 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java @@ -1,5 +1,6 @@ package us.codecraft.webmagic; +import com.sun.org.apache.regexp.internal.RE; import us.codecraft.webmagic.utils.HttpConstant; import java.util.*; @@ -35,8 +36,12 @@ public class Site { private static final Set DEFAULT_STATUS_CODE_SET = new HashSet(); + private static final Set DEFAULT_REFRESH_CODE_SET = new HashSet<>(); + + private Set refreshCode = DEFAULT_REFRESH_CODE_SET; private Set acceptStatCode = DEFAULT_STATUS_CODE_SET; + private Map headers = new HashMap(); private boolean useGzip = true; @@ -44,6 +49,7 @@ public class Site { private boolean disableCookieManagement = false; static { + DEFAULT_REFRESH_CODE_SET.add(HttpConstant.StatusCode.FORBIDDEN); DEFAULT_STATUS_CODE_SET.add(HttpConstant.StatusCode.CODE_200); } @@ -192,6 +198,15 @@ public class Site { return this; } + public Site setRefreshCode(Set refreshCode){ + this.refreshCode = refreshCode; + return this; + } + public Set getRefreshCode(){ + return refreshCode; + + } + /** * get acceptStatCode * diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 886e74a..aa8c5f3 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -24,6 +24,7 @@ import java.util.concurrent.ExecutorService; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicLong; +import java.util.concurrent.atomic.LongAdder; import java.util.concurrent.locks.Condition; import java.util.concurrent.locks.ReentrantLock; @@ -102,7 +103,7 @@ public class Spider implements Runnable, Task { private List spiderListeners; - private final AtomicLong pageCount = new AtomicLong(0); + private final LongAdder pageCount = new LongAdder(); private Date startTime; @@ -323,7 +324,7 @@ public class Spider implements Runnable, Task { onError(request); logger.error("process request " + request + " error", e); } finally { - pageCount.incrementAndGet(); + pageCount.increment(); signalNewUrl(); } } @@ -335,7 +336,7 @@ public class Spider implements Runnable, Task { if (destroyWhenExit) { close(); } - logger.info("Spider {} closed! {} pages downloaded.", getUUID(), pageCount.get()); + logger.info("Spider {} closed! {} pages downloaded.", getUUID(), pageCount.sumThenReset()); } protected void onError(Request request) { @@ -423,7 +424,11 @@ public class Spider implements Runnable, Task { pipeline.process(page.getResultItems(), this); } } - } else { + } else if(site.getRefreshCode().contains(page.getStatusCode())) { + logger.info("page status code error, page {} , code: {}, start refresh downloader", request.getUrl(), page.getStatusCode()); + downloader.refreshComponent(this); + failHandler(request); + }else { logger.info("page status code error, page {} , code: {}", request.getUrl(), page.getStatusCode()); } sleep(site.getSleepTime()); @@ -431,6 +436,10 @@ public class Spider implements Runnable, Task { } private void onDownloaderFail(Request request) { + failHandler(request); + } + + private void failHandler(Request request){ if (site.getCycleRetryTimes() == 0) { sleep(site.getSleepTime()); } else { @@ -650,7 +659,7 @@ public class Spider implements Runnable, Task { * @since 0.4.1 */ public long getPageCount() { - return pageCount.get(); + return pageCount.sum(); } /** diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java index c27292d..05f5686 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java @@ -3,6 +3,7 @@ package us.codecraft.webmagic.downloader; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.proxy.ProxyProvider; import us.codecraft.webmagic.selector.Html; /** @@ -38,7 +39,7 @@ public abstract class AbstractDownloader implements Downloader { protected void onSuccess(Request request) { } - protected void onError(Request request) { + protected void onError(Request request, Throwable throwable, ProxyProvider proxyProvider) { } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Downloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Downloader.java index f7ced49..5095501 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Downloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Downloader.java @@ -18,14 +18,18 @@ public interface Downloader { * Downloads web pages and store in Page object. * * @param request request - * @param task task + * @param task task * @return page */ - public Page download(Request request, Task task); + Page download(Request request, Task task); /** * Tell the downloader how many threads the spider used. + * * @param threadNum number of threads */ - public void setThread(int threadNum); + void setThread(int threadNum); + + + void refreshComponent(Task task); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 24889c8..8e8676d 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -13,6 +13,8 @@ import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.proxy.Proxy; import us.codecraft.webmagic.proxy.ProxyProvider; +import us.codecraft.webmagic.proxy.RefreshableProxyProvider; +import us.codecraft.webmagic.proxy.ReturnableProxyProvider; import us.codecraft.webmagic.selector.PlainText; import us.codecraft.webmagic.utils.CharsetUtils; import us.codecraft.webmagic.utils.HttpClientUtils; @@ -21,6 +23,8 @@ import java.io.IOException; import java.nio.charset.Charset; import java.util.HashMap; import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; +import java.util.function.Predicate; /** @@ -31,17 +35,29 @@ import java.util.Map; */ public class HttpClientDownloader extends AbstractDownloader { - private Logger logger = LoggerFactory.getLogger(getClass()); - - private final Map httpClients = new HashMap(); - - private HttpClientGenerator httpClientGenerator = new HttpClientGenerator(); + private final Map httpClients = new ConcurrentHashMap<>(); + private final Logger logger = LoggerFactory.getLogger(getClass()); + private final HttpClientGenerator httpClientGenerator = new HttpClientGenerator(); private HttpUriRequestConverter httpUriRequestConverter = new HttpUriRequestConverter(); - + private ProxyProvider proxyProvider; - private boolean responseHeader = true; + private final boolean responseHeader = true; + + + private Predicate refreshProxyOnError = t -> false; + + + private Predicate refreshClientOnError = t -> false; + + + public void setRefreshClientOnError(Predicate clientOnError){ + this.refreshClientOnError = clientOnError; + } + public void setRefreshProxyOnError(Predicate proxyOnError) { + this.refreshProxyOnError = proxyOnError; + } public void setHttpUriRequestConverter(HttpUriRequestConverter httpUriRequestConverter) { this.httpUriRequestConverter = httpUriRequestConverter; @@ -56,17 +72,8 @@ public class HttpClientDownloader extends AbstractDownloader { return httpClientGenerator.getClient(null); } String domain = site.getDomain(); - CloseableHttpClient httpClient = httpClients.get(domain); - if (httpClient == null) { - synchronized (this) { - httpClient = httpClients.get(domain); - if (httpClient == null) { - httpClient = httpClientGenerator.getClient(site); - httpClients.put(domain, httpClient); - } - } - } - return httpClient; + return httpClients.computeIfAbsent(domain,k->httpClientGenerator.getClient(site)); + } @Override @@ -87,19 +94,37 @@ public class HttpClientDownloader extends AbstractDownloader { return page; } catch (IOException e) { logger.warn("download page {} error", request.getUrl(), e); - onError(request); + onError(request, e, proxyProvider); + if (proxyProvider != null && proxy != null && proxyProvider instanceof RefreshableProxyProvider && refreshProxyOnError.test(e)) { + ((RefreshableProxyProvider)proxyProvider).refreshProxy(task,proxy); + } + if(refreshClientOnError.test(e)) { + httpClients.remove(task.getSite().getDomain()); + } return page; } finally { if (httpResponse != null) { //ensure the connection is released back to pool EntityUtils.consumeQuietly(httpResponse.getEntity()); } - if (proxyProvider != null && proxy != null) { - proxyProvider.returnProxy(proxy, page, task); + if (proxyProvider != null && proxy != null && proxyProvider instanceof ReturnableProxyProvider) { + ((ReturnableProxyProvider) proxyProvider).returnProxy(proxy, page, task); + } } } + + @Override + public void refreshComponent(Task task) { + if (proxyProvider != null && proxyProvider instanceof RefreshableProxyProvider) { + ((RefreshableProxyProvider) proxyProvider).refreshProxy(task, ((RefreshableProxyProvider) proxyProvider).getCurrentProxy(task)); + } + + httpClients.remove(task.getSite().getDomain()); + + } + @Override public void setThread(int thread) { httpClientGenerator.setPoolSize(thread); @@ -110,7 +135,7 @@ public class HttpClientDownloader extends AbstractDownloader { String contentType = httpResponse.getEntity().getContentType() == null ? "" : httpResponse.getEntity().getContentType().getValue(); Page page = new Page(); page.setBytes(bytes); - if (!request.isBinaryContent()){ + if (!request.isBinaryContent()) { if (charset == null) { charset = getHtmlCharset(contentType, bytes); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java index 80e0f10..2d27b79 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java @@ -1,13 +1,17 @@ package us.codecraft.webmagic.downloader; +import java.io.File; import java.io.IOException; import java.security.KeyManagementException; +import java.security.KeyStore; +import java.security.KeyStoreException; import java.security.NoSuchAlgorithmException; import java.security.cert.CertificateException; import java.security.cert.X509Certificate; import java.util.Map; import javax.net.ssl.SSLContext; +import javax.net.ssl.SSLContextSpi; import javax.net.ssl.TrustManager; import javax.net.ssl.X509TrustManager; @@ -24,6 +28,7 @@ import org.apache.http.conn.socket.ConnectionSocketFactory; import org.apache.http.conn.socket.PlainConnectionSocketFactory; import org.apache.http.conn.ssl.DefaultHostnameVerifier; import org.apache.http.conn.ssl.SSLConnectionSocketFactory; +import org.apache.http.conn.ssl.TrustSelfSignedStrategy; import org.apache.http.impl.client.BasicCookieStore; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.DefaultHttpRequestRetryHandler; @@ -32,6 +37,7 @@ import org.apache.http.impl.client.HttpClients; import org.apache.http.impl.conn.PoolingHttpClientConnectionManager; import org.apache.http.impl.cookie.BasicClientCookie; import org.apache.http.protocol.HttpContext; +import org.apache.http.ssl.SSLContexts; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -69,7 +75,7 @@ public class HttpClientGenerator { return new SSLConnectionSocketFactory(sslContext, supportedProtocols, null, new DefaultHostnameVerifier()); // 优先绕过安全证书 - } catch (KeyManagementException e) { + } catch (KeyManagementException | CertificateException | KeyStoreException | IOException e) { logger.error("ssl connection fail", e); } catch (NoSuchAlgorithmException e) { logger.error("ssl connection fail", e); @@ -77,8 +83,8 @@ public class HttpClientGenerator { return SSLConnectionSocketFactory.getSocketFactory(); } - private SSLContext createIgnoreVerifySSL() throws NoSuchAlgorithmException, KeyManagementException { - // 实现一个X509TrustManager接口,用于绕过验证,不用修改里面的方法 + private SSLContext createIgnoreVerifySSL() throws NoSuchAlgorithmException, KeyManagementException, CertificateException, KeyStoreException, IOException { +// 实现一个X509TrustManager接口,用于绕过验证,不用修改里面的方法 X509TrustManager trustManager = new X509TrustManager() { @Override @@ -96,10 +102,10 @@ public class HttpClientGenerator { }; - SSLContext sc = SSLContext.getInstance("TLS"); + SSLContext sc = SSLContext.getInstance("SSLv3"); sc.init(null, new TrustManager[] { trustManager }, null); return sc; - } + } public HttpClientGenerator setPoolSize(int poolSize) { connectionManager.setMaxTotal(poolSize); @@ -137,6 +143,7 @@ public class HttpClientGenerator { SocketConfig.Builder socketConfigBuilder = SocketConfig.custom(); socketConfigBuilder.setSoKeepAlive(true).setTcpNoDelay(true); socketConfigBuilder.setSoTimeout(site.getTimeOut()); + SocketConfig socketConfig = socketConfigBuilder.build(); httpClientBuilder.setDefaultSocketConfig(socketConfig); connectionManager.setDefaultSocketConfig(socketConfig); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/CachePipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/CachePipeline.java new file mode 100644 index 0000000..e0acd90 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/CachePipeline.java @@ -0,0 +1,18 @@ +package us.codecraft.webmagic.pipeline; + +import java.util.Collection; + +/** + * @author yaoqiang + * + * 为pipeline提供缓存能力 + * 在某个时机执行批处理任务 + */ +public interface CachePipeline extends Pipeline{ + + /** + * @param collection 缓存批处理 + * + */ + void process(Collection collection); +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/CloseableCachePipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/CloseableCachePipeline.java new file mode 100644 index 0000000..4ba433e --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/CloseableCachePipeline.java @@ -0,0 +1,87 @@ +package us.codecraft.webmagic.pipeline; + +import lombok.extern.slf4j.Slf4j; +import us.codecraft.webmagic.ResultItems; +import us.codecraft.webmagic.Task; + +import java.io.Closeable; +import java.io.IOException; +import java.util.Collection; +import java.util.HashSet; +import java.util.Set; +import java.util.concurrent.ArrayBlockingQueue; +import java.util.concurrent.BlockingQueue; +import java.util.concurrent.ExecutorService; + +/** + * @author yaoqiang + * 提供关闭时刷新能力 + *

+ *

+ * 不负责创建 {@link ExecutorService},如果需要异步执行,那么需要从外界传入,由外界自己管理 {@link ExecutorService}生命周期 + * @see ExecutorService + */ +@Slf4j +public abstract class CloseableCachePipeline implements CachePipeline, Closeable { + + + private final BlockingQueue cache; + + private final ExecutorService executorService; + + public CloseableCachePipeline(int max, ExecutorService executorService) { + this.cache = new ArrayBlockingQueue<>(max, false); + this.executorService = executorService; + } + + public CloseableCachePipeline(int max) { + this(max, null); + } + + /** + * @param resultItems 接收到的信息 + * @param task 执行的任务 + */ + @Override + public final void process(ResultItems resultItems, Task task) { + try { + cache.put(resultItems); + } catch (InterruptedException e) { + e.printStackTrace(); + Thread.currentThread().interrupt(); + log.error(e.getMessage(), e); + } + if (cache.remainingCapacity() == 0) { + // set 中的resultItem 使用权依然传递了出去,cache的使用全保留,考虑到后面也用不上 resultItem,所以发布出去问题也不大 + // temp的修改权限被发布,理由同上,想添加,删除都可以,反正以后也用不上了; + Set temp = new HashSet<>(cache); + if (executorService != null && !executorService.isShutdown()) { + executorService.execute(() -> process(temp, task)); + } else { + process(temp, task); + } + cache.clear(); + } + + } + + protected abstract void process(Collection resultItems, Task task); + + private synchronized void flush(Collection resultItems) { + process(resultItems, null); + + } + + /** + * 结合源码,实现关闭时处理剩余的缓存,直接交由主线程处理 + * + * @throws IOException 关闭可能出现异常,由上层处理 + */ + @Override + public final void close() throws IOException { + if (!cache.isEmpty()) { + flush(cache); + cache.clear(); + } + } +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/AbstractRefreshableProxyProvider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/AbstractRefreshableProxyProvider.java new file mode 100644 index 0000000..781553c --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/AbstractRefreshableProxyProvider.java @@ -0,0 +1,123 @@ +package us.codecraft.webmagic.proxy; + +import lombok.extern.slf4j.Slf4j; +import us.codecraft.webmagic.Task; + +import java.util.Comparator; +import java.util.concurrent.*; +import java.util.concurrent.atomic.AtomicReference; + +/** + * @author yaoqiang + * 可刷新的代理提供商抽象实现 + */ +@Slf4j +public abstract class AbstractRefreshableProxyProvider implements RefreshableProxyProvider { + + + private final AtomicReference> usedProxyCache = new AtomicReference<>(); + + private final PriorityBlockingQueue ipQueue = new PriorityBlockingQueue<>(1000, Comparator.comparing(ExpirableProxy::getExpireTime)); + + + protected void doPut(ExpirableProxy expirableProxy) { + ipQueue.put(expirableProxy); + } + + protected int hostSize() { + return ipQueue.size(); + } + + @Override + public void refreshProxy(Task task, Proxy proxy) { + if (proxy != null) { + FutureTask proxyFutureTask = usedProxyCache.get(); + Proxy currentProxy = getCurrentProxy(task); + // 如果在出错到这里的过程中,usedProxyCache被更新过,proxy 就不可能相等,如果依然相等,说明没有更新过 + // 可能没有使用代理的情况 + if (proxy.equals(currentProxy)) { + // 如果此时依然没有更新过,就设置为空 + usedProxyCache.compareAndSet(proxyFutureTask, null); + } + } + } + + @Override + public Proxy getCurrentProxy(Task task) { + FutureTask cache = usedProxyCache.get(); + Proxy currentProxy = null; + try { + if (cache != null) + currentProxy = cache.get(5, TimeUnit.SECONDS); + } catch (InterruptedException e) { + e.printStackTrace(); + log.error(e.getMessage(), e); + Thread.currentThread().interrupt(); + } catch (ExecutionException e) { + e.printStackTrace(); + log.error(e.getCause().getMessage(), e); + } catch (TimeoutException e) { + log.error(e.getMessage(), e); + e.printStackTrace(); + } + return currentProxy; + } + + + private FutureTask buildCacheTask() { + return new FutureTask<>(this::doGet); + } + + + /** + * 特别注意,防止活锁,集cache中总是抛出异常,那么将无限循环,无限报错 + * + * @param task 下载任务 + * @return 返回代理 + */ + @Override + public Proxy getProxy(Task task) { + while (!Thread.currentThread().isInterrupted()) { + FutureTask cache = usedProxyCache.get(); + if (cache == null) { + FutureTask futureTask = buildCacheTask(); + if (usedProxyCache.compareAndSet(null, futureTask)) { + cache = futureTask; + futureTask.run(); + } else { + // 交换失败,需要更新到最新数据 + cache = usedProxyCache.get(); + } + } + try { + if (cache != null) { + + ExpirableProxy proxy = (ExpirableProxy) cache.get(5, TimeUnit.SECONDS); + if (!proxy.isExpire()) + return proxy; + } + usedProxyCache.compareAndSet(cache, null); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + log.error(e.getMessage(), e); + usedProxyCache.compareAndSet(cache, null); + } catch (ExecutionException e) { + log.error(e.getMessage(), e); + usedProxyCache.compareAndSet(cache, null); + } catch (TimeoutException e) { + log.error(e.getMessage(), e); + } + } + return null; + } + + private Proxy doGet() throws InterruptedException { + ExpirableProxy proxy; + do { + proxy = ipQueue.take(); + } while (proxy.isExpire()); + return proxy; + } + + +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ExpirableProxy.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ExpirableProxy.java new file mode 100644 index 0000000..d22b7dc --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ExpirableProxy.java @@ -0,0 +1,36 @@ +package us.codecraft.webmagic.proxy; + +import lombok.Getter; +import org.apache.http.annotation.Contract; +import org.apache.http.annotation.ThreadingBehavior; + +import java.time.LocalDateTime; +import java.time.temporal.ChronoUnit; + +/** + * @author yaoqiang + * + * 可以过期的代理 + */ +@Contract(threading = ThreadingBehavior.IMMUTABLE_CONDITIONAL) +public class ExpirableProxy extends Proxy { + @Getter + private final int ttl; + private final LocalDateTime expireTime; + + + public ExpirableProxy(String host, int port, int ttl, ChronoUnit chronoUnit) { + super(host, port); + this.ttl = ttl; + this.expireTime = LocalDateTime.now().plus(ttl, chronoUnit); + + } + + public boolean isExpire() { + return LocalDateTime.now().isAfter(expireTime); + } + public LocalDateTime getExpireTime(){ + return expireTime; + } + +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java index 6554fab..ffae4be 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java @@ -6,33 +6,30 @@ import java.net.URISyntaxException; import java.net.URLEncoder; import java.nio.charset.StandardCharsets; +import jdk.nashorn.internal.ir.annotations.Immutable; import org.apache.commons.lang3.StringUtils; +import org.apache.http.annotation.Contract; +import org.apache.http.annotation.ThreadingBehavior; +@Contract(threading = ThreadingBehavior.IMMUTABLE) public class Proxy { - private String scheme; + private final String scheme; - private String host; + private final String host; - private int port; + private final int port; - private String username; + private final String username; - private String password; + private final String password; - public static Proxy create(final URI uri) { - Proxy proxy = new Proxy(uri.getHost(), uri.getPort(), uri.getScheme()); - String userInfo = uri.getUserInfo(); - if (userInfo != null) { - String[] up = userInfo.split(":"); - if (up.length == 1) { - proxy.username = up[0].isEmpty() ? null : up[0]; - } else { - proxy.username = up[0].isEmpty() ? null : up[0]; - proxy.password = up[1].isEmpty() ? null : up[1]; - } - } - return proxy; + public Proxy(String host, int port, String scheme, String username, String password) { + this.scheme = scheme; + this.host = host; + this.port = port; + this.username = username; + this.password = password; } public Proxy(String host, int port) { @@ -40,27 +37,30 @@ public class Proxy { } public Proxy(String host, int port, String scheme) { - this.host = host; - this.port = port; - this.scheme = scheme; + this(host, port, scheme, null, null); } public Proxy(String host, int port, String username, String password) { - this.host = host; - this.port = port; - this.username = username; - this.password = password; + this(host, port, null, username, password); } - public String getScheme() { - return scheme; + public static Proxy create(final URI uri) { + String userInfo = uri.getUserInfo(); + String username = null; + String password = null; + if (userInfo != null) { + String[] up = userInfo.split(":"); + if (up.length == 1) { + username = up[0].isEmpty() ? null : up[0]; + } else { + username = up[0].isEmpty() ? null : up[0]; + password = up[1].isEmpty() ? null : up[1]; + } + } + return new Proxy(uri.getHost(), uri.getPort(), uri.getScheme(), username, password); } - public void setScheme(String scheme) { - this.scheme = scheme; - } - - public String getHost() { + public String getHost() { return host; } @@ -68,6 +68,8 @@ public class Proxy { return port; } + public String getScheme(){return scheme;} + public String getUsername() { return username; } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java index 0cef4ed..b567d58 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java @@ -1,6 +1,5 @@ package us.codecraft.webmagic.proxy; -import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Task; /** @@ -10,14 +9,6 @@ import us.codecraft.webmagic.Task; */ public interface ProxyProvider { - /** - * - * Return proxy to Provider when complete a download. - * @param proxy the proxy config contains host,port and identify info - * @param page the download result - * @param task the download task - */ - void returnProxy(Proxy proxy, Page page, Task task); /** * Get a proxy for task by some strategy. diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/RefreshableProxyProvider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/RefreshableProxyProvider.java new file mode 100644 index 0000000..77e1ce2 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/RefreshableProxyProvider.java @@ -0,0 +1,30 @@ +package us.codecraft.webmagic.proxy; + +import us.codecraft.webmagic.Task; + +/** + * @author yaoqiang + * + * 可以手动刷新的代理供应商 + */ +public interface RefreshableProxyProvider extends ProxyProvider{ + + /** + * 代理IP是珍贵资源,有可能代理提供者内部代理没有过期,就一直提供某个IP,但这个IP又不可以使用,所以提供一种方式通知提供者,这个代理该刷新了 + * + * @param task 爬虫任务 + * @param proxy 需要对代理进行验证,如果确实持有的时错误代理,则刷新,否则,继续执行 + */ + void refreshProxy(Task task,Proxy proxy); + + + /** + * + * 获取当前正在提供的代理 + * + * @param task 工作中的爬虫任务 + * @return 获取当前正在使用的代理 + */ + Proxy getCurrentProxy(Task task); + +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ReturnableProxyProvider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ReturnableProxyProvider.java new file mode 100644 index 0000000..43b49fc --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ReturnableProxyProvider.java @@ -0,0 +1,22 @@ +package us.codecraft.webmagic.proxy; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Task; + +/** + * @author yaoqiang + * + * 可归还的代理提供商,代理被取出后,实用完成,可以归还给代理提供商 + */ +public interface ReturnableProxyProvider { + + /** + * + * Return proxy to Provider when complete a download. + * @param proxy the proxy config contains host,port and identify info + * @param page the download result + * @param task the download task + */ + void returnProxy(Proxy proxy, Page page, Task task); + +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyProvider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyProvider.java index ddef6a8..fda3e23 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyProvider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyProvider.java @@ -1,6 +1,5 @@ package us.codecraft.webmagic.proxy; -import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Task; import java.util.ArrayList; @@ -30,6 +29,7 @@ public class SimpleProxyProvider implements ProxyProvider { this.pointer = pointer; } + public static SimpleProxyProvider from(Proxy... proxies) { List proxiesTemp = new ArrayList(proxies.length); for (Proxy proxy : proxies) { @@ -38,11 +38,6 @@ public class SimpleProxyProvider implements ProxyProvider { return new SimpleProxyProvider(Collections.unmodifiableList(proxiesTemp)); } - @Override - public void returnProxy(Proxy proxy, Page page, Task task) { - //Donothing - } - @Override public Proxy getProxy(Task task) { return proxies.get(incrForLoop()); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/HttpConstant.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/HttpConstant.java index 2d6b8fe..bfacec3 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/HttpConstant.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/HttpConstant.java @@ -28,6 +28,7 @@ public abstract class HttpConstant { public static abstract class StatusCode { public static final int CODE_200 = 200; + public static final int FORBIDDEN = 403; } diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/SpiderTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/SpiderTest.java index 4f4a280..6b9c423 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/SpiderTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/SpiderTest.java @@ -57,6 +57,11 @@ public class SpiderTest { return Site.me().setSleepTime(0); } }).setDownloader(new Downloader() { + @Override + public void refreshComponent(Task task) { + + } + @Override public Page download(Request request, Task task) { return new Page().setRawText(""); diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java index 3aa742c..6d764a5 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java @@ -28,6 +28,11 @@ public class MockGithubDownloader implements Downloader { return page; } + @Override + public void refreshComponent(Task task) { + + } + @Override public void setThread(int threadNum) { } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java index 6055bdb..f3751d6 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java @@ -42,7 +42,12 @@ public class PhantomJSDownloader extends AbstractDownloader { this.initPhantomjsCrawlPath(); PhantomJSDownloader.phantomJsCommand = phantomJsCommand; } - + + @Override + public void refreshComponent(Task task) { + + } + /** * 新增构造函数,支持crawl.js路径自定义,因为当其他项目依赖此jar包时,runtime.exec()执行phantomjs命令时无使用法jar包中的crawl.js *

diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java
index 91e3698..7744692 100644
--- a/webmagic-extension/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java
+++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java
@@ -9,6 +9,10 @@ import us.codecraft.webmagic.selector.PlainText;
  * @author code4crafter@gmail.com
  */
 public class MockGithubDownloader implements Downloader{
+    @Override
+    public void refreshComponent(Task task) {
+
+    }
 
     private String html = "\n" +
             "\n" +
diff --git a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java
index cce293f..11b2356 100644
--- a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java
+++ b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java
@@ -59,6 +59,11 @@ public class SeleniumDownloader implements Downloader, Closeable {
 		// "/Users/Bingo/Downloads/phantomjs-1.9.7-macosx/bin/phantomjs");
 	}
 
+	@Override
+	public void refreshComponent(Task task) {
+
+	}
+
 	/**
 	 * set sleep time to wait until load success
 	 *