From 1b04a7f2b39546bf8c522b2c2c0059be57587dbd Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sun, 9 Apr 2017 09:23:10 +0800 Subject: [PATCH] #527 move logic check from downloaderto spider --- .../main/java/us/codecraft/webmagic/Page.java | 36 ++-- .../main/java/us/codecraft/webmagic/Site.java | 4 +- .../java/us/codecraft/webmagic/Spider.java | 60 ++++-- .../downloader/AbstractDownloader.java | 16 -- .../downloader/HttpClientDownloader.java | 24 +-- .../webmagic/proxy/ProxyProvider.java | 18 +- .../webmagic/proxy/ResponseChecker.java | 13 -- .../webmagic/proxy/TimerReuseProxy.java | 159 -------------- .../webmagic/proxy/TimerReuseProxyPool.java | 204 ------------------ .../webmagic/utils/HttpConstant.java | 6 + .../downloader/HttpClientDownloaderTest.java | 7 +- .../downloader/MockGithubDownloader.java | 4 +- 12 files changed, 100 insertions(+), 451 deletions(-) delete mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ResponseChecker.java delete mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/proxy/TimerReuseProxy.java delete mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/proxy/TimerReuseProxyPool.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java index 7dd48f8..a945607 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java @@ -4,6 +4,7 @@ import org.apache.commons.lang3.StringUtils; import us.codecraft.webmagic.selector.Html; import us.codecraft.webmagic.selector.Json; import us.codecraft.webmagic.selector.Selectable; +import us.codecraft.webmagic.utils.HttpConstant; import us.codecraft.webmagic.utils.UrlUtils; import java.util.ArrayList; @@ -41,15 +42,21 @@ public class Page { private Map> headers; - private int statusCode; + private int statusCode = HttpConstant.StatusCode.CODE_200; - private boolean needCycleRetry; + private boolean downloadSuccess = true; private List targetRequests = new ArrayList(); public Page() { } + public static Page fail(){ + Page page = new Page(); + page.setDownloadSuccess(false); + return page; + } + public Page setSkip(boolean skip) { resultItems.setSkip(skip); return this; @@ -179,14 +186,6 @@ public class Page { return request; } - public boolean isNeedCycleRetry() { - return needCycleRetry; - } - - public void setNeedCycleRetry(boolean needCycleRetry) { - this.needCycleRetry = needCycleRetry; - } - public void setRequest(Request request) { this.request = request; this.resultItems.setRequest(request); @@ -221,22 +220,27 @@ public class Page { this.headers = headers; } + public boolean isDownloadSuccess() { + return downloadSuccess; + } + + public void setDownloadSuccess(boolean downloadSuccess) { + this.downloadSuccess = downloadSuccess; + } + @Override public String toString() { return "Page{" + "request=" + request + ", resultItems=" + resultItems + + ", html=" + html + + ", json=" + json + ", rawText='" + rawText + '\'' + ", url=" + url + ", headers=" + headers + ", statusCode=" + statusCode + - ", needCycleRetry=" + needCycleRetry + + ", success=" + downloadSuccess + ", targetRequests=" + targetRequests + - ", headers=" + headers+ '}'; } - - - - } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java index 520902d..a77fca6 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java @@ -1,5 +1,7 @@ package us.codecraft.webmagic; +import us.codecraft.webmagic.utils.HttpConstant; + import java.util.*; /** @@ -40,7 +42,7 @@ public class Site { private boolean useGzip = true; static { - DEFAULT_STATUS_CODE_SET.add(200); + DEFAULT_STATUS_CODE_SET.add(HttpConstant.StatusCode.CODE_200); } /** diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 5e785af..0c5ce2d 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -1,6 +1,7 @@ package us.codecraft.webmagic; import org.apache.commons.collections.CollectionUtils; +import org.apache.commons.lang3.SerializationUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import us.codecraft.webmagic.downloader.Downloader; @@ -398,34 +399,59 @@ public class Spider implements Runnable, Task { } } - protected void processRequest(Request request) { + private void processRequest(Request request) { Page page = downloader.download(request, this); - if (page == null) { - sleep(site.getSleepTime()); - onError(request); - return; + if (page.isDownloadSuccess()){ + onDownloadSuccess(request, page); + } else { + onDownloaderFail(request); } - // for cycle retry - if (page.isNeedCycleRetry()) { - extractAndAddRequests(page, true); - sleep(site.getRetrySleepTime()); - return; - } - pageProcessor.process(page); - extractAndAddRequests(page, spawnUrl); - if (!page.getResultItems().isSkip()) { - for (Pipeline pipeline : pipelines) { - pipeline.process(page.getResultItems(), this); + } + + private void onDownloadSuccess(Request request, Page page) { + onSuccess(request); + if (site.getAcceptStatCode().contains(page.getStatusCode())){ + pageProcessor.process(page); + extractAndAddRequests(page, spawnUrl); + if (!page.getResultItems().isSkip()) { + for (Pipeline pipeline : pipelines) { + pipeline.process(page.getResultItems(), this); + } } } sleep(site.getSleepTime()); + return; + } + + private void onDownloaderFail(Request request) { + if (site.getCycleRetryTimes() == 0) { + sleep(site.getSleepTime()); + } else { + // for cycle retry + doCycleRetry(request); + } + onError(request); + } + + private void doCycleRetry(Request request) { + Object cycleTriedTimesObject = request.getExtra(Request.CYCLE_TRIED_TIMES); + if (cycleTriedTimesObject == null) { + addRequest(SerializationUtils.clone(request).setPriority(0).putExtra(Request.CYCLE_TRIED_TIMES, 1)); + } else { + int cycleTriedTimes = (Integer) cycleTriedTimesObject; + cycleTriedTimes++; + if (cycleTriedTimes < site.getCycleRetryTimes()) { + addRequest(SerializationUtils.clone(request).setPriority(0).putExtra(Request.CYCLE_TRIED_TIMES, cycleTriedTimes)); + } + } + sleep(site.getRetrySleepTime()); } protected void sleep(int time) { try { Thread.sleep(time); } catch (InterruptedException e) { - e.printStackTrace(); + logger.error("Thread interrupted when sleep",e); } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java index c835dc8..c27292d 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java @@ -41,20 +41,4 @@ public abstract class AbstractDownloader implements Downloader { protected void onError(Request request) { } - protected Page addToCycleRetry(Request request, Site site) { - Page page = new Page(); - Object cycleTriedTimesObject = request.getExtra(Request.CYCLE_TRIED_TIMES); - if (cycleTriedTimesObject == null) { - page.addTargetRequest(request.setPriority(0).putExtra(Request.CYCLE_TRIED_TIMES, 1)); - } else { - int cycleTriedTimes = (Integer) cycleTriedTimesObject; - cycleTriedTimes++; - if (cycleTriedTimes >= site.getCycleRetryTimes()) { - return null; - } - page.addTargetRequest(request.setPriority(0).putExtra(Request.CYCLE_TRIED_TIMES, cycleTriedTimes)); - } - page.setNeedCycleRetry(true); - return page; - } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 284702d..bf7993b 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -77,27 +77,18 @@ public class HttpClientDownloader extends AbstractDownloader { } logger.debug("downloading page {}", request.getUrl()); CloseableHttpResponse httpResponse = null; - Site site = task.getSite(); - CloseableHttpClient httpClient = getHttpClient(site); - HttpClientRequestContext requestContext = httpUriRequestConverter.convert(request, site, proxyProvider != null ? proxyProvider.getProxy(task) : null); + CloseableHttpClient httpClient = getHttpClient(task.getSite()); + HttpClientRequestContext requestContext = httpUriRequestConverter.convert(request, task.getSite(), proxyProvider != null ? proxyProvider.getProxy(task) : null); try { httpResponse = httpClient.execute(requestContext.getHttpUriRequest(), requestContext.getHttpClientContext()); - int statusCode = httpResponse.getStatusLine().getStatusCode(); - if (site.getAcceptStatCode().contains(statusCode)) { - Page page = handleResponse(request, site.getCharset(), httpResponse, task); - onSuccess(request); - return page; - } else { - logger.warn("get page {} error, status code {} ",request.getUrl(),statusCode); - return null; - } + Page page = handleResponse(request, task.getSite().getCharset(), httpResponse, task); + onSuccess(request); + logger.debug("downloading page success {}", page); + return page; } catch (IOException e) { logger.warn("download page {} error", request.getUrl(), e); - if (site != null && site.getCycleRetryTimes() > 0) { - return addToCycleRetry(request, site); - } onError(request); - return null; + return Page.fail(); } finally { if (httpResponse != null) { //ensure the connection is released back to pool @@ -118,6 +109,7 @@ public class HttpClientDownloader extends AbstractDownloader { page.setUrl(new PlainText(request.getUrl())); page.setRequest(request); page.setStatusCode(httpResponse.getStatusLine().getStatusCode()); + page.setDownloadSuccess(true); if (responseHeader) { page.setHeaders(HttpClientUtils.convertHeaders(httpResponse.getAllHeaders())); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java index 4266d78..6772d9d 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java @@ -1,14 +1,28 @@ package us.codecraft.webmagic.proxy; +import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Task; /** - * Created by edwardsbean on 15-2-28. + * Proxy provider.
+ * + * @since 0.7.0 */ public interface ProxyProvider { - void returnProxy(Proxy proxy, boolean banned, Task task); + /** + * + * @param proxy + * @param page + * @param task + */ + void returnProxy(Proxy proxy, Page page, Task task); + /** + * Get a proxy for task by some strategy. + * @param task task + * @return + */ Proxy getProxy(Task task); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ResponseChecker.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ResponseChecker.java deleted file mode 100644 index 3e68c11..0000000 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ResponseChecker.java +++ /dev/null @@ -1,13 +0,0 @@ -package us.codecraft.webmagic.proxy; - -import org.apache.http.HttpResponse; - -/** - * @author code4crafter@gmail.com - * Date: 17/3/20 - * Time: 下午10:52 - */ -public interface ResponseChecker { - - boolean isBanned(HttpResponse httpResponse); -} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/TimerReuseProxy.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/TimerReuseProxy.java deleted file mode 100644 index 7002df4..0000000 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/TimerReuseProxy.java +++ /dev/null @@ -1,159 +0,0 @@ -package us.codecraft.webmagic.proxy; - -import java.io.Serializable; -import java.util.ArrayList; -import java.util.List; -import java.util.concurrent.Delayed; -import java.util.concurrent.TimeUnit; - -/** - * >>>> Proxy lifecycle - - +----------+ +-----+ - | last use | | new | - +-----+----+ +---+-+ - | +------+ | - +->| init |<--+ - +--+---+ - | - v - +--------+ - +--->| borrow | - | +---+----+ - | |+------------------+ - | v - | +--------+ - | | in use | Respone Time - | +---+----+ - | |+------------------+ - | v - | +--------+ - | | return | - | +---+----+ - | |+-------------------+ - | v - | +-------+ reuse interval - | | delay | (delay time) - | +---+---+ - | |+-------------------+ - | v - | +------+ - | | idle | idle time - | +---+--+ - | |+-------------------+ - +--------+ - */ - -/** - * Object has these status of lifecycle above.
- * - * @author yxssfxwzy@sina.com
- * @since 0.5.1 - * @see TimerReuseProxyPool - */ - -public class TimerReuseProxy extends Proxy implements Delayed, Serializable { - - private static final long serialVersionUID = 228939737383625551L; - public static final int ERROR_403 = 403; - public static final int ERROR_404 = 404; - public static final int ERROR_BANNED = 10000;// banned by website - public static final int ERROR_Proxy = 10001;// the proxy itself failed - public static final int SUCCESS = 200; - - private int reuseTimeInterval = 1500;// ms - private Long canReuseTime = 0L; - private Long lastBorrowTime = System.currentTimeMillis(); - private Long responseTime = 0L; - - private int failedNum = 0; - private int successNum = 0; - private int borrowNum = 0; - - private List failedErrorType = new ArrayList(); - - public TimerReuseProxy(String host, int port, String username, String password) { - super(host, port, username, password); - } - - - public int getSuccessNum() { - return successNum; - } - - public void successNumIncrement(int increment) { - this.successNum += increment; - } - - public Long getLastUseTime() { - return lastBorrowTime; - } - - public void setLastBorrowTime(Long lastBorrowTime) { - this.lastBorrowTime = lastBorrowTime; - } - - public void recordResponse() { - this.responseTime = (System.currentTimeMillis() - lastBorrowTime + responseTime) / 2; - this.lastBorrowTime = System.currentTimeMillis(); - } - - public List getFailedErrorType() { - return failedErrorType; - } - - public void setFailedErrorType(List failedErrorType) { - this.failedErrorType = failedErrorType; - } - - public void fail(int failedErrorType) { - this.failedNum++; - this.failedErrorType.add(failedErrorType); - } - - public void setFailedNum(int failedNum) { - this.failedNum = failedNum; - } - - public int getFailedNum() { - return failedNum; - } - - public String getFailedType() { - String re = ""; - for (Integer i : this.failedErrorType) { - re += i + " . "; - } - return re; - } - - public int getReuseTimeInterval() { - return reuseTimeInterval; - } - - public void setReuseTimeInterval(int reuseTimeInterval) { - this.reuseTimeInterval = reuseTimeInterval; - this.canReuseTime = System.nanoTime() + TimeUnit.NANOSECONDS.convert(reuseTimeInterval, TimeUnit.MILLISECONDS); - - } - - @Override - public long getDelay(TimeUnit unit) { - return unit.convert(canReuseTime - System.nanoTime(), TimeUnit.NANOSECONDS); - } - - @Override - public int compareTo(Delayed o) { - TimerReuseProxy that = (TimerReuseProxy) o; - return canReuseTime > that.canReuseTime ? 1 : (canReuseTime < that.canReuseTime ? -1 : 0); - - } - - public void borrowNumIncrement(int increment) { - this.borrowNum += increment; - } - - public int getBorrowNum() { - return borrowNum; - } -} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/TimerReuseProxyPool.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/TimerReuseProxyPool.java deleted file mode 100644 index 6dbac5d..0000000 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/TimerReuseProxyPool.java +++ /dev/null @@ -1,204 +0,0 @@ -package us.codecraft.webmagic.proxy; - -import us.codecraft.webmagic.Task; - -/** - * Pooled Proxy Object - * - * @author yxssfxwzy@sina.com
- * @see Proxy - * @since 0.5.1 - */ -public class TimerReuseProxyPool implements ProxyProvider { - @Override - public void returnProxy(Proxy proxy, boolean banned, Task task) { - - } - - @Override - public Proxy getProxy(Task task) { - return null; - } - -// private Logger logger = LoggerFactory.getLogger(getClass()); -// -// private BlockingQueue proxyQueue = new DelayQueue(); -// private Map allProxy = new ConcurrentHashMap(); -// -// private int reuseInterval = 1500;// ms -// private int reviveTime = 2 * 60 * 60 * 1000;// ms -// private int saveProxyInterval = 10 * 60 * 1000;// ms -// -// private boolean isEnable = false; -// private boolean validateWhenInit = false; -// // private boolean isUseLastProxy = true; -// -// public TimerReuseProxyPool(List httpProxyList) { -// this(httpProxyList, true); -// } -// -// private void addProxy(Map httpProxyMap) { -// isEnable = true; -// for (Entry entry : httpProxyMap.entrySet()) { -// try { -// if (allProxy.containsKey(entry.getKey())) { -// continue; -// } -// if (!validateWhenInit || ProxyUtils.validateProxy(entry.getValue().getHttpHost())) { -// entry.getValue().setFailedNum(0); -// entry.getValue().setReuseTimeInterval(reuseInterval); -// proxyQueue.add(entry.getValue()); -// allProxy.put(entry.getKey(), entry.getValue()); -// } -// } catch (NumberFormatException e) { -// logger.error("HttpHost init error:", e); -// } -// } -// logger.info("proxy pool size>>>>" + allProxy.size()); -// } -// -// public void addProxy(Proxy... httpProxyList) { -// isEnable = true; -// for (Proxy proxy : httpProxyList) { -// if (!validateWhenInit || ProxyUtils.validateProxy(proxy.getProxyHost())) { -// TimerReuseProxy p = new TimerReuseProxy(proxy.getProxyHost(), proxy.getUsername(), proxy.getPassword(), reuseInterval); -// proxyQueue.add(p); -// allProxy.put(p.getProxyHost().getHost(), p); -// } -// } -// logger.info("proxy pool size>>>>" + allProxy.size()); -// } -// -// public TimerReuseProxy getProxy() { -// TimerReuseProxy proxy = null; -// try { -// Long time = System.currentTimeMillis(); -// proxy = proxyQueue.take(); -// double costTime = (System.currentTimeMillis() - time) / 1000.0; -// if (costTime > reuseInterval) { -// logger.info("get proxy time >>>> " + costTime); -// } -// TimerReuseProxy p = allProxy.get(proxy.getProxyHost().getHost()); -// p.setLastBorrowTime(System.currentTimeMillis()); -// p.borrowNumIncrement(1); -// } catch (InterruptedException e) { -// logger.error("get proxy error", e); -// } -// if (proxy == null) { -// throw new NoSuchElementException(); -// } -// return proxy; -// } -// -// public void returnProxy(Proxy proxy, int statusCode) { -// TimerReuseProxy p = allProxy.get(proxy.getProxyHost()); -// if (p == null) { -// return; -// } -// switch (statusCode) { -// case TimerReuseProxy.SUCCESS: -// p.setReuseTimeInterval(reuseInterval); -// p.setFailedNum(0); -// p.setFailedErrorType(new ArrayList()); -// p.recordResponse(); -// p.successNumIncrement(1); -// break; -// case TimerReuseProxy.ERROR_403: -// // banned,try longer interval -// p.fail(TimerReuseProxy.ERROR_403); -// p.setReuseTimeInterval(reuseInterval * p.getFailedNum()); -// logger.info(proxy + " >>>> reuseTimeInterval is >>>> " + p.getReuseTimeInterval() / 1000.0); -// break; -// case TimerReuseProxy.ERROR_BANNED: -// p.fail(TimerReuseProxy.ERROR_BANNED); -// p.setReuseTimeInterval(10 * 60 * 1000 * p.getFailedNum()); -// logger.info(proxy + " >>>> reuseTimeInterval is >>>> " + p.getReuseTimeInterval() / 1000.0); -// break; -// case TimerReuseProxy.ERROR_404: -// // p.fail(Proxy.ERROR_404); -// // p.setReuseTimeInterval(reuseInterval * p.getFailedNum()); -// break; -// default: -// p.fail(statusCode); -// break; -// } -// if (p.getFailedNum() > 20) { -// p.setReuseTimeInterval(reviveTime); -// logger.error("remove proxy >>>> " + proxy + ">>>>" + p.getFailedType() + " >>>> remain proxy >>>> " + proxyQueue.size()); -// return; -// } -// if (p.getFailedNum() > 0 && p.getFailedNum() % 5 == 0) { -// if (!ProxyUtils.validateProxy(proxy)) { -// p.setReuseTimeInterval(reviveTime); -// logger.error("remove proxy >>>> " + proxy + ">>>>" + p.getFailedType() + " >>>> remain proxy >>>> " + proxyQueue.size()); -// return; -// } -// } -// try { -// proxyQueue.put(p); -// } catch (InterruptedException e) { -// logger.warn("proxyQueue return proxy error", e); -// } -// } -// -// public String allProxyStatus() { -// String re = "all proxy info >>>> \n"; -// for (Entry entry : allProxy.entrySet()) { -// re += entry.getValue().toString() + "\n"; -// } -// return re; -// } -// -// public int getIdleNum() { -// return proxyQueue.size(); -// } -// -// public int getReuseInterval() { -// return reuseInterval; -// } -// -// public void setReuseInterval(int reuseInterval) { -// this.reuseInterval = reuseInterval; -// } -// -// public void enable(boolean isEnable) { -// this.isEnable = isEnable; -// } -// -// public boolean isEnable() { -// return isEnable; -// } -// -// public int getReviveTime() { -// return reviveTime; -// } -// -// public void setReviveTime(int reviveTime) { -// this.reviveTime = reviveTime; -// } -// -// public boolean isValidateWhenInit() { -// return validateWhenInit; -// } -// -// public void validateWhenInit(boolean validateWhenInit) { -// this.validateWhenInit = validateWhenInit; -// } -// -// public int getSaveProxyInterval() { -// return saveProxyInterval; -// } -// -// public void setSaveProxyInterval(int saveProxyInterval) { -// this.saveProxyInterval = saveProxyInterval; -// } -// -// public String getProxyFilePath() { -// return proxyFilePath; -// } -// -// public void setProxyFilePath(String proxyFilePath) { -// this.proxyFilePath = proxyFilePath; -// } - -} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/HttpConstant.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/HttpConstant.java index 2a76ecc..2d6b8fe 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/HttpConstant.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/HttpConstant.java @@ -25,6 +25,12 @@ public abstract class HttpConstant { } + public static abstract class StatusCode { + + public static final int CODE_200 = 200; + + } + public static abstract class Header { public static final String REFERER = "Referer"; diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java index 9c93915..f412502 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java @@ -50,15 +50,12 @@ public class HttpClientDownloaderTest { } @Test - public void testCycleTriedTimes() { + public void test_download_fail() { HttpClientDownloader httpClientDownloader = new HttpClientDownloader(); Task task = Site.me().setDomain("localhost").setCycleRetryTimes(5).toTask(); Request request = new Request(PAGE_ALWAYS_NOT_EXISTS); Page page = httpClientDownloader.download(request, task); - assertThat(page.getTargetRequests().size() > 0); - assertThat((Integer) page.getTargetRequests().get(0).getExtra(Request.CYCLE_TRIED_TIMES)).isEqualTo(1); - page = httpClientDownloader.download(page.getTargetRequests().get(0), task); - assertThat((Integer) page.getTargetRequests().get(0).getExtra(Request.CYCLE_TRIED_TIMES)).isEqualTo(2); + assertThat(page.isDownloadSuccess()).isFalse(); } @Test diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java index 6baee72..91e3698 100644 --- a/webmagic-extension/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java +++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java @@ -3,7 +3,6 @@ package us.codecraft.webmagic.downloader; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Task; -import us.codecraft.webmagic.selector.Html; import us.codecraft.webmagic.selector.PlainText; /** @@ -937,7 +936,8 @@ public class MockGithubDownloader implements Downloader{ @Override public Page download(Request request, Task task) { Page page = new Page(); - page.setHtml(new Html(html)); + page.setRawText(html); + page.setStatusCode(200); page.setRequest(new Request("https://github.com/code4craft/webmagic")); page.setUrl(new PlainText("https://github.com/code4craft/webmagic")); return page;