From 88b50d4182b5e062112a7a662f51b03eccbadb29 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Tue, 4 Mar 2014 07:33:07 +0800 Subject: [PATCH] bigfix: cycleTry will not work when spawnUrl is set to false #62 --- .../main/java/us/codecraft/webmagic/Page.java | 10 ++++ .../java/us/codecraft/webmagic/Spider.java | 13 ++--- .../downloader/AbstractDownloader.java | 53 +++++++++++++++++++ .../downloader/HttpClientDownloader.java | 40 +------------- 4 files changed, 71 insertions(+), 45 deletions(-) create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java index aeccb5b..a22fbdc 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java @@ -37,6 +37,8 @@ public class Page { private int statusCode; + private boolean needCycleRetry; + private List targetRequests = new ArrayList(); public Page() { @@ -165,6 +167,14 @@ public class Page { return request; } + public boolean isNeedCycleRetry() { + return needCycleRetry; + } + + public void setNeedCycleRetry(boolean needCycleRetry) { + this.needCycleRetry = needCycleRetry; + } + public void setRequest(Request request) { this.request = request; this.resultItems.setRequest(request); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 6a6b956..b6f95ac 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -376,13 +376,13 @@ public class Spider implements Runnable, Task { return; } // for cycle retry - if (page.getRawText() == null) { - extractAndAddRequests(page); + if (page.isNeedCycleRetry()) { + extractAndAddRequests(page, true); sleep(site.getSleepTime()); return; } pageProcessor.process(page); - extractAndAddRequests(page); + extractAndAddRequests(page, spawnUrl); if (!page.getResultItems().isSkip()) { for (Pipeline pipeline : pipelines) { pipeline.process(page.getResultItems(), this); @@ -399,7 +399,7 @@ public class Spider implements Runnable, Task { } } - protected void extractAndAddRequests(Page page) { + protected void extractAndAddRequests(Page page, boolean spawnUrl) { if (spawnUrl && CollectionUtils.isNotEmpty(page.getTargetRequests())) { for (Request request : page.getTargetRequests()) { addRequest(request); @@ -588,8 +588,8 @@ public class Spider implements Runnable, Task { * @see Status * @since 0.4.1 */ - public Status getStatus(){ - return Status.fromValue(stat.get()); + public Status getStatus() { + return Status.fromValue(stat.get()); } @@ -619,6 +619,7 @@ public class Spider implements Runnable, Task { /** * Get thread count which is running + * * @return thread count which is running * @since 0.4.1 */ diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java new file mode 100644 index 0000000..2336856 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java @@ -0,0 +1,53 @@ +package us.codecraft.webmagic.downloader; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.selector.Html; + +/** + * Base class of downloader with some common methods. + * + * @author code4crafter@gmail.com + * @since 0.5.0 + */ +public abstract class AbstractDownloader implements Downloader { + + /** + * A simple method to download a url. + * + * @param url + * @return html + */ + public Html download(String url) { + return download(url, null); + } + + /** + * A simple method to download a url. + * + * @param url + * @return html + */ + public Html download(String url, String charset) { + Page page = download(new Request(url), Site.me().setCharset(charset).toTask()); + return (Html) page.getHtml(); + } + + protected Page addToCycleRetry(Request request, Site site) { + Page page = new Page(); + Object cycleTriedTimesObject = request.getExtra(Request.CYCLE_TRIED_TIMES); + if (cycleTriedTimesObject == null) { + page.addTargetRequest(request.setPriority(0).putExtra(Request.CYCLE_TRIED_TIMES, 1)); + } else { + int cycleTriedTimes = (Integer) cycleTriedTimesObject; + cycleTriedTimes++; + if (cycleTriedTimes >= site.getCycleRetryTimes()) { + return null; + } + page.addTargetRequest(request.setPriority(0).putExtra(Request.CYCLE_TRIED_TIMES, cycleTriedTimes)); + } + page.setNeedCycleRetry(true); + return page; + } +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index c036e34..bcf4a53 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -16,7 +16,6 @@ import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Task; -import us.codecraft.webmagic.selector.Html; import us.codecraft.webmagic.selector.PlainText; import us.codecraft.webmagic.utils.UrlUtils; @@ -33,7 +32,7 @@ import java.util.Set; * @since 0.1.0 */ @ThreadSafe -public class HttpClientDownloader implements Downloader { +public class HttpClientDownloader extends AbstractDownloader { private Logger logger = LoggerFactory.getLogger(getClass()); @@ -41,27 +40,6 @@ public class HttpClientDownloader implements Downloader { private HttpClientGenerator httpClientGenerator = new HttpClientGenerator(); - /** - * A simple method to download a url. - * - * @param url - * @return html - */ - public Html download(String url) { - return download(url, null); - } - - /** - * A simple method to download a url. - * - * @param url - * @return html - */ - public Html download(String url, String charset) { - Page page = download(new Request(url), Site.me().setCharset(charset).toTask()); - return (Html) page.getHtml(); - } - private CloseableHttpClient getHttpClient(Site site) { if (site == null) { return httpClientGenerator.getClient(null); @@ -145,22 +123,6 @@ public class HttpClientDownloader implements Downloader { } } - private Page addToCycleRetry(Request request, Site site) { - Page page = new Page(); - Object cycleTriedTimesObject = request.getExtra(Request.CYCLE_TRIED_TIMES); - if (cycleTriedTimesObject == null) { - page.addTargetRequest(request.setPriority(0).putExtra(Request.CYCLE_TRIED_TIMES, 1)); - } else { - int cycleTriedTimes = (Integer) cycleTriedTimesObject; - cycleTriedTimes++; - if (cycleTriedTimes >= site.getCycleRetryTimes()) { - return null; - } - page.addTargetRequest(request.setPriority(0).putExtra(Request.CYCLE_TRIED_TIMES, cycleTriedTimes)); - } - return page; - } - protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException { String content = IOUtils.toString(httpResponse.getEntity().getContent(), charset); Page page = new Page();