bigfix: cycleTry will not work when spawnUrl is set to false #62

master
yihua.huang 2014-03-04 07:33:07 +08:00
parent 16ae066c62
commit 88b50d4182
4 changed files with 71 additions and 45 deletions

View File

@ -37,6 +37,8 @@ public class Page {
private int statusCode;
private boolean needCycleRetry;
private List<Request> targetRequests = new ArrayList<Request>();
public Page() {
@ -165,6 +167,14 @@ public class Page {
return request;
}
public boolean isNeedCycleRetry() {
return needCycleRetry;
}
public void setNeedCycleRetry(boolean needCycleRetry) {
this.needCycleRetry = needCycleRetry;
}
public void setRequest(Request request) {
this.request = request;
this.resultItems.setRequest(request);

View File

@ -376,13 +376,13 @@ public class Spider implements Runnable, Task {
return;
}
// for cycle retry
if (page.getRawText() == null) {
extractAndAddRequests(page);
if (page.isNeedCycleRetry()) {
extractAndAddRequests(page, true);
sleep(site.getSleepTime());
return;
}
pageProcessor.process(page);
extractAndAddRequests(page);
extractAndAddRequests(page, spawnUrl);
if (!page.getResultItems().isSkip()) {
for (Pipeline pipeline : pipelines) {
pipeline.process(page.getResultItems(), this);
@ -399,7 +399,7 @@ public class Spider implements Runnable, Task {
}
}
protected void extractAndAddRequests(Page page) {
protected void extractAndAddRequests(Page page, boolean spawnUrl) {
if (spawnUrl && CollectionUtils.isNotEmpty(page.getTargetRequests())) {
for (Request request : page.getTargetRequests()) {
addRequest(request);
@ -588,7 +588,7 @@ public class Spider implements Runnable, Task {
* @see Status
* @since 0.4.1
*/
public Status getStatus(){
public Status getStatus() {
return Status.fromValue(stat.get());
}
@ -619,6 +619,7 @@ public class Spider implements Runnable, Task {
/**
* Get thread count which is running
*
* @return thread count which is running
* @since 0.4.1
*/

View File

@ -0,0 +1,53 @@
package us.codecraft.webmagic.downloader;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.selector.Html;
/**
* Base class of downloader with some common methods.
*
* @author code4crafter@gmail.com
* @since 0.5.0
*/
public abstract class AbstractDownloader implements Downloader {
/**
* A simple method to download a url.
*
* @param url
* @return html
*/
public Html download(String url) {
return download(url, null);
}
/**
* A simple method to download a url.
*
* @param url
* @return html
*/
public Html download(String url, String charset) {
Page page = download(new Request(url), Site.me().setCharset(charset).toTask());
return (Html) page.getHtml();
}
protected Page addToCycleRetry(Request request, Site site) {
Page page = new Page();
Object cycleTriedTimesObject = request.getExtra(Request.CYCLE_TRIED_TIMES);
if (cycleTriedTimesObject == null) {
page.addTargetRequest(request.setPriority(0).putExtra(Request.CYCLE_TRIED_TIMES, 1));
} else {
int cycleTriedTimes = (Integer) cycleTriedTimesObject;
cycleTriedTimes++;
if (cycleTriedTimes >= site.getCycleRetryTimes()) {
return null;
}
page.addTargetRequest(request.setPriority(0).putExtra(Request.CYCLE_TRIED_TIMES, cycleTriedTimes));
}
page.setNeedCycleRetry(true);
return page;
}
}

View File

@ -16,7 +16,6 @@ import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.PlainText;
import us.codecraft.webmagic.utils.UrlUtils;
@ -33,7 +32,7 @@ import java.util.Set;
* @since 0.1.0
*/
@ThreadSafe
public class HttpClientDownloader implements Downloader {
public class HttpClientDownloader extends AbstractDownloader {
private Logger logger = LoggerFactory.getLogger(getClass());
@ -41,27 +40,6 @@ public class HttpClientDownloader implements Downloader {
private HttpClientGenerator httpClientGenerator = new HttpClientGenerator();
/**
* A simple method to download a url.
*
* @param url
* @return html
*/
public Html download(String url) {
return download(url, null);
}
/**
* A simple method to download a url.
*
* @param url
* @return html
*/
public Html download(String url, String charset) {
Page page = download(new Request(url), Site.me().setCharset(charset).toTask());
return (Html) page.getHtml();
}
private CloseableHttpClient getHttpClient(Site site) {
if (site == null) {
return httpClientGenerator.getClient(null);
@ -145,22 +123,6 @@ public class HttpClientDownloader implements Downloader {
}
}
private Page addToCycleRetry(Request request, Site site) {
Page page = new Page();
Object cycleTriedTimesObject = request.getExtra(Request.CYCLE_TRIED_TIMES);
if (cycleTriedTimesObject == null) {
page.addTargetRequest(request.setPriority(0).putExtra(Request.CYCLE_TRIED_TIMES, 1));
} else {
int cycleTriedTimes = (Integer) cycleTriedTimesObject;
cycleTriedTimes++;
if (cycleTriedTimes >= site.getCycleRetryTimes()) {
return null;
}
page.addTargetRequest(request.setPriority(0).putExtra(Request.CYCLE_TRIED_TIMES, cycleTriedTimes));
}
return page;
}
protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException {
String content = IOUtils.toString(httpResponse.getEntity().getContent(), charset);
Page page = new Page();