bigfix: cycleTry will not work when spawnUrl is set to false #62
parent
16ae066c62
commit
88b50d4182
|
@ -37,6 +37,8 @@ public class Page {
|
|||
|
||||
private int statusCode;
|
||||
|
||||
private boolean needCycleRetry;
|
||||
|
||||
private List<Request> targetRequests = new ArrayList<Request>();
|
||||
|
||||
public Page() {
|
||||
|
@ -165,6 +167,14 @@ public class Page {
|
|||
return request;
|
||||
}
|
||||
|
||||
public boolean isNeedCycleRetry() {
|
||||
return needCycleRetry;
|
||||
}
|
||||
|
||||
public void setNeedCycleRetry(boolean needCycleRetry) {
|
||||
this.needCycleRetry = needCycleRetry;
|
||||
}
|
||||
|
||||
public void setRequest(Request request) {
|
||||
this.request = request;
|
||||
this.resultItems.setRequest(request);
|
||||
|
|
|
@ -376,13 +376,13 @@ public class Spider implements Runnable, Task {
|
|||
return;
|
||||
}
|
||||
// for cycle retry
|
||||
if (page.getRawText() == null) {
|
||||
extractAndAddRequests(page);
|
||||
if (page.isNeedCycleRetry()) {
|
||||
extractAndAddRequests(page, true);
|
||||
sleep(site.getSleepTime());
|
||||
return;
|
||||
}
|
||||
pageProcessor.process(page);
|
||||
extractAndAddRequests(page);
|
||||
extractAndAddRequests(page, spawnUrl);
|
||||
if (!page.getResultItems().isSkip()) {
|
||||
for (Pipeline pipeline : pipelines) {
|
||||
pipeline.process(page.getResultItems(), this);
|
||||
|
@ -399,7 +399,7 @@ public class Spider implements Runnable, Task {
|
|||
}
|
||||
}
|
||||
|
||||
protected void extractAndAddRequests(Page page) {
|
||||
protected void extractAndAddRequests(Page page, boolean spawnUrl) {
|
||||
if (spawnUrl && CollectionUtils.isNotEmpty(page.getTargetRequests())) {
|
||||
for (Request request : page.getTargetRequests()) {
|
||||
addRequest(request);
|
||||
|
@ -588,8 +588,8 @@ public class Spider implements Runnable, Task {
|
|||
* @see Status
|
||||
* @since 0.4.1
|
||||
*/
|
||||
public Status getStatus(){
|
||||
return Status.fromValue(stat.get());
|
||||
public Status getStatus() {
|
||||
return Status.fromValue(stat.get());
|
||||
}
|
||||
|
||||
|
||||
|
@ -619,6 +619,7 @@ public class Spider implements Runnable, Task {
|
|||
|
||||
/**
|
||||
* Get thread count which is running
|
||||
*
|
||||
* @return thread count which is running
|
||||
* @since 0.4.1
|
||||
*/
|
||||
|
|
|
@ -0,0 +1,53 @@
|
|||
package us.codecraft.webmagic.downloader;
|
||||
|
||||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.Request;
|
||||
import us.codecraft.webmagic.Site;
|
||||
import us.codecraft.webmagic.selector.Html;
|
||||
|
||||
/**
|
||||
* Base class of downloader with some common methods.
|
||||
*
|
||||
* @author code4crafter@gmail.com
|
||||
* @since 0.5.0
|
||||
*/
|
||||
public abstract class AbstractDownloader implements Downloader {
|
||||
|
||||
/**
|
||||
* A simple method to download a url.
|
||||
*
|
||||
* @param url
|
||||
* @return html
|
||||
*/
|
||||
public Html download(String url) {
|
||||
return download(url, null);
|
||||
}
|
||||
|
||||
/**
|
||||
* A simple method to download a url.
|
||||
*
|
||||
* @param url
|
||||
* @return html
|
||||
*/
|
||||
public Html download(String url, String charset) {
|
||||
Page page = download(new Request(url), Site.me().setCharset(charset).toTask());
|
||||
return (Html) page.getHtml();
|
||||
}
|
||||
|
||||
protected Page addToCycleRetry(Request request, Site site) {
|
||||
Page page = new Page();
|
||||
Object cycleTriedTimesObject = request.getExtra(Request.CYCLE_TRIED_TIMES);
|
||||
if (cycleTriedTimesObject == null) {
|
||||
page.addTargetRequest(request.setPriority(0).putExtra(Request.CYCLE_TRIED_TIMES, 1));
|
||||
} else {
|
||||
int cycleTriedTimes = (Integer) cycleTriedTimesObject;
|
||||
cycleTriedTimes++;
|
||||
if (cycleTriedTimes >= site.getCycleRetryTimes()) {
|
||||
return null;
|
||||
}
|
||||
page.addTargetRequest(request.setPriority(0).putExtra(Request.CYCLE_TRIED_TIMES, cycleTriedTimes));
|
||||
}
|
||||
page.setNeedCycleRetry(true);
|
||||
return page;
|
||||
}
|
||||
}
|
|
@ -16,7 +16,6 @@ import us.codecraft.webmagic.Page;
|
|||
import us.codecraft.webmagic.Request;
|
||||
import us.codecraft.webmagic.Site;
|
||||
import us.codecraft.webmagic.Task;
|
||||
import us.codecraft.webmagic.selector.Html;
|
||||
import us.codecraft.webmagic.selector.PlainText;
|
||||
import us.codecraft.webmagic.utils.UrlUtils;
|
||||
|
||||
|
@ -33,7 +32,7 @@ import java.util.Set;
|
|||
* @since 0.1.0
|
||||
*/
|
||||
@ThreadSafe
|
||||
public class HttpClientDownloader implements Downloader {
|
||||
public class HttpClientDownloader extends AbstractDownloader {
|
||||
|
||||
private Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
|
@ -41,27 +40,6 @@ public class HttpClientDownloader implements Downloader {
|
|||
|
||||
private HttpClientGenerator httpClientGenerator = new HttpClientGenerator();
|
||||
|
||||
/**
|
||||
* A simple method to download a url.
|
||||
*
|
||||
* @param url
|
||||
* @return html
|
||||
*/
|
||||
public Html download(String url) {
|
||||
return download(url, null);
|
||||
}
|
||||
|
||||
/**
|
||||
* A simple method to download a url.
|
||||
*
|
||||
* @param url
|
||||
* @return html
|
||||
*/
|
||||
public Html download(String url, String charset) {
|
||||
Page page = download(new Request(url), Site.me().setCharset(charset).toTask());
|
||||
return (Html) page.getHtml();
|
||||
}
|
||||
|
||||
private CloseableHttpClient getHttpClient(Site site) {
|
||||
if (site == null) {
|
||||
return httpClientGenerator.getClient(null);
|
||||
|
@ -145,22 +123,6 @@ public class HttpClientDownloader implements Downloader {
|
|||
}
|
||||
}
|
||||
|
||||
private Page addToCycleRetry(Request request, Site site) {
|
||||
Page page = new Page();
|
||||
Object cycleTriedTimesObject = request.getExtra(Request.CYCLE_TRIED_TIMES);
|
||||
if (cycleTriedTimesObject == null) {
|
||||
page.addTargetRequest(request.setPriority(0).putExtra(Request.CYCLE_TRIED_TIMES, 1));
|
||||
} else {
|
||||
int cycleTriedTimes = (Integer) cycleTriedTimesObject;
|
||||
cycleTriedTimes++;
|
||||
if (cycleTriedTimes >= site.getCycleRetryTimes()) {
|
||||
return null;
|
||||
}
|
||||
page.addTargetRequest(request.setPriority(0).putExtra(Request.CYCLE_TRIED_TIMES, cycleTriedTimes));
|
||||
}
|
||||
return page;
|
||||
}
|
||||
|
||||
protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException {
|
||||
String content = IOUtils.toString(httpResponse.getEntity().getContent(), charset);
|
||||
Page page = new Page();
|
||||
|
|
Loading…
Reference in New Issue