bigfix: cycleTry will not work when spawnUrl is set to false #62
parent
16ae066c62
commit
88b50d4182
|
@ -37,6 +37,8 @@ public class Page {
|
||||||
|
|
||||||
private int statusCode;
|
private int statusCode;
|
||||||
|
|
||||||
|
private boolean needCycleRetry;
|
||||||
|
|
||||||
private List<Request> targetRequests = new ArrayList<Request>();
|
private List<Request> targetRequests = new ArrayList<Request>();
|
||||||
|
|
||||||
public Page() {
|
public Page() {
|
||||||
|
@ -165,6 +167,14 @@ public class Page {
|
||||||
return request;
|
return request;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public boolean isNeedCycleRetry() {
|
||||||
|
return needCycleRetry;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setNeedCycleRetry(boolean needCycleRetry) {
|
||||||
|
this.needCycleRetry = needCycleRetry;
|
||||||
|
}
|
||||||
|
|
||||||
public void setRequest(Request request) {
|
public void setRequest(Request request) {
|
||||||
this.request = request;
|
this.request = request;
|
||||||
this.resultItems.setRequest(request);
|
this.resultItems.setRequest(request);
|
||||||
|
|
|
@ -376,13 +376,13 @@ public class Spider implements Runnable, Task {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
// for cycle retry
|
// for cycle retry
|
||||||
if (page.getRawText() == null) {
|
if (page.isNeedCycleRetry()) {
|
||||||
extractAndAddRequests(page);
|
extractAndAddRequests(page, true);
|
||||||
sleep(site.getSleepTime());
|
sleep(site.getSleepTime());
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
pageProcessor.process(page);
|
pageProcessor.process(page);
|
||||||
extractAndAddRequests(page);
|
extractAndAddRequests(page, spawnUrl);
|
||||||
if (!page.getResultItems().isSkip()) {
|
if (!page.getResultItems().isSkip()) {
|
||||||
for (Pipeline pipeline : pipelines) {
|
for (Pipeline pipeline : pipelines) {
|
||||||
pipeline.process(page.getResultItems(), this);
|
pipeline.process(page.getResultItems(), this);
|
||||||
|
@ -399,7 +399,7 @@ public class Spider implements Runnable, Task {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
protected void extractAndAddRequests(Page page) {
|
protected void extractAndAddRequests(Page page, boolean spawnUrl) {
|
||||||
if (spawnUrl && CollectionUtils.isNotEmpty(page.getTargetRequests())) {
|
if (spawnUrl && CollectionUtils.isNotEmpty(page.getTargetRequests())) {
|
||||||
for (Request request : page.getTargetRequests()) {
|
for (Request request : page.getTargetRequests()) {
|
||||||
addRequest(request);
|
addRequest(request);
|
||||||
|
@ -588,8 +588,8 @@ public class Spider implements Runnable, Task {
|
||||||
* @see Status
|
* @see Status
|
||||||
* @since 0.4.1
|
* @since 0.4.1
|
||||||
*/
|
*/
|
||||||
public Status getStatus(){
|
public Status getStatus() {
|
||||||
return Status.fromValue(stat.get());
|
return Status.fromValue(stat.get());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -619,6 +619,7 @@ public class Spider implements Runnable, Task {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Get thread count which is running
|
* Get thread count which is running
|
||||||
|
*
|
||||||
* @return thread count which is running
|
* @return thread count which is running
|
||||||
* @since 0.4.1
|
* @since 0.4.1
|
||||||
*/
|
*/
|
||||||
|
|
|
@ -0,0 +1,53 @@
|
||||||
|
package us.codecraft.webmagic.downloader;
|
||||||
|
|
||||||
|
import us.codecraft.webmagic.Page;
|
||||||
|
import us.codecraft.webmagic.Request;
|
||||||
|
import us.codecraft.webmagic.Site;
|
||||||
|
import us.codecraft.webmagic.selector.Html;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Base class of downloader with some common methods.
|
||||||
|
*
|
||||||
|
* @author code4crafter@gmail.com
|
||||||
|
* @since 0.5.0
|
||||||
|
*/
|
||||||
|
public abstract class AbstractDownloader implements Downloader {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A simple method to download a url.
|
||||||
|
*
|
||||||
|
* @param url
|
||||||
|
* @return html
|
||||||
|
*/
|
||||||
|
public Html download(String url) {
|
||||||
|
return download(url, null);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A simple method to download a url.
|
||||||
|
*
|
||||||
|
* @param url
|
||||||
|
* @return html
|
||||||
|
*/
|
||||||
|
public Html download(String url, String charset) {
|
||||||
|
Page page = download(new Request(url), Site.me().setCharset(charset).toTask());
|
||||||
|
return (Html) page.getHtml();
|
||||||
|
}
|
||||||
|
|
||||||
|
protected Page addToCycleRetry(Request request, Site site) {
|
||||||
|
Page page = new Page();
|
||||||
|
Object cycleTriedTimesObject = request.getExtra(Request.CYCLE_TRIED_TIMES);
|
||||||
|
if (cycleTriedTimesObject == null) {
|
||||||
|
page.addTargetRequest(request.setPriority(0).putExtra(Request.CYCLE_TRIED_TIMES, 1));
|
||||||
|
} else {
|
||||||
|
int cycleTriedTimes = (Integer) cycleTriedTimesObject;
|
||||||
|
cycleTriedTimes++;
|
||||||
|
if (cycleTriedTimes >= site.getCycleRetryTimes()) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
page.addTargetRequest(request.setPriority(0).putExtra(Request.CYCLE_TRIED_TIMES, cycleTriedTimes));
|
||||||
|
}
|
||||||
|
page.setNeedCycleRetry(true);
|
||||||
|
return page;
|
||||||
|
}
|
||||||
|
}
|
|
@ -16,7 +16,6 @@ import us.codecraft.webmagic.Page;
|
||||||
import us.codecraft.webmagic.Request;
|
import us.codecraft.webmagic.Request;
|
||||||
import us.codecraft.webmagic.Site;
|
import us.codecraft.webmagic.Site;
|
||||||
import us.codecraft.webmagic.Task;
|
import us.codecraft.webmagic.Task;
|
||||||
import us.codecraft.webmagic.selector.Html;
|
|
||||||
import us.codecraft.webmagic.selector.PlainText;
|
import us.codecraft.webmagic.selector.PlainText;
|
||||||
import us.codecraft.webmagic.utils.UrlUtils;
|
import us.codecraft.webmagic.utils.UrlUtils;
|
||||||
|
|
||||||
|
@ -33,7 +32,7 @@ import java.util.Set;
|
||||||
* @since 0.1.0
|
* @since 0.1.0
|
||||||
*/
|
*/
|
||||||
@ThreadSafe
|
@ThreadSafe
|
||||||
public class HttpClientDownloader implements Downloader {
|
public class HttpClientDownloader extends AbstractDownloader {
|
||||||
|
|
||||||
private Logger logger = LoggerFactory.getLogger(getClass());
|
private Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
|
||||||
|
@ -41,27 +40,6 @@ public class HttpClientDownloader implements Downloader {
|
||||||
|
|
||||||
private HttpClientGenerator httpClientGenerator = new HttpClientGenerator();
|
private HttpClientGenerator httpClientGenerator = new HttpClientGenerator();
|
||||||
|
|
||||||
/**
|
|
||||||
* A simple method to download a url.
|
|
||||||
*
|
|
||||||
* @param url
|
|
||||||
* @return html
|
|
||||||
*/
|
|
||||||
public Html download(String url) {
|
|
||||||
return download(url, null);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* A simple method to download a url.
|
|
||||||
*
|
|
||||||
* @param url
|
|
||||||
* @return html
|
|
||||||
*/
|
|
||||||
public Html download(String url, String charset) {
|
|
||||||
Page page = download(new Request(url), Site.me().setCharset(charset).toTask());
|
|
||||||
return (Html) page.getHtml();
|
|
||||||
}
|
|
||||||
|
|
||||||
private CloseableHttpClient getHttpClient(Site site) {
|
private CloseableHttpClient getHttpClient(Site site) {
|
||||||
if (site == null) {
|
if (site == null) {
|
||||||
return httpClientGenerator.getClient(null);
|
return httpClientGenerator.getClient(null);
|
||||||
|
@ -145,22 +123,6 @@ public class HttpClientDownloader implements Downloader {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private Page addToCycleRetry(Request request, Site site) {
|
|
||||||
Page page = new Page();
|
|
||||||
Object cycleTriedTimesObject = request.getExtra(Request.CYCLE_TRIED_TIMES);
|
|
||||||
if (cycleTriedTimesObject == null) {
|
|
||||||
page.addTargetRequest(request.setPriority(0).putExtra(Request.CYCLE_TRIED_TIMES, 1));
|
|
||||||
} else {
|
|
||||||
int cycleTriedTimes = (Integer) cycleTriedTimesObject;
|
|
||||||
cycleTriedTimes++;
|
|
||||||
if (cycleTriedTimes >= site.getCycleRetryTimes()) {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
page.addTargetRequest(request.setPriority(0).putExtra(Request.CYCLE_TRIED_TIMES, cycleTriedTimes));
|
|
||||||
}
|
|
||||||
return page;
|
|
||||||
}
|
|
||||||
|
|
||||||
protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException {
|
protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException {
|
||||||
String content = IOUtils.toString(httpResponse.getEntity().getContent(), charset);
|
String content = IOUtils.toString(httpResponse.getEntity().getContent(), charset);
|
||||||
Page page = new Page();
|
Page page = new Page();
|
||||||
|
|
Loading…
Reference in New Issue