add retry
parent
a1ef2523cc
commit
d141541ef3
|
@ -17,6 +17,8 @@ public class Request implements Serializable {
|
||||||
|
|
||||||
private static final long serialVersionUID = 2062192774891352043L;
|
private static final long serialVersionUID = 2062192774891352043L;
|
||||||
|
|
||||||
|
public static final String CYCLE_TRIED_TIMES = "_cycle_tried_times";
|
||||||
|
|
||||||
private String url;
|
private String url;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -30,6 +30,8 @@ public class Site {
|
||||||
|
|
||||||
private int retryTimes = 0;
|
private int retryTimes = 0;
|
||||||
|
|
||||||
|
private int cycleRetryTimes = 0;
|
||||||
|
|
||||||
private static final Set<Integer> DEFAULT_STATUS_CODE_SET = new HashSet<Integer>();
|
private static final Set<Integer> DEFAULT_STATUS_CODE_SET = new HashSet<Integer>();
|
||||||
|
|
||||||
private Set<Integer> acceptStatCode = DEFAULT_STATUS_CODE_SET;
|
private Set<Integer> acceptStatCode = DEFAULT_STATUS_CODE_SET;
|
||||||
|
@ -200,7 +202,7 @@ public class Site {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Get retry times when download fail, 0 by default.<br>
|
* Get retry times when download fail immediately, 0 by default.<br>
|
||||||
*
|
*
|
||||||
* @return retry times when download fail
|
* @return retry times when download fail
|
||||||
*/
|
*/
|
||||||
|
@ -218,6 +220,25 @@ public class Site {
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* When cycleRetryTimes is more than 0, it will add back to scheduler and try download again. <br>
|
||||||
|
*
|
||||||
|
* @return retry times when download fail
|
||||||
|
*/
|
||||||
|
public int getCycleRetryTimes() {
|
||||||
|
return cycleRetryTimes;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Set cycleRetryTimes times when download fail, 0 by default. Only work in RedisScheduler. <br>
|
||||||
|
*
|
||||||
|
* @return this
|
||||||
|
*/
|
||||||
|
public Site setCycleRetryTimes(int cycleRetryTimes) {
|
||||||
|
this.cycleRetryTimes = cycleRetryTimes;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean equals(Object o) {
|
public boolean equals(Object o) {
|
||||||
if (this == o) return true;
|
if (this == o) return true;
|
||||||
|
|
|
@ -52,7 +52,7 @@ public class HttpClientDownloader implements Downloader {
|
||||||
* @param url
|
* @param url
|
||||||
* @return html
|
* @return html
|
||||||
*/
|
*/
|
||||||
public Html download(String url,String charset) {
|
public Html download(String url, String charset) {
|
||||||
Page page = download(new Request(url), Site.me().setCharset(charset).toTask());
|
Page page = download(new Request(url), Site.me().setCharset(charset).toTask());
|
||||||
return (Html) page.getHtml();
|
return (Html) page.getHtml();
|
||||||
}
|
}
|
||||||
|
@ -90,6 +90,21 @@ public class HttpClientDownloader implements Downloader {
|
||||||
|
|
||||||
if (tried > retryTimes) {
|
if (tried > retryTimes) {
|
||||||
logger.warn("download page " + request.getUrl() + " error", e);
|
logger.warn("download page " + request.getUrl() + " error", e);
|
||||||
|
if (site.getCycleRetryTimes() > 0) {
|
||||||
|
Page page = new Page();
|
||||||
|
Object cycleTriedTimesObject = request.getExtra(Request.CYCLE_TRIED_TIMES);
|
||||||
|
if (cycleTriedTimesObject == null) {
|
||||||
|
page.addTargetRequest(request.setPriority(0).putExtra(Request.CYCLE_TRIED_TIMES, 1));
|
||||||
|
} else {
|
||||||
|
int cycleTriedTimes = (Integer) cycleTriedTimesObject;
|
||||||
|
cycleTriedTimes++;
|
||||||
|
if (cycleTriedTimes >= site.getCycleRetryTimes()) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
page.addTargetRequest(request.setPriority(0).putExtra(Request.CYCLE_TRIED_TIMES, 1));
|
||||||
|
}
|
||||||
|
return page;
|
||||||
|
}
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
logger.info("download page " + request.getUrl() + " error, retry the " + tried + " time!");
|
logger.info("download page " + request.getUrl() + " error, retry the " + tried + " time!");
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
package us.codecraft.webmagic.selector;
|
package us.codecraft.webmagic.selector;
|
||||||
|
|
||||||
|
import org.apache.log4j.Logger;
|
||||||
import org.jsoup.Jsoup;
|
import org.jsoup.Jsoup;
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
import us.codecraft.webmagic.utils.EnvironmentUtil;
|
import us.codecraft.webmagic.utils.EnvironmentUtil;
|
||||||
|
@ -15,6 +16,8 @@ import java.util.List;
|
||||||
*/
|
*/
|
||||||
public class Html extends PlainText {
|
public class Html extends PlainText {
|
||||||
|
|
||||||
|
private Logger logger = Logger.getLogger(getClass());
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Store parsed document for better performance when only one text exist.
|
* Store parsed document for better performance when only one text exist.
|
||||||
*/
|
*/
|
||||||
|
@ -26,7 +29,11 @@ public class Html extends PlainText {
|
||||||
|
|
||||||
public Html(String text) {
|
public Html(String text) {
|
||||||
super(text);
|
super(text);
|
||||||
this.document = Jsoup.parse(text);
|
try {
|
||||||
|
this.document = Jsoup.parse(text);
|
||||||
|
} catch (Exception e) {
|
||||||
|
logger.warn("parse document error ", e);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public Html(Document document) {
|
public Html(Document document) {
|
||||||
|
@ -108,7 +115,7 @@ public class Html extends PlainText {
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getText() {
|
public String getText() {
|
||||||
if (strings!=null&&strings.size()>0){
|
if (strings != null && strings.size() > 0) {
|
||||||
return strings.get(0);
|
return strings.get(0);
|
||||||
}
|
}
|
||||||
return document.html();
|
return document.html();
|
||||||
|
|
|
@ -36,9 +36,11 @@ public class RedisScheduler implements Scheduler {
|
||||||
public synchronized void push(Request request, Task task) {
|
public synchronized void push(Request request, Task task) {
|
||||||
Jedis jedis = pool.getResource();
|
Jedis jedis = pool.getResource();
|
||||||
try {
|
try {
|
||||||
//使用Set进行url去重
|
// if cycleRetriedTimes is set, allow duplicated.
|
||||||
if (!jedis.sismember(SET_PREFIX + task.getUUID(), request.getUrl())) {
|
Object cycleRetriedTimes = request.getExtra(Request.CYCLE_TRIED_TIMES);
|
||||||
//使用List保存队列
|
// use set to remove duplicate url
|
||||||
|
if (cycleRetriedTimes != null || !jedis.sismember(SET_PREFIX + task.getUUID(), request.getUrl())) {
|
||||||
|
// use list to store queue
|
||||||
jedis.rpush(QUEUE_PREFIX + task.getUUID(), request.getUrl());
|
jedis.rpush(QUEUE_PREFIX + task.getUUID(), request.getUrl());
|
||||||
jedis.sadd(SET_PREFIX + task.getUUID(), request.getUrl());
|
jedis.sadd(SET_PREFIX + task.getUUID(), request.getUrl());
|
||||||
if (request.getExtras() != null) {
|
if (request.getExtras() != null) {
|
||||||
|
|
Loading…
Reference in New Issue