fix remove duplicate error #117
parent
22652c4521
commit
e8d4a9be2b
|
@ -18,7 +18,7 @@ public abstract class DuplicatedRemoveScheduler implements Scheduler {
|
|||
@Override
|
||||
public void push(Request request, Task task) {
|
||||
logger.trace("get a candidate url {}", request.getUrl());
|
||||
if (isDuplicate(request, task) || shouldReserved(request)) {
|
||||
if (!isDuplicate(request, task) || shouldReserved(request)) {
|
||||
logger.debug("push to queue {}", request.getUrl());
|
||||
pushWhenNoDuplicate(request, task);
|
||||
}
|
||||
|
|
|
@ -24,7 +24,7 @@ public abstract class LocalDuplicatedRemoveScheduler extends DuplicatedRemoveSch
|
|||
|
||||
@Override
|
||||
protected boolean isDuplicate(Request request, Task task) {
|
||||
return urls.add(request.getUrl());
|
||||
return !urls.add(request.getUrl());
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -46,7 +46,7 @@ public class RedisScheduler extends DuplicatedRemoveScheduler implements Monitor
|
|||
protected boolean isDuplicate(Request request, Task task) {
|
||||
Jedis jedis = pool.getResource();
|
||||
try {
|
||||
boolean isDuplicate = !jedis.sismember(getSetKey(task), request.getUrl());
|
||||
boolean isDuplicate = jedis.sismember(getSetKey(task), request.getUrl());
|
||||
if (!isDuplicate) {
|
||||
jedis.sadd(getSetKey(task), request.getUrl());
|
||||
}
|
||||
|
|
|
@ -0,0 +1,53 @@
|
|||
package us.codecraft.webmagic.samples;
|
||||
|
||||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.Site;
|
||||
import us.codecraft.webmagic.Spider;
|
||||
import us.codecraft.webmagic.processor.PageProcessor;
|
||||
import us.codecraft.webmagic.selector.Html;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* @author code4crafer@gmail.com
|
||||
*/
|
||||
public class AmanzonPageProcessor implements PageProcessor{
|
||||
public void process(Page page) {
|
||||
|
||||
Html html = page.getHtml();
|
||||
List<String> questionList = html.xpath("//table[@class='tgCustomerCommunityCenterColumn']//div[@class='content']//table[@class='dataGrid']//tr").all();
|
||||
|
||||
if(questionList != null && questionList.size() > 1)
|
||||
{
|
||||
//i=0是列名称,所以i从1开始
|
||||
for( int i = 1 ; i < questionList.size(); i++)
|
||||
{
|
||||
System.out.println(questionList.get(i));
|
||||
Html tempHtml = Html.create("<table>"+questionList.get(i)+"</table>");
|
||||
String comment = tempHtml.xpath("//td[@class='title']//a/text()").toString();
|
||||
System.out.println(comment);
|
||||
String answerNum = tempHtml.xpath("//td[@class='num']/text()").toString();
|
||||
System.out.println(answerNum);
|
||||
String createTime = tempHtml.xpath("//td[3]/text()").toString();
|
||||
System.out.println(createTime);
|
||||
|
||||
/* Document doc = Jsoup.parse(questionList.get(i));
|
||||
Html hmt = Html.create(questionList.get(i)) ;
|
||||
String str = hmt.links().toString();
|
||||
String content = doc.getElementsByTag("a").text();
|
||||
String ss = doc.text();*/
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public Site getSite() {
|
||||
return Site.me();
|
||||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
Spider.create(new AmanzonPageProcessor()).test("http://www.amazon.de/forum/Fx27CUFD8S7LJ5D");
|
||||
}
|
||||
}
|
|
@ -4,6 +4,7 @@ import us.codecraft.webmagic.Page;
|
|||
import us.codecraft.webmagic.Site;
|
||||
import us.codecraft.webmagic.Spider;
|
||||
import us.codecraft.webmagic.processor.PageProcessor;
|
||||
import us.codecraft.webmagic.scheduler.RedisScheduler;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
|
@ -30,6 +31,6 @@ public class OschinaBlogPageProcesser implements PageProcessor {
|
|||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
Spider.create(new OschinaBlogPageProcesser()).run();
|
||||
Spider.create(new OschinaBlogPageProcesser()).setScheduler(new RedisScheduler("localhost")).run();
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue