fix remove duplicate error #117

master
yihua.huang 2014-04-29 20:32:06 +08:00
parent 22652c4521
commit e8d4a9be2b
5 changed files with 58 additions and 4 deletions

View File

@ -18,7 +18,7 @@ public abstract class DuplicatedRemoveScheduler implements Scheduler {
@Override
public void push(Request request, Task task) {
logger.trace("get a candidate url {}", request.getUrl());
if (isDuplicate(request, task) || shouldReserved(request)) {
if (!isDuplicate(request, task) || shouldReserved(request)) {
logger.debug("push to queue {}", request.getUrl());
pushWhenNoDuplicate(request, task);
}

View File

@ -24,7 +24,7 @@ public abstract class LocalDuplicatedRemoveScheduler extends DuplicatedRemoveSch
@Override
protected boolean isDuplicate(Request request, Task task) {
return urls.add(request.getUrl());
return !urls.add(request.getUrl());
}
@Override

View File

@ -46,7 +46,7 @@ public class RedisScheduler extends DuplicatedRemoveScheduler implements Monitor
protected boolean isDuplicate(Request request, Task task) {
Jedis jedis = pool.getResource();
try {
boolean isDuplicate = !jedis.sismember(getSetKey(task), request.getUrl());
boolean isDuplicate = jedis.sismember(getSetKey(task), request.getUrl());
if (!isDuplicate) {
jedis.sadd(getSetKey(task), request.getUrl());
}

View File

@ -0,0 +1,53 @@
package us.codecraft.webmagic.samples;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Html;
import java.util.List;
/**
* @author code4crafer@gmail.com
*/
public class AmanzonPageProcessor implements PageProcessor{
public void process(Page page) {
Html html = page.getHtml();
List<String> questionList = html.xpath("//table[@class='tgCustomerCommunityCenterColumn']//div[@class='content']//table[@class='dataGrid']//tr").all();
if(questionList != null && questionList.size() > 1)
{
//i=0是列名称所以i从1开始
for( int i = 1 ; i < questionList.size(); i++)
{
System.out.println(questionList.get(i));
Html tempHtml = Html.create("<table>"+questionList.get(i)+"</table>");
String comment = tempHtml.xpath("//td[@class='title']//a/text()").toString();
System.out.println(comment);
String answerNum = tempHtml.xpath("//td[@class='num']/text()").toString();
System.out.println(answerNum);
String createTime = tempHtml.xpath("//td[3]/text()").toString();
System.out.println(createTime);
/* Document doc = Jsoup.parse(questionList.get(i));
Html hmt = Html.create(questionList.get(i)) ;
String str = hmt.links().toString();
String content = doc.getElementsByTag("a").text();
String ss = doc.text();*/
}
}
}
@Override
public Site getSite() {
return Site.me();
}
public static void main(String[] args) {
Spider.create(new AmanzonPageProcessor()).test("http://www.amazon.de/forum/Fx27CUFD8S7LJ5D");
}
}

View File

@ -4,6 +4,7 @@ import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.scheduler.RedisScheduler;
import java.util.List;
@ -30,6 +31,6 @@ public class OschinaBlogPageProcesser implements PageProcessor {
}
public static void main(String[] args) {
Spider.create(new OschinaBlogPageProcesser()).run();
Spider.create(new OschinaBlogPageProcesser()).setScheduler(new RedisScheduler("localhost")).run();
}
}