diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/DuplicatedRemoveScheduler.java b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/DuplicatedRemoveScheduler.java index 7b319b6..4b70b83 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/DuplicatedRemoveScheduler.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/DuplicatedRemoveScheduler.java @@ -18,7 +18,7 @@ public abstract class DuplicatedRemoveScheduler implements Scheduler { @Override public void push(Request request, Task task) { logger.trace("get a candidate url {}", request.getUrl()); - if (isDuplicate(request, task) || shouldReserved(request)) { + if (!isDuplicate(request, task) || shouldReserved(request)) { logger.debug("push to queue {}", request.getUrl()); pushWhenNoDuplicate(request, task); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/LocalDuplicatedRemoveScheduler.java b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/LocalDuplicatedRemoveScheduler.java index c127c98..a1b0cab 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/LocalDuplicatedRemoveScheduler.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/LocalDuplicatedRemoveScheduler.java @@ -24,7 +24,7 @@ public abstract class LocalDuplicatedRemoveScheduler extends DuplicatedRemoveSch @Override protected boolean isDuplicate(Request request, Task task) { - return urls.add(request.getUrl()); + return !urls.add(request.getUrl()); } @Override diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java index dc2ee2e..338f5af 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java @@ -46,7 +46,7 @@ public class RedisScheduler extends DuplicatedRemoveScheduler implements Monitor protected boolean isDuplicate(Request request, Task task) { Jedis jedis = pool.getResource(); try { - boolean isDuplicate = !jedis.sismember(getSetKey(task), request.getUrl()); + boolean isDuplicate = jedis.sismember(getSetKey(task), request.getUrl()); if (!isDuplicate) { jedis.sadd(getSetKey(task), request.getUrl()); } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/AmanzonPageProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/AmanzonPageProcessor.java new file mode 100644 index 0000000..a980851 --- /dev/null +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/AmanzonPageProcessor.java @@ -0,0 +1,53 @@ +package us.codecraft.webmagic.samples; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.processor.PageProcessor; +import us.codecraft.webmagic.selector.Html; + +import java.util.List; + +/** + * @author code4crafer@gmail.com + */ +public class AmanzonPageProcessor implements PageProcessor{ + public void process(Page page) { + + Html html = page.getHtml(); + List questionList = html.xpath("//table[@class='tgCustomerCommunityCenterColumn']//div[@class='content']//table[@class='dataGrid']//tr").all(); + + if(questionList != null && questionList.size() > 1) + { + //i=0是列名称,所以i从1开始 + for( int i = 1 ; i < questionList.size(); i++) + { + System.out.println(questionList.get(i)); + Html tempHtml = Html.create(""+questionList.get(i)+"
"); + String comment = tempHtml.xpath("//td[@class='title']//a/text()").toString(); + System.out.println(comment); + String answerNum = tempHtml.xpath("//td[@class='num']/text()").toString(); + System.out.println(answerNum); + String createTime = tempHtml.xpath("//td[3]/text()").toString(); + System.out.println(createTime); + + /* Document doc = Jsoup.parse(questionList.get(i)); + Html hmt = Html.create(questionList.get(i)) ; + String str = hmt.links().toString(); + String content = doc.getElementsByTag("a").text(); + String ss = doc.text();*/ + + } + } + + } + + @Override + public Site getSite() { + return Site.me(); + } + + public static void main(String[] args) { + Spider.create(new AmanzonPageProcessor()).test("http://www.amazon.de/forum/Fx27CUFD8S7LJ5D"); + } +} diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java index ded1a5f..8055b36 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java @@ -4,6 +4,7 @@ import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; +import us.codecraft.webmagic.scheduler.RedisScheduler; import java.util.List; @@ -30,6 +31,6 @@ public class OschinaBlogPageProcesser implements PageProcessor { } public static void main(String[] args) { - Spider.create(new OschinaBlogPageProcesser()).run(); + Spider.create(new OschinaBlogPageProcesser()).setScheduler(new RedisScheduler("localhost")).run(); } }