From 4f22f1210e58977f94a9161b1c63206358cdac73 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Fri, 2 May 2014 20:38:49 +0800 Subject: [PATCH] some bug fix #118 --- .../webmagic/scheduler/DuplicateRemovedScheduler.java | 3 ++- .../component/BloomFilterDuplicateRemover.java | 2 +- .../scheduler/BloomFilterDuplicateRemoverTest.java | 4 ++-- .../webmagic/samples/OschinaBlogPageProcesser.java | 11 ++++++++--- 4 files changed, 13 insertions(+), 7 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/DuplicateRemovedScheduler.java b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/DuplicateRemovedScheduler.java index 9319912..b14d8ee 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/DuplicateRemovedScheduler.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/DuplicateRemovedScheduler.java @@ -5,6 +5,7 @@ import org.slf4j.LoggerFactory; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.scheduler.component.DuplicateRemover; +import us.codecraft.webmagic.scheduler.component.HashSetDuplicateRemover; /** * Remove duplicate urls and only push urls which are not duplicate.

@@ -16,7 +17,7 @@ public abstract class DuplicateRemovedScheduler implements Scheduler { protected Logger logger = LoggerFactory.getLogger(getClass()); - private DuplicateRemover duplicatedRemover; + private DuplicateRemover duplicatedRemover = new HashSetDuplicateRemover(); public DuplicateRemover getDuplicateRemover() { return duplicatedRemover; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/component/BloomFilterDuplicateRemover.java b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/component/BloomFilterDuplicateRemover.java index d16c3ad..34d6f7c 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/component/BloomFilterDuplicateRemover.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/component/BloomFilterDuplicateRemover.java @@ -43,7 +43,7 @@ public class BloomFilterDuplicateRemover implements DuplicateRemover { public boolean isDuplicate(Request request, Task task) { boolean isDuplicate = bloomFilter.mightContain(request.getUrl()); if (!isDuplicate) { - bloomFilter.apply(request.getUrl()); + bloomFilter.put(request.getUrl()); counter.incrementAndGet(); } return isDuplicate; diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/scheduler/BloomFilterDuplicateRemoverTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/scheduler/BloomFilterDuplicateRemoverTest.java index b6fc5e0..a82d74e 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/scheduler/BloomFilterDuplicateRemoverTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/scheduler/BloomFilterDuplicateRemoverTest.java @@ -17,11 +17,11 @@ public class BloomFilterDuplicateRemoverTest { boolean isDuplicate = bloomFilterDuplicateRemover.isDuplicate(new Request("a"), null); assertThat(isDuplicate).isFalse(); isDuplicate = bloomFilterDuplicateRemover.isDuplicate(new Request("a"), null); - assertThat(isDuplicate); + assertThat(isDuplicate).isTrue(); isDuplicate = bloomFilterDuplicateRemover.isDuplicate(new Request("b"), null); assertThat(isDuplicate).isFalse(); isDuplicate = bloomFilterDuplicateRemover.isDuplicate(new Request("b"), null); - assertThat(isDuplicate); + assertThat(isDuplicate).isTrue(); } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java index 8055b36..2e3996c 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java @@ -3,9 +3,12 @@ package us.codecraft.webmagic.samples; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.monitor.SpiderMonitor; import us.codecraft.webmagic.processor.PageProcessor; -import us.codecraft.webmagic.scheduler.RedisScheduler; +import us.codecraft.webmagic.scheduler.QueueScheduler; +import us.codecraft.webmagic.scheduler.component.BloomFilterDuplicateRemover; +import javax.management.JMException; import java.util.List; /** @@ -30,7 +33,9 @@ public class OschinaBlogPageProcesser implements PageProcessor { } - public static void main(String[] args) { - Spider.create(new OschinaBlogPageProcesser()).setScheduler(new RedisScheduler("localhost")).run(); + public static void main(String[] args) throws JMException { + Spider spider = Spider.create(new OschinaBlogPageProcesser()).setScheduler(new QueueScheduler().setDuplicateRemover(new BloomFilterDuplicateRemover(2000))); + SpiderMonitor.instance().register(spider); + spider.run(); } }