diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/component/BloomFilterDuplicateRemover.java b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/component/BloomFilterDuplicateRemover.java index 34d6f7c..1c89e5c 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/component/BloomFilterDuplicateRemover.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/component/BloomFilterDuplicateRemover.java @@ -23,9 +23,14 @@ public class BloomFilterDuplicateRemover implements DuplicateRemover { private AtomicInteger counter; public BloomFilterDuplicateRemover(int expectedInsertions) { - this(expectedInsertions, 0.03); + this(expectedInsertions, 0.01); } + /** + * + * @param expectedInsertions the number of expected insertions to the constructed + * @param fpp the desired false positive probability (must be positive and less than 1.0) + */ public BloomFilterDuplicateRemover(int expectedInsertions, double fpp) { this.expectedInsertions = expectedInsertions; this.fpp = fpp; diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/scheduler/BloomFilterDuplicateRemoverTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/scheduler/BloomFilterDuplicateRemoverTest.java index a82d74e..f8e0b9a 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/scheduler/BloomFilterDuplicateRemoverTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/scheduler/BloomFilterDuplicateRemoverTest.java @@ -1,8 +1,11 @@ package us.codecraft.webmagic.scheduler; +import org.junit.Ignore; import org.junit.Test; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.scheduler.component.BloomFilterDuplicateRemover; +import us.codecraft.webmagic.scheduler.component.DuplicateRemover; +import us.codecraft.webmagic.scheduler.component.HashSetDuplicateRemover; import static org.assertj.core.api.Assertions.assertThat; @@ -24,4 +27,54 @@ public class BloomFilterDuplicateRemoverTest { assertThat(isDuplicate).isTrue(); } + + @Ignore("long time") + @Test + public void testMemory() throws Exception { + int times = 5000000; + DuplicateRemover duplicateRemover = new BloomFilterDuplicateRemover(times,0.005); + long freeMemory = Runtime.getRuntime().freeMemory(); + long time = System.currentTimeMillis(); + for (int i = 0; i < times; i++) { + duplicateRemover.isDuplicate(new Request(String.valueOf(i)), null); + } + System.out.println("Time used by bloomfilter:" + (System.currentTimeMillis() - time)); + System.out.println("Memory used by bloomfilter:" + (freeMemory - Runtime.getRuntime().freeMemory())); + + duplicateRemover = new HashSetDuplicateRemover(); + System.gc(); + freeMemory = Runtime.getRuntime().freeMemory(); + time = System.currentTimeMillis(); + for (int i = 0; i < times; i++) { + duplicateRemover.isDuplicate(new Request(String.valueOf(i)), null); + } + System.out.println("Time used by hashset:" + (System.currentTimeMillis() - time)); + System.out.println("Memory used by hashset:" + (freeMemory - Runtime.getRuntime().freeMemory())); + } + + @Ignore("long time") + @Test + public void testMissHit() throws Exception { + int times = 5000000; + DuplicateRemover duplicateRemover = new BloomFilterDuplicateRemover(times, 0.01); + int right = 0; + int wrong = 0; + int missCheck = 0; + for (int i = 0; i < times; i++) { + boolean duplicate = duplicateRemover.isDuplicate(new Request(String.valueOf(i)), null); + if (duplicate) { + wrong++; + } else { + right++; + } + duplicate = duplicateRemover.isDuplicate(new Request(String.valueOf(i)), null); + if (!duplicate) { + missCheck++; + } + } + + System.out.println("Right count: " + right + " Wrong count: " + wrong + " Miss check: " + missCheck); + } + + }