diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/component/BloomFilterDuplicateRemover.java b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/component/BloomFilterDuplicateRemover.java index 1c89e5c..6d5e597 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/component/BloomFilterDuplicateRemover.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/component/BloomFilterDuplicateRemover.java @@ -46,14 +46,18 @@ public class BloomFilterDuplicateRemover implements DuplicateRemover { @Override public boolean isDuplicate(Request request, Task task) { - boolean isDuplicate = bloomFilter.mightContain(request.getUrl()); + boolean isDuplicate = bloomFilter.mightContain(getUrl(request)); if (!isDuplicate) { - bloomFilter.put(request.getUrl()); + bloomFilter.put(getUrl(request)); counter.incrementAndGet(); } return isDuplicate; } + protected String getUrl(Request request) { + return request.getUrl(); + } + @Override public void resetDuplicateCheck(Task task) { rebuildBloomFilter(); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/component/HashSetDuplicateRemover.java b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/component/HashSetDuplicateRemover.java index f8bcf26..1190762 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/component/HashSetDuplicateRemover.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/component/HashSetDuplicateRemover.java @@ -16,7 +16,11 @@ public class HashSetDuplicateRemover implements DuplicateRemover { @Override public boolean isDuplicate(Request request, Task task) { - return !urls.add(request.getUrl()); + return !urls.add(getUrl(request)); + } + + protected String getUrl(Request request) { + return request.getUrl(); } @Override