From 01aec7e1ab60c5e18cc9c00704827d070bbc6492 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Fri, 2 May 2014 23:23:23 +0800 Subject: [PATCH] extension point of geturl #118 --- .../scheduler/component/BloomFilterDuplicateRemover.java | 8 ++++++-- .../scheduler/component/HashSetDuplicateRemover.java | 6 +++++- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/component/BloomFilterDuplicateRemover.java b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/component/BloomFilterDuplicateRemover.java index 1c89e5c..6d5e597 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/component/BloomFilterDuplicateRemover.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/component/BloomFilterDuplicateRemover.java @@ -46,14 +46,18 @@ public class BloomFilterDuplicateRemover implements DuplicateRemover { @Override public boolean isDuplicate(Request request, Task task) { - boolean isDuplicate = bloomFilter.mightContain(request.getUrl()); + boolean isDuplicate = bloomFilter.mightContain(getUrl(request)); if (!isDuplicate) { - bloomFilter.put(request.getUrl()); + bloomFilter.put(getUrl(request)); counter.incrementAndGet(); } return isDuplicate; } + protected String getUrl(Request request) { + return request.getUrl(); + } + @Override public void resetDuplicateCheck(Task task) { rebuildBloomFilter(); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/component/HashSetDuplicateRemover.java b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/component/HashSetDuplicateRemover.java index f8bcf26..1190762 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/component/HashSetDuplicateRemover.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/component/HashSetDuplicateRemover.java @@ -16,7 +16,11 @@ public class HashSetDuplicateRemover implements DuplicateRemover { @Override public boolean isDuplicate(Request request, Task task) { - return !urls.add(request.getUrl()); + return !urls.add(getUrl(request)); + } + + protected String getUrl(Request request) { + return request.getUrl(); } @Override