From 571061454a581510774d046023077710c916a577 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Thu, 27 Feb 2014 23:54:30 +0800 Subject: [PATCH] #58 add CYCLE_TRIED_TIMES support to QueueScheduler and PriorityScheduler --- .../LocalDuplicatedRemovedScheduler.java | 33 +++++++++++++++++++ .../webmagic/scheduler/PriorityScheduler.java | 26 +++++---------- .../webmagic/scheduler/QueueScheduler.java | 20 ++--------- 3 files changed, 44 insertions(+), 35 deletions(-) create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/LocalDuplicatedRemovedScheduler.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/LocalDuplicatedRemovedScheduler.java b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/LocalDuplicatedRemovedScheduler.java new file mode 100644 index 0000000..c4b08f3 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/LocalDuplicatedRemovedScheduler.java @@ -0,0 +1,33 @@ +package us.codecraft.webmagic.scheduler; + +import com.google.common.collect.Sets; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.Task; + +import java.util.Set; +import java.util.concurrent.ConcurrentHashMap; + +/** + * Base Scheduler with duplicated urls removed locally. + * + * @author code4crafter@gmail.com + * @since 0.5.0 + */ +public abstract class LocalDuplicatedRemovedScheduler implements Scheduler { + + protected Logger logger = LoggerFactory.getLogger(getClass()); + + private Set urls = Sets.newSetFromMap(new ConcurrentHashMap()); + + @Override + public void push(Request request, Task task) { + logger.debug("push to queue " + request.getUrl()); + if (request.getExtra(Request.CYCLE_TRIED_TIMES) != null || urls.add(request.getUrl())) { + pushWhenNoDuplicate(request, task); + } + } + + protected abstract void pushWhenNoDuplicate(Request request, Task task); +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/PriorityScheduler.java b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/PriorityScheduler.java index 9a3fda7..d68c22c 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/PriorityScheduler.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/PriorityScheduler.java @@ -1,15 +1,11 @@ package us.codecraft.webmagic.scheduler; import org.apache.http.annotation.ThreadSafe; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.utils.NumberUtils; import java.util.Comparator; -import java.util.HashSet; -import java.util.Set; import java.util.concurrent.BlockingQueue; import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.PriorityBlockingQueue; @@ -21,12 +17,10 @@ import java.util.concurrent.PriorityBlockingQueue; * @since 0.2.1 */ @ThreadSafe -public class PriorityScheduler implements Scheduler { +public class PriorityScheduler extends LocalDuplicatedRemovedScheduler { public static final int INITIAL_CAPACITY = 5; - private Logger logger = LoggerFactory.getLogger(getClass()); - private BlockingQueue noPriorityQueue = new LinkedBlockingQueue(); private PriorityBlockingQueue priorityQueuePlus = new PriorityBlockingQueue(INITIAL_CAPACITY, new Comparator() { @@ -43,19 +37,15 @@ public class PriorityScheduler implements Scheduler { } }); - private Set urls = new HashSet(); @Override - public synchronized void push(Request request, Task task) { - logger.debug("push to queue " + request.getUrl()); - if (urls.add(request.getUrl())) { - if (request.getPriority() == 0) { - noPriorityQueue.add(request); - } else if (request.getPriority() > 0) { - priorityQueuePlus.put(request); - } else { - priorityQueueMinus.put(request); - } + public void pushWhenNoDuplicate(Request request, Task task) { + if (request.getPriority() == 0) { + noPriorityQueue.add(request); + } else if (request.getPriority() > 0) { + priorityQueuePlus.put(request); + } else { + priorityQueueMinus.put(request); } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java index ebab857..ab288df 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java @@ -1,13 +1,9 @@ package us.codecraft.webmagic.scheduler; import org.apache.http.annotation.ThreadSafe; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Task; -import java.util.HashSet; -import java.util.Set; import java.util.concurrent.BlockingQueue; import java.util.concurrent.LinkedBlockingQueue; @@ -20,23 +16,13 @@ import java.util.concurrent.LinkedBlockingQueue; * @since 0.1.0 */ @ThreadSafe -public class QueueScheduler implements Scheduler { - - private Logger logger = LoggerFactory.getLogger(getClass()); +public class QueueScheduler extends LocalDuplicatedRemovedScheduler { private BlockingQueue queue = new LinkedBlockingQueue(); - private Set urls = new HashSet(); - @Override - public synchronized void push(Request request, Task task) { - if (logger.isDebugEnabled()) { - logger.debug("push to queue " + request.getUrl()); - } - if (urls.add(request.getUrl())) { - queue.add(request); - } - + public void pushWhenNoDuplicate(Request request, Task task) { + queue.add(request); } @Override