diff --git a/pom.xml b/pom.xml index b692f87..a0c9993 100644 --- a/pom.xml +++ b/pom.xml @@ -8,7 +8,7 @@ 7 us.codecraft - 0.2.1-SNAPSHOT + 0.2.1 4.0.0 pom webmagic-parent @@ -24,6 +24,11 @@ Yihua huang code4crafer@gmail.com + + yuany + Ligang Yao + ligang.yao@answers.com + scm:git:git@github.com:code4craft/webmagic.git diff --git a/release-note.md b/release-note.md index 9582302..7e8f958 100755 --- a/release-note.md +++ b/release-note.md @@ -1,5 +1,15 @@ Release Notes ---- +*2012-8-20* `version:0.2.1` + +ComboExtractor support for annotation. + +Request priority support (using `PriorityScheduler`). + +Complete some I18n work (comments and documents). + + + *2012-8-9* `version:0.2.0` 此次更新的主题是"方便"(之前的主题是"灵活")。 diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index c8d1e73..28d5507 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -5,7 +5,7 @@ us.codecraft webmagic-parent - 0.2.1-SNAPSHOT + 0.2.1 4.0.0 diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java index fd7f60c..bbea59f 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java @@ -31,7 +31,7 @@ public class Request implements Serializable { * But no scheduler in webmagic supporting priority now (: */ @Experimental - private double priority; + private long priority; public Request() { } @@ -40,7 +40,7 @@ public class Request implements Serializable { this.url = url; } - public double getPriority() { + public long getPriority() { return priority; } @@ -53,7 +53,7 @@ public class Request implements Serializable { * @return this */ @Experimental - public Request setPriority(double priority) { + public Request setPriority(long priority) { this.priority = priority; return this; } diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index 8a5059d..37322f6 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -5,7 +5,7 @@ us.codecraft webmagic-parent - 0.2.1-SNAPSHOT + 0.2.1 4.0.0 diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/PriorityScheduler.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/PriorityScheduler.java new file mode 100644 index 0000000..7ce44f0 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/PriorityScheduler.java @@ -0,0 +1,74 @@ +package us.codecraft.webmagic.scheduler; + +import org.apache.http.annotation.ThreadSafe; +import org.apache.log4j.Logger; +import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.Task; + +import java.util.Comparator; +import java.util.HashSet; +import java.util.Set; +import java.util.concurrent.BlockingQueue; +import java.util.concurrent.LinkedBlockingQueue; +import java.util.concurrent.PriorityBlockingQueue; + +/** + * Priority scheduler. Request with higher priority will poll earlier.
+ * + * @author code4crafter@gmail.com
+ * @since 0.2.1 + */ +@ThreadSafe +public class PriorityScheduler implements Scheduler { + + public static final int INITIAL_CAPACITY = 5; + + private Logger logger = Logger.getLogger(getClass()); + + private BlockingQueue noPriorityQueue = new LinkedBlockingQueue(); + + private PriorityBlockingQueue priorityQueuePlus = new PriorityBlockingQueue(INITIAL_CAPACITY, new Comparator() { + @Override + public int compare(Request o1, Request o2) { + return -(new Long(o1.getPriority()).compareTo(o2.getPriority())); + } + }); + + private PriorityBlockingQueue priorityQueueMinus = new PriorityBlockingQueue(INITIAL_CAPACITY, new Comparator() { + @Override + public int compare(Request o1, Request o2) { + return -(new Long(o1.getPriority()).compareTo(o2.getPriority())); + } + }); + + private Set urls = new HashSet(); + + @Override + public synchronized void push(Request request, Task task) { + if (logger.isDebugEnabled()) { + logger.debug("push to queue " + request.getUrl()); + } + if (urls.add(request.getUrl())) { + if (request.getPriority() == 0) { + noPriorityQueue.add(request); + } else if (request.getPriority() > 0) { + priorityQueuePlus.put(request); + } else { + priorityQueueMinus.put(request); + } + } + } + + @Override + public synchronized Request poll(Task task) { + Request poll = priorityQueuePlus.poll(); + if (poll != null) { + return poll; + } + poll = noPriorityQueue.poll(); + if (poll != null) { + return poll; + } + return priorityQueueMinus.poll(); + } +} diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/scheduler/PrioritySchedulerTest.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/scheduler/PrioritySchedulerTest.java new file mode 100644 index 0000000..700f454 --- /dev/null +++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/scheduler/PrioritySchedulerTest.java @@ -0,0 +1,75 @@ +package us.codecraft.webmagic.scheduler; + +import junit.framework.Assert; +import org.junit.Test; +import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Task; + +/** + * @author code4crafter@gmail.com
+ */ +public class PrioritySchedulerTest { + + private PriorityScheduler priorityScheduler = new PriorityScheduler(); + + private Task task = new Task() { + @Override + public String getUUID() { + return "1"; + } + + @Override + public Site getSite() { + return null; + } + }; + + @Test + public void testDifferentPriority() { + Request request = new Request("a"); + request.setPriority(100); + priorityScheduler.push(request,task); + + request = new Request("b"); + request.setPriority(900); + priorityScheduler.push(request,task); + + request = new Request("c"); + priorityScheduler.push(request,task); + + request = new Request("d"); + request.setPriority(-900); + priorityScheduler.push(request,task); + + Request poll = priorityScheduler.poll(task); + Assert.assertEquals("b",poll.getUrl()); + poll = priorityScheduler.poll(task); + Assert.assertEquals("a",poll.getUrl()); + poll = priorityScheduler.poll(task); + Assert.assertEquals("c",poll.getUrl()); + poll = priorityScheduler.poll(task); + Assert.assertEquals("d",poll.getUrl()); + } + + @Test + public void testNoPriority() { + Request request = new Request("a"); + priorityScheduler.push(request,task); + + request = new Request("b"); + priorityScheduler.push(request,task); + + request = new Request("c"); + priorityScheduler.push(request,task); + + Request poll = priorityScheduler.poll(task); + Assert.assertEquals("a",poll.getUrl()); + + poll = priorityScheduler.poll(task); + Assert.assertEquals("b",poll.getUrl()); + + poll = priorityScheduler.poll(task); + Assert.assertEquals("c",poll.getUrl()); + } +} diff --git a/webmagic-lucene/pom.xml b/webmagic-lucene/pom.xml index d54d239..97946cc 100644 --- a/webmagic-lucene/pom.xml +++ b/webmagic-lucene/pom.xml @@ -5,7 +5,7 @@ webmagic-parent us.codecraft - 0.2.1-SNAPSHOT + 0.2.1 4.0.0 diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index 98bc7a2..35ddcaa 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -5,7 +5,7 @@ webmagic-parent us.codecraft - 0.2.1-SNAPSHOT + 0.2.1 4.0.0 diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml index cfed143..efa8291 100644 --- a/webmagic-saxon/pom.xml +++ b/webmagic-saxon/pom.xml @@ -5,7 +5,7 @@ webmagic-parent us.codecraft - 0.2.1-SNAPSHOT + 0.2.1 4.0.0 diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml index 4469e3e..43bbcfb 100644 --- a/webmagic-selenium/pom.xml +++ b/webmagic-selenium/pom.xml @@ -5,7 +5,7 @@ webmagic-parent us.codecraft - 0.2.1-SNAPSHOT + 0.2.1 4.0.0