diff --git a/README.md b/README.md
index cee747c..b23bf83 100644
--- a/README.md
+++ b/README.md
@@ -41,12 +41,12 @@ webmagic使用maven管理依赖,在项目中添加对应的依赖即可使用w
us.codecraft
webmagic-core
- 0.5.0
+ 0.5.1
us.codecraft
webmagic-extension
- 0.5.0
+ 0.5.1
```
@@ -192,6 +192,7 @@ webmagic遵循[Apache 2.0协议](http://opensource.org/licenses/Apache-2.0)
* [lidongyang](http://my.oschina.net/lidongyang)
* [seveniu](https://github.com/seveniu)
* [sebastian1118](https://github.com/sebastian1118)
+* [codev777](https://github.com/codev777)
### 邮件组:
diff --git a/en_docs/README.md b/en_docs/README.md
index cc63925..5ae494c 100644
--- a/en_docs/README.md
+++ b/en_docs/README.md
@@ -25,12 +25,12 @@ Add dependencies to your pom.xml:
us.codecraft
webmagic-core
- 0.5.0
+ 0.5.1
us.codecraft
webmagic-extension
- 0.5.0
+ 0.5.1
```
@@ -145,6 +145,7 @@ Thanks these people for commiting source code, reporting bugs or suggesting for
* [lidongyang](http://my.oschina.net/lidongyang)
* [seveniu](https://github.com/seveniu)
* [sebastian1118](https://github.com/sebastian1118)
+* [codev777](https://github.com/codev777)
### Thanks:
diff --git a/pom.xml b/pom.xml
index 9bfc505..4ab38b1 100644
--- a/pom.xml
+++ b/pom.xml
@@ -6,7 +6,7 @@
7
us.codecraft
- 0.5.0
+ 0.5.2-SNAPSHOT
4.0.0
pom
@@ -54,7 +54,7 @@
webmagic-selenium
webmagic-saxon
webmagic-samples
- webmagic-avalon
+
diff --git a/webmagic-avalon/forger/pom.xml b/webmagic-avalon/forger/pom.xml
index 89796d1..44b42f9 100644
--- a/webmagic-avalon/forger/pom.xml
+++ b/webmagic-avalon/forger/pom.xml
@@ -7,7 +7,7 @@
us.codecraft
forger
- 0.1.0
+ 0.1.1-SNAPSHOT
4.0.0
jar
diff --git a/webmagic-avalon/pom.xml b/webmagic-avalon/pom.xml
index d1fadc4..0dbb369 100644
--- a/webmagic-avalon/pom.xml
+++ b/webmagic-avalon/pom.xml
@@ -3,7 +3,7 @@
webmagic-parent
us.codecraft
- 0.5.0
+ 0.5.1-SNAPSHOT
4.0.0
@@ -39,12 +39,6 @@
1.1.1
-
- us.codecraft
- forger
- 0.1.1-SNAPSHOT
-
-
org.freemarker
freemarker
diff --git a/webmagic-avalon/webmagic-admin/pom.xml b/webmagic-avalon/webmagic-admin/pom.xml
index 020ca8a..ed364c1 100644
--- a/webmagic-avalon/webmagic-admin/pom.xml
+++ b/webmagic-avalon/webmagic-admin/pom.xml
@@ -3,7 +3,7 @@
webmagic-avalon
us.codecraft
- 0.5.0
+ 0.5.1-SNAPSHOT
4.0.0
diff --git a/webmagic-avalon/webmagic-avalon-common/pom.xml b/webmagic-avalon/webmagic-avalon-common/pom.xml
index 32eb8b4..9c7199a 100644
--- a/webmagic-avalon/webmagic-avalon-common/pom.xml
+++ b/webmagic-avalon/webmagic-avalon-common/pom.xml
@@ -3,7 +3,7 @@
webmagic-avalon
us.codecraft
- 0.5.0
+ 0.5.1-SNAPSHOT
4.0.0
@@ -26,7 +26,7 @@
us.codecraft
forger
- 0.1.0
+ 0.1.1-SNAPSHOT
@@ -150,18 +150,4 @@
-
-
- sonatype-nexus-snapshots
- Sonatype Nexus Snapshots
- https://oss.sonatype.org/content/repositories/snapshots
-
- false
-
-
- true
-
-
-
-
diff --git a/webmagic-avalon/webmagic-worker/pom.xml b/webmagic-avalon/webmagic-worker/pom.xml
index f085c82..ebc5174 100644
--- a/webmagic-avalon/webmagic-worker/pom.xml
+++ b/webmagic-avalon/webmagic-worker/pom.xml
@@ -3,7 +3,7 @@
webmagic-avalon
us.codecraft
- 0.5.0
+ 0.5.1-SNAPSHOT
4.0.0
diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml
index 93ced05..6191b96 100644
--- a/webmagic-core/pom.xml
+++ b/webmagic-core/pom.xml
@@ -3,7 +3,7 @@
us.codecraft
webmagic-parent
- 0.5.0
+ 0.5.2-SNAPSHOT
4.0.0
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/DuplicatedRemoveScheduler.java b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/DuplicateRemovedScheduler.java
similarity index 57%
rename from webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/DuplicatedRemoveScheduler.java
rename to webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/DuplicateRemovedScheduler.java
index 7b319b6..b14d8ee 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/DuplicatedRemoveScheduler.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/DuplicateRemovedScheduler.java
@@ -4,6 +4,8 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task;
+import us.codecraft.webmagic.scheduler.component.DuplicateRemover;
+import us.codecraft.webmagic.scheduler.component.HashSetDuplicateRemover;
/**
* Remove duplicate urls and only push urls which are not duplicate.
@@ -11,30 +13,30 @@ import us.codecraft.webmagic.Task;
* @author code4crafer@gmail.com
* @since 0.5.0
*/
-public abstract class DuplicatedRemoveScheduler implements Scheduler {
+public abstract class DuplicateRemovedScheduler implements Scheduler {
protected Logger logger = LoggerFactory.getLogger(getClass());
+ private DuplicateRemover duplicatedRemover = new HashSetDuplicateRemover();
+
+ public DuplicateRemover getDuplicateRemover() {
+ return duplicatedRemover;
+ }
+
+ public DuplicateRemovedScheduler setDuplicateRemover(DuplicateRemover duplicatedRemover) {
+ this.duplicatedRemover = duplicatedRemover;
+ return this;
+ }
+
@Override
public void push(Request request, Task task) {
logger.trace("get a candidate url {}", request.getUrl());
- if (isDuplicate(request, task) || shouldReserved(request)) {
+ if (!duplicatedRemover.isDuplicate(request, task) || shouldReserved(request)) {
logger.debug("push to queue {}", request.getUrl());
pushWhenNoDuplicate(request, task);
}
}
- /**
- * Reset duplicate check.
- */
- public abstract void resetDuplicateCheck(Task task);
-
- /**
- * @param request
- * @return
- */
- protected abstract boolean isDuplicate(Request request, Task task);
-
protected boolean shouldReserved(Request request) {
return request.getExtra(Request.CYCLE_TRIED_TIMES) != null;
}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/PriorityScheduler.java b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/PriorityScheduler.java
index 38c9b6c..8fa1b9e 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/PriorityScheduler.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/PriorityScheduler.java
@@ -17,7 +17,7 @@ import java.util.concurrent.PriorityBlockingQueue;
* @since 0.2.1
*/
@ThreadSafe
-public class PriorityScheduler extends LocalDuplicatedRemoveScheduler {
+public class PriorityScheduler extends DuplicateRemovedScheduler implements MonitorableScheduler {
public static final int INITIAL_CAPACITY = 5;
@@ -65,4 +65,9 @@ public class PriorityScheduler extends LocalDuplicatedRemoveScheduler {
public int getLeftRequestsCount(Task task) {
return noPriorityQueue.size();
}
+
+ @Override
+ public int getTotalRequestsCount(Task task) {
+ return getDuplicateRemover().getTotalRequestsCount(task);
+ }
}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java
index 511d8a0..c38311f 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java
@@ -16,7 +16,7 @@ import java.util.concurrent.LinkedBlockingQueue;
* @since 0.1.0
*/
@ThreadSafe
-public class QueueScheduler extends LocalDuplicatedRemoveScheduler {
+public class QueueScheduler extends DuplicateRemovedScheduler implements MonitorableScheduler {
private BlockingQueue queue = new LinkedBlockingQueue();
@@ -34,4 +34,9 @@ public class QueueScheduler extends LocalDuplicatedRemoveScheduler {
public int getLeftRequestsCount(Task task) {
return queue.size();
}
+
+ @Override
+ public int getTotalRequestsCount(Task task) {
+ return getDuplicateRemover().getTotalRequestsCount(task);
+ }
}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/component/BloomFilterDuplicateRemover.java b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/component/BloomFilterDuplicateRemover.java
new file mode 100644
index 0000000..6d5e597
--- /dev/null
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/component/BloomFilterDuplicateRemover.java
@@ -0,0 +1,70 @@
+package us.codecraft.webmagic.scheduler.component;
+
+import com.google.common.hash.BloomFilter;
+import com.google.common.hash.Funnels;
+import us.codecraft.webmagic.Request;
+import us.codecraft.webmagic.Task;
+
+import java.nio.charset.Charset;
+import java.util.concurrent.atomic.AtomicInteger;
+
+/**
+ * BloomFilterDuplicateRemover for huge number of urls.
+ *
+ * @author code4crafer@gmail.com
+ * @since 0.5.1
+ */
+public class BloomFilterDuplicateRemover implements DuplicateRemover {
+
+ private int expectedInsertions;
+
+ private double fpp;
+
+ private AtomicInteger counter;
+
+ public BloomFilterDuplicateRemover(int expectedInsertions) {
+ this(expectedInsertions, 0.01);
+ }
+
+ /**
+ *
+ * @param expectedInsertions the number of expected insertions to the constructed
+ * @param fpp the desired false positive probability (must be positive and less than 1.0)
+ */
+ public BloomFilterDuplicateRemover(int expectedInsertions, double fpp) {
+ this.expectedInsertions = expectedInsertions;
+ this.fpp = fpp;
+ this.bloomFilter = rebuildBloomFilter();
+ }
+
+ protected BloomFilter rebuildBloomFilter() {
+ counter = new AtomicInteger(0);
+ return BloomFilter.create(Funnels.stringFunnel(Charset.defaultCharset()), expectedInsertions, fpp);
+ }
+
+ private final BloomFilter bloomFilter;
+
+ @Override
+ public boolean isDuplicate(Request request, Task task) {
+ boolean isDuplicate = bloomFilter.mightContain(getUrl(request));
+ if (!isDuplicate) {
+ bloomFilter.put(getUrl(request));
+ counter.incrementAndGet();
+ }
+ return isDuplicate;
+ }
+
+ protected String getUrl(Request request) {
+ return request.getUrl();
+ }
+
+ @Override
+ public void resetDuplicateCheck(Task task) {
+ rebuildBloomFilter();
+ }
+
+ @Override
+ public int getTotalRequestsCount(Task task) {
+ return counter.get();
+ }
+}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/component/DuplicateRemover.java b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/component/DuplicateRemover.java
new file mode 100644
index 0000000..fa88976
--- /dev/null
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/component/DuplicateRemover.java
@@ -0,0 +1,35 @@
+package us.codecraft.webmagic.scheduler.component;
+
+import us.codecraft.webmagic.Request;
+import us.codecraft.webmagic.Task;
+
+/**
+ * Remove duplicate requests.
+ * @author code4crafer@gmail.com
+ * @since 0.5.1
+ */
+public interface DuplicateRemover {
+ /**
+ *
+ * Check whether the request is duplicate.
+ *
+ * @param request
+ * @param task
+ * @return
+ */
+ public boolean isDuplicate(Request request, Task task);
+
+ /**
+ * Reset duplicate check.
+ * @param task
+ */
+ public void resetDuplicateCheck(Task task);
+
+ /**
+ * Get TotalRequestsCount for monitor.
+ * @param task
+ * @return
+ */
+ public int getTotalRequestsCount(Task task);
+
+}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/LocalDuplicatedRemoveScheduler.java b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/component/HashSetDuplicateRemover.java
similarity index 56%
rename from webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/LocalDuplicatedRemoveScheduler.java
rename to webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/component/HashSetDuplicateRemover.java
index c127c98..1190762 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/LocalDuplicatedRemoveScheduler.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/component/HashSetDuplicateRemover.java
@@ -1,4 +1,4 @@
-package us.codecraft.webmagic.scheduler;
+package us.codecraft.webmagic.scheduler.component;
import com.google.common.collect.Sets;
import us.codecraft.webmagic.Request;
@@ -8,25 +8,26 @@ import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
/**
- * Base Scheduler with duplicated urls removed by hash set.
- *
- * @author code4crafter@gmail.com
- * @since 0.5.0
+ * @author code4crafer@gmail.com
*/
-public abstract class LocalDuplicatedRemoveScheduler extends DuplicatedRemoveScheduler implements MonitorableScheduler {
+public class HashSetDuplicateRemover implements DuplicateRemover {
private Set urls = Sets.newSetFromMap(new ConcurrentHashMap());
+ @Override
+ public boolean isDuplicate(Request request, Task task) {
+ return !urls.add(getUrl(request));
+ }
+
+ protected String getUrl(Request request) {
+ return request.getUrl();
+ }
+
@Override
public void resetDuplicateCheck(Task task) {
urls.clear();
}
- @Override
- protected boolean isDuplicate(Request request, Task task) {
- return urls.add(request.getUrl());
- }
-
@Override
public int getTotalRequestsCount(Task task) {
return urls.size();
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/component/package.html b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/component/package.html
new file mode 100644
index 0000000..213707c
--- /dev/null
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/component/package.html
@@ -0,0 +1,5 @@
+
+
+Component of scheduler.
+
+
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/FilePersistentBase.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/FilePersistentBase.java
index e65e2f9..79b9efe 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/FilePersistentBase.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/FilePersistentBase.java
@@ -22,10 +22,10 @@ public class FilePersistentBase {
}
public void setPath(String path) {
- this.path = path;
if (!path.endsWith(PATH_SEPERATOR)) {
path += PATH_SEPERATOR;
}
+ this.path = path;
}
public File getFile(String fullName) {
diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/scheduler/BloomFilterDuplicateRemoverTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/scheduler/BloomFilterDuplicateRemoverTest.java
new file mode 100644
index 0000000..f8e0b9a
--- /dev/null
+++ b/webmagic-core/src/test/java/us/codecraft/webmagic/scheduler/BloomFilterDuplicateRemoverTest.java
@@ -0,0 +1,80 @@
+package us.codecraft.webmagic.scheduler;
+
+import org.junit.Ignore;
+import org.junit.Test;
+import us.codecraft.webmagic.Request;
+import us.codecraft.webmagic.scheduler.component.BloomFilterDuplicateRemover;
+import us.codecraft.webmagic.scheduler.component.DuplicateRemover;
+import us.codecraft.webmagic.scheduler.component.HashSetDuplicateRemover;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+/**
+ * @author code4crafer@gmail.com
+ */
+public class BloomFilterDuplicateRemoverTest {
+
+ @Test
+ public void testRemove() throws Exception {
+ BloomFilterDuplicateRemover bloomFilterDuplicateRemover = new BloomFilterDuplicateRemover(10);
+ boolean isDuplicate = bloomFilterDuplicateRemover.isDuplicate(new Request("a"), null);
+ assertThat(isDuplicate).isFalse();
+ isDuplicate = bloomFilterDuplicateRemover.isDuplicate(new Request("a"), null);
+ assertThat(isDuplicate).isTrue();
+ isDuplicate = bloomFilterDuplicateRemover.isDuplicate(new Request("b"), null);
+ assertThat(isDuplicate).isFalse();
+ isDuplicate = bloomFilterDuplicateRemover.isDuplicate(new Request("b"), null);
+ assertThat(isDuplicate).isTrue();
+
+ }
+
+ @Ignore("long time")
+ @Test
+ public void testMemory() throws Exception {
+ int times = 5000000;
+ DuplicateRemover duplicateRemover = new BloomFilterDuplicateRemover(times,0.005);
+ long freeMemory = Runtime.getRuntime().freeMemory();
+ long time = System.currentTimeMillis();
+ for (int i = 0; i < times; i++) {
+ duplicateRemover.isDuplicate(new Request(String.valueOf(i)), null);
+ }
+ System.out.println("Time used by bloomfilter:" + (System.currentTimeMillis() - time));
+ System.out.println("Memory used by bloomfilter:" + (freeMemory - Runtime.getRuntime().freeMemory()));
+
+ duplicateRemover = new HashSetDuplicateRemover();
+ System.gc();
+ freeMemory = Runtime.getRuntime().freeMemory();
+ time = System.currentTimeMillis();
+ for (int i = 0; i < times; i++) {
+ duplicateRemover.isDuplicate(new Request(String.valueOf(i)), null);
+ }
+ System.out.println("Time used by hashset:" + (System.currentTimeMillis() - time));
+ System.out.println("Memory used by hashset:" + (freeMemory - Runtime.getRuntime().freeMemory()));
+ }
+
+ @Ignore("long time")
+ @Test
+ public void testMissHit() throws Exception {
+ int times = 5000000;
+ DuplicateRemover duplicateRemover = new BloomFilterDuplicateRemover(times, 0.01);
+ int right = 0;
+ int wrong = 0;
+ int missCheck = 0;
+ for (int i = 0; i < times; i++) {
+ boolean duplicate = duplicateRemover.isDuplicate(new Request(String.valueOf(i)), null);
+ if (duplicate) {
+ wrong++;
+ } else {
+ right++;
+ }
+ duplicate = duplicateRemover.isDuplicate(new Request(String.valueOf(i)), null);
+ if (!duplicate) {
+ missCheck++;
+ }
+ }
+
+ System.out.println("Right count: " + right + " Wrong count: " + wrong + " Miss check: " + missCheck);
+ }
+
+
+}
diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml
index 5d93cdc..45a71cd 100644
--- a/webmagic-extension/pom.xml
+++ b/webmagic-extension/pom.xml
@@ -3,7 +3,7 @@
us.codecraft
webmagic-parent
- 0.5.0
+ 0.5.2-SNAPSHOT
4.0.0
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java
index 4215ab8..211b698 100644
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java
@@ -2,8 +2,6 @@ package us.codecraft.webmagic.scheduler;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.math.NumberUtils;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task;
@@ -23,9 +21,7 @@ import java.util.concurrent.atomic.AtomicInteger;
* @author code4crafter@gmail.com
* @since 0.2.0
*/
-public class FileCacheQueueScheduler extends LocalDuplicatedRemoveScheduler {
-
- private Logger logger = LoggerFactory.getLogger(getClass());
+public class FileCacheQueueScheduler extends DuplicateRemovedScheduler implements MonitorableScheduler {
private String filePath = System.getProperty("java.io.tmpdir");
@@ -166,4 +162,9 @@ public class FileCacheQueueScheduler extends LocalDuplicatedRemoveScheduler {
public int getLeftRequestsCount(Task task) {
return queue.size();
}
+
+ @Override
+ public int getTotalRequestsCount(Task task) {
+ return getDuplicateRemover().getTotalRequestsCount(task);
+ }
}
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java
index dc2ee2e..bbb945c 100644
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java
@@ -7,6 +7,7 @@ import redis.clients.jedis.JedisPool;
import redis.clients.jedis.JedisPoolConfig;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task;
+import us.codecraft.webmagic.scheduler.component.DuplicateRemover;
/**
* Use Redis as url scheduler for distributed crawlers.
@@ -14,7 +15,7 @@ import us.codecraft.webmagic.Task;
* @author code4crafter@gmail.com
* @since 0.2.0
*/
-public class RedisScheduler extends DuplicatedRemoveScheduler implements MonitorableScheduler {
+public class RedisScheduler extends DuplicateRemovedScheduler implements MonitorableScheduler, DuplicateRemover {
private JedisPool pool;
@@ -25,11 +26,12 @@ public class RedisScheduler extends DuplicatedRemoveScheduler implements Monitor
private static final String ITEM_PREFIX = "item_";
public RedisScheduler(String host) {
- pool = new JedisPool(new JedisPoolConfig(), host);
+ this(new JedisPool(new JedisPoolConfig(), host));
}
public RedisScheduler(JedisPool pool) {
this.pool = pool;
+ setDuplicateRemover(this);
}
@Override
@@ -43,10 +45,10 @@ public class RedisScheduler extends DuplicatedRemoveScheduler implements Monitor
}
@Override
- protected boolean isDuplicate(Request request, Task task) {
+ public boolean isDuplicate(Request request, Task task) {
Jedis jedis = pool.getResource();
try {
- boolean isDuplicate = !jedis.sismember(getSetKey(task), request.getUrl());
+ boolean isDuplicate = jedis.sismember(getSetKey(task), request.getUrl());
if (!isDuplicate) {
jedis.sadd(getSetKey(task), request.getUrl());
}
diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml
index 4769c21..df0ebfc 100644
--- a/webmagic-samples/pom.xml
+++ b/webmagic-samples/pom.xml
@@ -3,7 +3,7 @@
webmagic-parent
us.codecraft
- 0.5.0
+ 0.5.2-SNAPSHOT
4.0.0
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/AmanzonPageProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/AmanzonPageProcessor.java
new file mode 100644
index 0000000..a980851
--- /dev/null
+++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/AmanzonPageProcessor.java
@@ -0,0 +1,53 @@
+package us.codecraft.webmagic.samples;
+
+import us.codecraft.webmagic.Page;
+import us.codecraft.webmagic.Site;
+import us.codecraft.webmagic.Spider;
+import us.codecraft.webmagic.processor.PageProcessor;
+import us.codecraft.webmagic.selector.Html;
+
+import java.util.List;
+
+/**
+ * @author code4crafer@gmail.com
+ */
+public class AmanzonPageProcessor implements PageProcessor{
+ public void process(Page page) {
+
+ Html html = page.getHtml();
+ List questionList = html.xpath("//table[@class='tgCustomerCommunityCenterColumn']//div[@class='content']//table[@class='dataGrid']//tr").all();
+
+ if(questionList != null && questionList.size() > 1)
+ {
+ //i=0是列名称,所以i从1开始
+ for( int i = 1 ; i < questionList.size(); i++)
+ {
+ System.out.println(questionList.get(i));
+ Html tempHtml = Html.create("");
+ String comment = tempHtml.xpath("//td[@class='title']//a/text()").toString();
+ System.out.println(comment);
+ String answerNum = tempHtml.xpath("//td[@class='num']/text()").toString();
+ System.out.println(answerNum);
+ String createTime = tempHtml.xpath("//td[3]/text()").toString();
+ System.out.println(createTime);
+
+ /* Document doc = Jsoup.parse(questionList.get(i));
+ Html hmt = Html.create(questionList.get(i)) ;
+ String str = hmt.links().toString();
+ String content = doc.getElementsByTag("a").text();
+ String ss = doc.text();*/
+
+ }
+ }
+
+ }
+
+ @Override
+ public Site getSite() {
+ return Site.me();
+ }
+
+ public static void main(String[] args) {
+ Spider.create(new AmanzonPageProcessor()).test("http://www.amazon.de/forum/Fx27CUFD8S7LJ5D");
+ }
+}
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GithubRepo.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GithubRepo.java
new file mode 100644
index 0000000..0aecb7b
--- /dev/null
+++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GithubRepo.java
@@ -0,0 +1,37 @@
+package us.codecraft.webmagic.samples;
+
+/**
+ * @author code4crafer@gmail.com
+ */
+public class GithubRepo {
+
+ private String name;
+
+ private String author;
+
+ private String readme;
+
+ public String getName() {
+ return name;
+ }
+
+ public void setName(String name) {
+ this.name = name;
+ }
+
+ public String getAuthor() {
+ return author;
+ }
+
+ public void setAuthor(String author) {
+ this.author = author;
+ }
+
+ public String getReadme() {
+ return readme;
+ }
+
+ public void setReadme(String readme) {
+ this.readme = readme;
+ }
+}
\ No newline at end of file
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GithubRepoPageProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GithubRepoPageProcessor.java
new file mode 100644
index 0000000..0de61fb
--- /dev/null
+++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GithubRepoPageProcessor.java
@@ -0,0 +1,40 @@
+package us.codecraft.webmagic.samples;
+
+import us.codecraft.webmagic.Page;
+import us.codecraft.webmagic.Site;
+import us.codecraft.webmagic.Spider;
+import us.codecraft.webmagic.processor.PageProcessor;
+
+/**
+ * @author code4crafter@gmail.com
+ * @since 0.5.1
+ */
+public class GithubRepoPageProcessor implements PageProcessor {
+
+ private Site site = Site.me().setRetryTimes(3).setSleepTime(0);
+
+ @Override
+ public void process(Page page) {
+ page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all());
+ page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+)").all());
+ GithubRepo githubRepo = new GithubRepo();
+ githubRepo.setAuthor(page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString());
+ githubRepo.setName(page.getHtml().xpath("//h1[@class='entry-title public']/strong/a/text()").toString());
+ githubRepo.setReadme(page.getHtml().xpath("//div[@id='readme']/tidyText()").toString());
+ if (githubRepo.getName() == null) {
+ //skip this page
+ page.setSkip(true);
+ } else {
+ page.putField("repo", githubRepo);
+ }
+ }
+
+ @Override
+ public Site getSite() {
+ return site;
+ }
+
+ public static void main(String[] args) {
+ Spider.create(new GithubRepoPageProcessor()).addUrl("https://github.com/code4craft").thread(5).run();
+ }
+}
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java
index ded1a5f..2e3996c 100644
--- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java
+++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java
@@ -3,8 +3,12 @@ package us.codecraft.webmagic.samples;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
+import us.codecraft.webmagic.monitor.SpiderMonitor;
import us.codecraft.webmagic.processor.PageProcessor;
+import us.codecraft.webmagic.scheduler.QueueScheduler;
+import us.codecraft.webmagic.scheduler.component.BloomFilterDuplicateRemover;
+import javax.management.JMException;
import java.util.List;
/**
@@ -29,7 +33,9 @@ public class OschinaBlogPageProcesser implements PageProcessor {
}
- public static void main(String[] args) {
- Spider.create(new OschinaBlogPageProcesser()).run();
+ public static void main(String[] args) throws JMException {
+ Spider spider = Spider.create(new OschinaBlogPageProcesser()).setScheduler(new QueueScheduler().setDuplicateRemover(new BloomFilterDuplicateRemover(2000)));
+ SpiderMonitor.instance().register(spider);
+ spider.run();
}
}
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/pipeline/ReplacePipeline.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/pipeline/ReplacePipeline.java
new file mode 100644
index 0000000..2458c8a
--- /dev/null
+++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/pipeline/ReplacePipeline.java
@@ -0,0 +1,7 @@
+package us.codecraft.webmagic.samples.pipeline;
+
+/**
+ * @author code4crafer@gmail.com
+ */
+public class ReplacePipeline {
+}
diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml
index a444d39..a45bcdc 100644
--- a/webmagic-saxon/pom.xml
+++ b/webmagic-saxon/pom.xml
@@ -3,7 +3,7 @@
webmagic-parent
us.codecraft
- 0.5.0
+ 0.5.2-SNAPSHOT
4.0.0
diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml
index 2b846d8..fcf8d1a 100755
--- a/webmagic-scripts/pom.xml
+++ b/webmagic-scripts/pom.xml
@@ -3,7 +3,7 @@
webmagic-parent
us.codecraft
- 0.5.0
+ 0.5.2-SNAPSHOT
4.0.0
diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml
index 038b371..a8c6707 100644
--- a/webmagic-selenium/pom.xml
+++ b/webmagic-selenium/pom.xml
@@ -3,7 +3,7 @@
webmagic-parent
us.codecraft
- 0.5.0
+ 0.5.2-SNAPSHOT
4.0.0
diff --git a/zh_docs/README.md b/zh_docs/README.md
index cee747c..b23bf83 100644
--- a/zh_docs/README.md
+++ b/zh_docs/README.md
@@ -41,12 +41,12 @@ webmagic使用maven管理依赖,在项目中添加对应的依赖即可使用w
us.codecraft
webmagic-core
- 0.5.0
+ 0.5.1
us.codecraft
webmagic-extension
- 0.5.0
+ 0.5.1
```
@@ -192,6 +192,7 @@ webmagic遵循[Apache 2.0协议](http://opensource.org/licenses/Apache-2.0)
* [lidongyang](http://my.oschina.net/lidongyang)
* [seveniu](https://github.com/seveniu)
* [sebastian1118](https://github.com/sebastian1118)
+* [codev777](https://github.com/codev777)
### 邮件组: