diff --git a/README-zh.md b/README-zh.md
index c5ebe15..62b3c9a 100644
--- a/README-zh.md
+++ b/README-zh.md
@@ -1,9 +1,10 @@

+[](https://maven-badges.herokuapp.com/maven-central/us.codecraft/webmagic-parent/)
+[](https://www.apache.org/licenses/LICENSE-2.0.html)
[](https://travis-ci.org/code4craft/webmagic)
-
官方网站[http://webmagic.io/](http://webmagic.io/)
>webmagic是一个开源的Java垂直爬虫框架,目标是简化爬虫的开发流程,让开发者专注于逻辑功能的开发。webmagic的核心非常简单,但是覆盖爬虫的整个流程,也是很好的学习爬虫开发的材料。
@@ -38,12 +39,12 @@ webmagic使用maven管理依赖,在项目中添加对应的依赖即可使用w
us.codecraft
webmagic-core
- 0.7.4
+ 0.7.5
us.codecraft
webmagic-extension
- 0.7.4
+ 0.7.5
```
diff --git a/README.md b/README.md
index e5cd511..14aeac7 100644
--- a/README.md
+++ b/README.md
@@ -3,6 +3,8 @@
[Readme in Chinese](https://github.com/code4craft/webmagic/tree/master/README-zh.md)
+[](https://maven-badges.herokuapp.com/maven-central/us.codecraft/webmagic-parent/)
+[](https://www.apache.org/licenses/LICENSE-2.0.html)
[](https://travis-ci.org/code4craft/webmagic)
>A scalable crawler framework. It covers the whole lifecycle of crawler: downloading, url management, content extraction and persistent. It can simplify the development of a specific crawler.
@@ -23,12 +25,12 @@ Add dependencies to your pom.xml:
us.codecraft
webmagic-core
- 0.7.4
+ 0.7.5
us.codecraft
webmagic-extension
- 0.7.4
+ 0.7.5
```
diff --git a/pom.xml b/pom.xml
index f4a268a..d721fa4 100644
--- a/pom.xml
+++ b/pom.xml
@@ -1,13 +1,14 @@
us.codecraft
- 0.7.4
+ 0.7.5
4.0.0
pom
UTF-8
UTF-8
- 1.8
+ 1.8
+ 1.8
4.0.0.RELEASE
webmagic-parent
@@ -33,7 +34,7 @@
scm:git:git@github.com:code4craft/webmagic.git
scm:git:git@github.com:code4craft/webmagic.git
git@github.com:code4craft/webmagic.git
- webmagic-parent-0.6.1
+ WebMagic-${project.version}
@@ -49,6 +50,7 @@
webmagic-selenium
webmagic-saxon
webmagic-samples
+ webmagic-coverage
@@ -73,17 +75,17 @@
org.apache.httpcomponents
httpcore
- 4.4.13
+ 4.4.14
com.google.guava
guava
- 30.0-android
+ 30.1-jre
com.jayway.jsonpath
json-path
- 2.6.0
+ 2.5.0
org.slf4j
@@ -98,12 +100,12 @@
us.codecraft
xsoup
- 0.3.1
+ 0.3.2
com.alibaba
fastjson
- 1.2.69
+ 1.2.75
com.github.dreamhead
@@ -125,13 +127,13 @@
org.assertj
assertj-core
- 3.16.1
+ 3.18.1
test
org.apache.commons
commons-lang3
- 3.10
+ 3.11
commons-collections
@@ -139,24 +141,19 @@
3.2.2
- commons-io
- commons-io
- 2.7
-
+ commons-io
+ commons-io
+ 2.8.0
+
org.codehaus.groovy
groovy-all
- 2.4.19
+ 3.0.7
org.jruby
jruby
- 9.2.11.1
-
-
- org.jsoup
- jsoup
- 1.10.3
+ 9.2.14.0
org.python
@@ -171,12 +168,12 @@
net.sf.saxon
Saxon-HE
- 10.1
+ 10.3
net.sourceforge.htmlcleaner
htmlcleaner
- 2.5
+ 2.9
com.github.detro
@@ -191,7 +188,7 @@
redis.clients
jedis
- 2.9.3
+ 3.6.0
@@ -211,7 +208,7 @@
- 3.0.5
+ 3.3.9
@@ -221,19 +218,10 @@
org.apache.maven.plugins
maven-surefire-plugin
- 3.0.0-M4
-
- 0
-
org.apache.maven.plugins
maven-compiler-plugin
- 3.8.1
-
- ${java.version}
- ${java.version}
-
@@ -258,12 +246,10 @@
org.apache.maven.plugins
maven-resources-plugin
- 3.1.0
org.apache.maven.plugins
maven-jar-plugin
- 3.2.0
log4j.xml
@@ -289,7 +275,7 @@
3.2.0
UTF-8
- WebMagic 0.7.4
+ WebMagic ${project.version}
en_US
@@ -317,9 +303,147 @@
maven-release-plugin
3.0.0-M1
+
+ org.jacoco
+ jacoco-maven-plugin
+
+
+
+ prepare-agent
+
+
+
+ report
+ verify
+
+ report
+
+
+
+
+
+ com.amashchenko.maven.plugin
+ gitflow-maven-plugin
+
+
+ WebMagic-
+
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-clean-plugin
+ 3.1.0
+
+
+ org.apache.maven.plugins
+ maven-compiler-plugin
+ 3.8.1
+
+
+ org.apache.maven.plugins
+ maven-deploy-plugin
+ 3.0.0-M1
+
+
+ org.apache.maven.plugins
+ maven-install-plugin
+ 3.0.0-M1
+
+
+ org.apache.maven.plugins
+ maven-jar-plugin
+ 3.2.0
+
+
+ org.apache.maven.plugins
+ maven-jxr-plugin
+ 3.1.1
+
+
+ org.apache.maven.plugins
+ maven-pmd-plugin
+ 3.14.0
+
+
+ org.apache.maven.plugins
+ maven-resources-plugin
+ 3.2.0
+
+
+ org.apache.maven.plugins
+ maven-site-plugin
+ 3.9.1
+
+
+ org.apache.maven.plugins
+ maven-surefire-plugin
+ 3.0.0-M5
+
+
+ org.apache.maven.plugins
+ maven-surefire-report-plugin
+ 3.0.0-M5
+
+
+ org.codehaus.mojo
+ taglist-maven-plugin
+ 2.4
+
+
+ org.jacoco
+ jacoco-maven-plugin
+ 0.8.7
+
+
+ com.amashchenko.maven.plugin
+ gitflow-maven-plugin
+ 1.15.0
+
+
+ com.github.spotbugs
+ spotbugs-maven-plugin
+ 4.2.3
+
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-javadoc-plugin
+
+ none
+
+
+
+ org.apache.maven.plugins
+ maven-jxr-plugin
+
+
+ org.apache.maven.plugins
+ maven-pmd-plugin
+
+
+ org.apache.maven.plugins
+ maven-surefire-report-plugin
+
+
+ org.codehaus.mojo
+ taglist-maven-plugin
+
+
+ com.github.spotbugs
+ spotbugs-maven-plugin
+
+
+
+
release
diff --git a/src/site/site.xml b/src/site/site.xml
new file mode 100644
index 0000000..d2d5caa
--- /dev/null
+++ b/src/site/site.xml
@@ -0,0 +1,23 @@
+
+
+ org.apache.maven.skins
+ maven-fluido-skin
+ 1.9
+
+
+
+
+
+
+
+
+ true
+ true
+ true
+ pull-right
+
+
+
diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml
index 4b89cac..ec718a1 100644
--- a/webmagic-core/pom.xml
+++ b/webmagic-core/pom.xml
@@ -3,7 +3,7 @@
us.codecraft
webmagic-parent
- 0.7.4
+ 0.7.5
4.0.0
@@ -61,11 +61,6 @@
assertj-core
-
- org.jsoup
- jsoup
-
-
commons-io
commons-io
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
index 886e74a..5940e73 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
@@ -208,7 +208,8 @@ public class Spider implements Runnable, Task {
* @see #addPipeline(us.codecraft.webmagic.pipeline.Pipeline)
* @deprecated
*/
- public Spider pipeline(Pipeline pipeline) {
+ @Deprecated
+ public Spider pipeline(Pipeline pipeline) {
return addPipeline(pipeline);
}
@@ -258,7 +259,8 @@ public class Spider implements Runnable, Task {
* @see #setDownloader(us.codecraft.webmagic.downloader.Downloader)
* @deprecated
*/
- public Spider downloader(Downloader downloader) {
+ @Deprecated
+ public Spider downloader(Downloader downloader) {
return setDownloader(downloader);
}
@@ -320,7 +322,7 @@ public class Spider implements Runnable, Task {
processRequest(request);
onSuccess(request);
} catch (Exception e) {
- onError(request);
+ onError(request, e);
logger.error("process request " + request + " error", e);
} finally {
pageCount.incrementAndGet();
@@ -338,10 +340,19 @@ public class Spider implements Runnable, Task {
logger.info("Spider {} closed! {} pages downloaded.", getUUID(), pageCount.get());
}
+ /**
+ * @deprecated Use {@link #onError(Request, Exception)} instead.
+ */
+ @Deprecated
protected void onError(Request request) {
+ }
+
+ protected void onError(Request request, Exception e) {
+ this.onError(request);
+
if (CollectionUtils.isNotEmpty(spiderListeners)) {
for (SpiderListener spiderListener : spiderListeners) {
- spiderListener.onError(request);
+ spiderListener.onError(request, e);
}
}
}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/SpiderListener.java b/webmagic-core/src/main/java/us/codecraft/webmagic/SpiderListener.java
index 0678180..8f10e0e 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/SpiderListener.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/SpiderListener.java
@@ -10,5 +10,14 @@ public interface SpiderListener {
public void onSuccess(Request request);
+ /**
+ * @deprecated Use {@link #onError(Request, Exception)} instead.
+ */
+ @Deprecated
public void onError(Request request);
+
+ default void onError(Request request, Exception e) {
+ this.onError(request);
+ }
+
}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/LinksSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/LinksSelector.java
index 5296a74..2dafe8e 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/LinksSelector.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/LinksSelector.java
@@ -1,12 +1,12 @@
package us.codecraft.webmagic.selector;
-import org.jsoup.helper.StringUtil;
-import org.jsoup.nodes.Element;
-import org.jsoup.select.Elements;
-
import java.util.ArrayList;
import java.util.List;
+import org.apache.commons.lang3.StringUtils;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
+
/**
* Links selector based on jsoup. Use absolute url.
*
@@ -23,9 +23,9 @@ public class LinksSelector extends BaseElementSelector {
@Override
public List selectList(Element element) {
Elements elements = element.select("a");
- List links = new ArrayList(elements.size());
+ List links = new ArrayList<>(elements.size());
for (Element element0 : elements) {
- if (!StringUtil.isBlank(element0.baseUri())) {
+ if (StringUtils.isNotBlank(element0.baseUri())) {
links.add(element0.attr("abs:href"));
} else {
links.add(element0.attr("href"));
diff --git a/webmagic-coverage/pom.xml b/webmagic-coverage/pom.xml
new file mode 100644
index 0000000..16ed1b4
--- /dev/null
+++ b/webmagic-coverage/pom.xml
@@ -0,0 +1,72 @@
+
+
+ 4.0.0
+
+
+ us.codecraft
+ webmagic-parent
+ 0.7.5
+
+
+ webmagic-coverage
+ pom
+ webmagic-coverage
+ Compute aggregated test code coverage
+
+
+ true
+
+
+
+
+ ${project.groupId}
+ webmagic-core
+ ${project.version}
+
+
+ ${project.groupId}
+ webmagic-extension
+ ${project.version}
+
+
+ ${project.groupId}
+ webmagic-scripts
+ ${project.version}
+
+
+ ${project.groupId}
+ webmagic-selenium
+ ${project.version}
+
+
+ ${project.groupId}
+ webmagic-saxon
+ ${project.version}
+
+
+ ${project.groupId}
+ webmagic-samples
+ ${project.version}
+
+
+
+
+
+
+ org.jacoco
+ jacoco-maven-plugin
+
+
+
+ report-aggregate
+
+
+
+
+
+
+
+
diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml
index 87900ef..85d5c63 100644
--- a/webmagic-extension/pom.xml
+++ b/webmagic-extension/pom.xml
@@ -3,7 +3,7 @@
us.codecraft
webmagic-parent
- 0.7.4
+ 0.7.5
4.0.0
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java
index cfb4a82..b213dda 100644
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java
@@ -68,6 +68,10 @@ public class SpiderMonitor {
return new SpiderStatus(spider, monitorSpiderListener);
}
+ protected List getSpiderStatuses() {
+ return this.spiderStatuses;
+ }
+
public static SpiderMonitor instance() {
return INSTANCE;
}
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderStatus.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderStatus.java
index a87c040..69afe04 100644
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderStatus.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderStatus.java
@@ -84,8 +84,13 @@ public class SpiderStatus implements SpiderStatusMXBean {
@Override
public int getPagePerSecond() {
- int runSeconds = (int) (System.currentTimeMillis() - getStartTime().getTime()) / 1000;
- return getSuccessPageCount() / runSeconds;
+ if (getStartTime() != null) {
+ int runSeconds = (int) (System.currentTimeMillis() - getStartTime().getTime()) / 1000;
+ if (runSeconds != 0) {
+ return getSuccessPageCount() / runSeconds;
+ }
+ }
+ return -1;
}
}
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisPriorityScheduler.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisPriorityScheduler.java
index 540574a..46d47e5 100644
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisPriorityScheduler.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisPriorityScheduler.java
@@ -1,22 +1,23 @@
package us.codecraft.webmagic.scheduler;
-import com.alibaba.fastjson.JSON;
+import java.util.Set;
+
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.lang3.StringUtils;
+
+import com.alibaba.fastjson.JSON;
+
import redis.clients.jedis.Jedis;
import redis.clients.jedis.JedisPool;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task;
-import java.util.Set;
-
/**
* the redis scheduler with priority
* @author sai
* Created by sai on 16-5-27.
*/
-public class RedisPriorityScheduler extends RedisScheduler
-{
+public class RedisPriorityScheduler extends RedisScheduler {
private static final String ZSET_PREFIX = "zset_";
@@ -37,62 +38,44 @@ public class RedisPriorityScheduler extends RedisScheduler
}
@Override
- protected void pushWhenNoDuplicate(Request request, Task task)
- {
- Jedis jedis = pool.getResource();
- try
- {
- if(request.getPriority() > 0)
+ protected void pushWhenNoDuplicate(Request request, Task task) {
+ try (Jedis jedis = pool.getResource()) {
+ if (request.getPriority() > 0) {
jedis.zadd(getZsetPlusPriorityKey(task), request.getPriority(), request.getUrl());
- else if(request.getPriority() < 0)
+ } else if (request.getPriority() < 0) {
jedis.zadd(getZsetMinusPriorityKey(task), request.getPriority(), request.getUrl());
- else
+ } else {
jedis.lpush(getQueueNoPriorityKey(task), request.getUrl());
+ }
setExtrasInItem(jedis, request, task);
}
- finally
- {
- pool.returnResource(jedis);
- }
}
@Override
- public synchronized Request poll(Task task)
- {
- Jedis jedis = pool.getResource();
- try
- {
+ public synchronized Request poll(Task task) {
+ try (Jedis jedis = pool.getResource()) {
String url = getRequest(jedis, task);
- if(StringUtils.isBlank(url))
+ if (StringUtils.isBlank(url)) {
return null;
+ }
return getExtrasInItem(jedis, url, task);
}
- finally
- {
- pool.returnResource(jedis);
- }
}
- private String getRequest(Jedis jedis, Task task)
- {
+ private String getRequest(Jedis jedis, Task task) {
String url;
Set urls = jedis.zrevrange(getZsetPlusPriorityKey(task), 0, 0);
- if(urls.isEmpty())
- {
+ if (urls.isEmpty()) {
url = jedis.lpop(getQueueNoPriorityKey(task));
- if(StringUtils.isBlank(url))
- {
+ if (StringUtils.isBlank(url)) {
urls = jedis.zrevrange(getZsetMinusPriorityKey(task), 0, 0);
- if(!urls.isEmpty())
- {
+ if (!urls.isEmpty()) {
url = urls.toArray(new String[0])[0];
jedis.zrem(getZsetMinusPriorityKey(task), url);
}
}
- }
- else
- {
+ } else {
url = urls.toArray(new String[0])[0];
jedis.zrem(getZsetPlusPriorityKey(task), url);
}
@@ -100,51 +83,39 @@ public class RedisPriorityScheduler extends RedisScheduler
}
@Override
- public void resetDuplicateCheck(Task task)
- {
- Jedis jedis = pool.getResource();
- try
- {
+ public void resetDuplicateCheck(Task task) {
+ try (Jedis jedis = pool.getResource()) {
jedis.del(getSetKey(task));
}
- finally
- {
- pool.returnResource(jedis);
- }
}
- private String getZsetPlusPriorityKey(Task task)
- {
+ private String getZsetPlusPriorityKey(Task task) {
return ZSET_PREFIX + task.getUUID() + PLUS_PRIORITY_SUFFIX;
}
- private String getQueueNoPriorityKey(Task task)
- {
+ private String getQueueNoPriorityKey(Task task) {
return QUEUE_PREFIX + task.getUUID() + NO_PRIORITY_SUFFIX;
}
- private String getZsetMinusPriorityKey(Task task)
- {
+ private String getZsetMinusPriorityKey(Task task) {
return ZSET_PREFIX + task.getUUID() + MINUS_PRIORITY_SUFFIX;
}
- private void setExtrasInItem(Jedis jedis,Request request, Task task)
- {
- if(request.getExtras() != null)
- {
- String field = DigestUtils.shaHex(request.getUrl());
+ private void setExtrasInItem(Jedis jedis,Request request, Task task) {
+ if (request.getExtras() != null) {
+ String field = DigestUtils.sha1Hex(request.getUrl());
String value = JSON.toJSONString(request);
jedis.hset(getItemKey(task), field, value);
}
}
- private Request getExtrasInItem(Jedis jedis, String url, Task task)
- {
+ private Request getExtrasInItem(Jedis jedis, String url, Task task) {
String key = getItemKey(task);
- String field = DigestUtils.shaHex(url);
+ String field = DigestUtils.sha1Hex(url);
byte[] bytes = jedis.hget(key.getBytes(), field.getBytes());
- if(bytes != null)
+ if (bytes != null) {
return JSON.parseObject(new String(bytes), Request.class);
+ }
return new Request(url);
}
}
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java
index c70d885..19e8313 100644
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java
@@ -1,8 +1,10 @@
package us.codecraft.webmagic.scheduler;
-import com.alibaba.fastjson.JSON;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.lang3.StringUtils;
+
+import com.alibaba.fastjson.JSON;
+
import redis.clients.jedis.Jedis;
import redis.clients.jedis.JedisPool;
import redis.clients.jedis.JedisPoolConfig;
@@ -37,21 +39,15 @@ public class RedisScheduler extends DuplicateRemovedScheduler implements Monitor
@Override
public void resetDuplicateCheck(Task task) {
- Jedis jedis = pool.getResource();
- try {
+ try (Jedis jedis = pool.getResource()) {
jedis.del(getSetKey(task));
- } finally {
- pool.returnResource(jedis);
}
}
@Override
public boolean isDuplicate(Request request, Task task) {
- Jedis jedis = pool.getResource();
- try {
+ try (Jedis jedis = pool.getResource()) {
return jedis.sadd(getSetKey(task), request.getUrl()) == 0;
- } finally {
- pool.returnResource(jedis);
}
}
@@ -62,7 +58,7 @@ public class RedisScheduler extends DuplicateRemovedScheduler implements Monitor
try {
jedis.rpush(getQueueKey(task), request.getUrl());
if (checkForAdditionalInfo(request)) {
- String field = DigestUtils.shaHex(request.getUrl());
+ String field = DigestUtils.sha1Hex(request.getUrl());
String value = JSON.toJSONString(request);
jedis.hset((ITEM_PREFIX + task.getUUID()), field, value);
}
@@ -100,14 +96,13 @@ public class RedisScheduler extends DuplicateRemovedScheduler implements Monitor
@Override
public synchronized Request poll(Task task) {
- Jedis jedis = pool.getResource();
- try {
+ try (Jedis jedis = pool.getResource()) {
String url = jedis.lpop(getQueueKey(task));
if (url == null) {
return null;
}
String key = ITEM_PREFIX + task.getUUID();
- String field = DigestUtils.shaHex(url);
+ String field = DigestUtils.sha1Hex(url);
byte[] bytes = jedis.hget(key.getBytes(), field.getBytes());
if (bytes != null) {
Request o = JSON.parseObject(new String(bytes), Request.class);
@@ -115,8 +110,6 @@ public class RedisScheduler extends DuplicateRemovedScheduler implements Monitor
}
Request request = new Request(url);
return request;
- } finally {
- pool.returnResource(jedis);
}
}
@@ -134,23 +127,17 @@ public class RedisScheduler extends DuplicateRemovedScheduler implements Monitor
@Override
public int getLeftRequestsCount(Task task) {
- Jedis jedis = pool.getResource();
- try {
+ try (Jedis jedis = pool.getResource()) {
Long size = jedis.llen(getQueueKey(task));
return size.intValue();
- } finally {
- pool.returnResource(jedis);
}
}
@Override
public int getTotalRequestsCount(Task task) {
- Jedis jedis = pool.getResource();
- try {
+ try (Jedis jedis = pool.getResource()) {
Long size = jedis.scard(getSetKey(task));
return size.intValue();
- } finally {
- pool.returnResource(jedis);
}
}
}
diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml
index 3699fa6..dda1821 100644
--- a/webmagic-samples/pom.xml
+++ b/webmagic-samples/pom.xml
@@ -3,7 +3,7 @@
webmagic-parent
us.codecraft
- 0.7.4
+ 0.7.5
4.0.0
@@ -24,6 +24,26 @@
junit
junit
+
+ org.mapdb
+ mapdb
+ 3.0.8
+
+
+ com.fasterxml.jackson.core
+ jackson-core
+ 2.13.0-rc1
+
+
+ com.fasterxml.jackson.core
+ jackson-annotations
+ 2.13.0-rc1
+
+
+ com.fasterxml.jackson.core
+ jackson-databind
+ 2.13.0-rc1
+
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/recover/DuplicateStorageRemover.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/recover/DuplicateStorageRemover.java
new file mode 100644
index 0000000..bee80e7
--- /dev/null
+++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/recover/DuplicateStorageRemover.java
@@ -0,0 +1,78 @@
+package us.codecraft.webmagic.recover;
+
+import com.google.common.base.Charsets;
+import com.google.common.hash.BloomFilter;
+import com.google.common.hash.Funnels;
+import org.mapdb.DB;
+import org.mapdb.DBMaker;
+import org.mapdb.IndexTreeList;
+import org.mapdb.Serializer;
+import us.codecraft.webmagic.Request;
+import us.codecraft.webmagic.Task;
+import us.codecraft.webmagic.scheduler.component.DuplicateRemover;
+
+import java.util.concurrent.atomic.AtomicInteger;
+
+/**
+ * @author :linweisen
+ */
+public class DuplicateStorageRemover implements DuplicateRemover {
+
+ private DB db;
+
+ private static String DATABASE_NAME = "duplicate";
+
+ private IndexTreeList urlDuplicateQueue;
+
+ private BloomFilter bloomFilter;
+
+ private AtomicInteger counter;
+
+ public DuplicateStorageRemover(String path) {
+
+ String duplicatStoragePath = path;
+
+ DB db = DBMaker.fileDB(duplicatStoragePath)
+ .fileMmapEnableIfSupported()
+ .fileMmapPreclearDisable()
+ .cleanerHackEnable()
+ .closeOnJvmShutdown()
+ .transactionEnable()
+ .concurrencyScale(128)
+ .make();
+ this.db = db;
+
+ this.urlDuplicateQueue = db.indexTreeList(DATABASE_NAME, Serializer.STRING).createOrOpen();
+
+ counter = new AtomicInteger(this.urlDuplicateQueue.size());
+ this.bloomFilter = BloomFilter.create(Funnels.stringFunnel(Charsets.UTF_8), 200000, 1E-7);
+ for (String url : this.urlDuplicateQueue){
+ bloomFilter.put(url);
+ }
+
+ }
+
+ @Override
+ public boolean isDuplicate(Request request, Task task) {
+ String url = request.getUrl();
+ boolean isDuplicate = bloomFilter.mightContain(url);
+ if (!isDuplicate) {
+ bloomFilter.put(url);
+ urlDuplicateQueue.add(url);
+ this.db.commit();
+ counter.incrementAndGet();
+ }
+ return isDuplicate;
+ }
+
+ @Override
+ public void resetDuplicateCheck(Task task) {
+ this.bloomFilter = BloomFilter.create(Funnels.stringFunnel(Charsets.UTF_8), 200000, 1E-7);
+ this.urlDuplicateQueue.clear();
+ }
+
+ @Override
+ public int getTotalRequestsCount(Task task) {
+ return counter.get();
+ }
+}
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/recover/MmapQueueScheduler.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/recover/MmapQueueScheduler.java
new file mode 100644
index 0000000..4cee18a
--- /dev/null
+++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/recover/MmapQueueScheduler.java
@@ -0,0 +1,85 @@
+package us.codecraft.webmagic.recover;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+import org.apache.commons.lang3.StringUtils;
+import org.mapdb.DB;
+import org.mapdb.DBMaker;
+import org.mapdb.IndexTreeList;
+import org.mapdb.Serializer;
+import us.codecraft.webmagic.Request;
+import us.codecraft.webmagic.Task;
+import us.codecraft.webmagic.scheduler.DuplicateRemovedScheduler;
+import us.codecraft.webmagic.scheduler.component.DuplicateRemover;
+
+import java.io.IOException;
+
+/**
+ * @author :linweisen
+ */
+public class MmapQueueScheduler extends DuplicateRemovedScheduler {
+
+ private DB db;
+
+ private static String DATABASE_NAME = "queue";
+
+ private IndexTreeList queue;
+
+ private static ObjectMapper mapper;
+
+ public MmapQueueScheduler(DuplicateRemover duplicateRemover, String path) {
+ super.setDuplicateRemover(duplicateRemover);
+
+ String queuePath = path;
+
+ DB db = DBMaker.fileDB(queuePath)
+ .fileMmapEnableIfSupported()
+ .fileMmapPreclearDisable()
+ .cleanerHackEnable()
+ .closeOnJvmShutdown()
+ .transactionEnable()
+ .concurrencyScale(128)
+ .make();
+ this.db = db;
+ this.mapper = new ObjectMapper();
+ this.queue = db.indexTreeList(MmapQueueScheduler.DATABASE_NAME, Serializer.STRING).createOrOpen();
+ }
+
+ @Override
+ public Request poll(Task task) {
+ if (this.queue.size() > 0){
+ String s = queue.remove(0);
+ return fromJson(s, Request.class);
+ }else{
+ return null;
+ }
+
+ }
+
+ @Override
+ public void pushWhenNoDuplicate(Request request, Task task) {
+ queue.add(toJson(request));
+ this.db.commit();
+ }
+
+ public String toJson(Object object) {
+ try {
+ return mapper.writeValueAsString(object);
+ } catch (IOException e) {
+ logger.warn("write to json string error:" + object, e);
+ return null;
+ }
+ }
+
+ public T fromJson(String jsonString, Class clazz) {
+ if (StringUtils.isEmpty(jsonString)) {
+ return null;
+ }
+ try {
+ return mapper.readValue(jsonString, clazz);
+ } catch (IOException e) {
+ logger.warn("parse json string error:" + jsonString, e);
+ return null;
+ }
+ }
+
+}
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/recover/RecoverSample.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/recover/RecoverSample.java
new file mode 100644
index 0000000..4fb91a0
--- /dev/null
+++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/recover/RecoverSample.java
@@ -0,0 +1,22 @@
+package us.codecraft.webmagic.recover;
+
+
+import us.codecraft.webmagic.Spider;
+import us.codecraft.webmagic.samples.SinaBlogProcessor;
+import us.codecraft.webmagic.scheduler.component.DuplicateRemover;
+
+/**
+ * @author code4crafter@gmail.com
+ */
+public class RecoverSample {
+
+ public static void main(String[] args) {
+ String storage = "queue";
+ String duplicate = "duplicate";
+ Spider spider = new Spider(new SinaBlogProcessor());
+ DuplicateRemover remover = new DuplicateStorageRemover(duplicate);
+ spider.setScheduler(new MmapQueueScheduler(remover, storage));
+ spider.addUrl("http://blog.sina.com.cn/s/articlelist_1487828712_0_1.html")
+ .run();
+ }
+}
diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml
index d3a57f2..119e50f 100644
--- a/webmagic-saxon/pom.xml
+++ b/webmagic-saxon/pom.xml
@@ -3,7 +3,7 @@
webmagic-parent
us.codecraft
- 0.7.4
+ 0.7.5
4.0.0
diff --git a/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java b/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java
index d8aab6c..1f1f0a5 100644
--- a/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java
+++ b/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java
@@ -1,16 +1,11 @@
package us.codecraft.webmagic.selector;
-import net.sf.saxon.lib.NamespaceConstant;
-import net.sf.saxon.xpath.XPathEvaluator;
-import org.htmlcleaner.CleanerProperties;
-import org.htmlcleaner.DomSerializer;
-import org.htmlcleaner.HtmlCleaner;
-import org.htmlcleaner.TagNode;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-import org.w3c.dom.Document;
-import org.w3c.dom.Node;
-import org.w3c.dom.NodeList;
+import java.io.StringWriter;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.ConcurrentHashMap;
import javax.xml.namespace.NamespaceContext;
import javax.xml.transform.OutputKeys;
@@ -21,12 +16,19 @@ import javax.xml.transform.stream.StreamResult;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpression;
import javax.xml.xpath.XPathExpressionException;
-import java.io.StringWriter;
-import java.util.ArrayList;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
-import java.util.concurrent.ConcurrentHashMap;
+
+import org.htmlcleaner.CleanerProperties;
+import org.htmlcleaner.DomSerializer;
+import org.htmlcleaner.HtmlCleaner;
+import org.htmlcleaner.TagNode;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.w3c.dom.Document;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+
+import net.sf.saxon.lib.NamespaceConstant;
+import net.sf.saxon.xpath.XPathEvaluator;
/**
* 支持xpath2.0的选择器。包装了HtmlCleaner和Saxon HE。
diff --git a/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java b/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java
index aa3765a..32906b5 100644
--- a/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java
+++ b/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java
@@ -1,5 +1,7 @@
package us.codecraft.webmagic.selector;
+import java.util.List;
+
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;
import org.htmlcleaner.XPatherException;
@@ -8,6 +10,7 @@ import org.jsoup.nodes.Document;
import org.junit.Assert;
import org.junit.Ignore;
import org.junit.Test;
+
import us.codecraft.xsoup.XPathEvaluator;
import us.codecraft.xsoup.Xsoup;
@@ -1367,15 +1370,19 @@ public class XpathSelectorTest {
public void testXPath2() {
String text = "眉山:扎实推进农业农村工作 促农持续增收
\n" +
"2013-07-31 23:29:45 来源:眉山网 责任编辑:张斯炜
";
- XpathSelector xpathSelector = new XpathSelector("//h1/text()");
- System.out.println(xpathSelector.select(text));
+ Xpath2Selector xpathSelector = new Xpath2Selector("//h1/text()");
+ Assert.assertEquals("眉山:扎实推进农业农村工作 促农持续增收", xpathSelector.select(text));
}
@Test
public void testXpath2Selector() {
Xpath2Selector xpath2Selector = new Xpath2Selector("//a/@href");
String select = xpath2Selector.select(html);
- Assert.assertNotNull(select);
+ Assert.assertEquals("http://www.oschina.net/", select);
+
+ List selectList = xpath2Selector.selectList(html);
+ Assert.assertEquals(113, selectList.size());
+ Assert.assertEquals("http://www.oschina.net/", selectList.get(0));
}
@Ignore("take long time")
diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml
index 121aafa..1aca5b3 100755
--- a/webmagic-scripts/pom.xml
+++ b/webmagic-scripts/pom.xml
@@ -3,7 +3,7 @@
webmagic-parent
us.codecraft
- 0.7.4
+ 0.7.5
4.0.0
@@ -22,10 +22,6 @@
kotlin-stdlib
${kotlin.version}
-
- org.codehaus.groovy
- groovy-all
-
org.python
jython
diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml
index d0cb77c..42a6da9 100644
--- a/webmagic-selenium/pom.xml
+++ b/webmagic-selenium/pom.xml
@@ -3,7 +3,7 @@
webmagic-parent
us.codecraft
- 0.7.4
+ 0.7.5
4.0.0