From e14a7626321d9324164b21ff23d4fc17be81d57d Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Tue, 5 Jan 2021 23:14:24 +0800 Subject: [PATCH 01/22] Add gitflow-maven-plugin. --- pom.xml | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/pom.xml b/pom.xml index 6341bc0..0a24519 100644 --- a/pom.xml +++ b/pom.xml @@ -317,7 +317,20 @@ maven-release-plugin 3.0.0-M1 + + com.amashchenko.maven.plugin + gitflow-maven-plugin + + + + + com.amashchenko.maven.plugin + gitflow-maven-plugin + 1.15.0 + + + From 0d73f08ef6bdb3abb972b6ddcb3fd1737d93d8eb Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Wed, 6 Jan 2021 02:29:34 +0800 Subject: [PATCH 02/22] Upgrade maven plugins. --- pom.xml | 44 ++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 40 insertions(+), 4 deletions(-) diff --git a/pom.xml b/pom.xml index 0a24519..08250fd 100644 --- a/pom.xml +++ b/pom.xml @@ -221,7 +221,6 @@ org.apache.maven.plugins maven-surefire-plugin - 3.0.0-M4 0 @@ -229,7 +228,6 @@ org.apache.maven.plugins maven-compiler-plugin - 3.8.1 ${java.version} ${java.version} @@ -258,12 +256,10 @@ org.apache.maven.plugins maven-resources-plugin - 3.1.0 org.apache.maven.plugins maven-jar-plugin - 3.2.0 log4j.xml @@ -324,6 +320,46 @@ + + org.apache.maven.plugins + maven-clean-plugin + 3.1.0 + + + org.apache.maven.plugins + maven-compiler-plugin + 3.8.1 + + + org.apache.maven.plugins + maven-deploy-plugin + 3.0.0-M1 + + + org.apache.maven.plugins + maven-install-plugin + 3.0.0-M1 + + + org.apache.maven.plugins + maven-jar-plugin + 3.2.0 + + + org.apache.maven.plugins + maven-resources-plugin + 3.1.0 + + + org.apache.maven.plugins + maven-site-plugin + 3.9.0 + + + org.apache.maven.plugins + maven-surefire-plugin + 3.0.0-M5 + com.amashchenko.maven.plugin gitflow-maven-plugin From 0e01550a79883e7df6c0bd8d0b0ab31156a9412a Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Wed, 6 Jan 2021 03:13:50 +0800 Subject: [PATCH 03/22] Upgrade dependencies, including the jedis from 2.9.3 to 3.4.1. --- pom.xml | 30 +++--- .../webmagic/selector/LinksSelector.java | 12 +-- .../scheduler/RedisPriorityScheduler.java | 95 +++++++------------ .../webmagic/scheduler/RedisScheduler.java | 33 ++----- 4 files changed, 64 insertions(+), 106 deletions(-) diff --git a/pom.xml b/pom.xml index 08250fd..81d0d6c 100644 --- a/pom.xml +++ b/pom.xml @@ -73,17 +73,17 @@ org.apache.httpcomponents httpcore - 4.4.13 + 4.4.14 com.google.guava guava - 30.0-android + 30.1-jre com.jayway.jsonpath json-path - 2.4.0 + 2.5.0 org.slf4j @@ -103,7 +103,7 @@ com.alibaba fastjson - 1.2.69 + 1.2.75 com.github.dreamhead @@ -125,13 +125,13 @@ org.assertj assertj-core - 3.16.1 + 3.18.1 test org.apache.commons commons-lang3 - 3.10 + 3.11 commons-collections @@ -139,19 +139,19 @@ 3.2.2 - commons-io - commons-io - 2.7 - + commons-io + commons-io + 2.8.0 + org.codehaus.groovy groovy-all - 2.4.19 + 3.0.7 org.jruby jruby - 9.2.11.1 + 9.2.14.0 org.jsoup @@ -171,12 +171,12 @@ net.sf.saxon Saxon-HE - 10.1 + 10.3 net.sourceforge.htmlcleaner htmlcleaner - 2.5 + 2.24 com.github.detro @@ -191,7 +191,7 @@ redis.clients jedis - 2.9.3 + 3.4.1 diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/LinksSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/LinksSelector.java index 5296a74..2dafe8e 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/LinksSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/LinksSelector.java @@ -1,12 +1,12 @@ package us.codecraft.webmagic.selector; -import org.jsoup.helper.StringUtil; -import org.jsoup.nodes.Element; -import org.jsoup.select.Elements; - import java.util.ArrayList; import java.util.List; +import org.apache.commons.lang3.StringUtils; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + /** * Links selector based on jsoup. Use absolute url.
* @@ -23,9 +23,9 @@ public class LinksSelector extends BaseElementSelector { @Override public List selectList(Element element) { Elements elements = element.select("a"); - List links = new ArrayList(elements.size()); + List links = new ArrayList<>(elements.size()); for (Element element0 : elements) { - if (!StringUtil.isBlank(element0.baseUri())) { + if (StringUtils.isNotBlank(element0.baseUri())) { links.add(element0.attr("abs:href")); } else { links.add(element0.attr("href")); diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisPriorityScheduler.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisPriorityScheduler.java index 540574a..46d47e5 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisPriorityScheduler.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisPriorityScheduler.java @@ -1,22 +1,23 @@ package us.codecraft.webmagic.scheduler; -import com.alibaba.fastjson.JSON; +import java.util.Set; + import org.apache.commons.codec.digest.DigestUtils; import org.apache.commons.lang3.StringUtils; + +import com.alibaba.fastjson.JSON; + import redis.clients.jedis.Jedis; import redis.clients.jedis.JedisPool; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Task; -import java.util.Set; - /** * the redis scheduler with priority * @author sai * Created by sai on 16-5-27. */ -public class RedisPriorityScheduler extends RedisScheduler -{ +public class RedisPriorityScheduler extends RedisScheduler { private static final String ZSET_PREFIX = "zset_"; @@ -37,62 +38,44 @@ public class RedisPriorityScheduler extends RedisScheduler } @Override - protected void pushWhenNoDuplicate(Request request, Task task) - { - Jedis jedis = pool.getResource(); - try - { - if(request.getPriority() > 0) + protected void pushWhenNoDuplicate(Request request, Task task) { + try (Jedis jedis = pool.getResource()) { + if (request.getPriority() > 0) { jedis.zadd(getZsetPlusPriorityKey(task), request.getPriority(), request.getUrl()); - else if(request.getPriority() < 0) + } else if (request.getPriority() < 0) { jedis.zadd(getZsetMinusPriorityKey(task), request.getPriority(), request.getUrl()); - else + } else { jedis.lpush(getQueueNoPriorityKey(task), request.getUrl()); + } setExtrasInItem(jedis, request, task); } - finally - { - pool.returnResource(jedis); - } } @Override - public synchronized Request poll(Task task) - { - Jedis jedis = pool.getResource(); - try - { + public synchronized Request poll(Task task) { + try (Jedis jedis = pool.getResource()) { String url = getRequest(jedis, task); - if(StringUtils.isBlank(url)) + if (StringUtils.isBlank(url)) { return null; + } return getExtrasInItem(jedis, url, task); } - finally - { - pool.returnResource(jedis); - } } - private String getRequest(Jedis jedis, Task task) - { + private String getRequest(Jedis jedis, Task task) { String url; Set urls = jedis.zrevrange(getZsetPlusPriorityKey(task), 0, 0); - if(urls.isEmpty()) - { + if (urls.isEmpty()) { url = jedis.lpop(getQueueNoPriorityKey(task)); - if(StringUtils.isBlank(url)) - { + if (StringUtils.isBlank(url)) { urls = jedis.zrevrange(getZsetMinusPriorityKey(task), 0, 0); - if(!urls.isEmpty()) - { + if (!urls.isEmpty()) { url = urls.toArray(new String[0])[0]; jedis.zrem(getZsetMinusPriorityKey(task), url); } } - } - else - { + } else { url = urls.toArray(new String[0])[0]; jedis.zrem(getZsetPlusPriorityKey(task), url); } @@ -100,51 +83,39 @@ public class RedisPriorityScheduler extends RedisScheduler } @Override - public void resetDuplicateCheck(Task task) - { - Jedis jedis = pool.getResource(); - try - { + public void resetDuplicateCheck(Task task) { + try (Jedis jedis = pool.getResource()) { jedis.del(getSetKey(task)); } - finally - { - pool.returnResource(jedis); - } } - private String getZsetPlusPriorityKey(Task task) - { + private String getZsetPlusPriorityKey(Task task) { return ZSET_PREFIX + task.getUUID() + PLUS_PRIORITY_SUFFIX; } - private String getQueueNoPriorityKey(Task task) - { + private String getQueueNoPriorityKey(Task task) { return QUEUE_PREFIX + task.getUUID() + NO_PRIORITY_SUFFIX; } - private String getZsetMinusPriorityKey(Task task) - { + private String getZsetMinusPriorityKey(Task task) { return ZSET_PREFIX + task.getUUID() + MINUS_PRIORITY_SUFFIX; } - private void setExtrasInItem(Jedis jedis,Request request, Task task) - { - if(request.getExtras() != null) - { - String field = DigestUtils.shaHex(request.getUrl()); + private void setExtrasInItem(Jedis jedis,Request request, Task task) { + if (request.getExtras() != null) { + String field = DigestUtils.sha1Hex(request.getUrl()); String value = JSON.toJSONString(request); jedis.hset(getItemKey(task), field, value); } } - private Request getExtrasInItem(Jedis jedis, String url, Task task) - { + private Request getExtrasInItem(Jedis jedis, String url, Task task) { String key = getItemKey(task); - String field = DigestUtils.shaHex(url); + String field = DigestUtils.sha1Hex(url); byte[] bytes = jedis.hget(key.getBytes(), field.getBytes()); - if(bytes != null) + if (bytes != null) { return JSON.parseObject(new String(bytes), Request.class); + } return new Request(url); } } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java index c70d885..19e8313 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java @@ -1,8 +1,10 @@ package us.codecraft.webmagic.scheduler; -import com.alibaba.fastjson.JSON; import org.apache.commons.codec.digest.DigestUtils; import org.apache.commons.lang3.StringUtils; + +import com.alibaba.fastjson.JSON; + import redis.clients.jedis.Jedis; import redis.clients.jedis.JedisPool; import redis.clients.jedis.JedisPoolConfig; @@ -37,21 +39,15 @@ public class RedisScheduler extends DuplicateRemovedScheduler implements Monitor @Override public void resetDuplicateCheck(Task task) { - Jedis jedis = pool.getResource(); - try { + try (Jedis jedis = pool.getResource()) { jedis.del(getSetKey(task)); - } finally { - pool.returnResource(jedis); } } @Override public boolean isDuplicate(Request request, Task task) { - Jedis jedis = pool.getResource(); - try { + try (Jedis jedis = pool.getResource()) { return jedis.sadd(getSetKey(task), request.getUrl()) == 0; - } finally { - pool.returnResource(jedis); } } @@ -62,7 +58,7 @@ public class RedisScheduler extends DuplicateRemovedScheduler implements Monitor try { jedis.rpush(getQueueKey(task), request.getUrl()); if (checkForAdditionalInfo(request)) { - String field = DigestUtils.shaHex(request.getUrl()); + String field = DigestUtils.sha1Hex(request.getUrl()); String value = JSON.toJSONString(request); jedis.hset((ITEM_PREFIX + task.getUUID()), field, value); } @@ -100,14 +96,13 @@ public class RedisScheduler extends DuplicateRemovedScheduler implements Monitor @Override public synchronized Request poll(Task task) { - Jedis jedis = pool.getResource(); - try { + try (Jedis jedis = pool.getResource()) { String url = jedis.lpop(getQueueKey(task)); if (url == null) { return null; } String key = ITEM_PREFIX + task.getUUID(); - String field = DigestUtils.shaHex(url); + String field = DigestUtils.sha1Hex(url); byte[] bytes = jedis.hget(key.getBytes(), field.getBytes()); if (bytes != null) { Request o = JSON.parseObject(new String(bytes), Request.class); @@ -115,8 +110,6 @@ public class RedisScheduler extends DuplicateRemovedScheduler implements Monitor } Request request = new Request(url); return request; - } finally { - pool.returnResource(jedis); } } @@ -134,23 +127,17 @@ public class RedisScheduler extends DuplicateRemovedScheduler implements Monitor @Override public int getLeftRequestsCount(Task task) { - Jedis jedis = pool.getResource(); - try { + try (Jedis jedis = pool.getResource()) { Long size = jedis.llen(getQueueKey(task)); return size.intValue(); - } finally { - pool.returnResource(jedis); } } @Override public int getTotalRequestsCount(Task task) { - Jedis jedis = pool.getResource(); - try { + try (Jedis jedis = pool.getResource()) { Long size = jedis.scard(getSetKey(task)); return size.intValue(); - } finally { - pool.returnResource(jedis); } } } From d0e2776991b3aae0eb745f4e76562712584eb44e Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Sun, 10 Jan 2021 14:10:32 +0800 Subject: [PATCH 04/22] Upgrade xsoup from 0.3.1 to 0.3.2. --- pom.xml | 7 +------ webmagic-core/pom.xml | 5 ----- webmagic-scripts/pom.xml | 4 ---- 3 files changed, 1 insertion(+), 15 deletions(-) diff --git a/pom.xml b/pom.xml index 81d0d6c..c5b7dfe 100644 --- a/pom.xml +++ b/pom.xml @@ -98,7 +98,7 @@ us.codecraft xsoup - 0.3.1 + 0.3.2 com.alibaba @@ -153,11 +153,6 @@ jruby 9.2.14.0 - - org.jsoup - jsoup - 1.10.3 - org.python jython diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index 4b89cac..820651a 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -61,11 +61,6 @@ assertj-core - - org.jsoup - jsoup - - commons-io commons-io diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index 121aafa..85b735f 100755 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -22,10 +22,6 @@ kotlin-stdlib ${kotlin.version} - - org.codehaus.groovy - groovy-all - org.python jython From 2f71f7912c1d104fbd42f7ca14fea4ac764efd8a Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Sun, 10 Jan 2021 14:31:40 +0800 Subject: [PATCH 05/22] Fix scm tag. --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index c5b7dfe..97897db 100644 --- a/pom.xml +++ b/pom.xml @@ -33,7 +33,7 @@ scm:git:git@github.com:code4craft/webmagic.git scm:git:git@github.com:code4craft/webmagic.git git@github.com:code4craft/webmagic.git - webmagic-parent-0.6.1 + WebMagic-${project.version} From 683db09133b16ada8f6ea6de12a9b62a1a0705d4 Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Mon, 11 Jan 2021 00:35:22 +0800 Subject: [PATCH 06/22] Complete testXPath2 assertion. --- .../java/us/codecraft/webmagic/selector/XpathSelectorTest.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java b/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java index aa3765a..38aac15 100644 --- a/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java +++ b/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java @@ -8,6 +8,7 @@ import org.jsoup.nodes.Document; import org.junit.Assert; import org.junit.Ignore; import org.junit.Test; + import us.codecraft.xsoup.XPathEvaluator; import us.codecraft.xsoup.Xsoup; @@ -1368,7 +1369,7 @@ public class XpathSelectorTest { String text = "

眉山:扎实推进农业农村工作 促农持续增收
\n" + "2013-07-31 23:29:45   来源:眉山网      责任编辑:张斯炜

"; XpathSelector xpathSelector = new XpathSelector("//h1/text()"); - System.out.println(xpathSelector.select(text)); + Assert.assertEquals("眉山:扎实推进农业农村工作 促农持续增收 ", xpathSelector.select(text)); } @Test From 124c52b9884b1c855e47cfcdddbc1e7d9c613dbe Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Mon, 11 Jan 2021 01:25:41 +0800 Subject: [PATCH 07/22] Downgrade htmlcleaner from 2.24 back to 2.5, to make Xpath2Selector pass the test cases. --- pom.xml | 2 +- .../webmagic/selector/Xpath2Selector.java | 36 ++++++++++--------- .../webmagic/selector/XpathSelectorTest.java | 12 +++++-- 3 files changed, 29 insertions(+), 21 deletions(-) diff --git a/pom.xml b/pom.xml index 97897db..16e14cf 100644 --- a/pom.xml +++ b/pom.xml @@ -171,7 +171,7 @@ net.sourceforge.htmlcleaner htmlcleaner - 2.24 + 2.5 com.github.detro diff --git a/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java b/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java index d8aab6c..1f1f0a5 100644 --- a/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java +++ b/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java @@ -1,16 +1,11 @@ package us.codecraft.webmagic.selector; -import net.sf.saxon.lib.NamespaceConstant; -import net.sf.saxon.xpath.XPathEvaluator; -import org.htmlcleaner.CleanerProperties; -import org.htmlcleaner.DomSerializer; -import org.htmlcleaner.HtmlCleaner; -import org.htmlcleaner.TagNode; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.w3c.dom.Document; -import org.w3c.dom.Node; -import org.w3c.dom.NodeList; +import java.io.StringWriter; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; import javax.xml.namespace.NamespaceContext; import javax.xml.transform.OutputKeys; @@ -21,12 +16,19 @@ import javax.xml.transform.stream.StreamResult; import javax.xml.xpath.XPathConstants; import javax.xml.xpath.XPathExpression; import javax.xml.xpath.XPathExpressionException; -import java.io.StringWriter; -import java.util.ArrayList; -import java.util.Iterator; -import java.util.List; -import java.util.Map; -import java.util.concurrent.ConcurrentHashMap; + +import org.htmlcleaner.CleanerProperties; +import org.htmlcleaner.DomSerializer; +import org.htmlcleaner.HtmlCleaner; +import org.htmlcleaner.TagNode; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.w3c.dom.Document; +import org.w3c.dom.Node; +import org.w3c.dom.NodeList; + +import net.sf.saxon.lib.NamespaceConstant; +import net.sf.saxon.xpath.XPathEvaluator; /** * 支持xpath2.0的选择器。包装了HtmlCleaner和Saxon HE。
diff --git a/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java b/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java index 38aac15..32906b5 100644 --- a/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java +++ b/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java @@ -1,5 +1,7 @@ package us.codecraft.webmagic.selector; +import java.util.List; + import org.htmlcleaner.HtmlCleaner; import org.htmlcleaner.TagNode; import org.htmlcleaner.XPatherException; @@ -1368,15 +1370,19 @@ public class XpathSelectorTest { public void testXPath2() { String text = "

眉山:扎实推进农业农村工作 促农持续增收
\n" + "2013-07-31 23:29:45   来源:眉山网      责任编辑:张斯炜

"; - XpathSelector xpathSelector = new XpathSelector("//h1/text()"); - Assert.assertEquals("眉山:扎实推进农业农村工作 促农持续增收 ", xpathSelector.select(text)); + Xpath2Selector xpathSelector = new Xpath2Selector("//h1/text()"); + Assert.assertEquals("眉山:扎实推进农业农村工作 促农持续增收", xpathSelector.select(text)); } @Test public void testXpath2Selector() { Xpath2Selector xpath2Selector = new Xpath2Selector("//a/@href"); String select = xpath2Selector.select(html); - Assert.assertNotNull(select); + Assert.assertEquals("http://www.oschina.net/", select); + + List selectList = xpath2Selector.selectList(html); + Assert.assertEquals(113, selectList.size()); + Assert.assertEquals("http://www.oschina.net/", selectList.get(0)); } @Ignore("take long time") From d92dc8397f336c2757ce4559ac92daf7bf82aa61 Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Mon, 11 Jan 2021 01:46:32 +0800 Subject: [PATCH 08/22] Upgrade htmlcleaner from 2.5 to 2.9, this is the highest version to let Xpath2Selector pass the test cases. --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 16e14cf..df7d6da 100644 --- a/pom.xml +++ b/pom.xml @@ -171,7 +171,7 @@ net.sourceforge.htmlcleaner htmlcleaner - 2.5 + 2.9 com.github.detro From 54127318a4266fc53037e9f1b51a6eb3102e7aaf Mon Sep 17 00:00:00 2001 From: JustThink Date: Wed, 3 Feb 2021 02:43:53 +1300 Subject: [PATCH 09/22] =?UTF-8?q?SpiderStatus=E4=B8=ADgetPagePerSecond()?= =?UTF-8?q?=E6=96=B9=E6=B3=95=EF=BC=8C=E5=A2=9E=E5=8A=A0=E9=AA=8C=E8=AF=81?= =?UTF-8?q?=E9=80=BB=E8=BE=91=EF=BC=8C=E9=81=BF=E5=85=8D=E7=A9=BA=E6=8C=87?= =?UTF-8?q?=E9=92=88=EF=BC=8C=E9=81=BF=E5=85=8D=E9=99=A4=E6=95=B0=E4=B8=BA?= =?UTF-8?q?=E9=9B=B6=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../java/us/codecraft/webmagic/monitor/SpiderStatus.java | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderStatus.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderStatus.java index a87c040..69afe04 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderStatus.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderStatus.java @@ -84,8 +84,13 @@ public class SpiderStatus implements SpiderStatusMXBean { @Override public int getPagePerSecond() { - int runSeconds = (int) (System.currentTimeMillis() - getStartTime().getTime()) / 1000; - return getSuccessPageCount() / runSeconds; + if (getStartTime() != null) { + int runSeconds = (int) (System.currentTimeMillis() - getStartTime().getTime()) / 1000; + if (runSeconds != 0) { + return getSuccessPageCount() / runSeconds; + } + } + return -1; } } From 528a8908afe92a858b4ea0bcb3f403137fa9847a Mon Sep 17 00:00:00 2001 From: wecandoitjustthink Date: Sat, 27 Feb 2021 19:59:05 +1300 Subject: [PATCH 10/22] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E4=BA=86List=E5=B1=9E=E6=80=A7=E7=9A=84get=E6=96=B9=E6=B3=95,?= =?UTF-8?q?=E4=BE=9BSpiderMonitor=E7=9A=84=E5=AD=90=E7=B1=BB=E8=8E=B7?= =?UTF-8?q?=E5=8F=96.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../java/us/codecraft/webmagic/monitor/SpiderMonitor.java | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java index cfb4a82..b213dda 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java @@ -68,6 +68,10 @@ public class SpiderMonitor { return new SpiderStatus(spider, monitorSpiderListener); } + protected List getSpiderStatuses() { + return this.spiderStatuses; + } + public static SpiderMonitor instance() { return INSTANCE; } From dcfd23841310face234c0355b824416c5c12046e Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Mon, 1 Mar 2021 01:06:42 +0800 Subject: [PATCH 11/22] Polish java version setting. --- pom.xml | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/pom.xml b/pom.xml index df7d6da..12c3dbf 100644 --- a/pom.xml +++ b/pom.xml @@ -7,7 +7,8 @@ UTF-8 UTF-8 - 1.8 + 1.8 + 1.8 4.0.0.RELEASE webmagic-parent @@ -223,10 +224,6 @@ org.apache.maven.plugins maven-compiler-plugin - - ${java.version} - ${java.version} - From 4e8a086dae80e13a503f5a72e0e17d7b96c884fc Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Mon, 22 Mar 2021 18:18:10 +0800 Subject: [PATCH 12/22] Pass exception to onError. Fixes #1005. --- .../src/main/java/us/codecraft/webmagic/Spider.java | 13 +++++++++++-- .../java/us/codecraft/webmagic/SpiderListener.java | 9 +++++++++ 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 886e74a..54fc220 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -320,7 +320,7 @@ public class Spider implements Runnable, Task { processRequest(request); onSuccess(request); } catch (Exception e) { - onError(request); + onError(request, e); logger.error("process request " + request + " error", e); } finally { pageCount.incrementAndGet(); @@ -338,10 +338,19 @@ public class Spider implements Runnable, Task { logger.info("Spider {} closed! {} pages downloaded.", getUUID(), pageCount.get()); } + /** + * @deprecated Use {@link #onError(Request, Exception)} instead. + */ + @Deprecated protected void onError(Request request) { + } + + protected void onError(Request request, Exception e) { + this.onError(request); + if (CollectionUtils.isNotEmpty(spiderListeners)) { for (SpiderListener spiderListener : spiderListeners) { - spiderListener.onError(request); + spiderListener.onError(request, e); } } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/SpiderListener.java b/webmagic-core/src/main/java/us/codecraft/webmagic/SpiderListener.java index 0678180..8f10e0e 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/SpiderListener.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/SpiderListener.java @@ -10,5 +10,14 @@ public interface SpiderListener { public void onSuccess(Request request); + /** + * @deprecated Use {@link #onError(Request, Exception)} instead. + */ + @Deprecated public void onError(Request request); + + default void onError(Request request, Exception e) { + this.onError(request); + } + } From be6f5ff77114eed558f3af86781395dabe9ad8f6 Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Mon, 22 Mar 2021 18:18:42 +0800 Subject: [PATCH 13/22] Add missing @Deprecated annotations. --- .../src/main/java/us/codecraft/webmagic/Spider.java | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 54fc220..5940e73 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -208,7 +208,8 @@ public class Spider implements Runnable, Task { * @see #addPipeline(us.codecraft.webmagic.pipeline.Pipeline) * @deprecated */ - public Spider pipeline(Pipeline pipeline) { + @Deprecated + public Spider pipeline(Pipeline pipeline) { return addPipeline(pipeline); } @@ -258,7 +259,8 @@ public class Spider implements Runnable, Task { * @see #setDownloader(us.codecraft.webmagic.downloader.Downloader) * @deprecated */ - public Spider downloader(Downloader downloader) { + @Deprecated + public Spider downloader(Downloader downloader) { return setDownloader(downloader); } From 76f625c02e552ccd2834cd38cb0f46e2f3037db7 Mon Sep 17 00:00:00 2001 From: linweisen Date: Fri, 9 Apr 2021 17:00:00 +0800 Subject: [PATCH 14/22] =?UTF-8?q?=E6=8F=90=E4=BA=A4=E5=8F=AF=E6=81=A2?= =?UTF-8?q?=E5=A4=8D=E7=88=AC=E5=8F=96=E5=86=85=E5=AE=B9=E4=BE=8B=E5=AD=90?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- webmagic-samples/pom.xml | 20 +++++ .../recover/DuplicateStorageRemover.java | 82 +++++++++++++++++ .../webmagic/recover/MmapQueueScheduler.java | 89 +++++++++++++++++++ .../webmagic/recover/RecoverSample.java | 22 +++++ 4 files changed, 213 insertions(+) create mode 100644 webmagic-samples/src/main/java/us/codecraft/webmagic/recover/DuplicateStorageRemover.java create mode 100644 webmagic-samples/src/main/java/us/codecraft/webmagic/recover/MmapQueueScheduler.java create mode 100644 webmagic-samples/src/main/java/us/codecraft/webmagic/recover/RecoverSample.java diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index 3699fa6..6c0e59b 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -24,6 +24,26 @@ junit junit + + org.mapdb + mapdb + 3.0.7 + + + com.fasterxml.jackson.core + jackson-core + 2.9.5 + + + com.fasterxml.jackson.core + jackson-annotations + 2.9.5 + + + com.fasterxml.jackson.core + jackson-databind + 2.9.5 + diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/recover/DuplicateStorageRemover.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/recover/DuplicateStorageRemover.java new file mode 100644 index 0000000..5bf249e --- /dev/null +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/recover/DuplicateStorageRemover.java @@ -0,0 +1,82 @@ +package us.codecraft.webmagic.recover; + +import com.google.common.base.Charsets; +import com.google.common.hash.BloomFilter; +import com.google.common.hash.Funnels; +import org.mapdb.DB; +import org.mapdb.DBMaker; +import org.mapdb.IndexTreeList; +import org.mapdb.Serializer; +import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.Task; +import us.codecraft.webmagic.scheduler.component.DuplicateRemover; + +import java.util.concurrent.atomic.AtomicInteger; + +/** + * @author :linweisen + * @date :Created in 2021/4/9 14:46 + * @description:${description} + * @modified By: + * @version: 1.0 + */ +public class DuplicateStorageRemover implements DuplicateRemover { + + private DB db; + + private static String DATABASE_NAME = "duplicate"; + + private IndexTreeList urlDuplicateQueue; + + private BloomFilter bloomFilter; + + private AtomicInteger counter; + + public DuplicateStorageRemover(String path) { + + String duplicatStoragePath = path; + + DB db = DBMaker.fileDB(duplicatStoragePath) + .fileMmapEnableIfSupported() + .fileMmapPreclearDisable() + .cleanerHackEnable() + .closeOnJvmShutdown() + .transactionEnable() + .concurrencyScale(128) + .make(); + this.db = db; + + this.urlDuplicateQueue = db.indexTreeList(DATABASE_NAME, Serializer.STRING).createOrOpen(); + + counter = new AtomicInteger(this.urlDuplicateQueue.size()); + this.bloomFilter = BloomFilter.create(Funnels.stringFunnel(Charsets.UTF_8), 200000, 1E-7); + for (String url : this.urlDuplicateQueue){ + bloomFilter.put(url); + } + + } + + @Override + public boolean isDuplicate(Request request, Task task) { + String url = request.getUrl(); + boolean isDuplicate = bloomFilter.mightContain(url); + if (!isDuplicate) { + bloomFilter.put(url); + urlDuplicateQueue.add(url); + this.db.commit(); + counter.incrementAndGet(); + } + return isDuplicate; + } + + @Override + public void resetDuplicateCheck(Task task) { + this.bloomFilter = BloomFilter.create(Funnels.stringFunnel(Charsets.UTF_8), 200000, 1E-7); + this.urlDuplicateQueue.clear(); + } + + @Override + public int getTotalRequestsCount(Task task) { + return counter.get(); + } +} diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/recover/MmapQueueScheduler.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/recover/MmapQueueScheduler.java new file mode 100644 index 0000000..07cfa22 --- /dev/null +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/recover/MmapQueueScheduler.java @@ -0,0 +1,89 @@ +package us.codecraft.webmagic.recover; + +import com.fasterxml.jackson.databind.ObjectMapper; +import org.apache.commons.lang3.StringUtils; +import org.mapdb.DB; +import org.mapdb.DBMaker; +import org.mapdb.IndexTreeList; +import org.mapdb.Serializer; +import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.Task; +import us.codecraft.webmagic.scheduler.DuplicateRemovedScheduler; +import us.codecraft.webmagic.scheduler.component.DuplicateRemover; + +import java.io.IOException; + +/** + * @author :linweisen + * @date :Created in 2021/4/9 14:38 + * @description:${description} + * @modified By: + * @version: 1.0 + */ +public class MmapQueueScheduler extends DuplicateRemovedScheduler { + + private DB db; + + private static String DATABASE_NAME = "queue"; + + private IndexTreeList queue; + + private static ObjectMapper mapper; + + public MmapQueueScheduler(DuplicateRemover duplicateRemover, String path) { + super.setDuplicateRemover(duplicateRemover); + + String queuePath = path; + + DB db = DBMaker.fileDB(queuePath) + .fileMmapEnableIfSupported() + .fileMmapPreclearDisable() + .cleanerHackEnable() + .closeOnJvmShutdown() + .transactionEnable() + .concurrencyScale(128) + .make(); + this.db = db; + this.mapper = new ObjectMapper(); + this.queue = db.indexTreeList(MmapQueueScheduler.DATABASE_NAME, Serializer.STRING).createOrOpen(); + } + + @Override + public Request poll(Task task) { + if (this.queue.size() > 0){ + String s = queue.remove(0); + return fromJson(s, Request.class); + }else{ + return null; + } + + } + + @Override + public void pushWhenNoDuplicate(Request request, Task task) { + queue.add(toJson(request)); + this.db.commit(); + } + + public String toJson(Object object) { + try { + return mapper.writeValueAsString(object); + } catch (IOException e) { + logger.warn("write to json string error:" + object, e); + return null; + } + } + + public T fromJson(String jsonString, Class clazz) { + if (StringUtils.isEmpty(jsonString)) { + return null; + } + try { + return mapper.readValue(jsonString, clazz); + } catch (IOException e) { + logger.warn("parse json string error:" + jsonString, e); + return null; + } + } + +} diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/recover/RecoverSample.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/recover/RecoverSample.java new file mode 100644 index 0000000..4fb91a0 --- /dev/null +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/recover/RecoverSample.java @@ -0,0 +1,22 @@ +package us.codecraft.webmagic.recover; + + +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.samples.SinaBlogProcessor; +import us.codecraft.webmagic.scheduler.component.DuplicateRemover; + +/** + * @author code4crafter@gmail.com
+ */ +public class RecoverSample { + + public static void main(String[] args) { + String storage = "queue"; + String duplicate = "duplicate"; + Spider spider = new Spider(new SinaBlogProcessor()); + DuplicateRemover remover = new DuplicateStorageRemover(duplicate); + spider.setScheduler(new MmapQueueScheduler(remover, storage)); + spider.addUrl("http://blog.sina.com.cn/s/articlelist_1487828712_0_1.html") + .run(); + } +} From dba0ddb92cd9a80553a0d01eec94b0ac97e4f7de Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Wed, 28 Apr 2021 12:17:52 +0800 Subject: [PATCH 15/22] Remove unknown tag from javadoc. --- .../codecraft/webmagic/recover/DuplicateStorageRemover.java | 4 ---- .../us/codecraft/webmagic/recover/MmapQueueScheduler.java | 4 ---- 2 files changed, 8 deletions(-) diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/recover/DuplicateStorageRemover.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/recover/DuplicateStorageRemover.java index 5bf249e..bee80e7 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/recover/DuplicateStorageRemover.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/recover/DuplicateStorageRemover.java @@ -15,10 +15,6 @@ import java.util.concurrent.atomic.AtomicInteger; /** * @author :linweisen - * @date :Created in 2021/4/9 14:46 - * @description:${description} - * @modified By: - * @version: 1.0 */ public class DuplicateStorageRemover implements DuplicateRemover { diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/recover/MmapQueueScheduler.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/recover/MmapQueueScheduler.java index 07cfa22..4cee18a 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/recover/MmapQueueScheduler.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/recover/MmapQueueScheduler.java @@ -15,10 +15,6 @@ import java.io.IOException; /** * @author :linweisen - * @date :Created in 2021/4/9 14:38 - * @description:${description} - * @modified By: - * @version: 1.0 */ public class MmapQueueScheduler extends DuplicateRemovedScheduler { From 189c5962e6a6f68ed1c76c517d59353031cbb77a Mon Sep 17 00:00:00 2001 From: Guy Korland Date: Tue, 18 May 2021 16:55:29 +0300 Subject: [PATCH 16/22] Update to Jedis 3.6.0 --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 12c3dbf..c1f16b7 100644 --- a/pom.xml +++ b/pom.xml @@ -187,7 +187,7 @@ redis.clients jedis - 3.4.1 + 3.6.0 From db70b6e095cf070bdea46816371be3f33848b9b9 Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Tue, 22 Jun 2021 21:58:49 +0800 Subject: [PATCH 17/22] Add maven reports. --- pom.xml | 59 ++++++++++++++++++++++++++++++-- src/site/site.xml | 23 +++++++++++++ webmagic-coverage/pom.xml | 72 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 151 insertions(+), 3 deletions(-) create mode 100644 src/site/site.xml create mode 100644 webmagic-coverage/pom.xml diff --git a/pom.xml b/pom.xml index c1f16b7..4468ba0 100644 --- a/pom.xml +++ b/pom.xml @@ -50,6 +50,7 @@ webmagic-selenium webmagic-saxon webmagic-samples + webmagic-coverage @@ -217,9 +218,6 @@ org.apache.maven.plugins maven-surefire-plugin - - 0 - org.apache.maven.plugins @@ -305,6 +303,24 @@ maven-release-plugin 3.0.0-M1 + + org.jacoco + jacoco-maven-plugin + + + + prepare-agent + + + + report + verify + + report + + + + com.amashchenko.maven.plugin gitflow-maven-plugin @@ -352,6 +368,11 @@ maven-surefire-plugin 3.0.0-M5 + + org.jacoco + jacoco-maven-plugin + 0.8.7 + com.amashchenko.maven.plugin gitflow-maven-plugin @@ -361,6 +382,38 @@
+ + + + org.apache.maven.plugins + maven-javadoc-plugin + + none + + + + org.apache.maven.plugins + maven-jxr-plugin + + + org.apache.maven.plugins + maven-pmd-plugin + + + org.apache.maven.plugins + maven-surefire-report-plugin + + + org.codehaus.mojo + taglist-maven-plugin + + + com.github.spotbugs + spotbugs-maven-plugin + + + + release diff --git a/src/site/site.xml b/src/site/site.xml new file mode 100644 index 0000000..d2d5caa --- /dev/null +++ b/src/site/site.xml @@ -0,0 +1,23 @@ + + + org.apache.maven.skins + maven-fluido-skin + 1.9 + + + + + + + + + true + true + true + pull-right + + + diff --git a/webmagic-coverage/pom.xml b/webmagic-coverage/pom.xml new file mode 100644 index 0000000..b1998a3 --- /dev/null +++ b/webmagic-coverage/pom.xml @@ -0,0 +1,72 @@ + + + 4.0.0 + + + us.codecraft + webmagic-parent + 0.7.4 + + + webmagic-coverage + pom + webmagic-coverage + Compute aggregated test code coverage + + + true + + + + + ${project.groupId} + webmagic-core + ${project.version} + + + ${project.groupId} + webmagic-extension + ${project.version} + + + ${project.groupId} + webmagic-scripts + ${project.version} + + + ${project.groupId} + webmagic-selenium + ${project.version} + + + ${project.groupId} + webmagic-saxon + ${project.version} + + + ${project.groupId} + webmagic-samples + ${project.version} + + + + + + + org.jacoco + jacoco-maven-plugin + + + + report-aggregate + + + + + + + + From 31a00f5f8e73b90bc28833b06ec6aa649b07f245 Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Thu, 22 Jul 2021 12:09:02 +0800 Subject: [PATCH 18/22] Set gitflow-maven-plugin versionTagPrefix. --- pom.xml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pom.xml b/pom.xml index 4468ba0..efa5b42 100644 --- a/pom.xml +++ b/pom.xml @@ -324,6 +324,11 @@ com.amashchenko.maven.plugin gitflow-maven-plugin + + + WebMagic- + + From 14b09a33852a022209437216af48f509f654b1c6 Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Thu, 22 Jul 2021 12:36:11 +0800 Subject: [PATCH 19/22] Update maven plugin versions. --- pom.xml | 31 ++++++++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/pom.xml b/pom.xml index efa5b42..6a237e6 100644 --- a/pom.xml +++ b/pom.xml @@ -208,7 +208,7 @@ - 3.0.5 + 3.3.9 @@ -358,21 +358,41 @@ maven-jar-plugin 3.2.0 + + org.apache.maven.plugins + maven-jxr-plugin + 3.1.1 + + + org.apache.maven.plugins + maven-pmd-plugin + 3.14.0 + org.apache.maven.plugins maven-resources-plugin - 3.1.0 + 3.2.0 org.apache.maven.plugins maven-site-plugin - 3.9.0 + 3.9.1 org.apache.maven.plugins maven-surefire-plugin 3.0.0-M5 + + org.apache.maven.plugins + maven-surefire-report-plugin + 3.0.0-M5 + + + org.codehaus.mojo + taglist-maven-plugin + 2.4 + org.jacoco jacoco-maven-plugin @@ -383,6 +403,11 @@ gitflow-maven-plugin 1.15.0 + + com.github.spotbugs + spotbugs-maven-plugin + 4.2.3 + From e3e66fb270782813cc35e1189949dd4ac2465299 Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Thu, 22 Jul 2021 12:38:38 +0800 Subject: [PATCH 20/22] Upgrade webmagic-samples dependencies. --- webmagic-samples/pom.xml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index 6c0e59b..bdca2b6 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -27,22 +27,22 @@ org.mapdb mapdb - 3.0.7 + 3.0.8 com.fasterxml.jackson.core jackson-core - 2.9.5 + 2.13.0-rc1 com.fasterxml.jackson.core jackson-annotations - 2.9.5 + 2.13.0-rc1 com.fasterxml.jackson.core jackson-databind - 2.9.5 + 2.13.0-rc1 From 4e51a4f68bbba56a4dc4dc9cf2c3128027a2991b Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Thu, 22 Jul 2021 12:42:55 +0800 Subject: [PATCH 21/22] Update versions for release --- pom.xml | 2 +- webmagic-core/pom.xml | 2 +- webmagic-coverage/pom.xml | 2 +- webmagic-extension/pom.xml | 2 +- webmagic-samples/pom.xml | 2 +- webmagic-saxon/pom.xml | 2 +- webmagic-scripts/pom.xml | 2 +- webmagic-selenium/pom.xml | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pom.xml b/pom.xml index 6a237e6..990805e 100644 --- a/pom.xml +++ b/pom.xml @@ -1,7 +1,7 @@ us.codecraft - 0.7.4 + 0.7.5 4.0.0 pom diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index 820651a..ec718a1 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.7.4 + 0.7.5 4.0.0 diff --git a/webmagic-coverage/pom.xml b/webmagic-coverage/pom.xml index b1998a3..16ed1b4 100644 --- a/webmagic-coverage/pom.xml +++ b/webmagic-coverage/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic-parent - 0.7.4 + 0.7.5 webmagic-coverage diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index 87900ef..85d5c63 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.7.4 + 0.7.5 4.0.0 diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index bdca2b6..dda1821 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.7.4 + 0.7.5 4.0.0 diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml index d3a57f2..119e50f 100644 --- a/webmagic-saxon/pom.xml +++ b/webmagic-saxon/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.7.4 + 0.7.5 4.0.0 diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index 85b735f..1aca5b3 100755 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.7.4 + 0.7.5 4.0.0 diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml index d0cb77c..42a6da9 100644 --- a/webmagic-selenium/pom.xml +++ b/webmagic-selenium/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.7.4 + 0.7.5 4.0.0 From 113eaa4baeb8dcb8d45a62d49aff5b75ead34c2e Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Thu, 22 Jul 2021 12:57:14 +0800 Subject: [PATCH 22/22] Bump version number to 0.7.5. --- README-zh.md | 7 ++++--- README.md | 6 ++++-- pom.xml | 2 +- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/README-zh.md b/README-zh.md index c5ebe15..62b3c9a 100644 --- a/README-zh.md +++ b/README-zh.md @@ -1,9 +1,10 @@ ![logo](http://webmagic.io/images/logo.jpeg) +[![Maven Central](https://maven-badges.herokuapp.com/maven-central/us.codecraft/webmagic-parent/badge.svg?subject=Maven%20Central)](https://maven-badges.herokuapp.com/maven-central/us.codecraft/webmagic-parent/) +[![License](https://img.shields.io/badge/License-Apache%20License%202.0-blue.svg)](https://www.apache.org/licenses/LICENSE-2.0.html) [![Build Status](https://travis-ci.org/code4craft/webmagic.png?branch=master)](https://travis-ci.org/code4craft/webmagic) - 官方网站[http://webmagic.io/](http://webmagic.io/) >webmagic是一个开源的Java垂直爬虫框架,目标是简化爬虫的开发流程,让开发者专注于逻辑功能的开发。webmagic的核心非常简单,但是覆盖爬虫的整个流程,也是很好的学习爬虫开发的材料。 @@ -38,12 +39,12 @@ webmagic使用maven管理依赖,在项目中添加对应的依赖即可使用w us.codecraft webmagic-core - 0.7.4 + 0.7.5 us.codecraft webmagic-extension - 0.7.4 + 0.7.5 ``` diff --git a/README.md b/README.md index e5cd511..14aeac7 100644 --- a/README.md +++ b/README.md @@ -3,6 +3,8 @@ [Readme in Chinese](https://github.com/code4craft/webmagic/tree/master/README-zh.md) +[![Maven Central](https://maven-badges.herokuapp.com/maven-central/us.codecraft/webmagic-parent/badge.svg?subject=Maven%20Central)](https://maven-badges.herokuapp.com/maven-central/us.codecraft/webmagic-parent/) +[![License](https://img.shields.io/badge/License-Apache%20License%202.0-blue.svg)](https://www.apache.org/licenses/LICENSE-2.0.html) [![Build Status](https://travis-ci.org/code4craft/webmagic.png?branch=master)](https://travis-ci.org/code4craft/webmagic) >A scalable crawler framework. It covers the whole lifecycle of crawler: downloading, url management, content extraction and persistent. It can simplify the development of a specific crawler. @@ -23,12 +25,12 @@ Add dependencies to your pom.xml: us.codecraft webmagic-core - 0.7.4 + 0.7.5 us.codecraft webmagic-extension - 0.7.4 + 0.7.5 ``` diff --git a/pom.xml b/pom.xml index 990805e..51e6fdb 100644 --- a/pom.xml +++ b/pom.xml @@ -275,7 +275,7 @@ 3.2.0 UTF-8 - WebMagic 0.7.4 + WebMagic ${project.version} en_US