From 0e01550a79883e7df6c0bd8d0b0ab31156a9412a Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Wed, 6 Jan 2021 03:13:50 +0800 Subject: [PATCH] Upgrade dependencies, including the jedis from 2.9.3 to 3.4.1. --- pom.xml | 30 +++--- .../webmagic/selector/LinksSelector.java | 12 +-- .../scheduler/RedisPriorityScheduler.java | 95 +++++++------------ .../webmagic/scheduler/RedisScheduler.java | 33 ++----- 4 files changed, 64 insertions(+), 106 deletions(-) diff --git a/pom.xml b/pom.xml index 08250fd..81d0d6c 100644 --- a/pom.xml +++ b/pom.xml @@ -73,17 +73,17 @@ org.apache.httpcomponents httpcore - 4.4.13 + 4.4.14 com.google.guava guava - 30.0-android + 30.1-jre com.jayway.jsonpath json-path - 2.4.0 + 2.5.0 org.slf4j @@ -103,7 +103,7 @@ com.alibaba fastjson - 1.2.69 + 1.2.75 com.github.dreamhead @@ -125,13 +125,13 @@ org.assertj assertj-core - 3.16.1 + 3.18.1 test org.apache.commons commons-lang3 - 3.10 + 3.11 commons-collections @@ -139,19 +139,19 @@ 3.2.2 - commons-io - commons-io - 2.7 - + commons-io + commons-io + 2.8.0 + org.codehaus.groovy groovy-all - 2.4.19 + 3.0.7 org.jruby jruby - 9.2.11.1 + 9.2.14.0 org.jsoup @@ -171,12 +171,12 @@ net.sf.saxon Saxon-HE - 10.1 + 10.3 net.sourceforge.htmlcleaner htmlcleaner - 2.5 + 2.24 com.github.detro @@ -191,7 +191,7 @@ redis.clients jedis - 2.9.3 + 3.4.1 diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/LinksSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/LinksSelector.java index 5296a74..2dafe8e 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/LinksSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/LinksSelector.java @@ -1,12 +1,12 @@ package us.codecraft.webmagic.selector; -import org.jsoup.helper.StringUtil; -import org.jsoup.nodes.Element; -import org.jsoup.select.Elements; - import java.util.ArrayList; import java.util.List; +import org.apache.commons.lang3.StringUtils; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + /** * Links selector based on jsoup. Use absolute url.
* @@ -23,9 +23,9 @@ public class LinksSelector extends BaseElementSelector { @Override public List selectList(Element element) { Elements elements = element.select("a"); - List links = new ArrayList(elements.size()); + List links = new ArrayList<>(elements.size()); for (Element element0 : elements) { - if (!StringUtil.isBlank(element0.baseUri())) { + if (StringUtils.isNotBlank(element0.baseUri())) { links.add(element0.attr("abs:href")); } else { links.add(element0.attr("href")); diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisPriorityScheduler.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisPriorityScheduler.java index 540574a..46d47e5 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisPriorityScheduler.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisPriorityScheduler.java @@ -1,22 +1,23 @@ package us.codecraft.webmagic.scheduler; -import com.alibaba.fastjson.JSON; +import java.util.Set; + import org.apache.commons.codec.digest.DigestUtils; import org.apache.commons.lang3.StringUtils; + +import com.alibaba.fastjson.JSON; + import redis.clients.jedis.Jedis; import redis.clients.jedis.JedisPool; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Task; -import java.util.Set; - /** * the redis scheduler with priority * @author sai * Created by sai on 16-5-27. */ -public class RedisPriorityScheduler extends RedisScheduler -{ +public class RedisPriorityScheduler extends RedisScheduler { private static final String ZSET_PREFIX = "zset_"; @@ -37,62 +38,44 @@ public class RedisPriorityScheduler extends RedisScheduler } @Override - protected void pushWhenNoDuplicate(Request request, Task task) - { - Jedis jedis = pool.getResource(); - try - { - if(request.getPriority() > 0) + protected void pushWhenNoDuplicate(Request request, Task task) { + try (Jedis jedis = pool.getResource()) { + if (request.getPriority() > 0) { jedis.zadd(getZsetPlusPriorityKey(task), request.getPriority(), request.getUrl()); - else if(request.getPriority() < 0) + } else if (request.getPriority() < 0) { jedis.zadd(getZsetMinusPriorityKey(task), request.getPriority(), request.getUrl()); - else + } else { jedis.lpush(getQueueNoPriorityKey(task), request.getUrl()); + } setExtrasInItem(jedis, request, task); } - finally - { - pool.returnResource(jedis); - } } @Override - public synchronized Request poll(Task task) - { - Jedis jedis = pool.getResource(); - try - { + public synchronized Request poll(Task task) { + try (Jedis jedis = pool.getResource()) { String url = getRequest(jedis, task); - if(StringUtils.isBlank(url)) + if (StringUtils.isBlank(url)) { return null; + } return getExtrasInItem(jedis, url, task); } - finally - { - pool.returnResource(jedis); - } } - private String getRequest(Jedis jedis, Task task) - { + private String getRequest(Jedis jedis, Task task) { String url; Set urls = jedis.zrevrange(getZsetPlusPriorityKey(task), 0, 0); - if(urls.isEmpty()) - { + if (urls.isEmpty()) { url = jedis.lpop(getQueueNoPriorityKey(task)); - if(StringUtils.isBlank(url)) - { + if (StringUtils.isBlank(url)) { urls = jedis.zrevrange(getZsetMinusPriorityKey(task), 0, 0); - if(!urls.isEmpty()) - { + if (!urls.isEmpty()) { url = urls.toArray(new String[0])[0]; jedis.zrem(getZsetMinusPriorityKey(task), url); } } - } - else - { + } else { url = urls.toArray(new String[0])[0]; jedis.zrem(getZsetPlusPriorityKey(task), url); } @@ -100,51 +83,39 @@ public class RedisPriorityScheduler extends RedisScheduler } @Override - public void resetDuplicateCheck(Task task) - { - Jedis jedis = pool.getResource(); - try - { + public void resetDuplicateCheck(Task task) { + try (Jedis jedis = pool.getResource()) { jedis.del(getSetKey(task)); } - finally - { - pool.returnResource(jedis); - } } - private String getZsetPlusPriorityKey(Task task) - { + private String getZsetPlusPriorityKey(Task task) { return ZSET_PREFIX + task.getUUID() + PLUS_PRIORITY_SUFFIX; } - private String getQueueNoPriorityKey(Task task) - { + private String getQueueNoPriorityKey(Task task) { return QUEUE_PREFIX + task.getUUID() + NO_PRIORITY_SUFFIX; } - private String getZsetMinusPriorityKey(Task task) - { + private String getZsetMinusPriorityKey(Task task) { return ZSET_PREFIX + task.getUUID() + MINUS_PRIORITY_SUFFIX; } - private void setExtrasInItem(Jedis jedis,Request request, Task task) - { - if(request.getExtras() != null) - { - String field = DigestUtils.shaHex(request.getUrl()); + private void setExtrasInItem(Jedis jedis,Request request, Task task) { + if (request.getExtras() != null) { + String field = DigestUtils.sha1Hex(request.getUrl()); String value = JSON.toJSONString(request); jedis.hset(getItemKey(task), field, value); } } - private Request getExtrasInItem(Jedis jedis, String url, Task task) - { + private Request getExtrasInItem(Jedis jedis, String url, Task task) { String key = getItemKey(task); - String field = DigestUtils.shaHex(url); + String field = DigestUtils.sha1Hex(url); byte[] bytes = jedis.hget(key.getBytes(), field.getBytes()); - if(bytes != null) + if (bytes != null) { return JSON.parseObject(new String(bytes), Request.class); + } return new Request(url); } } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java index c70d885..19e8313 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java @@ -1,8 +1,10 @@ package us.codecraft.webmagic.scheduler; -import com.alibaba.fastjson.JSON; import org.apache.commons.codec.digest.DigestUtils; import org.apache.commons.lang3.StringUtils; + +import com.alibaba.fastjson.JSON; + import redis.clients.jedis.Jedis; import redis.clients.jedis.JedisPool; import redis.clients.jedis.JedisPoolConfig; @@ -37,21 +39,15 @@ public class RedisScheduler extends DuplicateRemovedScheduler implements Monitor @Override public void resetDuplicateCheck(Task task) { - Jedis jedis = pool.getResource(); - try { + try (Jedis jedis = pool.getResource()) { jedis.del(getSetKey(task)); - } finally { - pool.returnResource(jedis); } } @Override public boolean isDuplicate(Request request, Task task) { - Jedis jedis = pool.getResource(); - try { + try (Jedis jedis = pool.getResource()) { return jedis.sadd(getSetKey(task), request.getUrl()) == 0; - } finally { - pool.returnResource(jedis); } } @@ -62,7 +58,7 @@ public class RedisScheduler extends DuplicateRemovedScheduler implements Monitor try { jedis.rpush(getQueueKey(task), request.getUrl()); if (checkForAdditionalInfo(request)) { - String field = DigestUtils.shaHex(request.getUrl()); + String field = DigestUtils.sha1Hex(request.getUrl()); String value = JSON.toJSONString(request); jedis.hset((ITEM_PREFIX + task.getUUID()), field, value); } @@ -100,14 +96,13 @@ public class RedisScheduler extends DuplicateRemovedScheduler implements Monitor @Override public synchronized Request poll(Task task) { - Jedis jedis = pool.getResource(); - try { + try (Jedis jedis = pool.getResource()) { String url = jedis.lpop(getQueueKey(task)); if (url == null) { return null; } String key = ITEM_PREFIX + task.getUUID(); - String field = DigestUtils.shaHex(url); + String field = DigestUtils.sha1Hex(url); byte[] bytes = jedis.hget(key.getBytes(), field.getBytes()); if (bytes != null) { Request o = JSON.parseObject(new String(bytes), Request.class); @@ -115,8 +110,6 @@ public class RedisScheduler extends DuplicateRemovedScheduler implements Monitor } Request request = new Request(url); return request; - } finally { - pool.returnResource(jedis); } } @@ -134,23 +127,17 @@ public class RedisScheduler extends DuplicateRemovedScheduler implements Monitor @Override public int getLeftRequestsCount(Task task) { - Jedis jedis = pool.getResource(); - try { + try (Jedis jedis = pool.getResource()) { Long size = jedis.llen(getQueueKey(task)); return size.intValue(); - } finally { - pool.returnResource(jedis); } } @Override public int getTotalRequestsCount(Task task) { - Jedis jedis = pool.getResource(); - try { + try (Jedis jedis = pool.getResource()) { Long size = jedis.scard(getSetKey(task)); return size.intValue(); - } finally { - pool.returnResource(jedis); } } }