Upgrade dependencies, including the jedis from 2.9.3 to 3.4.1.
parent
0d73f08ef6
commit
0e01550a79
24
pom.xml
24
pom.xml
|
@ -73,17 +73,17 @@
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.httpcomponents</groupId>
|
<groupId>org.apache.httpcomponents</groupId>
|
||||||
<artifactId>httpcore</artifactId>
|
<artifactId>httpcore</artifactId>
|
||||||
<version>4.4.13</version>
|
<version>4.4.14</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>com.google.guava</groupId>
|
<groupId>com.google.guava</groupId>
|
||||||
<artifactId>guava</artifactId>
|
<artifactId>guava</artifactId>
|
||||||
<version>30.0-android</version>
|
<version>30.1-jre</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>com.jayway.jsonpath</groupId>
|
<groupId>com.jayway.jsonpath</groupId>
|
||||||
<artifactId>json-path</artifactId>
|
<artifactId>json-path</artifactId>
|
||||||
<version>2.4.0</version>
|
<version>2.5.0</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.slf4j</groupId>
|
<groupId>org.slf4j</groupId>
|
||||||
|
@ -103,7 +103,7 @@
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>com.alibaba</groupId>
|
<groupId>com.alibaba</groupId>
|
||||||
<artifactId>fastjson</artifactId>
|
<artifactId>fastjson</artifactId>
|
||||||
<version>1.2.69</version>
|
<version>1.2.75</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>com.github.dreamhead</groupId>
|
<groupId>com.github.dreamhead</groupId>
|
||||||
|
@ -125,13 +125,13 @@
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.assertj</groupId>
|
<groupId>org.assertj</groupId>
|
||||||
<artifactId>assertj-core</artifactId>
|
<artifactId>assertj-core</artifactId>
|
||||||
<version>3.16.1</version>
|
<version>3.18.1</version>
|
||||||
<scope>test</scope>
|
<scope>test</scope>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.commons</groupId>
|
<groupId>org.apache.commons</groupId>
|
||||||
<artifactId>commons-lang3</artifactId>
|
<artifactId>commons-lang3</artifactId>
|
||||||
<version>3.10</version>
|
<version>3.11</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>commons-collections</groupId>
|
<groupId>commons-collections</groupId>
|
||||||
|
@ -141,17 +141,17 @@
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>commons-io</groupId>
|
<groupId>commons-io</groupId>
|
||||||
<artifactId>commons-io</artifactId>
|
<artifactId>commons-io</artifactId>
|
||||||
<version>2.7</version>
|
<version>2.8.0</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.codehaus.groovy</groupId>
|
<groupId>org.codehaus.groovy</groupId>
|
||||||
<artifactId>groovy-all</artifactId>
|
<artifactId>groovy-all</artifactId>
|
||||||
<version>2.4.19</version>
|
<version>3.0.7</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.jruby</groupId>
|
<groupId>org.jruby</groupId>
|
||||||
<artifactId>jruby</artifactId>
|
<artifactId>jruby</artifactId>
|
||||||
<version>9.2.11.1</version>
|
<version>9.2.14.0</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.jsoup</groupId>
|
<groupId>org.jsoup</groupId>
|
||||||
|
@ -171,12 +171,12 @@
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>net.sf.saxon</groupId>
|
<groupId>net.sf.saxon</groupId>
|
||||||
<artifactId>Saxon-HE</artifactId>
|
<artifactId>Saxon-HE</artifactId>
|
||||||
<version>10.1</version>
|
<version>10.3</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>net.sourceforge.htmlcleaner</groupId>
|
<groupId>net.sourceforge.htmlcleaner</groupId>
|
||||||
<artifactId>htmlcleaner</artifactId>
|
<artifactId>htmlcleaner</artifactId>
|
||||||
<version>2.5</version>
|
<version>2.24</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>com.github.detro</groupId>
|
<groupId>com.github.detro</groupId>
|
||||||
|
@ -191,7 +191,7 @@
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>redis.clients</groupId>
|
<groupId>redis.clients</groupId>
|
||||||
<artifactId>jedis</artifactId>
|
<artifactId>jedis</artifactId>
|
||||||
<version>2.9.3</version>
|
<version>3.4.1</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
</dependencies>
|
</dependencies>
|
||||||
</dependencyManagement>
|
</dependencyManagement>
|
||||||
|
|
|
@ -1,12 +1,12 @@
|
||||||
package us.codecraft.webmagic.selector;
|
package us.codecraft.webmagic.selector;
|
||||||
|
|
||||||
import org.jsoup.helper.StringUtil;
|
|
||||||
import org.jsoup.nodes.Element;
|
|
||||||
import org.jsoup.select.Elements;
|
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
import org.jsoup.nodes.Element;
|
||||||
|
import org.jsoup.select.Elements;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Links selector based on jsoup. Use absolute url. <br>
|
* Links selector based on jsoup. Use absolute url. <br>
|
||||||
*
|
*
|
||||||
|
@ -23,9 +23,9 @@ public class LinksSelector extends BaseElementSelector {
|
||||||
@Override
|
@Override
|
||||||
public List<String> selectList(Element element) {
|
public List<String> selectList(Element element) {
|
||||||
Elements elements = element.select("a");
|
Elements elements = element.select("a");
|
||||||
List<String> links = new ArrayList<String>(elements.size());
|
List<String> links = new ArrayList<>(elements.size());
|
||||||
for (Element element0 : elements) {
|
for (Element element0 : elements) {
|
||||||
if (!StringUtil.isBlank(element0.baseUri())) {
|
if (StringUtils.isNotBlank(element0.baseUri())) {
|
||||||
links.add(element0.attr("abs:href"));
|
links.add(element0.attr("abs:href"));
|
||||||
} else {
|
} else {
|
||||||
links.add(element0.attr("href"));
|
links.add(element0.attr("href"));
|
||||||
|
|
|
@ -1,22 +1,23 @@
|
||||||
package us.codecraft.webmagic.scheduler;
|
package us.codecraft.webmagic.scheduler;
|
||||||
|
|
||||||
import com.alibaba.fastjson.JSON;
|
import java.util.Set;
|
||||||
|
|
||||||
import org.apache.commons.codec.digest.DigestUtils;
|
import org.apache.commons.codec.digest.DigestUtils;
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
|
||||||
|
import com.alibaba.fastjson.JSON;
|
||||||
|
|
||||||
import redis.clients.jedis.Jedis;
|
import redis.clients.jedis.Jedis;
|
||||||
import redis.clients.jedis.JedisPool;
|
import redis.clients.jedis.JedisPool;
|
||||||
import us.codecraft.webmagic.Request;
|
import us.codecraft.webmagic.Request;
|
||||||
import us.codecraft.webmagic.Task;
|
import us.codecraft.webmagic.Task;
|
||||||
|
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* the redis scheduler with priority
|
* the redis scheduler with priority
|
||||||
* @author sai
|
* @author sai
|
||||||
* Created by sai on 16-5-27.
|
* Created by sai on 16-5-27.
|
||||||
*/
|
*/
|
||||||
public class RedisPriorityScheduler extends RedisScheduler
|
public class RedisPriorityScheduler extends RedisScheduler {
|
||||||
{
|
|
||||||
|
|
||||||
private static final String ZSET_PREFIX = "zset_";
|
private static final String ZSET_PREFIX = "zset_";
|
||||||
|
|
||||||
|
@ -37,62 +38,44 @@ public class RedisPriorityScheduler extends RedisScheduler
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected void pushWhenNoDuplicate(Request request, Task task)
|
protected void pushWhenNoDuplicate(Request request, Task task) {
|
||||||
{
|
try (Jedis jedis = pool.getResource()) {
|
||||||
Jedis jedis = pool.getResource();
|
if (request.getPriority() > 0) {
|
||||||
try
|
|
||||||
{
|
|
||||||
if(request.getPriority() > 0)
|
|
||||||
jedis.zadd(getZsetPlusPriorityKey(task), request.getPriority(), request.getUrl());
|
jedis.zadd(getZsetPlusPriorityKey(task), request.getPriority(), request.getUrl());
|
||||||
else if(request.getPriority() < 0)
|
} else if (request.getPriority() < 0) {
|
||||||
jedis.zadd(getZsetMinusPriorityKey(task), request.getPriority(), request.getUrl());
|
jedis.zadd(getZsetMinusPriorityKey(task), request.getPriority(), request.getUrl());
|
||||||
else
|
} else {
|
||||||
jedis.lpush(getQueueNoPriorityKey(task), request.getUrl());
|
jedis.lpush(getQueueNoPriorityKey(task), request.getUrl());
|
||||||
|
}
|
||||||
|
|
||||||
setExtrasInItem(jedis, request, task);
|
setExtrasInItem(jedis, request, task);
|
||||||
}
|
}
|
||||||
finally
|
|
||||||
{
|
|
||||||
pool.returnResource(jedis);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public synchronized Request poll(Task task)
|
public synchronized Request poll(Task task) {
|
||||||
{
|
try (Jedis jedis = pool.getResource()) {
|
||||||
Jedis jedis = pool.getResource();
|
|
||||||
try
|
|
||||||
{
|
|
||||||
String url = getRequest(jedis, task);
|
String url = getRequest(jedis, task);
|
||||||
if(StringUtils.isBlank(url))
|
if (StringUtils.isBlank(url)) {
|
||||||
return null;
|
return null;
|
||||||
return getExtrasInItem(jedis, url, task);
|
|
||||||
}
|
}
|
||||||
finally
|
return getExtrasInItem(jedis, url, task);
|
||||||
{
|
|
||||||
pool.returnResource(jedis);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private String getRequest(Jedis jedis, Task task)
|
private String getRequest(Jedis jedis, Task task) {
|
||||||
{
|
|
||||||
String url;
|
String url;
|
||||||
Set<String> urls = jedis.zrevrange(getZsetPlusPriorityKey(task), 0, 0);
|
Set<String> urls = jedis.zrevrange(getZsetPlusPriorityKey(task), 0, 0);
|
||||||
if(urls.isEmpty())
|
if (urls.isEmpty()) {
|
||||||
{
|
|
||||||
url = jedis.lpop(getQueueNoPriorityKey(task));
|
url = jedis.lpop(getQueueNoPriorityKey(task));
|
||||||
if(StringUtils.isBlank(url))
|
if (StringUtils.isBlank(url)) {
|
||||||
{
|
|
||||||
urls = jedis.zrevrange(getZsetMinusPriorityKey(task), 0, 0);
|
urls = jedis.zrevrange(getZsetMinusPriorityKey(task), 0, 0);
|
||||||
if(!urls.isEmpty())
|
if (!urls.isEmpty()) {
|
||||||
{
|
|
||||||
url = urls.toArray(new String[0])[0];
|
url = urls.toArray(new String[0])[0];
|
||||||
jedis.zrem(getZsetMinusPriorityKey(task), url);
|
jedis.zrem(getZsetMinusPriorityKey(task), url);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
} else {
|
||||||
else
|
|
||||||
{
|
|
||||||
url = urls.toArray(new String[0])[0];
|
url = urls.toArray(new String[0])[0];
|
||||||
jedis.zrem(getZsetPlusPriorityKey(task), url);
|
jedis.zrem(getZsetPlusPriorityKey(task), url);
|
||||||
}
|
}
|
||||||
|
@ -100,51 +83,39 @@ public class RedisPriorityScheduler extends RedisScheduler
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void resetDuplicateCheck(Task task)
|
public void resetDuplicateCheck(Task task) {
|
||||||
{
|
try (Jedis jedis = pool.getResource()) {
|
||||||
Jedis jedis = pool.getResource();
|
|
||||||
try
|
|
||||||
{
|
|
||||||
jedis.del(getSetKey(task));
|
jedis.del(getSetKey(task));
|
||||||
}
|
}
|
||||||
finally
|
|
||||||
{
|
|
||||||
pool.returnResource(jedis);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private String getZsetPlusPriorityKey(Task task)
|
private String getZsetPlusPriorityKey(Task task) {
|
||||||
{
|
|
||||||
return ZSET_PREFIX + task.getUUID() + PLUS_PRIORITY_SUFFIX;
|
return ZSET_PREFIX + task.getUUID() + PLUS_PRIORITY_SUFFIX;
|
||||||
}
|
}
|
||||||
|
|
||||||
private String getQueueNoPriorityKey(Task task)
|
private String getQueueNoPriorityKey(Task task) {
|
||||||
{
|
|
||||||
return QUEUE_PREFIX + task.getUUID() + NO_PRIORITY_SUFFIX;
|
return QUEUE_PREFIX + task.getUUID() + NO_PRIORITY_SUFFIX;
|
||||||
}
|
}
|
||||||
|
|
||||||
private String getZsetMinusPriorityKey(Task task)
|
private String getZsetMinusPriorityKey(Task task) {
|
||||||
{
|
|
||||||
return ZSET_PREFIX + task.getUUID() + MINUS_PRIORITY_SUFFIX;
|
return ZSET_PREFIX + task.getUUID() + MINUS_PRIORITY_SUFFIX;
|
||||||
}
|
}
|
||||||
|
|
||||||
private void setExtrasInItem(Jedis jedis,Request request, Task task)
|
private void setExtrasInItem(Jedis jedis,Request request, Task task) {
|
||||||
{
|
if (request.getExtras() != null) {
|
||||||
if(request.getExtras() != null)
|
String field = DigestUtils.sha1Hex(request.getUrl());
|
||||||
{
|
|
||||||
String field = DigestUtils.shaHex(request.getUrl());
|
|
||||||
String value = JSON.toJSONString(request);
|
String value = JSON.toJSONString(request);
|
||||||
jedis.hset(getItemKey(task), field, value);
|
jedis.hset(getItemKey(task), field, value);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private Request getExtrasInItem(Jedis jedis, String url, Task task)
|
private Request getExtrasInItem(Jedis jedis, String url, Task task) {
|
||||||
{
|
|
||||||
String key = getItemKey(task);
|
String key = getItemKey(task);
|
||||||
String field = DigestUtils.shaHex(url);
|
String field = DigestUtils.sha1Hex(url);
|
||||||
byte[] bytes = jedis.hget(key.getBytes(), field.getBytes());
|
byte[] bytes = jedis.hget(key.getBytes(), field.getBytes());
|
||||||
if(bytes != null)
|
if (bytes != null) {
|
||||||
return JSON.parseObject(new String(bytes), Request.class);
|
return JSON.parseObject(new String(bytes), Request.class);
|
||||||
|
}
|
||||||
return new Request(url);
|
return new Request(url);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,8 +1,10 @@
|
||||||
package us.codecraft.webmagic.scheduler;
|
package us.codecraft.webmagic.scheduler;
|
||||||
|
|
||||||
import com.alibaba.fastjson.JSON;
|
|
||||||
import org.apache.commons.codec.digest.DigestUtils;
|
import org.apache.commons.codec.digest.DigestUtils;
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
|
||||||
|
import com.alibaba.fastjson.JSON;
|
||||||
|
|
||||||
import redis.clients.jedis.Jedis;
|
import redis.clients.jedis.Jedis;
|
||||||
import redis.clients.jedis.JedisPool;
|
import redis.clients.jedis.JedisPool;
|
||||||
import redis.clients.jedis.JedisPoolConfig;
|
import redis.clients.jedis.JedisPoolConfig;
|
||||||
|
@ -37,21 +39,15 @@ public class RedisScheduler extends DuplicateRemovedScheduler implements Monitor
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void resetDuplicateCheck(Task task) {
|
public void resetDuplicateCheck(Task task) {
|
||||||
Jedis jedis = pool.getResource();
|
try (Jedis jedis = pool.getResource()) {
|
||||||
try {
|
|
||||||
jedis.del(getSetKey(task));
|
jedis.del(getSetKey(task));
|
||||||
} finally {
|
|
||||||
pool.returnResource(jedis);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean isDuplicate(Request request, Task task) {
|
public boolean isDuplicate(Request request, Task task) {
|
||||||
Jedis jedis = pool.getResource();
|
try (Jedis jedis = pool.getResource()) {
|
||||||
try {
|
|
||||||
return jedis.sadd(getSetKey(task), request.getUrl()) == 0;
|
return jedis.sadd(getSetKey(task), request.getUrl()) == 0;
|
||||||
} finally {
|
|
||||||
pool.returnResource(jedis);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -62,7 +58,7 @@ public class RedisScheduler extends DuplicateRemovedScheduler implements Monitor
|
||||||
try {
|
try {
|
||||||
jedis.rpush(getQueueKey(task), request.getUrl());
|
jedis.rpush(getQueueKey(task), request.getUrl());
|
||||||
if (checkForAdditionalInfo(request)) {
|
if (checkForAdditionalInfo(request)) {
|
||||||
String field = DigestUtils.shaHex(request.getUrl());
|
String field = DigestUtils.sha1Hex(request.getUrl());
|
||||||
String value = JSON.toJSONString(request);
|
String value = JSON.toJSONString(request);
|
||||||
jedis.hset((ITEM_PREFIX + task.getUUID()), field, value);
|
jedis.hset((ITEM_PREFIX + task.getUUID()), field, value);
|
||||||
}
|
}
|
||||||
|
@ -100,14 +96,13 @@ public class RedisScheduler extends DuplicateRemovedScheduler implements Monitor
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public synchronized Request poll(Task task) {
|
public synchronized Request poll(Task task) {
|
||||||
Jedis jedis = pool.getResource();
|
try (Jedis jedis = pool.getResource()) {
|
||||||
try {
|
|
||||||
String url = jedis.lpop(getQueueKey(task));
|
String url = jedis.lpop(getQueueKey(task));
|
||||||
if (url == null) {
|
if (url == null) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
String key = ITEM_PREFIX + task.getUUID();
|
String key = ITEM_PREFIX + task.getUUID();
|
||||||
String field = DigestUtils.shaHex(url);
|
String field = DigestUtils.sha1Hex(url);
|
||||||
byte[] bytes = jedis.hget(key.getBytes(), field.getBytes());
|
byte[] bytes = jedis.hget(key.getBytes(), field.getBytes());
|
||||||
if (bytes != null) {
|
if (bytes != null) {
|
||||||
Request o = JSON.parseObject(new String(bytes), Request.class);
|
Request o = JSON.parseObject(new String(bytes), Request.class);
|
||||||
|
@ -115,8 +110,6 @@ public class RedisScheduler extends DuplicateRemovedScheduler implements Monitor
|
||||||
}
|
}
|
||||||
Request request = new Request(url);
|
Request request = new Request(url);
|
||||||
return request;
|
return request;
|
||||||
} finally {
|
|
||||||
pool.returnResource(jedis);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -134,23 +127,17 @@ public class RedisScheduler extends DuplicateRemovedScheduler implements Monitor
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int getLeftRequestsCount(Task task) {
|
public int getLeftRequestsCount(Task task) {
|
||||||
Jedis jedis = pool.getResource();
|
try (Jedis jedis = pool.getResource()) {
|
||||||
try {
|
|
||||||
Long size = jedis.llen(getQueueKey(task));
|
Long size = jedis.llen(getQueueKey(task));
|
||||||
return size.intValue();
|
return size.intValue();
|
||||||
} finally {
|
|
||||||
pool.returnResource(jedis);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int getTotalRequestsCount(Task task) {
|
public int getTotalRequestsCount(Task task) {
|
||||||
Jedis jedis = pool.getResource();
|
try (Jedis jedis = pool.getResource()) {
|
||||||
try {
|
|
||||||
Long size = jedis.scard(getSetKey(task));
|
Long size = jedis.scard(getSetKey(task));
|
||||||
return size.intValue();
|
return size.intValue();
|
||||||
} finally {
|
|
||||||
pool.returnResource(jedis);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue