Merge branch 'release/0.7.5'

master
Sutra Zhou 2021-07-22 12:59:39 +08:00
commit 04978f912d
23 changed files with 588 additions and 174 deletions

View File

@ -1,9 +1,10 @@
![logo](http://webmagic.io/images/logo.jpeg)
[![Maven Central](https://maven-badges.herokuapp.com/maven-central/us.codecraft/webmagic-parent/badge.svg?subject=Maven%20Central)](https://maven-badges.herokuapp.com/maven-central/us.codecraft/webmagic-parent/)
[![License](https://img.shields.io/badge/License-Apache%20License%202.0-blue.svg)](https://www.apache.org/licenses/LICENSE-2.0.html)
[![Build Status](https://travis-ci.org/code4craft/webmagic.png?branch=master)](https://travis-ci.org/code4craft/webmagic)
官方网站[http://webmagic.io/](http://webmagic.io/)
>webmagic是一个开源的Java垂直爬虫框架目标是简化爬虫的开发流程让开发者专注于逻辑功能的开发。webmagic的核心非常简单但是覆盖爬虫的整个流程也是很好的学习爬虫开发的材料。
@ -38,12 +39,12 @@ webmagic使用maven管理依赖在项目中添加对应的依赖即可使用w
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.7.4</version>
<version>0.7.5</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.7.4</version>
<version>0.7.5</version>
</dependency>
```

View File

@ -3,6 +3,8 @@
[Readme in Chinese](https://github.com/code4craft/webmagic/tree/master/README-zh.md)
[![Maven Central](https://maven-badges.herokuapp.com/maven-central/us.codecraft/webmagic-parent/badge.svg?subject=Maven%20Central)](https://maven-badges.herokuapp.com/maven-central/us.codecraft/webmagic-parent/)
[![License](https://img.shields.io/badge/License-Apache%20License%202.0-blue.svg)](https://www.apache.org/licenses/LICENSE-2.0.html)
[![Build Status](https://travis-ci.org/code4craft/webmagic.png?branch=master)](https://travis-ci.org/code4craft/webmagic)
>A scalable crawler framework. It covers the whole lifecycle of crawler: downloading, url management, content extraction and persistent. It can simplify the development of a specific crawler.
@ -23,12 +25,12 @@ Add dependencies to your pom.xml:
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.7.4</version>
<version>0.7.5</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.7.4</version>
<version>0.7.5</version>
</dependency>
```

198
pom.xml
View File

@ -1,13 +1,14 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<groupId>us.codecraft</groupId>
<version>0.7.4</version>
<version>0.7.5</version>
<modelVersion>4.0.0</modelVersion>
<packaging>pom</packaging>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
<java.version>1.8</java.version>
<maven.compiler.source>1.8</maven.compiler.source>
<maven.compiler.target>1.8</maven.compiler.target>
<spring-version>4.0.0.RELEASE</spring-version>
</properties>
<artifactId>webmagic-parent</artifactId>
@ -33,7 +34,7 @@
<connection>scm:git:git@github.com:code4craft/webmagic.git</connection>
<developerConnection>scm:git:git@github.com:code4craft/webmagic.git</developerConnection>
<url>git@github.com:code4craft/webmagic.git</url>
<tag>webmagic-parent-0.6.1</tag>
<tag>WebMagic-${project.version}</tag>
</scm>
<licenses>
<license>
@ -49,6 +50,7 @@
<module>webmagic-selenium</module>
<module>webmagic-saxon</module>
<module>webmagic-samples</module>
<module>webmagic-coverage</module>
</modules>
<dependencyManagement>
@ -73,17 +75,17 @@
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpcore</artifactId>
<version>4.4.13</version>
<version>4.4.14</version>
</dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>30.0-android</version>
<version>30.1-jre</version>
</dependency>
<dependency>
<groupId>com.jayway.jsonpath</groupId>
<artifactId>json-path</artifactId>
<version>2.6.0</version>
<version>2.5.0</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
@ -98,12 +100,12 @@
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>xsoup</artifactId>
<version>0.3.1</version>
<version>0.3.2</version>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.69</version>
<version>1.2.75</version>
</dependency>
<dependency>
<groupId>com.github.dreamhead</groupId>
@ -125,13 +127,13 @@
<dependency>
<groupId>org.assertj</groupId>
<artifactId>assertj-core</artifactId>
<version>3.16.1</version>
<version>3.18.1</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>3.10</version>
<version>3.11</version>
</dependency>
<dependency>
<groupId>commons-collections</groupId>
@ -139,24 +141,19 @@
<version>3.2.2</version>
</dependency>
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>2.7</version>
</dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>2.8.0</version>
</dependency>
<dependency>
<groupId>org.codehaus.groovy</groupId>
<artifactId>groovy-all</artifactId>
<version>2.4.19</version>
<version>3.0.7</version>
</dependency>
<dependency>
<groupId>org.jruby</groupId>
<artifactId>jruby</artifactId>
<version>9.2.11.1</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.10.3</version>
<version>9.2.14.0</version>
</dependency>
<dependency>
<groupId>org.python</groupId>
@ -171,12 +168,12 @@
<dependency>
<groupId>net.sf.saxon</groupId>
<artifactId>Saxon-HE</artifactId>
<version>10.1</version>
<version>10.3</version>
</dependency>
<dependency>
<groupId>net.sourceforge.htmlcleaner</groupId>
<artifactId>htmlcleaner</artifactId>
<version>2.5</version>
<version>2.9</version>
</dependency>
<dependency>
<groupId>com.github.detro</groupId>
@ -191,7 +188,7 @@
<dependency>
<groupId>redis.clients</groupId>
<artifactId>jedis</artifactId>
<version>2.9.3</version>
<version>3.6.0</version>
</dependency>
</dependencies>
</dependencyManagement>
@ -211,7 +208,7 @@
<configuration>
<rules>
<requireMavenVersion>
<version>3.0.5</version>
<version>3.3.9</version>
</requireMavenVersion>
</rules>
</configuration>
@ -221,19 +218,10 @@
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<version>3.0.0-M4</version>
<configuration>
<forkCount>0</forkCount>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.8.1</version>
<configuration>
<source>${java.version}</source>
<target>${java.version}</target>
</configuration>
</plugin>
<!--<plugin>-->
<!--<groupId>org.apache.maven.plugins</groupId>-->
@ -258,12 +246,10 @@
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-resources-plugin</artifactId>
<version>3.1.0</version>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jar-plugin</artifactId>
<version>3.2.0</version>
<configuration>
<excludes>
<exclude>log4j.xml</exclude>
@ -289,7 +275,7 @@
<version>3.2.0</version>
<configuration>
<encoding>UTF-8</encoding>
<doctitle>WebMagic 0.7.4</doctitle>
<doctitle>WebMagic ${project.version}</doctitle>
<locale>en_US</locale>
<!-- avoid the issue: https://bugs.openjdk.java.net/browse/JDK-8212233 -->
@ -317,9 +303,147 @@
<artifactId>maven-release-plugin</artifactId>
<version>3.0.0-M1</version>
</plugin>
<plugin>
<groupId>org.jacoco</groupId>
<artifactId>jacoco-maven-plugin</artifactId>
<executions>
<execution>
<goals>
<goal>prepare-agent</goal>
</goals>
</execution>
<execution>
<id>report</id>
<phase>verify</phase>
<goals>
<goal>report</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>com.amashchenko.maven.plugin</groupId>
<artifactId>gitflow-maven-plugin</artifactId>
<configuration>
<gitFlowConfig>
<versionTagPrefix>WebMagic-</versionTagPrefix>
</gitFlowConfig>
</configuration>
</plugin>
</plugins>
<pluginManagement>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-clean-plugin</artifactId>
<version>3.1.0</version>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.8.1</version>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-deploy-plugin</artifactId>
<version>3.0.0-M1</version>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-install-plugin</artifactId>
<version>3.0.0-M1</version>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jar-plugin</artifactId>
<version>3.2.0</version>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jxr-plugin</artifactId>
<version>3.1.1</version>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-pmd-plugin</artifactId>
<version>3.14.0</version>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-resources-plugin</artifactId>
<version>3.2.0</version>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-site-plugin</artifactId>
<version>3.9.1</version>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<version>3.0.0-M5</version>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-report-plugin</artifactId>
<version>3.0.0-M5</version>
</plugin>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>taglist-maven-plugin</artifactId>
<version>2.4</version>
</plugin>
<plugin>
<groupId>org.jacoco</groupId>
<artifactId>jacoco-maven-plugin</artifactId>
<version>0.8.7</version>
</plugin>
<plugin>
<groupId>com.amashchenko.maven.plugin</groupId>
<artifactId>gitflow-maven-plugin</artifactId>
<version>1.15.0</version>
</plugin>
<plugin>
<groupId>com.github.spotbugs</groupId>
<artifactId>spotbugs-maven-plugin</artifactId>
<version>4.2.3</version>
</plugin>
</plugins>
</pluginManagement>
</build>
<reporting>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-javadoc-plugin</artifactId>
<configuration>
<doclint>none</doclint>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jxr-plugin</artifactId>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-pmd-plugin</artifactId>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-report-plugin</artifactId>
</plugin>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>taglist-maven-plugin</artifactId>
</plugin>
<plugin>
<groupId>com.github.spotbugs</groupId>
<artifactId>spotbugs-maven-plugin</artifactId>
</plugin>
</plugins>
</reporting>
<profiles>
<profile>
<id>release</id>

23
src/site/site.xml 100644
View File

@ -0,0 +1,23 @@
<project xmlns="http://maven.apache.org/DECORATION/1.6.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/DECORATION/1.6.0
http://maven.apache.org/xsd/decoration-1.6.0.xsd">
<skin>
<groupId>org.apache.maven.skins</groupId>
<artifactId>maven-fluido-skin</artifactId>
<version>1.9</version>
</skin>
<body>
<menu ref="parent" inherit="top" />
<menu ref="modules" inherit="top" />
<menu ref="reports" inherit="top" />
</body>
<custom>
<fluidoSkin>
<topBarEnabled>true</topBarEnabled>
<sideBarEnabled>true</sideBarEnabled>
<sourceLineNumbersEnabled>true</sourceLineNumbersEnabled>
<copyrightClass>pull-right</copyrightClass>
</fluidoSkin>
</custom>
</project>

View File

@ -3,7 +3,7 @@
<parent>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-parent</artifactId>
<version>0.7.4</version>
<version>0.7.5</version>
</parent>
<modelVersion>4.0.0</modelVersion>
@ -61,11 +61,6 @@
<artifactId>assertj-core</artifactId>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
</dependency>
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>

View File

@ -208,7 +208,8 @@ public class Spider implements Runnable, Task {
* @see #addPipeline(us.codecraft.webmagic.pipeline.Pipeline)
* @deprecated
*/
public Spider pipeline(Pipeline pipeline) {
@Deprecated
public Spider pipeline(Pipeline pipeline) {
return addPipeline(pipeline);
}
@ -258,7 +259,8 @@ public class Spider implements Runnable, Task {
* @see #setDownloader(us.codecraft.webmagic.downloader.Downloader)
* @deprecated
*/
public Spider downloader(Downloader downloader) {
@Deprecated
public Spider downloader(Downloader downloader) {
return setDownloader(downloader);
}
@ -320,7 +322,7 @@ public class Spider implements Runnable, Task {
processRequest(request);
onSuccess(request);
} catch (Exception e) {
onError(request);
onError(request, e);
logger.error("process request " + request + " error", e);
} finally {
pageCount.incrementAndGet();
@ -338,10 +340,19 @@ public class Spider implements Runnable, Task {
logger.info("Spider {} closed! {} pages downloaded.", getUUID(), pageCount.get());
}
/**
* @deprecated Use {@link #onError(Request, Exception)} instead.
*/
@Deprecated
protected void onError(Request request) {
}
protected void onError(Request request, Exception e) {
this.onError(request);
if (CollectionUtils.isNotEmpty(spiderListeners)) {
for (SpiderListener spiderListener : spiderListeners) {
spiderListener.onError(request);
spiderListener.onError(request, e);
}
}
}

View File

@ -10,5 +10,14 @@ public interface SpiderListener {
public void onSuccess(Request request);
/**
* @deprecated Use {@link #onError(Request, Exception)} instead.
*/
@Deprecated
public void onError(Request request);
default void onError(Request request, Exception e) {
this.onError(request);
}
}

View File

@ -1,12 +1,12 @@
package us.codecraft.webmagic.selector;
import org.jsoup.helper.StringUtil;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
/**
* Links selector based on jsoup. Use absolute url. <br>
*
@ -23,9 +23,9 @@ public class LinksSelector extends BaseElementSelector {
@Override
public List<String> selectList(Element element) {
Elements elements = element.select("a");
List<String> links = new ArrayList<String>(elements.size());
List<String> links = new ArrayList<>(elements.size());
for (Element element0 : elements) {
if (!StringUtil.isBlank(element0.baseUri())) {
if (StringUtils.isNotBlank(element0.baseUri())) {
links.add(element0.attr("abs:href"));
} else {
links.add(element0.attr("href"));

View File

@ -0,0 +1,72 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0
http://maven.apache.org/maven-v4_0_0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-parent</artifactId>
<version>0.7.5</version>
</parent>
<artifactId>webmagic-coverage</artifactId>
<packaging>pom</packaging>
<name>webmagic-coverage</name>
<description>Compute aggregated test code coverage</description>
<properties>
<maven.deploy.skip>true</maven.deploy.skip>
</properties>
<dependencies>
<dependency>
<groupId>${project.groupId}</groupId>
<artifactId>webmagic-core</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>${project.groupId}</groupId>
<artifactId>webmagic-extension</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>${project.groupId}</groupId>
<artifactId>webmagic-scripts</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>${project.groupId}</groupId>
<artifactId>webmagic-selenium</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>${project.groupId}</groupId>
<artifactId>webmagic-saxon</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>${project.groupId}</groupId>
<artifactId>webmagic-samples</artifactId>
<version>${project.version}</version>
</dependency>
</dependencies>
<reporting>
<plugins>
<plugin>
<groupId>org.jacoco</groupId>
<artifactId>jacoco-maven-plugin</artifactId>
<reportSets>
<reportSet>
<reports>
<report>report-aggregate</report>
</reports>
</reportSet>
</reportSets>
</plugin>
</plugins>
</reporting>
</project>

View File

@ -3,7 +3,7 @@
<parent>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-parent</artifactId>
<version>0.7.4</version>
<version>0.7.5</version>
</parent>
<modelVersion>4.0.0</modelVersion>

View File

@ -68,6 +68,10 @@ public class SpiderMonitor {
return new SpiderStatus(spider, monitorSpiderListener);
}
protected List<SpiderStatusMXBean> getSpiderStatuses() {
return this.spiderStatuses;
}
public static SpiderMonitor instance() {
return INSTANCE;
}

View File

@ -84,8 +84,13 @@ public class SpiderStatus implements SpiderStatusMXBean {
@Override
public int getPagePerSecond() {
int runSeconds = (int) (System.currentTimeMillis() - getStartTime().getTime()) / 1000;
return getSuccessPageCount() / runSeconds;
if (getStartTime() != null) {
int runSeconds = (int) (System.currentTimeMillis() - getStartTime().getTime()) / 1000;
if (runSeconds != 0) {
return getSuccessPageCount() / runSeconds;
}
}
return -1;
}
}

View File

@ -1,22 +1,23 @@
package us.codecraft.webmagic.scheduler;
import com.alibaba.fastjson.JSON;
import java.util.Set;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.lang3.StringUtils;
import com.alibaba.fastjson.JSON;
import redis.clients.jedis.Jedis;
import redis.clients.jedis.JedisPool;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task;
import java.util.Set;
/**
* the redis scheduler with priority
* @author sai
* Created by sai on 16-5-27.
*/
public class RedisPriorityScheduler extends RedisScheduler
{
public class RedisPriorityScheduler extends RedisScheduler {
private static final String ZSET_PREFIX = "zset_";
@ -37,62 +38,44 @@ public class RedisPriorityScheduler extends RedisScheduler
}
@Override
protected void pushWhenNoDuplicate(Request request, Task task)
{
Jedis jedis = pool.getResource();
try
{
if(request.getPriority() > 0)
protected void pushWhenNoDuplicate(Request request, Task task) {
try (Jedis jedis = pool.getResource()) {
if (request.getPriority() > 0) {
jedis.zadd(getZsetPlusPriorityKey(task), request.getPriority(), request.getUrl());
else if(request.getPriority() < 0)
} else if (request.getPriority() < 0) {
jedis.zadd(getZsetMinusPriorityKey(task), request.getPriority(), request.getUrl());
else
} else {
jedis.lpush(getQueueNoPriorityKey(task), request.getUrl());
}
setExtrasInItem(jedis, request, task);
}
finally
{
pool.returnResource(jedis);
}
}
@Override
public synchronized Request poll(Task task)
{
Jedis jedis = pool.getResource();
try
{
public synchronized Request poll(Task task) {
try (Jedis jedis = pool.getResource()) {
String url = getRequest(jedis, task);
if(StringUtils.isBlank(url))
if (StringUtils.isBlank(url)) {
return null;
}
return getExtrasInItem(jedis, url, task);
}
finally
{
pool.returnResource(jedis);
}
}
private String getRequest(Jedis jedis, Task task)
{
private String getRequest(Jedis jedis, Task task) {
String url;
Set<String> urls = jedis.zrevrange(getZsetPlusPriorityKey(task), 0, 0);
if(urls.isEmpty())
{
if (urls.isEmpty()) {
url = jedis.lpop(getQueueNoPriorityKey(task));
if(StringUtils.isBlank(url))
{
if (StringUtils.isBlank(url)) {
urls = jedis.zrevrange(getZsetMinusPriorityKey(task), 0, 0);
if(!urls.isEmpty())
{
if (!urls.isEmpty()) {
url = urls.toArray(new String[0])[0];
jedis.zrem(getZsetMinusPriorityKey(task), url);
}
}
}
else
{
} else {
url = urls.toArray(new String[0])[0];
jedis.zrem(getZsetPlusPriorityKey(task), url);
}
@ -100,51 +83,39 @@ public class RedisPriorityScheduler extends RedisScheduler
}
@Override
public void resetDuplicateCheck(Task task)
{
Jedis jedis = pool.getResource();
try
{
public void resetDuplicateCheck(Task task) {
try (Jedis jedis = pool.getResource()) {
jedis.del(getSetKey(task));
}
finally
{
pool.returnResource(jedis);
}
}
private String getZsetPlusPriorityKey(Task task)
{
private String getZsetPlusPriorityKey(Task task) {
return ZSET_PREFIX + task.getUUID() + PLUS_PRIORITY_SUFFIX;
}
private String getQueueNoPriorityKey(Task task)
{
private String getQueueNoPriorityKey(Task task) {
return QUEUE_PREFIX + task.getUUID() + NO_PRIORITY_SUFFIX;
}
private String getZsetMinusPriorityKey(Task task)
{
private String getZsetMinusPriorityKey(Task task) {
return ZSET_PREFIX + task.getUUID() + MINUS_PRIORITY_SUFFIX;
}
private void setExtrasInItem(Jedis jedis,Request request, Task task)
{
if(request.getExtras() != null)
{
String field = DigestUtils.shaHex(request.getUrl());
private void setExtrasInItem(Jedis jedis,Request request, Task task) {
if (request.getExtras() != null) {
String field = DigestUtils.sha1Hex(request.getUrl());
String value = JSON.toJSONString(request);
jedis.hset(getItemKey(task), field, value);
}
}
private Request getExtrasInItem(Jedis jedis, String url, Task task)
{
private Request getExtrasInItem(Jedis jedis, String url, Task task) {
String key = getItemKey(task);
String field = DigestUtils.shaHex(url);
String field = DigestUtils.sha1Hex(url);
byte[] bytes = jedis.hget(key.getBytes(), field.getBytes());
if(bytes != null)
if (bytes != null) {
return JSON.parseObject(new String(bytes), Request.class);
}
return new Request(url);
}
}

View File

@ -1,8 +1,10 @@
package us.codecraft.webmagic.scheduler;
import com.alibaba.fastjson.JSON;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.lang3.StringUtils;
import com.alibaba.fastjson.JSON;
import redis.clients.jedis.Jedis;
import redis.clients.jedis.JedisPool;
import redis.clients.jedis.JedisPoolConfig;
@ -37,21 +39,15 @@ public class RedisScheduler extends DuplicateRemovedScheduler implements Monitor
@Override
public void resetDuplicateCheck(Task task) {
Jedis jedis = pool.getResource();
try {
try (Jedis jedis = pool.getResource()) {
jedis.del(getSetKey(task));
} finally {
pool.returnResource(jedis);
}
}
@Override
public boolean isDuplicate(Request request, Task task) {
Jedis jedis = pool.getResource();
try {
try (Jedis jedis = pool.getResource()) {
return jedis.sadd(getSetKey(task), request.getUrl()) == 0;
} finally {
pool.returnResource(jedis);
}
}
@ -62,7 +58,7 @@ public class RedisScheduler extends DuplicateRemovedScheduler implements Monitor
try {
jedis.rpush(getQueueKey(task), request.getUrl());
if (checkForAdditionalInfo(request)) {
String field = DigestUtils.shaHex(request.getUrl());
String field = DigestUtils.sha1Hex(request.getUrl());
String value = JSON.toJSONString(request);
jedis.hset((ITEM_PREFIX + task.getUUID()), field, value);
}
@ -100,14 +96,13 @@ public class RedisScheduler extends DuplicateRemovedScheduler implements Monitor
@Override
public synchronized Request poll(Task task) {
Jedis jedis = pool.getResource();
try {
try (Jedis jedis = pool.getResource()) {
String url = jedis.lpop(getQueueKey(task));
if (url == null) {
return null;
}
String key = ITEM_PREFIX + task.getUUID();
String field = DigestUtils.shaHex(url);
String field = DigestUtils.sha1Hex(url);
byte[] bytes = jedis.hget(key.getBytes(), field.getBytes());
if (bytes != null) {
Request o = JSON.parseObject(new String(bytes), Request.class);
@ -115,8 +110,6 @@ public class RedisScheduler extends DuplicateRemovedScheduler implements Monitor
}
Request request = new Request(url);
return request;
} finally {
pool.returnResource(jedis);
}
}
@ -134,23 +127,17 @@ public class RedisScheduler extends DuplicateRemovedScheduler implements Monitor
@Override
public int getLeftRequestsCount(Task task) {
Jedis jedis = pool.getResource();
try {
try (Jedis jedis = pool.getResource()) {
Long size = jedis.llen(getQueueKey(task));
return size.intValue();
} finally {
pool.returnResource(jedis);
}
}
@Override
public int getTotalRequestsCount(Task task) {
Jedis jedis = pool.getResource();
try {
try (Jedis jedis = pool.getResource()) {
Long size = jedis.scard(getSetKey(task));
return size.intValue();
} finally {
pool.returnResource(jedis);
}
}
}

View File

@ -3,7 +3,7 @@
<parent>
<artifactId>webmagic-parent</artifactId>
<groupId>us.codecraft</groupId>
<version>0.7.4</version>
<version>0.7.5</version>
</parent>
<modelVersion>4.0.0</modelVersion>
@ -24,6 +24,26 @@
<groupId>junit</groupId>
<artifactId>junit</artifactId>
</dependency>
<dependency>
<groupId>org.mapdb</groupId>
<artifactId>mapdb</artifactId>
<version>3.0.8</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-core</artifactId>
<version>2.13.0-rc1</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-annotations</artifactId>
<version>2.13.0-rc1</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
<version>2.13.0-rc1</version>
</dependency>
</dependencies>
</project>

View File

@ -0,0 +1,78 @@
package us.codecraft.webmagic.recover;
import com.google.common.base.Charsets;
import com.google.common.hash.BloomFilter;
import com.google.common.hash.Funnels;
import org.mapdb.DB;
import org.mapdb.DBMaker;
import org.mapdb.IndexTreeList;
import org.mapdb.Serializer;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.scheduler.component.DuplicateRemover;
import java.util.concurrent.atomic.AtomicInteger;
/**
* @author linweisen
*/
public class DuplicateStorageRemover implements DuplicateRemover {
private DB db;
private static String DATABASE_NAME = "duplicate";
private IndexTreeList<String> urlDuplicateQueue;
private BloomFilter<CharSequence> bloomFilter;
private AtomicInteger counter;
public DuplicateStorageRemover(String path) {
String duplicatStoragePath = path;
DB db = DBMaker.fileDB(duplicatStoragePath)
.fileMmapEnableIfSupported()
.fileMmapPreclearDisable()
.cleanerHackEnable()
.closeOnJvmShutdown()
.transactionEnable()
.concurrencyScale(128)
.make();
this.db = db;
this.urlDuplicateQueue = db.indexTreeList(DATABASE_NAME, Serializer.STRING).createOrOpen();
counter = new AtomicInteger(this.urlDuplicateQueue.size());
this.bloomFilter = BloomFilter.create(Funnels.stringFunnel(Charsets.UTF_8), 200000, 1E-7);
for (String url : this.urlDuplicateQueue){
bloomFilter.put(url);
}
}
@Override
public boolean isDuplicate(Request request, Task task) {
String url = request.getUrl();
boolean isDuplicate = bloomFilter.mightContain(url);
if (!isDuplicate) {
bloomFilter.put(url);
urlDuplicateQueue.add(url);
this.db.commit();
counter.incrementAndGet();
}
return isDuplicate;
}
@Override
public void resetDuplicateCheck(Task task) {
this.bloomFilter = BloomFilter.create(Funnels.stringFunnel(Charsets.UTF_8), 200000, 1E-7);
this.urlDuplicateQueue.clear();
}
@Override
public int getTotalRequestsCount(Task task) {
return counter.get();
}
}

View File

@ -0,0 +1,85 @@
package us.codecraft.webmagic.recover;
import com.fasterxml.jackson.databind.ObjectMapper;
import org.apache.commons.lang3.StringUtils;
import org.mapdb.DB;
import org.mapdb.DBMaker;
import org.mapdb.IndexTreeList;
import org.mapdb.Serializer;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.scheduler.DuplicateRemovedScheduler;
import us.codecraft.webmagic.scheduler.component.DuplicateRemover;
import java.io.IOException;
/**
* @author linweisen
*/
public class MmapQueueScheduler extends DuplicateRemovedScheduler {
private DB db;
private static String DATABASE_NAME = "queue";
private IndexTreeList<String> queue;
private static ObjectMapper mapper;
public MmapQueueScheduler(DuplicateRemover duplicateRemover, String path) {
super.setDuplicateRemover(duplicateRemover);
String queuePath = path;
DB db = DBMaker.fileDB(queuePath)
.fileMmapEnableIfSupported()
.fileMmapPreclearDisable()
.cleanerHackEnable()
.closeOnJvmShutdown()
.transactionEnable()
.concurrencyScale(128)
.make();
this.db = db;
this.mapper = new ObjectMapper();
this.queue = db.indexTreeList(MmapQueueScheduler.DATABASE_NAME, Serializer.STRING).createOrOpen();
}
@Override
public Request poll(Task task) {
if (this.queue.size() > 0){
String s = queue.remove(0);
return fromJson(s, Request.class);
}else{
return null;
}
}
@Override
public void pushWhenNoDuplicate(Request request, Task task) {
queue.add(toJson(request));
this.db.commit();
}
public String toJson(Object object) {
try {
return mapper.writeValueAsString(object);
} catch (IOException e) {
logger.warn("write to json string error:" + object, e);
return null;
}
}
public <T> T fromJson(String jsonString, Class<T> clazz) {
if (StringUtils.isEmpty(jsonString)) {
return null;
}
try {
return mapper.readValue(jsonString, clazz);
} catch (IOException e) {
logger.warn("parse json string error:" + jsonString, e);
return null;
}
}
}

View File

@ -0,0 +1,22 @@
package us.codecraft.webmagic.recover;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.samples.SinaBlogProcessor;
import us.codecraft.webmagic.scheduler.component.DuplicateRemover;
/**
* @author code4crafter@gmail.com <br>
*/
public class RecoverSample {
public static void main(String[] args) {
String storage = "queue";
String duplicate = "duplicate";
Spider spider = new Spider(new SinaBlogProcessor());
DuplicateRemover remover = new DuplicateStorageRemover(duplicate);
spider.setScheduler(new MmapQueueScheduler(remover, storage));
spider.addUrl("http://blog.sina.com.cn/s/articlelist_1487828712_0_1.html")
.run();
}
}

View File

@ -3,7 +3,7 @@
<parent>
<artifactId>webmagic-parent</artifactId>
<groupId>us.codecraft</groupId>
<version>0.7.4</version>
<version>0.7.5</version>
</parent>
<modelVersion>4.0.0</modelVersion>

View File

@ -1,16 +1,11 @@
package us.codecraft.webmagic.selector;
import net.sf.saxon.lib.NamespaceConstant;
import net.sf.saxon.xpath.XPathEvaluator;
import org.htmlcleaner.CleanerProperties;
import org.htmlcleaner.DomSerializer;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import javax.xml.namespace.NamespaceContext;
import javax.xml.transform.OutputKeys;
@ -21,12 +16,19 @@ import javax.xml.transform.stream.StreamResult;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpression;
import javax.xml.xpath.XPathExpressionException;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import org.htmlcleaner.CleanerProperties;
import org.htmlcleaner.DomSerializer;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import net.sf.saxon.lib.NamespaceConstant;
import net.sf.saxon.xpath.XPathEvaluator;
/**
* xpath2.0HtmlCleanerSaxon HE<br>

View File

@ -1,5 +1,7 @@
package us.codecraft.webmagic.selector;
import java.util.List;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;
import org.htmlcleaner.XPatherException;
@ -8,6 +10,7 @@ import org.jsoup.nodes.Document;
import org.junit.Assert;
import org.junit.Ignore;
import org.junit.Test;
import us.codecraft.xsoup.XPathEvaluator;
import us.codecraft.xsoup.Xsoup;
@ -1367,15 +1370,19 @@ public class XpathSelectorTest {
public void testXPath2() {
String text = "<h1>眉山:扎实推进农业农村工作 促农持续增收<br>\n" +
"<span>2013-07-31 23:29:45&nbsp;&nbsp;&nbsp;来源:<a href=\"http://www.mshw.net\" target=\"_blank\" style=\"color:#AAA\">眉山网</a>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;责任编辑:张斯炜</span></h1>";
XpathSelector xpathSelector = new XpathSelector("//h1/text()");
System.out.println(xpathSelector.select(text));
Xpath2Selector xpathSelector = new Xpath2Selector("//h1/text()");
Assert.assertEquals("眉山:扎实推进农业农村工作 促农持续增收", xpathSelector.select(text));
}
@Test
public void testXpath2Selector() {
Xpath2Selector xpath2Selector = new Xpath2Selector("//a/@href");
String select = xpath2Selector.select(html);
Assert.assertNotNull(select);
Assert.assertEquals("http://www.oschina.net/", select);
List<String> selectList = xpath2Selector.selectList(html);
Assert.assertEquals(113, selectList.size());
Assert.assertEquals("http://www.oschina.net/", selectList.get(0));
}
@Ignore("take long time")

View File

@ -3,7 +3,7 @@
<parent>
<artifactId>webmagic-parent</artifactId>
<groupId>us.codecraft</groupId>
<version>0.7.4</version>
<version>0.7.5</version>
</parent>
<modelVersion>4.0.0</modelVersion>
@ -22,10 +22,6 @@
<artifactId>kotlin-stdlib</artifactId>
<version>${kotlin.version}</version>
</dependency>
<dependency>
<groupId>org.codehaus.groovy</groupId>
<artifactId>groovy-all</artifactId>
</dependency>
<dependency>
<groupId>org.python</groupId>
<artifactId>jython</artifactId>

View File

@ -3,7 +3,7 @@
<parent>
<artifactId>webmagic-parent</artifactId>
<groupId>us.codecraft</groupId>
<version>0.7.4</version>
<version>0.7.5</version>
</parent>
<modelVersion>4.0.0</modelVersion>