Merge branch 'release/0.7.5'
commit
04978f912d
|
@ -1,9 +1,10 @@
|
|||

|
||||
|
||||
|
||||
[](https://maven-badges.herokuapp.com/maven-central/us.codecraft/webmagic-parent/)
|
||||
[](https://www.apache.org/licenses/LICENSE-2.0.html)
|
||||
[](https://travis-ci.org/code4craft/webmagic)
|
||||
|
||||
|
||||
官方网站[http://webmagic.io/](http://webmagic.io/)
|
||||
|
||||
>webmagic是一个开源的Java垂直爬虫框架,目标是简化爬虫的开发流程,让开发者专注于逻辑功能的开发。webmagic的核心非常简单,但是覆盖爬虫的整个流程,也是很好的学习爬虫开发的材料。
|
||||
|
@ -38,12 +39,12 @@ webmagic使用maven管理依赖,在项目中添加对应的依赖即可使用w
|
|||
<dependency>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<artifactId>webmagic-core</artifactId>
|
||||
<version>0.7.4</version>
|
||||
<version>0.7.5</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<artifactId>webmagic-extension</artifactId>
|
||||
<version>0.7.4</version>
|
||||
<version>0.7.5</version>
|
||||
</dependency>
|
||||
```
|
||||
|
||||
|
|
|
@ -3,6 +3,8 @@
|
|||
[Readme in Chinese](https://github.com/code4craft/webmagic/tree/master/README-zh.md)
|
||||
|
||||
|
||||
[](https://maven-badges.herokuapp.com/maven-central/us.codecraft/webmagic-parent/)
|
||||
[](https://www.apache.org/licenses/LICENSE-2.0.html)
|
||||
[](https://travis-ci.org/code4craft/webmagic)
|
||||
|
||||
>A scalable crawler framework. It covers the whole lifecycle of crawler: downloading, url management, content extraction and persistent. It can simplify the development of a specific crawler.
|
||||
|
@ -23,12 +25,12 @@ Add dependencies to your pom.xml:
|
|||
<dependency>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<artifactId>webmagic-core</artifactId>
|
||||
<version>0.7.4</version>
|
||||
<version>0.7.5</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<artifactId>webmagic-extension</artifactId>
|
||||
<version>0.7.4</version>
|
||||
<version>0.7.5</version>
|
||||
</dependency>
|
||||
```
|
||||
|
||||
|
|
192
pom.xml
192
pom.xml
|
@ -1,13 +1,14 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<groupId>us.codecraft</groupId>
|
||||
<version>0.7.4</version>
|
||||
<version>0.7.5</version>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
<packaging>pom</packaging>
|
||||
<properties>
|
||||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
||||
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
|
||||
<java.version>1.8</java.version>
|
||||
<maven.compiler.source>1.8</maven.compiler.source>
|
||||
<maven.compiler.target>1.8</maven.compiler.target>
|
||||
<spring-version>4.0.0.RELEASE</spring-version>
|
||||
</properties>
|
||||
<artifactId>webmagic-parent</artifactId>
|
||||
|
@ -33,7 +34,7 @@
|
|||
<connection>scm:git:git@github.com:code4craft/webmagic.git</connection>
|
||||
<developerConnection>scm:git:git@github.com:code4craft/webmagic.git</developerConnection>
|
||||
<url>git@github.com:code4craft/webmagic.git</url>
|
||||
<tag>webmagic-parent-0.6.1</tag>
|
||||
<tag>WebMagic-${project.version}</tag>
|
||||
</scm>
|
||||
<licenses>
|
||||
<license>
|
||||
|
@ -49,6 +50,7 @@
|
|||
<module>webmagic-selenium</module>
|
||||
<module>webmagic-saxon</module>
|
||||
<module>webmagic-samples</module>
|
||||
<module>webmagic-coverage</module>
|
||||
</modules>
|
||||
|
||||
<dependencyManagement>
|
||||
|
@ -73,17 +75,17 @@
|
|||
<dependency>
|
||||
<groupId>org.apache.httpcomponents</groupId>
|
||||
<artifactId>httpcore</artifactId>
|
||||
<version>4.4.13</version>
|
||||
<version>4.4.14</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.google.guava</groupId>
|
||||
<artifactId>guava</artifactId>
|
||||
<version>30.0-android</version>
|
||||
<version>30.1-jre</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.jayway.jsonpath</groupId>
|
||||
<artifactId>json-path</artifactId>
|
||||
<version>2.6.0</version>
|
||||
<version>2.5.0</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.slf4j</groupId>
|
||||
|
@ -98,12 +100,12 @@
|
|||
<dependency>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<artifactId>xsoup</artifactId>
|
||||
<version>0.3.1</version>
|
||||
<version>0.3.2</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.alibaba</groupId>
|
||||
<artifactId>fastjson</artifactId>
|
||||
<version>1.2.69</version>
|
||||
<version>1.2.75</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.github.dreamhead</groupId>
|
||||
|
@ -125,13 +127,13 @@
|
|||
<dependency>
|
||||
<groupId>org.assertj</groupId>
|
||||
<artifactId>assertj-core</artifactId>
|
||||
<version>3.16.1</version>
|
||||
<version>3.18.1</version>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.commons</groupId>
|
||||
<artifactId>commons-lang3</artifactId>
|
||||
<version>3.10</version>
|
||||
<version>3.11</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>commons-collections</groupId>
|
||||
|
@ -141,22 +143,17 @@
|
|||
<dependency>
|
||||
<groupId>commons-io</groupId>
|
||||
<artifactId>commons-io</artifactId>
|
||||
<version>2.7</version>
|
||||
<version>2.8.0</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.codehaus.groovy</groupId>
|
||||
<artifactId>groovy-all</artifactId>
|
||||
<version>2.4.19</version>
|
||||
<version>3.0.7</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.jruby</groupId>
|
||||
<artifactId>jruby</artifactId>
|
||||
<version>9.2.11.1</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.jsoup</groupId>
|
||||
<artifactId>jsoup</artifactId>
|
||||
<version>1.10.3</version>
|
||||
<version>9.2.14.0</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.python</groupId>
|
||||
|
@ -171,12 +168,12 @@
|
|||
<dependency>
|
||||
<groupId>net.sf.saxon</groupId>
|
||||
<artifactId>Saxon-HE</artifactId>
|
||||
<version>10.1</version>
|
||||
<version>10.3</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>net.sourceforge.htmlcleaner</groupId>
|
||||
<artifactId>htmlcleaner</artifactId>
|
||||
<version>2.5</version>
|
||||
<version>2.9</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.github.detro</groupId>
|
||||
|
@ -191,7 +188,7 @@
|
|||
<dependency>
|
||||
<groupId>redis.clients</groupId>
|
||||
<artifactId>jedis</artifactId>
|
||||
<version>2.9.3</version>
|
||||
<version>3.6.0</version>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
</dependencyManagement>
|
||||
|
@ -211,7 +208,7 @@
|
|||
<configuration>
|
||||
<rules>
|
||||
<requireMavenVersion>
|
||||
<version>3.0.5</version>
|
||||
<version>3.3.9</version>
|
||||
</requireMavenVersion>
|
||||
</rules>
|
||||
</configuration>
|
||||
|
@ -221,19 +218,10 @@
|
|||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-surefire-plugin</artifactId>
|
||||
<version>3.0.0-M4</version>
|
||||
<configuration>
|
||||
<forkCount>0</forkCount>
|
||||
</configuration>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-compiler-plugin</artifactId>
|
||||
<version>3.8.1</version>
|
||||
<configuration>
|
||||
<source>${java.version}</source>
|
||||
<target>${java.version}</target>
|
||||
</configuration>
|
||||
</plugin>
|
||||
<!--<plugin>-->
|
||||
<!--<groupId>org.apache.maven.plugins</groupId>-->
|
||||
|
@ -258,12 +246,10 @@
|
|||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-resources-plugin</artifactId>
|
||||
<version>3.1.0</version>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-jar-plugin</artifactId>
|
||||
<version>3.2.0</version>
|
||||
<configuration>
|
||||
<excludes>
|
||||
<exclude>log4j.xml</exclude>
|
||||
|
@ -289,7 +275,7 @@
|
|||
<version>3.2.0</version>
|
||||
<configuration>
|
||||
<encoding>UTF-8</encoding>
|
||||
<doctitle>WebMagic 0.7.4</doctitle>
|
||||
<doctitle>WebMagic ${project.version}</doctitle>
|
||||
<locale>en_US</locale>
|
||||
|
||||
<!-- avoid the issue: https://bugs.openjdk.java.net/browse/JDK-8212233 -->
|
||||
|
@ -317,9 +303,147 @@
|
|||
<artifactId>maven-release-plugin</artifactId>
|
||||
<version>3.0.0-M1</version>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.jacoco</groupId>
|
||||
<artifactId>jacoco-maven-plugin</artifactId>
|
||||
<executions>
|
||||
<execution>
|
||||
<goals>
|
||||
<goal>prepare-agent</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
<execution>
|
||||
<id>report</id>
|
||||
<phase>verify</phase>
|
||||
<goals>
|
||||
<goal>report</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>com.amashchenko.maven.plugin</groupId>
|
||||
<artifactId>gitflow-maven-plugin</artifactId>
|
||||
<configuration>
|
||||
<gitFlowConfig>
|
||||
<versionTagPrefix>WebMagic-</versionTagPrefix>
|
||||
</gitFlowConfig>
|
||||
</configuration>
|
||||
</plugin>
|
||||
</plugins>
|
||||
<pluginManagement>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-clean-plugin</artifactId>
|
||||
<version>3.1.0</version>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-compiler-plugin</artifactId>
|
||||
<version>3.8.1</version>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-deploy-plugin</artifactId>
|
||||
<version>3.0.0-M1</version>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-install-plugin</artifactId>
|
||||
<version>3.0.0-M1</version>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-jar-plugin</artifactId>
|
||||
<version>3.2.0</version>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-jxr-plugin</artifactId>
|
||||
<version>3.1.1</version>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-pmd-plugin</artifactId>
|
||||
<version>3.14.0</version>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-resources-plugin</artifactId>
|
||||
<version>3.2.0</version>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-site-plugin</artifactId>
|
||||
<version>3.9.1</version>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-surefire-plugin</artifactId>
|
||||
<version>3.0.0-M5</version>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-surefire-report-plugin</artifactId>
|
||||
<version>3.0.0-M5</version>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.codehaus.mojo</groupId>
|
||||
<artifactId>taglist-maven-plugin</artifactId>
|
||||
<version>2.4</version>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.jacoco</groupId>
|
||||
<artifactId>jacoco-maven-plugin</artifactId>
|
||||
<version>0.8.7</version>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>com.amashchenko.maven.plugin</groupId>
|
||||
<artifactId>gitflow-maven-plugin</artifactId>
|
||||
<version>1.15.0</version>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>com.github.spotbugs</groupId>
|
||||
<artifactId>spotbugs-maven-plugin</artifactId>
|
||||
<version>4.2.3</version>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</pluginManagement>
|
||||
</build>
|
||||
|
||||
<reporting>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-javadoc-plugin</artifactId>
|
||||
<configuration>
|
||||
<doclint>none</doclint>
|
||||
</configuration>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-jxr-plugin</artifactId>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-pmd-plugin</artifactId>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-surefire-report-plugin</artifactId>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.codehaus.mojo</groupId>
|
||||
<artifactId>taglist-maven-plugin</artifactId>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>com.github.spotbugs</groupId>
|
||||
<artifactId>spotbugs-maven-plugin</artifactId>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</reporting>
|
||||
|
||||
<profiles>
|
||||
<profile>
|
||||
<id>release</id>
|
||||
|
|
|
@ -0,0 +1,23 @@
|
|||
<project xmlns="http://maven.apache.org/DECORATION/1.6.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/DECORATION/1.6.0
|
||||
http://maven.apache.org/xsd/decoration-1.6.0.xsd">
|
||||
<skin>
|
||||
<groupId>org.apache.maven.skins</groupId>
|
||||
<artifactId>maven-fluido-skin</artifactId>
|
||||
<version>1.9</version>
|
||||
</skin>
|
||||
<body>
|
||||
<menu ref="parent" inherit="top" />
|
||||
<menu ref="modules" inherit="top" />
|
||||
<menu ref="reports" inherit="top" />
|
||||
</body>
|
||||
<custom>
|
||||
<fluidoSkin>
|
||||
<topBarEnabled>true</topBarEnabled>
|
||||
<sideBarEnabled>true</sideBarEnabled>
|
||||
<sourceLineNumbersEnabled>true</sourceLineNumbersEnabled>
|
||||
<copyrightClass>pull-right</copyrightClass>
|
||||
</fluidoSkin>
|
||||
</custom>
|
||||
</project>
|
|
@ -3,7 +3,7 @@
|
|||
<parent>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<artifactId>webmagic-parent</artifactId>
|
||||
<version>0.7.4</version>
|
||||
<version>0.7.5</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
|
@ -61,11 +61,6 @@
|
|||
<artifactId>assertj-core</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.jsoup</groupId>
|
||||
<artifactId>jsoup</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>commons-io</groupId>
|
||||
<artifactId>commons-io</artifactId>
|
||||
|
|
|
@ -208,6 +208,7 @@ public class Spider implements Runnable, Task {
|
|||
* @see #addPipeline(us.codecraft.webmagic.pipeline.Pipeline)
|
||||
* @deprecated
|
||||
*/
|
||||
@Deprecated
|
||||
public Spider pipeline(Pipeline pipeline) {
|
||||
return addPipeline(pipeline);
|
||||
}
|
||||
|
@ -258,6 +259,7 @@ public class Spider implements Runnable, Task {
|
|||
* @see #setDownloader(us.codecraft.webmagic.downloader.Downloader)
|
||||
* @deprecated
|
||||
*/
|
||||
@Deprecated
|
||||
public Spider downloader(Downloader downloader) {
|
||||
return setDownloader(downloader);
|
||||
}
|
||||
|
@ -320,7 +322,7 @@ public class Spider implements Runnable, Task {
|
|||
processRequest(request);
|
||||
onSuccess(request);
|
||||
} catch (Exception e) {
|
||||
onError(request);
|
||||
onError(request, e);
|
||||
logger.error("process request " + request + " error", e);
|
||||
} finally {
|
||||
pageCount.incrementAndGet();
|
||||
|
@ -338,10 +340,19 @@ public class Spider implements Runnable, Task {
|
|||
logger.info("Spider {} closed! {} pages downloaded.", getUUID(), pageCount.get());
|
||||
}
|
||||
|
||||
/**
|
||||
* @deprecated Use {@link #onError(Request, Exception)} instead.
|
||||
*/
|
||||
@Deprecated
|
||||
protected void onError(Request request) {
|
||||
}
|
||||
|
||||
protected void onError(Request request, Exception e) {
|
||||
this.onError(request);
|
||||
|
||||
if (CollectionUtils.isNotEmpty(spiderListeners)) {
|
||||
for (SpiderListener spiderListener : spiderListeners) {
|
||||
spiderListener.onError(request);
|
||||
spiderListener.onError(request, e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -10,5 +10,14 @@ public interface SpiderListener {
|
|||
|
||||
public void onSuccess(Request request);
|
||||
|
||||
/**
|
||||
* @deprecated Use {@link #onError(Request, Exception)} instead.
|
||||
*/
|
||||
@Deprecated
|
||||
public void onError(Request request);
|
||||
|
||||
default void onError(Request request, Exception e) {
|
||||
this.onError(request);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,12 +1,12 @@
|
|||
package us.codecraft.webmagic.selector;
|
||||
|
||||
import org.jsoup.helper.StringUtil;
|
||||
import org.jsoup.nodes.Element;
|
||||
import org.jsoup.select.Elements;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.jsoup.nodes.Element;
|
||||
import org.jsoup.select.Elements;
|
||||
|
||||
/**
|
||||
* Links selector based on jsoup. Use absolute url. <br>
|
||||
*
|
||||
|
@ -23,9 +23,9 @@ public class LinksSelector extends BaseElementSelector {
|
|||
@Override
|
||||
public List<String> selectList(Element element) {
|
||||
Elements elements = element.select("a");
|
||||
List<String> links = new ArrayList<String>(elements.size());
|
||||
List<String> links = new ArrayList<>(elements.size());
|
||||
for (Element element0 : elements) {
|
||||
if (!StringUtil.isBlank(element0.baseUri())) {
|
||||
if (StringUtils.isNotBlank(element0.baseUri())) {
|
||||
links.add(element0.attr("abs:href"));
|
||||
} else {
|
||||
links.add(element0.attr("href"));
|
||||
|
|
|
@ -0,0 +1,72 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0
|
||||
http://maven.apache.org/maven-v4_0_0.xsd">
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
<parent>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<artifactId>webmagic-parent</artifactId>
|
||||
<version>0.7.5</version>
|
||||
</parent>
|
||||
|
||||
<artifactId>webmagic-coverage</artifactId>
|
||||
<packaging>pom</packaging>
|
||||
<name>webmagic-coverage</name>
|
||||
<description>Compute aggregated test code coverage</description>
|
||||
|
||||
<properties>
|
||||
<maven.deploy.skip>true</maven.deploy.skip>
|
||||
</properties>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>${project.groupId}</groupId>
|
||||
<artifactId>webmagic-core</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>${project.groupId}</groupId>
|
||||
<artifactId>webmagic-extension</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>${project.groupId}</groupId>
|
||||
<artifactId>webmagic-scripts</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>${project.groupId}</groupId>
|
||||
<artifactId>webmagic-selenium</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>${project.groupId}</groupId>
|
||||
<artifactId>webmagic-saxon</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>${project.groupId}</groupId>
|
||||
<artifactId>webmagic-samples</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
<reporting>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>org.jacoco</groupId>
|
||||
<artifactId>jacoco-maven-plugin</artifactId>
|
||||
<reportSets>
|
||||
<reportSet>
|
||||
<reports>
|
||||
<report>report-aggregate</report>
|
||||
</reports>
|
||||
</reportSet>
|
||||
</reportSets>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</reporting>
|
||||
|
||||
</project>
|
|
@ -3,7 +3,7 @@
|
|||
<parent>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<artifactId>webmagic-parent</artifactId>
|
||||
<version>0.7.4</version>
|
||||
<version>0.7.5</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
|
|
|
@ -68,6 +68,10 @@ public class SpiderMonitor {
|
|||
return new SpiderStatus(spider, monitorSpiderListener);
|
||||
}
|
||||
|
||||
protected List<SpiderStatusMXBean> getSpiderStatuses() {
|
||||
return this.spiderStatuses;
|
||||
}
|
||||
|
||||
public static SpiderMonitor instance() {
|
||||
return INSTANCE;
|
||||
}
|
||||
|
|
|
@ -84,8 +84,13 @@ public class SpiderStatus implements SpiderStatusMXBean {
|
|||
|
||||
@Override
|
||||
public int getPagePerSecond() {
|
||||
if (getStartTime() != null) {
|
||||
int runSeconds = (int) (System.currentTimeMillis() - getStartTime().getTime()) / 1000;
|
||||
if (runSeconds != 0) {
|
||||
return getSuccessPageCount() / runSeconds;
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,22 +1,23 @@
|
|||
package us.codecraft.webmagic.scheduler;
|
||||
|
||||
import com.alibaba.fastjson.JSON;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.commons.codec.digest.DigestUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import com.alibaba.fastjson.JSON;
|
||||
|
||||
import redis.clients.jedis.Jedis;
|
||||
import redis.clients.jedis.JedisPool;
|
||||
import us.codecraft.webmagic.Request;
|
||||
import us.codecraft.webmagic.Task;
|
||||
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
* the redis scheduler with priority
|
||||
* @author sai
|
||||
* Created by sai on 16-5-27.
|
||||
*/
|
||||
public class RedisPriorityScheduler extends RedisScheduler
|
||||
{
|
||||
public class RedisPriorityScheduler extends RedisScheduler {
|
||||
|
||||
private static final String ZSET_PREFIX = "zset_";
|
||||
|
||||
|
@ -37,62 +38,44 @@ public class RedisPriorityScheduler extends RedisScheduler
|
|||
}
|
||||
|
||||
@Override
|
||||
protected void pushWhenNoDuplicate(Request request, Task task)
|
||||
{
|
||||
Jedis jedis = pool.getResource();
|
||||
try
|
||||
{
|
||||
if(request.getPriority() > 0)
|
||||
protected void pushWhenNoDuplicate(Request request, Task task) {
|
||||
try (Jedis jedis = pool.getResource()) {
|
||||
if (request.getPriority() > 0) {
|
||||
jedis.zadd(getZsetPlusPriorityKey(task), request.getPriority(), request.getUrl());
|
||||
else if(request.getPriority() < 0)
|
||||
} else if (request.getPriority() < 0) {
|
||||
jedis.zadd(getZsetMinusPriorityKey(task), request.getPriority(), request.getUrl());
|
||||
else
|
||||
} else {
|
||||
jedis.lpush(getQueueNoPriorityKey(task), request.getUrl());
|
||||
}
|
||||
|
||||
setExtrasInItem(jedis, request, task);
|
||||
}
|
||||
finally
|
||||
{
|
||||
pool.returnResource(jedis);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public synchronized Request poll(Task task)
|
||||
{
|
||||
Jedis jedis = pool.getResource();
|
||||
try
|
||||
{
|
||||
public synchronized Request poll(Task task) {
|
||||
try (Jedis jedis = pool.getResource()) {
|
||||
String url = getRequest(jedis, task);
|
||||
if(StringUtils.isBlank(url))
|
||||
if (StringUtils.isBlank(url)) {
|
||||
return null;
|
||||
return getExtrasInItem(jedis, url, task);
|
||||
}
|
||||
finally
|
||||
{
|
||||
pool.returnResource(jedis);
|
||||
return getExtrasInItem(jedis, url, task);
|
||||
}
|
||||
}
|
||||
|
||||
private String getRequest(Jedis jedis, Task task)
|
||||
{
|
||||
private String getRequest(Jedis jedis, Task task) {
|
||||
String url;
|
||||
Set<String> urls = jedis.zrevrange(getZsetPlusPriorityKey(task), 0, 0);
|
||||
if(urls.isEmpty())
|
||||
{
|
||||
if (urls.isEmpty()) {
|
||||
url = jedis.lpop(getQueueNoPriorityKey(task));
|
||||
if(StringUtils.isBlank(url))
|
||||
{
|
||||
if (StringUtils.isBlank(url)) {
|
||||
urls = jedis.zrevrange(getZsetMinusPriorityKey(task), 0, 0);
|
||||
if(!urls.isEmpty())
|
||||
{
|
||||
if (!urls.isEmpty()) {
|
||||
url = urls.toArray(new String[0])[0];
|
||||
jedis.zrem(getZsetMinusPriorityKey(task), url);
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
} else {
|
||||
url = urls.toArray(new String[0])[0];
|
||||
jedis.zrem(getZsetPlusPriorityKey(task), url);
|
||||
}
|
||||
|
@ -100,51 +83,39 @@ public class RedisPriorityScheduler extends RedisScheduler
|
|||
}
|
||||
|
||||
@Override
|
||||
public void resetDuplicateCheck(Task task)
|
||||
{
|
||||
Jedis jedis = pool.getResource();
|
||||
try
|
||||
{
|
||||
public void resetDuplicateCheck(Task task) {
|
||||
try (Jedis jedis = pool.getResource()) {
|
||||
jedis.del(getSetKey(task));
|
||||
}
|
||||
finally
|
||||
{
|
||||
pool.returnResource(jedis);
|
||||
}
|
||||
}
|
||||
|
||||
private String getZsetPlusPriorityKey(Task task)
|
||||
{
|
||||
private String getZsetPlusPriorityKey(Task task) {
|
||||
return ZSET_PREFIX + task.getUUID() + PLUS_PRIORITY_SUFFIX;
|
||||
}
|
||||
|
||||
private String getQueueNoPriorityKey(Task task)
|
||||
{
|
||||
private String getQueueNoPriorityKey(Task task) {
|
||||
return QUEUE_PREFIX + task.getUUID() + NO_PRIORITY_SUFFIX;
|
||||
}
|
||||
|
||||
private String getZsetMinusPriorityKey(Task task)
|
||||
{
|
||||
private String getZsetMinusPriorityKey(Task task) {
|
||||
return ZSET_PREFIX + task.getUUID() + MINUS_PRIORITY_SUFFIX;
|
||||
}
|
||||
|
||||
private void setExtrasInItem(Jedis jedis,Request request, Task task)
|
||||
{
|
||||
if(request.getExtras() != null)
|
||||
{
|
||||
String field = DigestUtils.shaHex(request.getUrl());
|
||||
private void setExtrasInItem(Jedis jedis,Request request, Task task) {
|
||||
if (request.getExtras() != null) {
|
||||
String field = DigestUtils.sha1Hex(request.getUrl());
|
||||
String value = JSON.toJSONString(request);
|
||||
jedis.hset(getItemKey(task), field, value);
|
||||
}
|
||||
}
|
||||
|
||||
private Request getExtrasInItem(Jedis jedis, String url, Task task)
|
||||
{
|
||||
private Request getExtrasInItem(Jedis jedis, String url, Task task) {
|
||||
String key = getItemKey(task);
|
||||
String field = DigestUtils.shaHex(url);
|
||||
String field = DigestUtils.sha1Hex(url);
|
||||
byte[] bytes = jedis.hget(key.getBytes(), field.getBytes());
|
||||
if(bytes != null)
|
||||
if (bytes != null) {
|
||||
return JSON.parseObject(new String(bytes), Request.class);
|
||||
}
|
||||
return new Request(url);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,8 +1,10 @@
|
|||
package us.codecraft.webmagic.scheduler;
|
||||
|
||||
import com.alibaba.fastjson.JSON;
|
||||
import org.apache.commons.codec.digest.DigestUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import com.alibaba.fastjson.JSON;
|
||||
|
||||
import redis.clients.jedis.Jedis;
|
||||
import redis.clients.jedis.JedisPool;
|
||||
import redis.clients.jedis.JedisPoolConfig;
|
||||
|
@ -37,21 +39,15 @@ public class RedisScheduler extends DuplicateRemovedScheduler implements Monitor
|
|||
|
||||
@Override
|
||||
public void resetDuplicateCheck(Task task) {
|
||||
Jedis jedis = pool.getResource();
|
||||
try {
|
||||
try (Jedis jedis = pool.getResource()) {
|
||||
jedis.del(getSetKey(task));
|
||||
} finally {
|
||||
pool.returnResource(jedis);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isDuplicate(Request request, Task task) {
|
||||
Jedis jedis = pool.getResource();
|
||||
try {
|
||||
try (Jedis jedis = pool.getResource()) {
|
||||
return jedis.sadd(getSetKey(task), request.getUrl()) == 0;
|
||||
} finally {
|
||||
pool.returnResource(jedis);
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -62,7 +58,7 @@ public class RedisScheduler extends DuplicateRemovedScheduler implements Monitor
|
|||
try {
|
||||
jedis.rpush(getQueueKey(task), request.getUrl());
|
||||
if (checkForAdditionalInfo(request)) {
|
||||
String field = DigestUtils.shaHex(request.getUrl());
|
||||
String field = DigestUtils.sha1Hex(request.getUrl());
|
||||
String value = JSON.toJSONString(request);
|
||||
jedis.hset((ITEM_PREFIX + task.getUUID()), field, value);
|
||||
}
|
||||
|
@ -100,14 +96,13 @@ public class RedisScheduler extends DuplicateRemovedScheduler implements Monitor
|
|||
|
||||
@Override
|
||||
public synchronized Request poll(Task task) {
|
||||
Jedis jedis = pool.getResource();
|
||||
try {
|
||||
try (Jedis jedis = pool.getResource()) {
|
||||
String url = jedis.lpop(getQueueKey(task));
|
||||
if (url == null) {
|
||||
return null;
|
||||
}
|
||||
String key = ITEM_PREFIX + task.getUUID();
|
||||
String field = DigestUtils.shaHex(url);
|
||||
String field = DigestUtils.sha1Hex(url);
|
||||
byte[] bytes = jedis.hget(key.getBytes(), field.getBytes());
|
||||
if (bytes != null) {
|
||||
Request o = JSON.parseObject(new String(bytes), Request.class);
|
||||
|
@ -115,8 +110,6 @@ public class RedisScheduler extends DuplicateRemovedScheduler implements Monitor
|
|||
}
|
||||
Request request = new Request(url);
|
||||
return request;
|
||||
} finally {
|
||||
pool.returnResource(jedis);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -134,23 +127,17 @@ public class RedisScheduler extends DuplicateRemovedScheduler implements Monitor
|
|||
|
||||
@Override
|
||||
public int getLeftRequestsCount(Task task) {
|
||||
Jedis jedis = pool.getResource();
|
||||
try {
|
||||
try (Jedis jedis = pool.getResource()) {
|
||||
Long size = jedis.llen(getQueueKey(task));
|
||||
return size.intValue();
|
||||
} finally {
|
||||
pool.returnResource(jedis);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getTotalRequestsCount(Task task) {
|
||||
Jedis jedis = pool.getResource();
|
||||
try {
|
||||
try (Jedis jedis = pool.getResource()) {
|
||||
Long size = jedis.scard(getSetKey(task));
|
||||
return size.intValue();
|
||||
} finally {
|
||||
pool.returnResource(jedis);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
<parent>
|
||||
<artifactId>webmagic-parent</artifactId>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<version>0.7.4</version>
|
||||
<version>0.7.5</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
|
@ -24,6 +24,26 @@
|
|||
<groupId>junit</groupId>
|
||||
<artifactId>junit</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.mapdb</groupId>
|
||||
<artifactId>mapdb</artifactId>
|
||||
<version>3.0.8</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.fasterxml.jackson.core</groupId>
|
||||
<artifactId>jackson-core</artifactId>
|
||||
<version>2.13.0-rc1</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.fasterxml.jackson.core</groupId>
|
||||
<artifactId>jackson-annotations</artifactId>
|
||||
<version>2.13.0-rc1</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.fasterxml.jackson.core</groupId>
|
||||
<artifactId>jackson-databind</artifactId>
|
||||
<version>2.13.0-rc1</version>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
</project>
|
||||
|
|
|
@ -0,0 +1,78 @@
|
|||
package us.codecraft.webmagic.recover;
|
||||
|
||||
import com.google.common.base.Charsets;
|
||||
import com.google.common.hash.BloomFilter;
|
||||
import com.google.common.hash.Funnels;
|
||||
import org.mapdb.DB;
|
||||
import org.mapdb.DBMaker;
|
||||
import org.mapdb.IndexTreeList;
|
||||
import org.mapdb.Serializer;
|
||||
import us.codecraft.webmagic.Request;
|
||||
import us.codecraft.webmagic.Task;
|
||||
import us.codecraft.webmagic.scheduler.component.DuplicateRemover;
|
||||
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
|
||||
/**
|
||||
* @author :linweisen
|
||||
*/
|
||||
public class DuplicateStorageRemover implements DuplicateRemover {
|
||||
|
||||
private DB db;
|
||||
|
||||
private static String DATABASE_NAME = "duplicate";
|
||||
|
||||
private IndexTreeList<String> urlDuplicateQueue;
|
||||
|
||||
private BloomFilter<CharSequence> bloomFilter;
|
||||
|
||||
private AtomicInteger counter;
|
||||
|
||||
public DuplicateStorageRemover(String path) {
|
||||
|
||||
String duplicatStoragePath = path;
|
||||
|
||||
DB db = DBMaker.fileDB(duplicatStoragePath)
|
||||
.fileMmapEnableIfSupported()
|
||||
.fileMmapPreclearDisable()
|
||||
.cleanerHackEnable()
|
||||
.closeOnJvmShutdown()
|
||||
.transactionEnable()
|
||||
.concurrencyScale(128)
|
||||
.make();
|
||||
this.db = db;
|
||||
|
||||
this.urlDuplicateQueue = db.indexTreeList(DATABASE_NAME, Serializer.STRING).createOrOpen();
|
||||
|
||||
counter = new AtomicInteger(this.urlDuplicateQueue.size());
|
||||
this.bloomFilter = BloomFilter.create(Funnels.stringFunnel(Charsets.UTF_8), 200000, 1E-7);
|
||||
for (String url : this.urlDuplicateQueue){
|
||||
bloomFilter.put(url);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isDuplicate(Request request, Task task) {
|
||||
String url = request.getUrl();
|
||||
boolean isDuplicate = bloomFilter.mightContain(url);
|
||||
if (!isDuplicate) {
|
||||
bloomFilter.put(url);
|
||||
urlDuplicateQueue.add(url);
|
||||
this.db.commit();
|
||||
counter.incrementAndGet();
|
||||
}
|
||||
return isDuplicate;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void resetDuplicateCheck(Task task) {
|
||||
this.bloomFilter = BloomFilter.create(Funnels.stringFunnel(Charsets.UTF_8), 200000, 1E-7);
|
||||
this.urlDuplicateQueue.clear();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getTotalRequestsCount(Task task) {
|
||||
return counter.get();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,85 @@
|
|||
package us.codecraft.webmagic.recover;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.mapdb.DB;
|
||||
import org.mapdb.DBMaker;
|
||||
import org.mapdb.IndexTreeList;
|
||||
import org.mapdb.Serializer;
|
||||
import us.codecraft.webmagic.Request;
|
||||
import us.codecraft.webmagic.Task;
|
||||
import us.codecraft.webmagic.scheduler.DuplicateRemovedScheduler;
|
||||
import us.codecraft.webmagic.scheduler.component.DuplicateRemover;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* @author :linweisen
|
||||
*/
|
||||
public class MmapQueueScheduler extends DuplicateRemovedScheduler {
|
||||
|
||||
private DB db;
|
||||
|
||||
private static String DATABASE_NAME = "queue";
|
||||
|
||||
private IndexTreeList<String> queue;
|
||||
|
||||
private static ObjectMapper mapper;
|
||||
|
||||
public MmapQueueScheduler(DuplicateRemover duplicateRemover, String path) {
|
||||
super.setDuplicateRemover(duplicateRemover);
|
||||
|
||||
String queuePath = path;
|
||||
|
||||
DB db = DBMaker.fileDB(queuePath)
|
||||
.fileMmapEnableIfSupported()
|
||||
.fileMmapPreclearDisable()
|
||||
.cleanerHackEnable()
|
||||
.closeOnJvmShutdown()
|
||||
.transactionEnable()
|
||||
.concurrencyScale(128)
|
||||
.make();
|
||||
this.db = db;
|
||||
this.mapper = new ObjectMapper();
|
||||
this.queue = db.indexTreeList(MmapQueueScheduler.DATABASE_NAME, Serializer.STRING).createOrOpen();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Request poll(Task task) {
|
||||
if (this.queue.size() > 0){
|
||||
String s = queue.remove(0);
|
||||
return fromJson(s, Request.class);
|
||||
}else{
|
||||
return null;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public void pushWhenNoDuplicate(Request request, Task task) {
|
||||
queue.add(toJson(request));
|
||||
this.db.commit();
|
||||
}
|
||||
|
||||
public String toJson(Object object) {
|
||||
try {
|
||||
return mapper.writeValueAsString(object);
|
||||
} catch (IOException e) {
|
||||
logger.warn("write to json string error:" + object, e);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
public <T> T fromJson(String jsonString, Class<T> clazz) {
|
||||
if (StringUtils.isEmpty(jsonString)) {
|
||||
return null;
|
||||
}
|
||||
try {
|
||||
return mapper.readValue(jsonString, clazz);
|
||||
} catch (IOException e) {
|
||||
logger.warn("parse json string error:" + jsonString, e);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,22 @@
|
|||
package us.codecraft.webmagic.recover;
|
||||
|
||||
|
||||
import us.codecraft.webmagic.Spider;
|
||||
import us.codecraft.webmagic.samples.SinaBlogProcessor;
|
||||
import us.codecraft.webmagic.scheduler.component.DuplicateRemover;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com <br>
|
||||
*/
|
||||
public class RecoverSample {
|
||||
|
||||
public static void main(String[] args) {
|
||||
String storage = "queue";
|
||||
String duplicate = "duplicate";
|
||||
Spider spider = new Spider(new SinaBlogProcessor());
|
||||
DuplicateRemover remover = new DuplicateStorageRemover(duplicate);
|
||||
spider.setScheduler(new MmapQueueScheduler(remover, storage));
|
||||
spider.addUrl("http://blog.sina.com.cn/s/articlelist_1487828712_0_1.html")
|
||||
.run();
|
||||
}
|
||||
}
|
|
@ -3,7 +3,7 @@
|
|||
<parent>
|
||||
<artifactId>webmagic-parent</artifactId>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<version>0.7.4</version>
|
||||
<version>0.7.5</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
|
|
|
@ -1,16 +1,11 @@
|
|||
package us.codecraft.webmagic.selector;
|
||||
|
||||
import net.sf.saxon.lib.NamespaceConstant;
|
||||
import net.sf.saxon.xpath.XPathEvaluator;
|
||||
import org.htmlcleaner.CleanerProperties;
|
||||
import org.htmlcleaner.DomSerializer;
|
||||
import org.htmlcleaner.HtmlCleaner;
|
||||
import org.htmlcleaner.TagNode;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.w3c.dom.Document;
|
||||
import org.w3c.dom.Node;
|
||||
import org.w3c.dom.NodeList;
|
||||
import java.io.StringWriter;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
|
||||
import javax.xml.namespace.NamespaceContext;
|
||||
import javax.xml.transform.OutputKeys;
|
||||
|
@ -21,12 +16,19 @@ import javax.xml.transform.stream.StreamResult;
|
|||
import javax.xml.xpath.XPathConstants;
|
||||
import javax.xml.xpath.XPathExpression;
|
||||
import javax.xml.xpath.XPathExpressionException;
|
||||
import java.io.StringWriter;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
|
||||
import org.htmlcleaner.CleanerProperties;
|
||||
import org.htmlcleaner.DomSerializer;
|
||||
import org.htmlcleaner.HtmlCleaner;
|
||||
import org.htmlcleaner.TagNode;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.w3c.dom.Document;
|
||||
import org.w3c.dom.Node;
|
||||
import org.w3c.dom.NodeList;
|
||||
|
||||
import net.sf.saxon.lib.NamespaceConstant;
|
||||
import net.sf.saxon.xpath.XPathEvaluator;
|
||||
|
||||
/**
|
||||
* 支持xpath2.0的选择器。包装了HtmlCleaner和Saxon HE。<br>
|
||||
|
|
|
@ -1,5 +1,7 @@
|
|||
package us.codecraft.webmagic.selector;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import org.htmlcleaner.HtmlCleaner;
|
||||
import org.htmlcleaner.TagNode;
|
||||
import org.htmlcleaner.XPatherException;
|
||||
|
@ -8,6 +10,7 @@ import org.jsoup.nodes.Document;
|
|||
import org.junit.Assert;
|
||||
import org.junit.Ignore;
|
||||
import org.junit.Test;
|
||||
|
||||
import us.codecraft.xsoup.XPathEvaluator;
|
||||
import us.codecraft.xsoup.Xsoup;
|
||||
|
||||
|
@ -1367,15 +1370,19 @@ public class XpathSelectorTest {
|
|||
public void testXPath2() {
|
||||
String text = "<h1>眉山:扎实推进农业农村工作 促农持续增收<br>\n" +
|
||||
"<span>2013-07-31 23:29:45 来源:<a href=\"http://www.mshw.net\" target=\"_blank\" style=\"color:#AAA\">眉山网</a> 责任编辑:张斯炜</span></h1>";
|
||||
XpathSelector xpathSelector = new XpathSelector("//h1/text()");
|
||||
System.out.println(xpathSelector.select(text));
|
||||
Xpath2Selector xpathSelector = new Xpath2Selector("//h1/text()");
|
||||
Assert.assertEquals("眉山:扎实推进农业农村工作 促农持续增收", xpathSelector.select(text));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testXpath2Selector() {
|
||||
Xpath2Selector xpath2Selector = new Xpath2Selector("//a/@href");
|
||||
String select = xpath2Selector.select(html);
|
||||
Assert.assertNotNull(select);
|
||||
Assert.assertEquals("http://www.oschina.net/", select);
|
||||
|
||||
List<String> selectList = xpath2Selector.selectList(html);
|
||||
Assert.assertEquals(113, selectList.size());
|
||||
Assert.assertEquals("http://www.oschina.net/", selectList.get(0));
|
||||
}
|
||||
|
||||
@Ignore("take long time")
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
<parent>
|
||||
<artifactId>webmagic-parent</artifactId>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<version>0.7.4</version>
|
||||
<version>0.7.5</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
|
@ -22,10 +22,6 @@
|
|||
<artifactId>kotlin-stdlib</artifactId>
|
||||
<version>${kotlin.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.codehaus.groovy</groupId>
|
||||
<artifactId>groovy-all</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.python</groupId>
|
||||
<artifactId>jython</artifactId>
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
<parent>
|
||||
<artifactId>webmagic-parent</artifactId>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<version>0.7.4</version>
|
||||
<version>0.7.5</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
|
|
Loading…
Reference in New Issue