Merge branch 'release/1.0.0'
commit
9d75cce16d
|
@ -1,9 +1,77 @@
|
||||||
target
|
target/
|
||||||
*.iml
|
pom.xml.tag
|
||||||
out/
|
pom.xml.releaseBackup
|
||||||
.idea
|
pom.xml.versionsBackup
|
||||||
.classpath
|
pom.xml.next
|
||||||
|
release.properties
|
||||||
|
dependency-reduced-pom.xml
|
||||||
|
buildNumber.properties
|
||||||
|
.mvn/timing.properties
|
||||||
|
# https://github.com/takari/maven-wrapper#usage-without-binary-jar
|
||||||
|
.mvn/wrapper/maven-wrapper.jar
|
||||||
|
|
||||||
|
# Eclipse m2e generated files
|
||||||
|
# Eclipse Core
|
||||||
.project
|
.project
|
||||||
.settings/
|
# JDT-specific (Eclipse Java Development Tools)
|
||||||
|
.classpath
|
||||||
|
.metadata
|
||||||
bin/
|
bin/
|
||||||
.myeclipse
|
tmp/
|
||||||
|
*.tmp
|
||||||
|
*.bak
|
||||||
|
*.swp
|
||||||
|
*~.nib
|
||||||
|
local.properties
|
||||||
|
.settings/
|
||||||
|
.loadpath
|
||||||
|
.recommenders
|
||||||
|
|
||||||
|
# External tool builders
|
||||||
|
.externalToolBuilders/
|
||||||
|
|
||||||
|
# Locally stored "Eclipse launch configurations"
|
||||||
|
*.launch
|
||||||
|
|
||||||
|
# PyDev specific (Python IDE for Eclipse)
|
||||||
|
*.pydevproject
|
||||||
|
|
||||||
|
# CDT-specific (C/C++ Development Tooling)
|
||||||
|
.cproject
|
||||||
|
|
||||||
|
# CDT- autotools
|
||||||
|
.autotools
|
||||||
|
|
||||||
|
# Java annotation processor (APT)
|
||||||
|
.factorypath
|
||||||
|
|
||||||
|
# PDT-specific (PHP Development Tools)
|
||||||
|
.buildpath
|
||||||
|
|
||||||
|
# sbteclipse plugin
|
||||||
|
.target
|
||||||
|
|
||||||
|
# Tern plugin
|
||||||
|
.tern-project
|
||||||
|
|
||||||
|
# TeXlipse plugin
|
||||||
|
.texlipse
|
||||||
|
|
||||||
|
# STS (Spring Tool Suite)
|
||||||
|
.springBeans
|
||||||
|
|
||||||
|
# Code Recommenders
|
||||||
|
.recommenders/
|
||||||
|
|
||||||
|
# Annotation Processing
|
||||||
|
.apt_generated/
|
||||||
|
.apt_generated_test/
|
||||||
|
|
||||||
|
# Scala IDE specific (Scala & Java development for Eclipse)
|
||||||
|
.cache-main
|
||||||
|
.scala_dependencies
|
||||||
|
.worksheet
|
||||||
|
|
||||||
|
# Uncomment this line if you wish to ignore the project description file.
|
||||||
|
# Typically, this file would be tracked if it contains build/dependency configurations:
|
||||||
|
#.project
|
||||||
|
|
370
pom.xml
370
pom.xml
|
@ -1,14 +1,24 @@
|
||||||
<?xml version="1.0" encoding="UTF-8"?>
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
<project
|
||||||
<groupId>us.codecraft</groupId>
|
xmlns="http://maven.apache.org/POM/4.0.0"
|
||||||
<version>0.10.3</version>
|
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||||
|
xsi:schemaLocation="
|
||||||
|
http://maven.apache.org/POM/4.0.0
|
||||||
|
http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
<parent>
|
||||||
|
<groupId>org.oxerr</groupId>
|
||||||
|
<artifactId>oxerr-parent</artifactId>
|
||||||
|
<version>2.2.1</version>
|
||||||
|
</parent>
|
||||||
|
<groupId>us.codecraft</groupId>
|
||||||
|
<version>1.0.0</version>
|
||||||
<packaging>pom</packaging>
|
<packaging>pom</packaging>
|
||||||
<properties>
|
<properties>
|
||||||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
||||||
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
|
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
|
||||||
<maven.compiler.source>1.8</maven.compiler.source>
|
<maven.compiler.source>11</maven.compiler.source>
|
||||||
<maven.compiler.target>1.8</maven.compiler.target>
|
<maven.compiler.target>11</maven.compiler.target>
|
||||||
<assertj.version>3.23.1</assertj.version>
|
<assertj.version>3.23.1</assertj.version>
|
||||||
<commons-cli.version>1.5.0</commons-cli.version>
|
<commons-cli.version>1.5.0</commons-cli.version>
|
||||||
<commons-collections4.version>4.4</commons-collections4.version>
|
<commons-collections4.version>4.4</commons-collections4.version>
|
||||||
|
@ -23,20 +33,21 @@
|
||||||
<jedis.version>3.7.1</jedis.version>
|
<jedis.version>3.7.1</jedis.version>
|
||||||
<jruby.version>9.3.9.0</jruby.version>
|
<jruby.version>9.3.9.0</jruby.version>
|
||||||
<json-path.version>2.9.0</json-path.version>
|
<json-path.version>2.9.0</json-path.version>
|
||||||
<junit.version>4.13.2</junit.version>
|
<junit.version>5.10.2</junit.version>
|
||||||
|
<junit.platform.version>1.10.2</junit.platform.version>
|
||||||
<jython.version>2.7.3</jython.version>
|
<jython.version>2.7.3</jython.version>
|
||||||
<log4j.version>1.2.17</log4j.version>
|
<log4j2.version>2.23.1</log4j2.version>
|
||||||
<mockito-all.version>2.0.2-beta</mockito-all.version>
|
<mockito-all.version>2.0.2-beta</mockito-all.version>
|
||||||
<moco.version>1.3.0</moco.version>
|
<moco.version>1.3.0</moco.version>
|
||||||
<phantomjsdriver.version>1.2.0</phantomjsdriver.version>
|
<phantomjsdriver.version>1.2.0</phantomjsdriver.version>
|
||||||
<saxon-he.version>11.4</saxon-he.version>
|
<saxon-he.version>12.4</saxon-he.version>
|
||||||
<selenium-java.version>3.141.59</selenium-java.version>
|
<selenium-java.version>4.14.1</selenium-java.version>
|
||||||
<slf4j.version>2.0.4</slf4j.version>
|
<slf4j.version>2.0.4</slf4j.version>
|
||||||
<spring-version>4.0.0.RELEASE</spring-version>
|
<spring-version>4.0.0.RELEASE</spring-version>
|
||||||
<xsoup.version>0.3.5</xsoup.version>
|
<xsoup.version>0.3.5</xsoup.version>
|
||||||
</properties>
|
</properties>
|
||||||
<artifactId>webmagic-parent</artifactId>
|
<artifactId>webmagic</artifactId>
|
||||||
<name>webmagic-parent</name>
|
<name>webmagic</name>
|
||||||
<description>
|
<description>
|
||||||
A crawler framework. It covers the whole lifecycle of crawler: downloading, url management, content
|
A crawler framework. It covers the whole lifecycle of crawler: downloading, url management, content
|
||||||
extraction and persistent. It can simply the development of a specific crawler.
|
extraction and persistent. It can simply the development of a specific crawler.
|
||||||
|
@ -77,14 +88,41 @@
|
||||||
<module>webmagic-coverage</module>
|
<module>webmagic-coverage</module>
|
||||||
</modules>
|
</modules>
|
||||||
|
|
||||||
<dependencyManagement>
|
|
||||||
<dependencies>
|
<dependencies>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>junit</groupId>
|
<groupId>org.apache.logging.log4j</groupId>
|
||||||
<artifactId>junit</artifactId>
|
<artifactId>log4j-core</artifactId>
|
||||||
<version>${junit.version}</version>
|
|
||||||
<scope>test</scope>
|
<scope>test</scope>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.logging.log4j</groupId>
|
||||||
|
<artifactId>log4j-slf4j2-impl</artifactId>
|
||||||
|
<scope>test</scope>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.junit.jupiter</groupId>
|
||||||
|
<artifactId>junit-jupiter-engine</artifactId>
|
||||||
|
<scope>test</scope>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.junit.vintage</groupId>
|
||||||
|
<artifactId>junit-vintage-engine</artifactId>
|
||||||
|
<scope>test</scope>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.junit.platform</groupId>
|
||||||
|
<artifactId>junit-platform-launcher</artifactId>
|
||||||
|
<scope>test</scope>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.junit.platform</groupId>
|
||||||
|
<artifactId>junit-platform-runner</artifactId>
|
||||||
|
<scope>test</scope>
|
||||||
|
</dependency>
|
||||||
|
</dependencies>
|
||||||
|
|
||||||
|
<dependencyManagement>
|
||||||
|
<dependencies>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.mockito</groupId>
|
<groupId>org.mockito</groupId>
|
||||||
<artifactId>mockito-all</artifactId>
|
<artifactId>mockito-all</artifactId>
|
||||||
|
@ -101,6 +139,16 @@
|
||||||
<artifactId>httpcore</artifactId>
|
<artifactId>httpcore</artifactId>
|
||||||
<version>${httpcore.version}</version>
|
<version>${httpcore.version}</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.logging.log4j</groupId>
|
||||||
|
<artifactId>log4j-core</artifactId>
|
||||||
|
<version>${log4j2.version}</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.logging.log4j</groupId>
|
||||||
|
<artifactId>log4j-slf4j2-impl</artifactId>
|
||||||
|
<version>${log4j2.version}</version>
|
||||||
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>com.google.guava</groupId>
|
<groupId>com.google.guava</groupId>
|
||||||
<artifactId>guava</artifactId>
|
<artifactId>guava</artifactId>
|
||||||
|
@ -112,13 +160,28 @@
|
||||||
<version>${json-path.version}</version>
|
<version>${json-path.version}</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.slf4j</groupId>
|
<groupId>org.junit.jupiter</groupId>
|
||||||
<artifactId>slf4j-api</artifactId>
|
<artifactId>junit-jupiter-engine</artifactId>
|
||||||
<version>${slf4j.version}</version>
|
<version>${junit.version}</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.junit.vintage</groupId>
|
||||||
|
<artifactId>junit-vintage-engine</artifactId>
|
||||||
|
<version>${junit.version}</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.junit.platform</groupId>
|
||||||
|
<artifactId>junit-platform-launcher</artifactId>
|
||||||
|
<version>${junit.platform.version}</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.junit.platform</groupId>
|
||||||
|
<artifactId>junit-platform-runner</artifactId>
|
||||||
|
<version>${junit.platform.version}</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.slf4j</groupId>
|
<groupId>org.slf4j</groupId>
|
||||||
<artifactId>slf4j-log4j12</artifactId>
|
<artifactId>slf4j-api</artifactId>
|
||||||
<version>${slf4j.version}</version>
|
<version>${slf4j.version}</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
|
@ -143,11 +206,6 @@
|
||||||
</exclusion>
|
</exclusion>
|
||||||
</exclusions>
|
</exclusions>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
|
||||||
<groupId>log4j</groupId>
|
|
||||||
<artifactId>log4j</artifactId>
|
|
||||||
<version>${log4j.version}</version>
|
|
||||||
</dependency>
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.assertj</groupId>
|
<groupId>org.assertj</groupId>
|
||||||
<artifactId>assertj-core</artifactId>
|
<artifactId>assertj-core</artifactId>
|
||||||
|
@ -219,86 +277,10 @@
|
||||||
|
|
||||||
<build>
|
<build>
|
||||||
<plugins>
|
<plugins>
|
||||||
<plugin>
|
|
||||||
<groupId>org.apache.maven.plugins</groupId>
|
|
||||||
<artifactId>maven-enforcer-plugin</artifactId>
|
|
||||||
<version>3.1.0</version>
|
|
||||||
<executions>
|
|
||||||
<execution>
|
|
||||||
<id>enforce-maven</id>
|
|
||||||
<goals>
|
|
||||||
<goal>enforce</goal>
|
|
||||||
</goals>
|
|
||||||
<configuration>
|
|
||||||
<rules>
|
|
||||||
<requireMavenVersion>
|
|
||||||
<version>3.5.0</version>
|
|
||||||
</requireMavenVersion>
|
|
||||||
</rules>
|
|
||||||
</configuration>
|
|
||||||
</execution>
|
|
||||||
</executions>
|
|
||||||
</plugin>
|
|
||||||
<plugin>
|
|
||||||
<groupId>org.apache.maven.plugins</groupId>
|
|
||||||
<artifactId>maven-surefire-plugin</artifactId>
|
|
||||||
</plugin>
|
|
||||||
<plugin>
|
|
||||||
<groupId>org.apache.maven.plugins</groupId>
|
|
||||||
<artifactId>maven-compiler-plugin</artifactId>
|
|
||||||
</plugin>
|
|
||||||
<!--<plugin>-->
|
|
||||||
<!--<groupId>org.apache.maven.plugins</groupId>-->
|
|
||||||
<!--<artifactId>maven-dependency-plugin</artifactId>-->
|
|
||||||
<!--<version>2.8</version>-->
|
|
||||||
<!--<executions>-->
|
|
||||||
<!--<execution>-->
|
|
||||||
<!--<id>copy-dependencies</id>-->
|
|
||||||
<!--<phase>package</phase>-->
|
|
||||||
<!--<goals>-->
|
|
||||||
<!--<goal>copy-dependencies</goal>-->
|
|
||||||
<!--</goals>-->
|
|
||||||
<!--<configuration>-->
|
|
||||||
<!--<outputDirectory>${project.build.directory}/lib</outputDirectory>-->
|
|
||||||
<!--<overWriteReleases>false</overWriteReleases>-->
|
|
||||||
<!--<overWriteSnapshots>false</overWriteSnapshots>-->
|
|
||||||
<!--<overWriteIfNewer>true</overWriteIfNewer>-->
|
|
||||||
<!--</configuration>-->
|
|
||||||
<!--</execution>-->
|
|
||||||
<!--</executions>-->
|
|
||||||
<!--</plugin>-->
|
|
||||||
<plugin>
|
|
||||||
<groupId>org.apache.maven.plugins</groupId>
|
|
||||||
<artifactId>maven-resources-plugin</artifactId>
|
|
||||||
</plugin>
|
|
||||||
<plugin>
|
|
||||||
<groupId>org.apache.maven.plugins</groupId>
|
|
||||||
<artifactId>maven-jar-plugin</artifactId>
|
|
||||||
<configuration>
|
|
||||||
<excludes>
|
|
||||||
<exclude>log4j.xml</exclude>
|
|
||||||
</excludes>
|
|
||||||
</configuration>
|
|
||||||
</plugin>
|
|
||||||
<plugin>
|
|
||||||
<groupId>org.apache.maven.plugins</groupId>
|
|
||||||
<artifactId>maven-source-plugin</artifactId>
|
|
||||||
<version>3.2.1</version>
|
|
||||||
<executions>
|
|
||||||
<execution>
|
|
||||||
<id>attach-sources</id>
|
|
||||||
<goals>
|
|
||||||
<goal>jar</goal>
|
|
||||||
</goals>
|
|
||||||
</execution>
|
|
||||||
</executions>
|
|
||||||
</plugin>
|
|
||||||
<plugin>
|
<plugin>
|
||||||
<groupId>org.apache.maven.plugins</groupId>
|
<groupId>org.apache.maven.plugins</groupId>
|
||||||
<artifactId>maven-javadoc-plugin</artifactId>
|
<artifactId>maven-javadoc-plugin</artifactId>
|
||||||
<version>3.4.1</version>
|
|
||||||
<configuration>
|
<configuration>
|
||||||
<encoding>UTF-8</encoding>
|
|
||||||
<doctitle>WebMagic ${project.version}</doctitle>
|
<doctitle>WebMagic ${project.version}</doctitle>
|
||||||
<locale>en_US</locale>
|
<locale>en_US</locale>
|
||||||
|
|
||||||
|
@ -322,11 +304,6 @@
|
||||||
</execution>
|
</execution>
|
||||||
</executions>
|
</executions>
|
||||||
</plugin>
|
</plugin>
|
||||||
<plugin>
|
|
||||||
<groupId>org.apache.maven.plugins</groupId>
|
|
||||||
<artifactId>maven-release-plugin</artifactId>
|
|
||||||
<version>3.0.0-M6</version>
|
|
||||||
</plugin>
|
|
||||||
<plugin>
|
<plugin>
|
||||||
<groupId>org.jacoco</groupId>
|
<groupId>org.jacoco</groupId>
|
||||||
<artifactId>jacoco-maven-plugin</artifactId>
|
<artifactId>jacoco-maven-plugin</artifactId>
|
||||||
|
@ -355,189 +332,6 @@
|
||||||
</configuration>
|
</configuration>
|
||||||
</plugin>
|
</plugin>
|
||||||
</plugins>
|
</plugins>
|
||||||
<pluginManagement>
|
|
||||||
<plugins>
|
|
||||||
<plugin>
|
|
||||||
<groupId>org.apache.maven.plugins</groupId>
|
|
||||||
<artifactId>maven-clean-plugin</artifactId>
|
|
||||||
<version>3.2.0</version>
|
|
||||||
</plugin>
|
|
||||||
<plugin>
|
|
||||||
<groupId>org.apache.maven.plugins</groupId>
|
|
||||||
<artifactId>maven-compiler-plugin</artifactId>
|
|
||||||
<version>3.10.1</version>
|
|
||||||
</plugin>
|
|
||||||
<plugin>
|
|
||||||
<groupId>org.apache.maven.plugins</groupId>
|
|
||||||
<artifactId>maven-deploy-plugin</artifactId>
|
|
||||||
<version>3.0.0</version>
|
|
||||||
</plugin>
|
|
||||||
<plugin>
|
|
||||||
<groupId>org.apache.maven.plugins</groupId>
|
|
||||||
<artifactId>maven-install-plugin</artifactId>
|
|
||||||
<version>3.0.1</version>
|
|
||||||
</plugin>
|
|
||||||
<plugin>
|
|
||||||
<groupId>org.apache.maven.plugins</groupId>
|
|
||||||
<artifactId>maven-jar-plugin</artifactId>
|
|
||||||
<version>3.3.0</version>
|
|
||||||
</plugin>
|
|
||||||
<plugin>
|
|
||||||
<groupId>org.apache.maven.plugins</groupId>
|
|
||||||
<artifactId>maven-jxr-plugin</artifactId>
|
|
||||||
<version>3.3.0</version>
|
|
||||||
</plugin>
|
|
||||||
<plugin>
|
|
||||||
<groupId>org.apache.maven.plugins</groupId>
|
|
||||||
<artifactId>maven-pmd-plugin</artifactId>
|
|
||||||
<version>3.19.0</version>
|
|
||||||
</plugin>
|
|
||||||
<plugin>
|
|
||||||
<groupId>org.apache.maven.plugins</groupId>
|
|
||||||
<artifactId>maven-resources-plugin</artifactId>
|
|
||||||
<version>3.3.0</version>
|
|
||||||
</plugin>
|
|
||||||
<plugin>
|
|
||||||
<groupId>org.apache.maven.plugins</groupId>
|
|
||||||
<artifactId>maven-site-plugin</artifactId>
|
|
||||||
<version>4.0.0-M3</version>
|
|
||||||
</plugin>
|
|
||||||
<plugin>
|
|
||||||
<groupId>org.apache.maven.plugins</groupId>
|
|
||||||
<artifactId>maven-surefire-plugin</artifactId>
|
|
||||||
<version>3.0.0-M7</version>
|
|
||||||
</plugin>
|
|
||||||
<plugin>
|
|
||||||
<groupId>org.apache.maven.plugins</groupId>
|
|
||||||
<artifactId>maven-surefire-report-plugin</artifactId>
|
|
||||||
<version>3.0.0-M7</version>
|
|
||||||
</plugin>
|
|
||||||
<plugin>
|
|
||||||
<groupId>org.codehaus.mojo</groupId>
|
|
||||||
<artifactId>taglist-maven-plugin</artifactId>
|
|
||||||
<version>3.0.0</version>
|
|
||||||
</plugin>
|
|
||||||
<plugin>
|
|
||||||
<groupId>org.jacoco</groupId>
|
|
||||||
<artifactId>jacoco-maven-plugin</artifactId>
|
|
||||||
<version>0.8.8</version>
|
|
||||||
</plugin>
|
|
||||||
<plugin>
|
|
||||||
<groupId>com.amashchenko.maven.plugin</groupId>
|
|
||||||
<artifactId>gitflow-maven-plugin</artifactId>
|
|
||||||
<version>1.18.0</version>
|
|
||||||
</plugin>
|
|
||||||
<plugin>
|
|
||||||
<groupId>com.github.spotbugs</groupId>
|
|
||||||
<artifactId>spotbugs-maven-plugin</artifactId>
|
|
||||||
<version>4.7.2.0</version>
|
|
||||||
</plugin>
|
|
||||||
</plugins>
|
|
||||||
</pluginManagement>
|
|
||||||
</build>
|
</build>
|
||||||
|
|
||||||
<reporting>
|
|
||||||
<plugins>
|
|
||||||
<plugin>
|
|
||||||
<groupId>org.apache.maven.plugins</groupId>
|
|
||||||
<artifactId>maven-javadoc-plugin</artifactId>
|
|
||||||
<configuration>
|
|
||||||
<doclint>none</doclint>
|
|
||||||
</configuration>
|
|
||||||
</plugin>
|
|
||||||
<plugin>
|
|
||||||
<groupId>org.apache.maven.plugins</groupId>
|
|
||||||
<artifactId>maven-jxr-plugin</artifactId>
|
|
||||||
</plugin>
|
|
||||||
<plugin>
|
|
||||||
<groupId>org.apache.maven.plugins</groupId>
|
|
||||||
<artifactId>maven-pmd-plugin</artifactId>
|
|
||||||
</plugin>
|
|
||||||
<plugin>
|
|
||||||
<groupId>org.apache.maven.plugins</groupId>
|
|
||||||
<artifactId>maven-surefire-report-plugin</artifactId>
|
|
||||||
</plugin>
|
|
||||||
<plugin>
|
|
||||||
<groupId>org.codehaus.mojo</groupId>
|
|
||||||
<artifactId>taglist-maven-plugin</artifactId>
|
|
||||||
</plugin>
|
|
||||||
<plugin>
|
|
||||||
<groupId>com.github.spotbugs</groupId>
|
|
||||||
<artifactId>spotbugs-maven-plugin</artifactId>
|
|
||||||
</plugin>
|
|
||||||
</plugins>
|
|
||||||
</reporting>
|
|
||||||
|
|
||||||
<profiles>
|
|
||||||
<profile>
|
|
||||||
<id>release</id>
|
|
||||||
<build>
|
|
||||||
<plugins>
|
|
||||||
<!-- Source -->
|
|
||||||
<plugin>
|
|
||||||
<groupId>org.apache.maven.plugins</groupId>
|
|
||||||
<artifactId>maven-source-plugin</artifactId>
|
|
||||||
<version>3.2.1</version>
|
|
||||||
<executions>
|
|
||||||
<execution>
|
|
||||||
<phase>package</phase>
|
|
||||||
<goals>
|
|
||||||
<goal>jar-no-fork</goal>
|
|
||||||
</goals>
|
|
||||||
</execution>
|
|
||||||
</executions>
|
|
||||||
</plugin>
|
|
||||||
<!-- Javadoc -->
|
|
||||||
<plugin>
|
|
||||||
<groupId>org.apache.maven.plugins</groupId>
|
|
||||||
<artifactId>maven-javadoc-plugin</artifactId>
|
|
||||||
<version>3.4.1</version>
|
|
||||||
<executions>
|
|
||||||
<execution>
|
|
||||||
<phase>package</phase>
|
|
||||||
<goals>
|
|
||||||
<goal>jar</goal>
|
|
||||||
</goals>
|
|
||||||
</execution>
|
|
||||||
</executions>
|
|
||||||
</plugin>
|
|
||||||
<!-- GPG -->
|
|
||||||
<plugin>
|
|
||||||
<groupId>org.apache.maven.plugins</groupId>
|
|
||||||
<artifactId>maven-gpg-plugin</artifactId>
|
|
||||||
<version>3.0.1</version>
|
|
||||||
<executions>
|
|
||||||
<execution>
|
|
||||||
<phase>verify</phase>
|
|
||||||
<goals>
|
|
||||||
<goal>sign</goal>
|
|
||||||
</goals>
|
|
||||||
</execution>
|
|
||||||
</executions>
|
|
||||||
</plugin>
|
|
||||||
<plugin>
|
|
||||||
<groupId>org.sonatype.plugins</groupId>
|
|
||||||
<artifactId>nexus-staging-maven-plugin</artifactId>
|
|
||||||
<version>1.6.13</version>
|
|
||||||
<extensions>true</extensions>
|
|
||||||
<configuration>
|
|
||||||
<serverId>sonatype-nexus-staging</serverId>
|
|
||||||
<nexusUrl>https://oss.sonatype.org/</nexusUrl>
|
|
||||||
<autoReleaseAfterClose>true</autoReleaseAfterClose>
|
|
||||||
</configuration>
|
|
||||||
</plugin>
|
|
||||||
</plugins>
|
|
||||||
</build>
|
|
||||||
<distributionManagement>
|
|
||||||
<snapshotRepository>
|
|
||||||
<id>sonatype-nexus-snapshots</id>
|
|
||||||
<url>https://oss.sonatype.org/content/repositories/snapshots/</url>
|
|
||||||
</snapshotRepository>
|
|
||||||
<repository>
|
|
||||||
<id>sonatype-nexus-staging</id>
|
|
||||||
<url>https://oss.sonatype.org/service/local/staging/deploy/maven2/</url>
|
|
||||||
</repository>
|
|
||||||
</distributionManagement>
|
|
||||||
</profile>
|
|
||||||
</profiles>
|
|
||||||
</project>
|
</project>
|
||||||
|
|
|
@ -1,9 +1,14 @@
|
||||||
<?xml version="1.0" encoding="UTF-8"?>
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
<project
|
||||||
|
xmlns="http://maven.apache.org/POM/4.0.0"
|
||||||
|
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||||
|
xsi:schemaLocation="
|
||||||
|
http://maven.apache.org/POM/4.0.0
|
||||||
|
http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||||
<parent>
|
<parent>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<artifactId>webmagic-parent</artifactId>
|
<artifactId>webmagic</artifactId>
|
||||||
<version>0.10.3</version>
|
<version>1.0.0</version>
|
||||||
</parent>
|
</parent>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
@ -15,11 +20,6 @@
|
||||||
<artifactId>httpclient</artifactId>
|
<artifactId>httpclient</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
|
||||||
<groupId>junit</groupId>
|
|
||||||
<artifactId>junit</artifactId>
|
|
||||||
</dependency>
|
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.commons</groupId>
|
<groupId>org.apache.commons</groupId>
|
||||||
<artifactId>commons-lang3</artifactId>
|
<artifactId>commons-lang3</artifactId>
|
||||||
|
@ -45,12 +45,6 @@
|
||||||
<artifactId>mockito-all</artifactId>
|
<artifactId>mockito-all</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
|
||||||
<groupId>org.slf4j</groupId>
|
|
||||||
<artifactId>slf4j-log4j12</artifactId>
|
|
||||||
<optional>true</optional>
|
|
||||||
</dependency>
|
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.commons</groupId>
|
<groupId>org.apache.commons</groupId>
|
||||||
<artifactId>commons-collections4</artifactId>
|
<artifactId>commons-collections4</artifactId>
|
||||||
|
|
|
@ -71,6 +71,7 @@ public class Page {
|
||||||
* Returns a {@link Page} with {@link #downloadSuccess} is {@code false},
|
* Returns a {@link Page} with {@link #downloadSuccess} is {@code false},
|
||||||
* and {@link #request} is specified.
|
* and {@link #request} is specified.
|
||||||
*
|
*
|
||||||
|
* @param request the {@link Request}.
|
||||||
* @return the page.
|
* @return the page.
|
||||||
* @since 0.10.0
|
* @since 0.10.0
|
||||||
*/
|
*/
|
||||||
|
|
|
@ -1,13 +1,14 @@
|
||||||
package us.codecraft.webmagic;
|
package us.codecraft.webmagic;
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
import us.codecraft.webmagic.downloader.Downloader;
|
import us.codecraft.webmagic.downloader.Downloader;
|
||||||
import us.codecraft.webmagic.model.HttpRequestBody;
|
import us.codecraft.webmagic.model.HttpRequestBody;
|
||||||
import us.codecraft.webmagic.utils.Experimental;
|
import us.codecraft.webmagic.utils.Experimental;
|
||||||
|
|
||||||
import java.io.Serializable;
|
|
||||||
import java.util.HashMap;
|
|
||||||
import java.util.Map;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Object contains url to crawl.<br>
|
* Object contains url to crawl.<br>
|
||||||
* It contains some additional information.<br>
|
* It contains some additional information.<br>
|
||||||
|
@ -35,7 +36,7 @@ public class Request implements Serializable {
|
||||||
/**
|
/**
|
||||||
* Store additional information in extras.
|
* Store additional information in extras.
|
||||||
*/
|
*/
|
||||||
private Map<String, Object> extras;
|
private Map<String, Object> extras = new HashMap<>();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* cookies for current url, if not set use Site's cookies
|
* cookies for current url, if not set use Site's cookies
|
||||||
|
@ -93,9 +94,6 @@ public class Request implements Serializable {
|
||||||
}
|
}
|
||||||
|
|
||||||
public <T> Request putExtra(String key, T value) {
|
public <T> Request putExtra(String key, T value) {
|
||||||
if (extras == null) {
|
|
||||||
extras = new HashMap<String, Object>();
|
|
||||||
}
|
|
||||||
extras.put(key, value);
|
extras.put(key, value);
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
@ -105,11 +103,11 @@ public class Request implements Serializable {
|
||||||
}
|
}
|
||||||
|
|
||||||
public Map<String, Object> getExtras() {
|
public Map<String, Object> getExtras() {
|
||||||
return extras;
|
return Collections.unmodifiableMap(extras);
|
||||||
}
|
}
|
||||||
|
|
||||||
public Request setExtras(Map<String, Object> extras) {
|
public Request setExtras(Map<String, Object> extras) {
|
||||||
this.extras = extras;
|
this.extras.putAll(extras);
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -9,11 +9,8 @@ import java.util.Date;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.UUID;
|
import java.util.UUID;
|
||||||
import java.util.concurrent.ExecutorService;
|
import java.util.concurrent.ExecutorService;
|
||||||
import java.util.concurrent.TimeUnit;
|
|
||||||
import java.util.concurrent.atomic.AtomicInteger;
|
import java.util.concurrent.atomic.AtomicInteger;
|
||||||
import java.util.concurrent.atomic.AtomicLong;
|
import java.util.concurrent.atomic.AtomicLong;
|
||||||
import java.util.concurrent.locks.Condition;
|
|
||||||
import java.util.concurrent.locks.ReentrantLock;
|
|
||||||
import org.apache.commons.collections4.CollectionUtils;
|
import org.apache.commons.collections4.CollectionUtils;
|
||||||
import org.apache.commons.lang3.SerializationUtils;
|
import org.apache.commons.lang3.SerializationUtils;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
|
@ -76,7 +73,7 @@ public class Spider implements Runnable, Task {
|
||||||
|
|
||||||
protected String uuid;
|
protected String uuid;
|
||||||
|
|
||||||
protected Scheduler scheduler = new QueueScheduler();
|
protected SpiderScheduler scheduler;
|
||||||
|
|
||||||
protected Logger logger = LoggerFactory.getLogger(getClass());
|
protected Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
|
||||||
|
@ -88,7 +85,7 @@ public class Spider implements Runnable, Task {
|
||||||
|
|
||||||
protected AtomicInteger stat = new AtomicInteger(STAT_INIT);
|
protected AtomicInteger stat = new AtomicInteger(STAT_INIT);
|
||||||
|
|
||||||
protected boolean exitWhenComplete = true;
|
protected volatile boolean exitWhenComplete = true;
|
||||||
|
|
||||||
protected final static int STAT_INIT = 0;
|
protected final static int STAT_INIT = 0;
|
||||||
|
|
||||||
|
@ -100,10 +97,6 @@ public class Spider implements Runnable, Task {
|
||||||
|
|
||||||
protected boolean destroyWhenExit = true;
|
protected boolean destroyWhenExit = true;
|
||||||
|
|
||||||
private ReentrantLock newUrlLock = new ReentrantLock();
|
|
||||||
|
|
||||||
private Condition newUrlCondition = newUrlLock.newCondition();
|
|
||||||
|
|
||||||
private List<SpiderListener> spiderListeners;
|
private List<SpiderListener> spiderListeners;
|
||||||
|
|
||||||
private final AtomicLong pageCount = new AtomicLong(0);
|
private final AtomicLong pageCount = new AtomicLong(0);
|
||||||
|
@ -131,6 +124,7 @@ public class Spider implements Runnable, Task {
|
||||||
public Spider(PageProcessor pageProcessor) {
|
public Spider(PageProcessor pageProcessor) {
|
||||||
this.pageProcessor = pageProcessor;
|
this.pageProcessor = pageProcessor;
|
||||||
this.site = pageProcessor.getSite();
|
this.site = pageProcessor.getSite();
|
||||||
|
this.scheduler = new SpiderScheduler(new QueueScheduler());
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -186,15 +180,15 @@ public class Spider implements Runnable, Task {
|
||||||
/**
|
/**
|
||||||
* set scheduler for Spider
|
* set scheduler for Spider
|
||||||
*
|
*
|
||||||
* @param scheduler scheduler
|
* @param updateScheduler scheduler
|
||||||
* @return this
|
* @return this
|
||||||
* @see Scheduler
|
* @see Scheduler
|
||||||
* @since 0.2.1
|
* @since 0.2.1
|
||||||
*/
|
*/
|
||||||
public Spider setScheduler(Scheduler scheduler) {
|
public Spider setScheduler(Scheduler updateScheduler) {
|
||||||
checkIfRunning();
|
checkIfRunning();
|
||||||
Scheduler oldScheduler = this.scheduler;
|
SpiderScheduler oldScheduler = this.scheduler;
|
||||||
this.scheduler = scheduler;
|
scheduler.setScheduler(updateScheduler);
|
||||||
if (oldScheduler != null) {
|
if (oldScheduler != null) {
|
||||||
Request request;
|
Request request;
|
||||||
while ((request = oldScheduler.poll(this)) != null) {
|
while ((request = oldScheduler.poll(this)) != null) {
|
||||||
|
@ -333,7 +327,7 @@ public class Spider implements Runnable, Task {
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// wait until new url added,
|
// wait until new url added,
|
||||||
if (waitNewUrl()) {
|
if (scheduler.waitNewUrl(threadPool, emptySleepTime)) {
|
||||||
// if interrupted
|
// if interrupted
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@ -353,7 +347,7 @@ public class Spider implements Runnable, Task {
|
||||||
logger.error("process request " + request + " error", e);
|
logger.error("process request " + request + " error", e);
|
||||||
} finally {
|
} finally {
|
||||||
pageCount.incrementAndGet();
|
pageCount.incrementAndGet();
|
||||||
signalNewUrl();
|
scheduler.signalNewUrl();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
@ -536,7 +530,7 @@ public class Spider implements Runnable, Task {
|
||||||
for (String url : urls) {
|
for (String url : urls) {
|
||||||
addRequest(new Request(url));
|
addRequest(new Request(url));
|
||||||
}
|
}
|
||||||
signalNewUrl();
|
scheduler.signalNewUrl();
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -588,42 +582,10 @@ public class Spider implements Runnable, Task {
|
||||||
for (Request request : requests) {
|
for (Request request : requests) {
|
||||||
addRequest(request);
|
addRequest(request);
|
||||||
}
|
}
|
||||||
signalNewUrl();
|
scheduler.signalNewUrl();
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
*
|
|
||||||
* @return isInterrupted
|
|
||||||
*/
|
|
||||||
private boolean waitNewUrl() {
|
|
||||||
// now there may not be any thread live
|
|
||||||
newUrlLock.lock();
|
|
||||||
try {
|
|
||||||
//double check,unnecessary, unless very fast concurrent
|
|
||||||
if (threadPool.getThreadAlive() == 0) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
//wait for amount of time
|
|
||||||
newUrlCondition.await(emptySleepTime, TimeUnit.MILLISECONDS);
|
|
||||||
return false;
|
|
||||||
} catch (InterruptedException e) {
|
|
||||||
// logger.warn("waitNewUrl - interrupted, error {}", e);
|
|
||||||
return true;
|
|
||||||
} finally {
|
|
||||||
newUrlLock.unlock();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private void signalNewUrl() {
|
|
||||||
try {
|
|
||||||
newUrlLock.lock();
|
|
||||||
newUrlCondition.signalAll();
|
|
||||||
} finally {
|
|
||||||
newUrlLock.unlock();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public void start() {
|
public void start() {
|
||||||
runAsync();
|
runAsync();
|
||||||
}
|
}
|
||||||
|
@ -636,6 +598,13 @@ public class Spider implements Runnable, Task {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Stop when all tasks in the queue are completed and all worker threads are also completed
|
||||||
|
*/
|
||||||
|
public void stopWhenComplete(){
|
||||||
|
this.exitWhenComplete = true;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* start with more than one threads
|
* start with more than one threads
|
||||||
*
|
*
|
||||||
|
@ -799,7 +768,7 @@ public class Spider implements Runnable, Task {
|
||||||
}
|
}
|
||||||
|
|
||||||
public Scheduler getScheduler() {
|
public Scheduler getScheduler() {
|
||||||
return scheduler;
|
return scheduler.getScheduler();
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -0,0 +1,59 @@
|
||||||
|
package us.codecraft.webmagic;
|
||||||
|
|
||||||
|
import java.util.concurrent.TimeUnit;
|
||||||
|
import java.util.concurrent.locks.Condition;
|
||||||
|
import java.util.concurrent.locks.ReentrantLock;
|
||||||
|
|
||||||
|
import us.codecraft.webmagic.scheduler.Scheduler;
|
||||||
|
import us.codecraft.webmagic.thread.CountableThreadPool;
|
||||||
|
|
||||||
|
public class SpiderScheduler {
|
||||||
|
private Scheduler scheduler;
|
||||||
|
private final ReentrantLock newUrlLock = new ReentrantLock();
|
||||||
|
private final Condition newUrlCondition = newUrlLock.newCondition();
|
||||||
|
|
||||||
|
public SpiderScheduler(Scheduler scheduler) {
|
||||||
|
this.scheduler = scheduler;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Scheduler getScheduler() {
|
||||||
|
return scheduler;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setScheduler(Scheduler scheduler) {
|
||||||
|
this.scheduler = scheduler;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Request poll(Spider spider) {
|
||||||
|
return scheduler.poll(spider);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void push(Request request, Spider spider) {
|
||||||
|
scheduler.push(request, spider);
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean waitNewUrl(CountableThreadPool threadPool, long emptySleepTime) {
|
||||||
|
newUrlLock.lock();
|
||||||
|
try {
|
||||||
|
if (threadPool.getThreadAlive() == 0) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
newUrlCondition.await(emptySleepTime, TimeUnit.MILLISECONDS);
|
||||||
|
return false;
|
||||||
|
} catch (InterruptedException e) {
|
||||||
|
return true;
|
||||||
|
} finally {
|
||||||
|
newUrlLock.unlock();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void signalNewUrl() {
|
||||||
|
try {
|
||||||
|
newUrlLock.lock();
|
||||||
|
newUrlCondition.signalAll();
|
||||||
|
} finally {
|
||||||
|
newUrlLock.unlock();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -42,7 +42,9 @@ public class HttpUriRequestConverter {
|
||||||
HttpClientContext httpContext = new HttpClientContext();
|
HttpClientContext httpContext = new HttpClientContext();
|
||||||
if (proxy != null && proxy.getUsername() != null) {
|
if (proxy != null && proxy.getUsername() != null) {
|
||||||
AuthState authState = new AuthState();
|
AuthState authState = new AuthState();
|
||||||
authState.update(new BasicScheme(ChallengeState.PROXY), new UsernamePasswordCredentials(proxy.getUsername(), proxy.getPassword()));
|
BasicScheme proxyAuthScheme = new BasicScheme(ChallengeState.PROXY);
|
||||||
|
UsernamePasswordCredentials proxyCredentials = new UsernamePasswordCredentials(proxy.getUsername(), proxy.getPassword());
|
||||||
|
authState.update(proxyAuthScheme, proxyCredentials);
|
||||||
httpContext.setAttribute(HttpClientContext.PROXY_AUTH_STATE, authState);
|
httpContext.setAttribute(HttpClientContext.PROXY_AUTH_STATE, authState);
|
||||||
}
|
}
|
||||||
if (request.getCookies() != null && !request.getCookies().isEmpty()) {
|
if (request.getCookies() != null && !request.getCookies().isEmpty()) {
|
||||||
|
|
|
@ -26,7 +26,6 @@ public class HtmlNode extends AbstractSelectable {
|
||||||
return elements;
|
return elements;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
public Selectable smartContent() {
|
public Selectable smartContent() {
|
||||||
SmartContentSelector smartContentSelector = Selectors.smartContent();
|
SmartContentSelector smartContentSelector = Selectors.smartContent();
|
||||||
return select(smartContentSelector, getSourceTexts());
|
return select(smartContentSelector, getSourceTexts());
|
||||||
|
|
|
@ -42,11 +42,6 @@ public class PlainText extends AbstractSelectable {
|
||||||
throw new UnsupportedOperationException("$ can not apply to plain text. Please check whether you use a previous xpath with attribute select (/@href etc).");
|
throw new UnsupportedOperationException("$ can not apply to plain text. Please check whether you use a previous xpath with attribute select (/@href etc).");
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
public Selectable smartContent() {
|
|
||||||
throw new UnsupportedOperationException("Smart content can not apply to plain text. Please check whether you use a previous xpath with attribute select (/@href etc).");
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Selectable links() {
|
public Selectable links() {
|
||||||
throw new UnsupportedOperationException("Links can not apply to plain text. Please check whether you use a previous xpath with attribute select (/@href etc).");
|
throw new UnsupportedOperationException("Links can not apply to plain text. Please check whether you use a previous xpath with attribute select (/@href etc).");
|
||||||
|
|
|
@ -51,14 +51,6 @@ public interface Selectable {
|
||||||
* @return new Selectable after extract
|
* @return new Selectable after extract
|
||||||
*/
|
*/
|
||||||
public Selectable css(String selector, String attrName);
|
public Selectable css(String selector, String attrName);
|
||||||
|
|
||||||
/**
|
|
||||||
* select smart content with ReadAbility algorithm
|
|
||||||
*
|
|
||||||
* @return content
|
|
||||||
*/
|
|
||||||
public Selectable smartContent();
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* select all links
|
* select all links
|
||||||
*
|
*
|
||||||
|
|
|
@ -21,6 +21,10 @@ public abstract class CharsetUtils {
|
||||||
|
|
||||||
private static Logger logger = LoggerFactory.getLogger(CharsetUtils.class);
|
private static Logger logger = LoggerFactory.getLogger(CharsetUtils.class);
|
||||||
|
|
||||||
|
private CharsetUtils() {
|
||||||
|
throw new AssertionError("No us.codecraft.webmagic.utils.CharsetUtils instances for you!");
|
||||||
|
}
|
||||||
|
|
||||||
public static String detectCharset(String contentType, byte[] contentBytes) throws IOException {
|
public static String detectCharset(String contentType, byte[] contentBytes) throws IOException {
|
||||||
String charset;
|
String charset;
|
||||||
// charset
|
// charset
|
||||||
|
|
|
@ -116,6 +116,10 @@ public class UrlUtils {
|
||||||
private static final Pattern patternForCharset = Pattern.compile("charset\\s*=\\s*['\"]*([^\\s;'\"]*)", Pattern.CASE_INSENSITIVE);
|
private static final Pattern patternForCharset = Pattern.compile("charset\\s*=\\s*['\"]*([^\\s;'\"]*)", Pattern.CASE_INSENSITIVE);
|
||||||
|
|
||||||
public static String getCharset(String contentType) {
|
public static String getCharset(String contentType) {
|
||||||
|
if (contentType == null) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
Matcher matcher = patternForCharset.matcher(contentType);
|
Matcher matcher = patternForCharset.matcher(contentType);
|
||||||
if (matcher.find()) {
|
if (matcher.find()) {
|
||||||
String charset = matcher.group(1);
|
String charset = matcher.group(1);
|
||||||
|
|
|
@ -1,21 +0,0 @@
|
||||||
<?xml version="1.0" encoding="UTF-8"?>
|
|
||||||
<!DOCTYPE log4j:configuration SYSTEM "log4j.dtd">
|
|
||||||
<log4j:configuration xmlns:log4j="http://jakarta.apache.org/log4j/">
|
|
||||||
|
|
||||||
<appender name="stdout" class="org.apache.log4j.ConsoleAppender">
|
|
||||||
<layout class="org.apache.log4j.PatternLayout">
|
|
||||||
<param name="ConversionPattern" value="%d{yy-MM-dd HH:mm:ss,SSS} %-5p %c(%F:%L) ## %m%n" />
|
|
||||||
</layout>
|
|
||||||
</appender>
|
|
||||||
|
|
||||||
<logger name="org.apache" additivity="false">
|
|
||||||
<level value="warn" />
|
|
||||||
<appender-ref ref="stdout" />
|
|
||||||
</logger>
|
|
||||||
|
|
||||||
<root>
|
|
||||||
<level value="info" />
|
|
||||||
<appender-ref ref="stdout" />
|
|
||||||
</root>
|
|
||||||
|
|
||||||
</log4j:configuration>
|
|
|
@ -1,10 +1,14 @@
|
||||||
package us.codecraft.webmagic;
|
package us.codecraft.webmagic;
|
||||||
|
|
||||||
import org.junit.Test;
|
|
||||||
import us.codecraft.webmagic.utils.HttpConstant;
|
|
||||||
|
|
||||||
import static org.assertj.core.api.Assertions.assertThat;
|
import static org.assertj.core.api.Assertions.assertThat;
|
||||||
|
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
import us.codecraft.webmagic.utils.HttpConstant;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @author code4crafter@gmail.com
|
* @author code4crafter@gmail.com
|
||||||
* Date: 17/3/11
|
* Date: 17/3/11
|
||||||
|
@ -22,4 +26,28 @@ public class RequestTest {
|
||||||
assertThat(requestA).isNotEqualTo(requestB);
|
assertThat(requestA).isNotEqualTo(requestB);
|
||||||
assertThat(requestA.hashCode()).isNotEqualTo(requestB.hashCode());
|
assertThat(requestA.hashCode()).isNotEqualTo(requestB.hashCode());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testSetExtras() {
|
||||||
|
Request request = new Request();
|
||||||
|
Map<String, Object> extras = Collections.singletonMap("a", "1");
|
||||||
|
request.setExtras(extras);
|
||||||
|
request.putExtra("b", "2");
|
||||||
|
assertThat(request.<String>getExtra("a")).isEqualTo("1");
|
||||||
|
assertThat(request.<String>getExtra("b")).isEqualTo("2");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testGetExtras() {
|
||||||
|
Request request = new Request();
|
||||||
|
request.putExtra("a", "1");
|
||||||
|
assertThat(request.getExtras()).containsEntry("a", "1");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test(expected = UnsupportedOperationException.class)
|
||||||
|
public void testGetExtrasShouldBeUnmodifiable() {
|
||||||
|
Request request = new Request();
|
||||||
|
request.getExtras().put("a", "1");
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,8 +1,12 @@
|
||||||
package us.codecraft.webmagic;
|
package us.codecraft.webmagic;
|
||||||
|
|
||||||
import static org.junit.Assert.assertEquals;
|
import static org.junit.Assert.assertEquals;
|
||||||
|
import static org.junit.Assert.assertTrue;
|
||||||
|
|
||||||
import java.nio.charset.StandardCharsets;
|
import java.nio.charset.StandardCharsets;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
|
||||||
|
@ -14,4 +18,23 @@ public class SiteTest {
|
||||||
assertEquals(StandardCharsets.UTF_8.name(), site.getDefaultCharset());
|
assertEquals(StandardCharsets.UTF_8.name(), site.getDefaultCharset());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void addCookieTest(){
|
||||||
|
Site site=Site.me().setDefaultCharset(StandardCharsets.UTF_8.name());
|
||||||
|
site.addCookie("cookieDefault","cookie-webmagicDefault");
|
||||||
|
String firstDomain="example.com";
|
||||||
|
String secondDomain="exampleCopy.com";
|
||||||
|
site.addCookie(firstDomain, "cookie", "cookie-webmagic");
|
||||||
|
site.addCookie(firstDomain, "cookieCopy", "cookie-webmagicCopy");
|
||||||
|
site.addCookie(secondDomain, "cookie", "cookie-webmagic");
|
||||||
|
Map<String, Map<String, String>> allCookies = site.getAllCookies();
|
||||||
|
List<String> domains=new ArrayList<>();
|
||||||
|
for(String key : allCookies.keySet()){
|
||||||
|
domains.add(key);
|
||||||
|
}
|
||||||
|
assertEquals("cookie-webmagic", allCookies.get(firstDomain).get("cookie"));
|
||||||
|
assertEquals("cookie-webmagicCopy", allCookies.get(firstDomain).get("cookieCopy"));
|
||||||
|
assertEquals("cookie-webmagic", allCookies.get(secondDomain).get("cookie"));
|
||||||
|
assertEquals(2, domains.size());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -40,6 +40,7 @@ import static com.github.dreamhead.moco.Moco.uri;
|
||||||
import static com.github.dreamhead.moco.Moco.with;
|
import static com.github.dreamhead.moco.Moco.with;
|
||||||
import static org.assertj.core.api.Assertions.assertThat;
|
import static org.assertj.core.api.Assertions.assertThat;
|
||||||
import static org.junit.Assert.assertEquals;
|
import static org.junit.Assert.assertEquals;
|
||||||
|
import static org.junit.Assert.assertThrows;
|
||||||
import static org.junit.Assert.assertTrue;
|
import static org.junit.Assert.assertTrue;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -333,5 +334,13 @@ public class HttpClientDownloaderTest {
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void test_no_task_download(){
|
||||||
|
Request request = new Request();
|
||||||
|
request.setUrl("http://127.0.0.1:13423/");
|
||||||
|
HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
|
||||||
|
assertThrows(NullPointerException.class, () -> httpClientDownloader.download(request,null));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -8,19 +8,19 @@ import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
import org.apache.http.HttpHost;
|
import org.apache.http.HttpHost;
|
||||||
import org.junit.BeforeClass;
|
import org.junit.jupiter.api.BeforeAll;
|
||||||
import org.junit.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @author yxssfxwzy@sina.com May 30, 2014
|
* @author yxssfxwzy@sina.com May 30, 2014
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
public class ProxyTest {
|
class ProxyTest {
|
||||||
|
|
||||||
private static List<String[]> httpProxyList = new ArrayList<String[]>();
|
private static List<String[]> httpProxyList = new ArrayList<String[]>();
|
||||||
|
|
||||||
@BeforeClass
|
@BeforeAll
|
||||||
public static void before() {
|
static void before() {
|
||||||
// String[] source = { "0.0.0.1:0", "0.0.0.2:0", "0.0.0.3:0",
|
// String[] source = { "0.0.0.1:0", "0.0.0.2:0", "0.0.0.3:0",
|
||||||
// "0.0.0.4:0" };
|
// "0.0.0.4:0" };
|
||||||
String[] source = { "::0.0.0.1:0", "::0.0.0.2:0", "::0.0.0.3:0", "::0.0.0.4:0" };
|
String[] source = { "::0.0.0.1:0", "::0.0.0.2:0", "::0.0.0.3:0", "::0.0.0.4:0" };
|
||||||
|
@ -48,7 +48,7 @@ public class ProxyTest {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testCreate() {
|
void testCreate() {
|
||||||
Proxy proxy = Proxy.create(URI.create("//127.0.0.1:8080"));
|
Proxy proxy = Proxy.create(URI.create("//127.0.0.1:8080"));
|
||||||
assertNull(proxy.getScheme());
|
assertNull(proxy.getScheme());
|
||||||
assertNull(proxy.getUsername());
|
assertNull(proxy.getUsername());
|
||||||
|
@ -86,7 +86,15 @@ public class ProxyTest {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testToString() {
|
void testEqualsHashCode() {
|
||||||
|
var proxy0 = new Proxy("::1", 1080);
|
||||||
|
var proxy1 = new Proxy("::1", 1080);
|
||||||
|
assertEquals(proxy0, proxy1);
|
||||||
|
assertEquals(proxy0.hashCode(), proxy1.hashCode());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testToString() {
|
||||||
assertEquals("//127.0.0.1:8080", new Proxy("127.0.0.1", 8080).toString());
|
assertEquals("//127.0.0.1:8080", new Proxy("127.0.0.1", 8080).toString());
|
||||||
assertEquals("http://127.0.0.1:8080", new Proxy("127.0.0.1", 8080, "http").toString());
|
assertEquals("http://127.0.0.1:8080", new Proxy("127.0.0.1", 8080, "http").toString());
|
||||||
assertEquals("//username:password@127.0.0.1:8080", new Proxy("127.0.0.1", 8080, "username", "password").toString());
|
assertEquals("//username:password@127.0.0.1:8080", new Proxy("127.0.0.1", 8080, "username", "password").toString());
|
||||||
|
|
|
@ -0,0 +1,59 @@
|
||||||
|
package us.codecraft.webmagic.selector;
|
||||||
|
|
||||||
|
import static org.junit.Assert.assertEquals;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
public class AndSelectorTest {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testSelectList() {
|
||||||
|
String htmlContent = "<!DOCTYPE html>\n" +
|
||||||
|
"<html lang=\"en\">\n" +
|
||||||
|
"<head>\n" +
|
||||||
|
" <meta charset=\"UTF-8\">\n" +
|
||||||
|
" <meta name=\"viewport\" content=\"width=device-width, initial-scale=1.0\">\n" +
|
||||||
|
" <title>HTML with XPath</title>\n" +
|
||||||
|
"</head>\n" +
|
||||||
|
"<body>\n" +
|
||||||
|
" <div class=\"container\">\n" +
|
||||||
|
" <div class=\"item1\">Item 1</div>\n" +
|
||||||
|
" <div class=\"item2\">Item 2</div>\n" +
|
||||||
|
" </div>\n" +
|
||||||
|
"</body>\n" +
|
||||||
|
"</html>";
|
||||||
|
List<Selector> selectors = new ArrayList<Selector>();
|
||||||
|
selectors.add(new CssSelector("div"));
|
||||||
|
selectors.add(new XpathSelector("//div[@class='item1']"));
|
||||||
|
AndSelector andSelector = new AndSelector(selectors);
|
||||||
|
List<String> result = andSelector.selectList(htmlContent);
|
||||||
|
assertEquals("<div class=\"item1\">\n Item 1\n</div>", result.get(0));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testSelectList_NoResults() {
|
||||||
|
String htmlContent = "<!DOCTYPE html>\n" +
|
||||||
|
"<html lang=\"en\">\n" +
|
||||||
|
"<head>\n" +
|
||||||
|
" <meta charset=\"UTF-8\">\n" +
|
||||||
|
" <meta name=\"viewport\" content=\"width=device-width, initial-scale=1.0\">\n" +
|
||||||
|
" <title>HTML with XPath</title>\n" +
|
||||||
|
"</head>\n" +
|
||||||
|
"<body>\n" +
|
||||||
|
" <div class=\"container\">\n" +
|
||||||
|
" <div class=\"item1\">Item 1</div>\n" +
|
||||||
|
" <div class=\"item2\">Item 2</div>\n" +
|
||||||
|
" </div>\n" +
|
||||||
|
"</body>\n" +
|
||||||
|
"</html>";
|
||||||
|
List<Selector> selectors = new ArrayList<Selector>();
|
||||||
|
selectors.add(new CssSelector("div"));
|
||||||
|
selectors.add(new XpathSelector("//div[@class='item']"));
|
||||||
|
AndSelector andSelector = new AndSelector(selectors);
|
||||||
|
List<String> result = andSelector.selectList(htmlContent);
|
||||||
|
assertEquals(0, result.size());
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,39 @@
|
||||||
|
package us.codecraft.webmagic.selector;
|
||||||
|
|
||||||
|
import org.jsoup.Jsoup;
|
||||||
|
import org.jsoup.nodes.Document;
|
||||||
|
import org.jsoup.nodes.Element;
|
||||||
|
import org.jsoup.select.Elements;
|
||||||
|
import org.junit.Test;
|
||||||
|
import org.junit.runner.RunWith;
|
||||||
|
import org.mockito.Mock;
|
||||||
|
import org.mockito.Mockito;
|
||||||
|
import org.mockito.runners.MockitoJUnitRunner;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
import static org.junit.Assert.*;
|
||||||
|
|
||||||
|
public class CssSelectorTest {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testSelectElement() {
|
||||||
|
CssSelector cssSelector = new CssSelector("div");
|
||||||
|
String htmlContent = "<html><head><title>Dummy Page</title></head><body><div id=\"dummyDiv\">Hello World!</div></body></html>";
|
||||||
|
Document doc = Jsoup.parse(htmlContent);
|
||||||
|
Element dummyElement = doc.getElementById("dummyDiv");
|
||||||
|
Element resultElement = cssSelector.selectElement(dummyElement);
|
||||||
|
assertNotNull(resultElement);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testSelectList() {
|
||||||
|
CssSelector cssSelector = new CssSelector("div");
|
||||||
|
String htmlContent = "<html><head><title>Dummy Page</title></head><body><div id=\"dummyDiv\">Hello World!</div></body></html>";
|
||||||
|
Document doc = Jsoup.parse(htmlContent);
|
||||||
|
Element dummyElement = doc.getElementById("dummyDiv");
|
||||||
|
List<String> result = cssSelector.selectList(dummyElement);
|
||||||
|
assertEquals(1, result.size());
|
||||||
|
assertEquals("[<div id=\"dummyDiv\">\n Hello World!\n</div>]", result.toString());
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,44 @@
|
||||||
|
package us.codecraft.webmagic.selector;
|
||||||
|
|
||||||
|
import static org.junit.Assert.assertEquals;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
public class OrSelectorTest {
|
||||||
|
@Test
|
||||||
|
public void testSelectList() {
|
||||||
|
String htmlContent = "<!DOCTYPE html>\n" +
|
||||||
|
"<html lang=\"en\">\n" +
|
||||||
|
"<head>\n" +
|
||||||
|
" <meta charset=\"UTF-8\">\n" +
|
||||||
|
" <meta name=\"viewport\" content=\"width=device-width, initial-scale=1.0\">\n" +
|
||||||
|
" <title>HTML with XPath</title>\n" +
|
||||||
|
"</head>\n" +
|
||||||
|
"<body>\n" +
|
||||||
|
" <div class=\"container\">\n" +
|
||||||
|
" <div class=\"item1\">Item 1</div>\n" +
|
||||||
|
" <div class=\"item2\">Item 2</div>\n" +
|
||||||
|
" </div>\n" +
|
||||||
|
"</body>\n" +
|
||||||
|
"</html>";
|
||||||
|
String expectedResult = "[<head>\n" +
|
||||||
|
" <meta charset=\"UTF-8\">\n" +
|
||||||
|
" <meta name=\"viewport\" content=\"width=device-width, initial-scale=1.0\">\n" +
|
||||||
|
" <title>HTML with XPath</title>\n" +
|
||||||
|
"</head>, <div class=\"item1\">\n" +
|
||||||
|
" Item 1\n" +
|
||||||
|
"</div>, <div class=\"item2\">\n" +
|
||||||
|
" Item 2\n" +
|
||||||
|
"</div>]";
|
||||||
|
List<Selector> selectors = new ArrayList<Selector>();
|
||||||
|
selectors.add(new CssSelector("head"));
|
||||||
|
selectors.add(new XpathSelector("//div[@class='item1']"));
|
||||||
|
selectors.add(new XpathSelector("//div[@class='item2']"));
|
||||||
|
OrSelector orSelector = new OrSelector(selectors);
|
||||||
|
List<String> result = orSelector.selectList(htmlContent);
|
||||||
|
assertEquals(expectedResult, result.toString());
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,16 @@
|
||||||
|
package us.codecraft.webmagic.utils;
|
||||||
|
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertNull;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
class CharsetUtilsTest {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testDetectCharset() throws IOException {
|
||||||
|
assertNull(CharsetUtils.detectCharset(null, new byte[0]));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -1,5 +1,7 @@
|
||||||
package us.codecraft.webmagic.utils;
|
package us.codecraft.webmagic.utils;
|
||||||
|
|
||||||
|
import static org.junit.Assert.assertNull;
|
||||||
|
|
||||||
import org.junit.Assert;
|
import org.junit.Assert;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
|
||||||
|
@ -43,5 +45,9 @@ public class UrlUtilsTest {
|
||||||
Assert.assertEquals("www.dianping.com",UrlUtils.getDomain(url));
|
Assert.assertEquals("www.dianping.com",UrlUtils.getDomain(url));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testGetCharset() {
|
||||||
|
assertNull(UrlUtils.getCharset(null));
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,21 +0,0 @@
|
||||||
<?xml version="1.0" encoding="UTF-8"?>
|
|
||||||
<!DOCTYPE log4j:configuration SYSTEM "log4j.dtd">
|
|
||||||
<log4j:configuration xmlns:log4j="http://jakarta.apache.org/log4j/">
|
|
||||||
|
|
||||||
<appender name="stdout" class="org.apache.log4j.ConsoleAppender">
|
|
||||||
<layout class="org.apache.log4j.PatternLayout">
|
|
||||||
<param name="ConversionPattern" value="%d{yy-MM-dd HH:mm:ss,SSS} %-5p %c(%F:%L) ## %m%n" />
|
|
||||||
</layout>
|
|
||||||
</appender>
|
|
||||||
|
|
||||||
<logger name="org.apache" additivity="false">
|
|
||||||
<level value="warn" />
|
|
||||||
<appender-ref ref="stdout" />
|
|
||||||
</logger>
|
|
||||||
|
|
||||||
<root>
|
|
||||||
<level value="info" />
|
|
||||||
<appender-ref ref="stdout" />
|
|
||||||
</root>
|
|
||||||
|
|
||||||
</log4j:configuration>
|
|
|
@ -0,0 +1,16 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<Configuration>
|
||||||
|
<Appenders>
|
||||||
|
<Console name="stdout" target="SYSTEM_OUT">
|
||||||
|
<PatternLayout pattern="%d{yy-MM-dd HH:mm:ss,SSS} %-5p %c(%F:%L) ## %m%n" />
|
||||||
|
</Console>
|
||||||
|
</Appenders>
|
||||||
|
<Loggers>
|
||||||
|
<Logger name="org.apache" level="warn" additivity="false">
|
||||||
|
<AppenderRef ref="stdout" />
|
||||||
|
</Logger>
|
||||||
|
<Root level="info">
|
||||||
|
<AppenderRef ref="stdout" />
|
||||||
|
</Root>
|
||||||
|
</Loggers>
|
||||||
|
</Configuration>
|
|
@ -1,14 +1,16 @@
|
||||||
<?xml version="1.0" encoding="UTF-8"?>
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
<project
|
||||||
|
xmlns="http://maven.apache.org/POM/4.0.0"
|
||||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0
|
xsi:schemaLocation="
|
||||||
|
http://maven.apache.org/POM/4.0.0
|
||||||
http://maven.apache.org/maven-v4_0_0.xsd">
|
http://maven.apache.org/maven-v4_0_0.xsd">
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
<parent>
|
<parent>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<artifactId>webmagic-parent</artifactId>
|
<artifactId>webmagic</artifactId>
|
||||||
<version>0.10.3</version>
|
<version>1.0.0</version>
|
||||||
</parent>
|
</parent>
|
||||||
|
|
||||||
<artifactId>webmagic-coverage</artifactId>
|
<artifactId>webmagic-coverage</artifactId>
|
||||||
|
|
|
@ -1,15 +1,26 @@
|
||||||
<?xml version="1.0" encoding="UTF-8"?>
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
<project
|
||||||
|
xmlns="http://maven.apache.org/POM/4.0.0"
|
||||||
|
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||||
|
xsi:schemaLocation="
|
||||||
|
http://maven.apache.org/POM/4.0.0
|
||||||
|
http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||||
<parent>
|
<parent>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<artifactId>webmagic-parent</artifactId>
|
<artifactId>webmagic</artifactId>
|
||||||
<version>0.10.3</version>
|
<version>1.0.0</version>
|
||||||
</parent>
|
</parent>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
<artifactId>webmagic-extension</artifactId>
|
<artifactId>webmagic-extension</artifactId>
|
||||||
|
|
||||||
<dependencies>
|
<dependencies>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.projectlombok</groupId>
|
||||||
|
<artifactId>lombok</artifactId>
|
||||||
|
<version>1.18.32</version>
|
||||||
|
<scope>provided</scope>
|
||||||
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>redis.clients</groupId>
|
<groupId>redis.clients</groupId>
|
||||||
<artifactId>jedis</artifactId>
|
<artifactId>jedis</artifactId>
|
||||||
|
@ -29,10 +40,6 @@
|
||||||
<artifactId>webmagic-core</artifactId>
|
<artifactId>webmagic-core</artifactId>
|
||||||
<version>${project.version}</version>
|
<version>${project.version}</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
|
||||||
<groupId>junit</groupId>
|
|
||||||
<artifactId>junit</artifactId>
|
|
||||||
</dependency>
|
|
||||||
</dependencies>
|
</dependencies>
|
||||||
|
|
||||||
</project>
|
</project>
|
||||||
|
|
|
@ -1,5 +1,9 @@
|
||||||
package us.codecraft.webmagic.model;
|
package us.codecraft.webmagic.model;
|
||||||
|
|
||||||
|
import lombok.Getter;
|
||||||
|
import lombok.Setter;
|
||||||
|
|
||||||
|
import us.codecraft.webmagic.model.sources.Source;
|
||||||
import us.codecraft.webmagic.selector.Selector;
|
import us.codecraft.webmagic.selector.Selector;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -7,18 +11,18 @@ import us.codecraft.webmagic.selector.Selector;
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
* @since 0.2.0
|
* @since 0.2.0
|
||||||
*/
|
*/
|
||||||
class Extractor {
|
public class Extractor {
|
||||||
|
|
||||||
|
@Getter @Setter
|
||||||
protected Selector selector;
|
protected Selector selector;
|
||||||
|
|
||||||
|
@Getter
|
||||||
protected final Source source;
|
protected final Source source;
|
||||||
|
|
||||||
protected final boolean notNull;
|
protected final boolean notNull;
|
||||||
|
|
||||||
protected final boolean multi;
|
protected final boolean multi;
|
||||||
|
|
||||||
static enum Source {Html, Url, RawHtml, RawText}
|
|
||||||
|
|
||||||
public Extractor(Selector selector, Source source, boolean notNull, boolean multi) {
|
public Extractor(Selector selector, Source source, boolean notNull, boolean multi) {
|
||||||
this.selector = selector;
|
this.selector = selector;
|
||||||
this.source = source;
|
this.source = source;
|
||||||
|
@ -26,23 +30,11 @@ class Extractor {
|
||||||
this.multi = multi;
|
this.multi = multi;
|
||||||
}
|
}
|
||||||
|
|
||||||
Selector getSelector() {
|
public boolean isNotNull() {
|
||||||
return selector;
|
|
||||||
}
|
|
||||||
|
|
||||||
Source getSource() {
|
|
||||||
return source;
|
|
||||||
}
|
|
||||||
|
|
||||||
boolean isNotNull() {
|
|
||||||
return notNull;
|
return notNull;
|
||||||
}
|
}
|
||||||
|
|
||||||
boolean isMulti() {
|
public boolean isMulti() {
|
||||||
return multi;
|
return multi;
|
||||||
}
|
}
|
||||||
|
|
||||||
void setSelector(Selector selector) {
|
|
||||||
this.selector = selector;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,58 +1,33 @@
|
||||||
package us.codecraft.webmagic.model;
|
package us.codecraft.webmagic.model;
|
||||||
|
|
||||||
import us.codecraft.webmagic.model.formatter.ObjectFormatter;
|
import us.codecraft.webmagic.model.formatter.ObjectFormatter;
|
||||||
|
import us.codecraft.webmagic.model.sources.Source;
|
||||||
import us.codecraft.webmagic.selector.Selector;
|
import us.codecraft.webmagic.selector.Selector;
|
||||||
|
|
||||||
import java.lang.reflect.Field;
|
import java.lang.reflect.Field;
|
||||||
import java.lang.reflect.Method;
|
import java.lang.reflect.Method;
|
||||||
|
|
||||||
|
import lombok.Getter;
|
||||||
|
import lombok.Setter;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Wrapper of field and extractor.
|
* Wrapper of field and extractor.
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
* @since 0.2.0
|
* @since 0.2.0
|
||||||
*/
|
*/
|
||||||
class FieldExtractor extends Extractor {
|
public class FieldExtractor extends Extractor {
|
||||||
|
|
||||||
|
@Getter
|
||||||
private final Field field;
|
private final Field field;
|
||||||
|
|
||||||
|
@Getter @Setter
|
||||||
private Method setterMethod;
|
private Method setterMethod;
|
||||||
|
|
||||||
|
@Getter @Setter
|
||||||
private ObjectFormatter objectFormatter;
|
private ObjectFormatter objectFormatter;
|
||||||
|
|
||||||
public FieldExtractor(Field field, Selector selector, Source source, boolean notNull, boolean multi) {
|
public FieldExtractor(Field field, Selector selector, Source source, boolean notNull, boolean multi) {
|
||||||
super(selector, source, notNull, multi);
|
super(selector, source, notNull, multi);
|
||||||
this.field = field;
|
this.field = field;
|
||||||
}
|
}
|
||||||
|
|
||||||
Field getField() {
|
|
||||||
return field;
|
|
||||||
}
|
|
||||||
|
|
||||||
Selector getSelector() {
|
|
||||||
return selector;
|
|
||||||
}
|
|
||||||
|
|
||||||
Source getSource() {
|
|
||||||
return source;
|
|
||||||
}
|
|
||||||
|
|
||||||
void setSetterMethod(Method setterMethod) {
|
|
||||||
this.setterMethod = setterMethod;
|
|
||||||
}
|
|
||||||
|
|
||||||
Method getSetterMethod() {
|
|
||||||
return setterMethod;
|
|
||||||
}
|
|
||||||
|
|
||||||
boolean isNotNull() {
|
|
||||||
return notNull;
|
|
||||||
}
|
|
||||||
|
|
||||||
ObjectFormatter getObjectFormatter() {
|
|
||||||
return objectFormatter;
|
|
||||||
}
|
|
||||||
|
|
||||||
void setObjectFormatter(ObjectFormatter objectFormatter) {
|
|
||||||
this.objectFormatter = objectFormatter;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -3,17 +3,21 @@ package us.codecraft.webmagic.model;
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import lombok.Getter;
|
||||||
import us.codecraft.webmagic.Page;
|
import us.codecraft.webmagic.Page;
|
||||||
import us.codecraft.webmagic.model.annotation.*;
|
import us.codecraft.webmagic.model.annotation.*;
|
||||||
import us.codecraft.webmagic.model.formatter.ObjectFormatter;
|
import us.codecraft.webmagic.model.fields.PageField;
|
||||||
import us.codecraft.webmagic.model.formatter.ObjectFormatterBuilder;
|
import us.codecraft.webmagic.model.formatter.ObjectFormatterBuilder;
|
||||||
|
import us.codecraft.webmagic.model.sources.Source;
|
||||||
|
import us.codecraft.webmagic.model.sources.SourceTextExtractor;
|
||||||
|
import us.codecraft.webmagic.model.sources.Source.*;
|
||||||
import us.codecraft.webmagic.selector.*;
|
import us.codecraft.webmagic.selector.*;
|
||||||
import us.codecraft.webmagic.utils.ClassUtils;
|
import us.codecraft.webmagic.utils.ClassUtils;
|
||||||
import us.codecraft.webmagic.utils.ExtractorUtils;
|
import us.codecraft.webmagic.utils.ExtractorUtils;
|
||||||
|
|
||||||
import java.lang.annotation.Annotation;
|
import java.lang.annotation.Annotation;
|
||||||
import java.lang.reflect.Field;
|
import java.lang.reflect.Field;
|
||||||
import java.lang.reflect.InvocationTargetException;
|
|
||||||
import java.lang.reflect.Method;
|
import java.lang.reflect.Method;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
@ -29,14 +33,19 @@ import static us.codecraft.webmagic.model.annotation.ExtractBy.Source.RawText;
|
||||||
*/
|
*/
|
||||||
class PageModelExtractor {
|
class PageModelExtractor {
|
||||||
|
|
||||||
|
@Getter
|
||||||
private List<Pattern> targetUrlPatterns = new ArrayList<Pattern>();
|
private List<Pattern> targetUrlPatterns = new ArrayList<Pattern>();
|
||||||
|
|
||||||
|
@Getter
|
||||||
private Selector targetUrlRegionSelector;
|
private Selector targetUrlRegionSelector;
|
||||||
|
|
||||||
|
@Getter
|
||||||
private List<Pattern> helpUrlPatterns = new ArrayList<Pattern>();
|
private List<Pattern> helpUrlPatterns = new ArrayList<Pattern>();
|
||||||
|
|
||||||
|
@Getter
|
||||||
private Selector helpUrlRegionSelector;
|
private Selector helpUrlRegionSelector;
|
||||||
|
|
||||||
|
@Getter
|
||||||
private Class clazz;
|
private Class clazz;
|
||||||
|
|
||||||
private List<FieldExtractor> fieldExtractors;
|
private List<FieldExtractor> fieldExtractors;
|
||||||
|
@ -86,7 +95,7 @@ class PageModelExtractor {
|
||||||
regexPattern = ".*";
|
regexPattern = ".*";
|
||||||
}
|
}
|
||||||
fieldExtractor = new FieldExtractor(field,
|
fieldExtractor = new FieldExtractor(field,
|
||||||
new RegexSelector(regexPattern), FieldExtractor.Source.Url, extractByUrl.notNull(),
|
new RegexSelector(regexPattern), new Url(), extractByUrl.notNull(),
|
||||||
extractByUrl.multi() || List.class.isAssignableFrom(field.getType()));
|
extractByUrl.multi() || List.class.isAssignableFrom(field.getType()));
|
||||||
Method setterMethod = getSetterMethod(clazz, field);
|
Method setterMethod = getSetterMethod(clazz, field);
|
||||||
if (setterMethod != null) {
|
if (setterMethod != null) {
|
||||||
|
@ -112,7 +121,7 @@ class PageModelExtractor {
|
||||||
default:
|
default:
|
||||||
selector = new AndSelector(ExtractorUtils.getSelectors(extractBies));
|
selector = new AndSelector(ExtractorUtils.getSelectors(extractBies));
|
||||||
}
|
}
|
||||||
fieldExtractor = new FieldExtractor(field, selector, comboExtract.source() == ComboExtract.Source.RawHtml ? FieldExtractor.Source.RawHtml : FieldExtractor.Source.Html,
|
fieldExtractor = new FieldExtractor(field, selector, comboExtract.source() == ComboExtract.Source.RawHtml ? new RawHtml() : new SelectedHtml(),
|
||||||
comboExtract.notNull(), comboExtract.multi() || List.class.isAssignableFrom(field.getType()));
|
comboExtract.notNull(), comboExtract.multi() || List.class.isAssignableFrom(field.getType()));
|
||||||
Method setterMethod = getSetterMethod(clazz, field);
|
Method setterMethod = getSetterMethod(clazz, field);
|
||||||
if (setterMethod != null) {
|
if (setterMethod != null) {
|
||||||
|
@ -127,26 +136,23 @@ class PageModelExtractor {
|
||||||
ExtractBy extractBy = field.getAnnotation(ExtractBy.class);
|
ExtractBy extractBy = field.getAnnotation(ExtractBy.class);
|
||||||
if (extractBy != null) {
|
if (extractBy != null) {
|
||||||
Selector selector = ExtractorUtils.getSelector(extractBy);
|
Selector selector = ExtractorUtils.getSelector(extractBy);
|
||||||
ExtractBy.Source source0 = extractBy.source();
|
ExtractBy.Source extractSource = extractBy.source();
|
||||||
if (extractBy.type()== ExtractBy.Type.JsonPath){
|
if (extractBy.type()== ExtractBy.Type.JsonPath)
|
||||||
source0 = RawText;
|
extractSource = RawText;
|
||||||
}
|
Source source = null;
|
||||||
FieldExtractor.Source source = null;
|
switch (extractSource) {
|
||||||
switch (source0){
|
|
||||||
case RawText:
|
case RawText:
|
||||||
source = FieldExtractor.Source.RawText;
|
source = new RawText();
|
||||||
break;
|
break;
|
||||||
case RawHtml:
|
case RawHtml:
|
||||||
source = FieldExtractor.Source.RawHtml;
|
source = new RawHtml();
|
||||||
break;
|
break;
|
||||||
case SelectedHtml:
|
case SelectedHtml:
|
||||||
source =FieldExtractor.Source.Html;
|
source = new SelectedHtml();
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
source =FieldExtractor.Source.Html;
|
source = new SelectedHtml();
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fieldExtractor = new FieldExtractor(field, selector, source,
|
fieldExtractor = new FieldExtractor(field, selector, source,
|
||||||
extractBy.notNull(), List.class.isAssignableFrom(field.getType()));
|
extractBy.notNull(), List.class.isAssignableFrom(field.getType()));
|
||||||
fieldExtractor.setSetterMethod(getSetterMethod(clazz, field));
|
fieldExtractor.setSetterMethod(getSetterMethod(clazz, field));
|
||||||
|
@ -193,7 +199,7 @@ class PageModelExtractor {
|
||||||
annotation = clazz.getAnnotation(ExtractBy.class);
|
annotation = clazz.getAnnotation(ExtractBy.class);
|
||||||
if (annotation != null) {
|
if (annotation != null) {
|
||||||
ExtractBy extractBy = (ExtractBy) annotation;
|
ExtractBy extractBy = (ExtractBy) annotation;
|
||||||
objectExtractor = new Extractor(new XpathSelector(extractBy.value()), Extractor.Source.Html, extractBy.notNull(), extractBy.multi());
|
objectExtractor = new Extractor(new XpathSelector(extractBy.value()), new SelectedHtml(), extractBy.notNull(), extractBy.multi());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -233,135 +239,15 @@ class PageModelExtractor {
|
||||||
try {
|
try {
|
||||||
o = clazz.newInstance();
|
o = clazz.newInstance();
|
||||||
for (FieldExtractor fieldExtractor : fieldExtractors) {
|
for (FieldExtractor fieldExtractor : fieldExtractors) {
|
||||||
if (fieldExtractor.isMulti()) {
|
PageField field = SourceTextExtractor.getText(page, html, isRaw, fieldExtractor);
|
||||||
List<String> value;
|
if (!field.operation(o, fieldExtractor, logger))
|
||||||
switch (fieldExtractor.getSource()) {
|
|
||||||
case RawHtml:
|
|
||||||
value = page.getHtml().selectDocumentForList(fieldExtractor.getSelector());
|
|
||||||
break;
|
|
||||||
case Html:
|
|
||||||
if (isRaw) {
|
|
||||||
value = page.getHtml().selectDocumentForList(fieldExtractor.getSelector());
|
|
||||||
} else {
|
|
||||||
value = fieldExtractor.getSelector().selectList(html);
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
case Url:
|
|
||||||
value = fieldExtractor.getSelector().selectList(page.getUrl().toString());
|
|
||||||
break;
|
|
||||||
case RawText:
|
|
||||||
value = fieldExtractor.getSelector().selectList(page.getRawText());
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
value = fieldExtractor.getSelector().selectList(html);
|
|
||||||
}
|
|
||||||
if ((value == null || value.size() == 0) && fieldExtractor.isNotNull()) {
|
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
if (fieldExtractor.getObjectFormatter() != null) {
|
if (AfterExtractor.class.isAssignableFrom(clazz))
|
||||||
List<Object> converted = convert(value, fieldExtractor.getObjectFormatter());
|
|
||||||
setField(o, fieldExtractor, converted);
|
|
||||||
} else {
|
|
||||||
setField(o, fieldExtractor, value);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
String value;
|
|
||||||
switch (fieldExtractor.getSource()) {
|
|
||||||
case RawHtml:
|
|
||||||
value = page.getHtml().selectDocument(fieldExtractor.getSelector());
|
|
||||||
break;
|
|
||||||
case Html:
|
|
||||||
if (isRaw) {
|
|
||||||
value = page.getHtml().selectDocument(fieldExtractor.getSelector());
|
|
||||||
} else {
|
|
||||||
value = fieldExtractor.getSelector().select(html);
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
case Url:
|
|
||||||
value = fieldExtractor.getSelector().select(page.getUrl().toString());
|
|
||||||
break;
|
|
||||||
case RawText:
|
|
||||||
value = fieldExtractor.getSelector().select(page.getRawText());
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
value = fieldExtractor.getSelector().select(html);
|
|
||||||
}
|
|
||||||
if (value == null && fieldExtractor.isNotNull()) {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
if (fieldExtractor.getObjectFormatter() != null) {
|
|
||||||
Object converted = convert(value, fieldExtractor.getObjectFormatter());
|
|
||||||
if (converted == null && fieldExtractor.isNotNull()) {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
setField(o, fieldExtractor, converted);
|
|
||||||
} else {
|
|
||||||
setField(o, fieldExtractor, value);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (AfterExtractor.class.isAssignableFrom(clazz)) {
|
|
||||||
((AfterExtractor) o).afterProcess(page);
|
((AfterExtractor) o).afterProcess(page);
|
||||||
}
|
} catch (Exception e) {
|
||||||
} catch (InstantiationException e) {
|
|
||||||
logger.error("extract fail", e);
|
|
||||||
} catch (IllegalAccessException e) {
|
|
||||||
logger.error("extract fail", e);
|
|
||||||
} catch (InvocationTargetException e) {
|
|
||||||
logger.error("extract fail", e);
|
logger.error("extract fail", e);
|
||||||
}
|
}
|
||||||
return o;
|
return o;
|
||||||
}
|
}
|
||||||
|
|
||||||
private Object convert(String value, ObjectFormatter objectFormatter) {
|
|
||||||
try {
|
|
||||||
Object format = objectFormatter.format(value);
|
|
||||||
logger.debug("String {} is converted to {}", value, format);
|
|
||||||
return format;
|
|
||||||
} catch (Exception e) {
|
|
||||||
logger.error("convert " + value + " to " + objectFormatter.clazz() + " error!", e);
|
|
||||||
}
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
private List<Object> convert(List<String> values, ObjectFormatter objectFormatter) {
|
|
||||||
List<Object> objects = new ArrayList<Object>();
|
|
||||||
for (String value : values) {
|
|
||||||
Object converted = convert(value, objectFormatter);
|
|
||||||
if (converted != null) {
|
|
||||||
objects.add(converted);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return objects;
|
|
||||||
}
|
|
||||||
|
|
||||||
private void setField(Object o, FieldExtractor fieldExtractor, Object value) throws IllegalAccessException, InvocationTargetException {
|
|
||||||
if (value == null) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
if (fieldExtractor.getSetterMethod() != null) {
|
|
||||||
fieldExtractor.getSetterMethod().invoke(o, value);
|
|
||||||
}
|
|
||||||
fieldExtractor.getField().set(o, value);
|
|
||||||
}
|
|
||||||
|
|
||||||
Class getClazz() {
|
|
||||||
return clazz;
|
|
||||||
}
|
|
||||||
|
|
||||||
List<Pattern> getTargetUrlPatterns() {
|
|
||||||
return targetUrlPatterns;
|
|
||||||
}
|
|
||||||
|
|
||||||
List<Pattern> getHelpUrlPatterns() {
|
|
||||||
return helpUrlPatterns;
|
|
||||||
}
|
|
||||||
|
|
||||||
Selector getTargetUrlRegionSelector() {
|
|
||||||
return targetUrlRegionSelector;
|
|
||||||
}
|
|
||||||
|
|
||||||
Selector getHelpUrlRegionSelector() {
|
|
||||||
return helpUrlRegionSelector;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,42 @@
|
||||||
|
package us.codecraft.webmagic.model.fields;
|
||||||
|
|
||||||
|
import java.lang.reflect.InvocationTargetException;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
|
||||||
|
import lombok.Getter;
|
||||||
|
import us.codecraft.webmagic.model.FieldExtractor;
|
||||||
|
import us.codecraft.webmagic.model.formatter.ObjectFormatter;
|
||||||
|
|
||||||
|
public class MultipleField extends PageField {
|
||||||
|
@Getter
|
||||||
|
private List<String> fieldNames;
|
||||||
|
|
||||||
|
public MultipleField(List<String> fieldNames) {
|
||||||
|
this.fieldNames = fieldNames;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean operation(Object o, FieldExtractor fieldExtractor, Logger logger) throws IllegalAccessException, InvocationTargetException {
|
||||||
|
if ((this.fieldNames == null || this.fieldNames.size() == 0) && fieldExtractor.isNotNull())
|
||||||
|
return false;
|
||||||
|
if (fieldExtractor.getObjectFormatter() != null) {
|
||||||
|
List<Object> converted = this.convert(this.fieldNames, fieldExtractor.getObjectFormatter(), logger);
|
||||||
|
setField(o, fieldExtractor, converted);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
setField(o, fieldExtractor, this.fieldNames);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
private List<Object> convert(List<String> values, ObjectFormatter objectFormatter, Logger logger) {
|
||||||
|
List<Object> objects = new ArrayList<>();
|
||||||
|
for (String value : values) {
|
||||||
|
Object converted = this.convert(value, objectFormatter, logger);
|
||||||
|
if (converted != null)
|
||||||
|
objects.add(converted);
|
||||||
|
}
|
||||||
|
return objects;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,31 @@
|
||||||
|
package us.codecraft.webmagic.model.fields;
|
||||||
|
|
||||||
|
import java.lang.reflect.InvocationTargetException;
|
||||||
|
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
|
||||||
|
import us.codecraft.webmagic.model.FieldExtractor;
|
||||||
|
import us.codecraft.webmagic.model.formatter.ObjectFormatter;
|
||||||
|
|
||||||
|
public abstract class PageField {
|
||||||
|
public abstract boolean operation(Object o, FieldExtractor fieldExtractor, Logger logger) throws IllegalAccessException, InvocationTargetException;
|
||||||
|
|
||||||
|
protected Object convert(String value, ObjectFormatter objectFormatter, Logger logger) {
|
||||||
|
try {
|
||||||
|
Object format = objectFormatter.format(value);
|
||||||
|
logger.debug("String {} is converted to {}", value, format);
|
||||||
|
return format;
|
||||||
|
} catch (Exception e) {
|
||||||
|
logger.error("convert " + value + " to " + objectFormatter.clazz() + " error!", e);
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected void setField(Object o, FieldExtractor fieldExtractor, Object value) throws IllegalAccessException, InvocationTargetException {
|
||||||
|
if (value != null) {
|
||||||
|
if (fieldExtractor.getSetterMethod() != null)
|
||||||
|
fieldExtractor.getSetterMethod().invoke(o, value);
|
||||||
|
fieldExtractor.getField().set(o, value);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,28 @@
|
||||||
|
package us.codecraft.webmagic.model.fields;
|
||||||
|
|
||||||
|
import java.lang.reflect.InvocationTargetException;
|
||||||
|
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
|
||||||
|
import lombok.Getter;
|
||||||
|
import us.codecraft.webmagic.model.FieldExtractor;
|
||||||
|
|
||||||
|
public class SingleField extends PageField {
|
||||||
|
@Getter
|
||||||
|
private String fieldName;
|
||||||
|
|
||||||
|
public SingleField(String fieldName) {
|
||||||
|
this.fieldName = fieldName;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean operation(Object o, FieldExtractor fieldExtractor, Logger logger) throws IllegalAccessException, InvocationTargetException {
|
||||||
|
if (fieldExtractor.getObjectFormatter() != null) {
|
||||||
|
Object converted = this.convert(this.fieldName, fieldExtractor.getObjectFormatter(), logger);
|
||||||
|
if (converted == null && fieldExtractor.isNotNull())
|
||||||
|
return false;
|
||||||
|
setField(o, fieldExtractor, converted);
|
||||||
|
} else
|
||||||
|
setField(o, fieldExtractor, this.fieldName);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,85 @@
|
||||||
|
package us.codecraft.webmagic.model.formatter;
|
||||||
|
|
||||||
|
public interface BasicClassDetector {
|
||||||
|
Class<?> detectBasicClass(Class<?> type);
|
||||||
|
}
|
||||||
|
|
||||||
|
class IntegerClassDetector implements BasicClassDetector {
|
||||||
|
@Override
|
||||||
|
public Class<?> detectBasicClass(Class<?> type) {
|
||||||
|
if (type.equals(Integer.TYPE) || type.equals(Integer.class)) {
|
||||||
|
return Integer.class;
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
class LongClassDetector implements BasicClassDetector {
|
||||||
|
@Override
|
||||||
|
public Class<?> detectBasicClass(Class<?> type) {
|
||||||
|
if (type.equals(Long.TYPE) || type.equals(Long.class)) {
|
||||||
|
return Long.class;
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
class DoubleClassDetector implements BasicClassDetector {
|
||||||
|
@Override
|
||||||
|
public Class<?> detectBasicClass(Class<?> type) {
|
||||||
|
if (type.equals(Double.TYPE) || type.equals(Double.class)) {
|
||||||
|
return Double.class;
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
class FloatClassDetector implements BasicClassDetector {
|
||||||
|
@Override
|
||||||
|
public Class<?> detectBasicClass(Class<?> type) {
|
||||||
|
if (type.equals(Float.TYPE) || type.equals(Float.class)) {
|
||||||
|
return Float.class;
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
class ShortClassDetector implements BasicClassDetector {
|
||||||
|
@Override
|
||||||
|
public Class<?> detectBasicClass(Class<?> type) {
|
||||||
|
if (type.equals(Short.TYPE) || type.equals(Short.class)) {
|
||||||
|
return Short.class;
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
class CharacterClassDetector implements BasicClassDetector {
|
||||||
|
@Override
|
||||||
|
public Class<?> detectBasicClass(Class<?> type) {
|
||||||
|
if (type.equals(Character.TYPE) || type.equals(Character.class)) {
|
||||||
|
return Character.class;
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
class ByteClassDetector implements BasicClassDetector {
|
||||||
|
@Override
|
||||||
|
public Class<?> detectBasicClass(Class<?> type) {
|
||||||
|
if (type.equals(Byte.TYPE) || type.equals(Byte.class)) {
|
||||||
|
return Byte.class;
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
class BooleanClassDetector implements BasicClassDetector {
|
||||||
|
@Override
|
||||||
|
public Class<?> detectBasicClass(Class<?> type) {
|
||||||
|
if (type.equals(Boolean.TYPE) || type.equals(Boolean.class)) {
|
||||||
|
return Boolean.class;
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
|
@ -24,28 +24,24 @@ public abstract class BasicTypeFormatter<T> implements ObjectFormatter<T> {
|
||||||
}
|
}
|
||||||
|
|
||||||
protected abstract T formatTrimmed(String raw) throws Exception;
|
protected abstract T formatTrimmed(String raw) throws Exception;
|
||||||
|
|
||||||
public static final List<Class<? extends ObjectFormatter>> basicTypeFormatters = Arrays.<Class<? extends ObjectFormatter>>asList(IntegerFormatter.class,
|
public static final List<Class<? extends ObjectFormatter>> basicTypeFormatters = Arrays.<Class<? extends ObjectFormatter>>asList(IntegerFormatter.class,
|
||||||
LongFormatter.class, DoubleFormatter.class, FloatFormatter.class, ShortFormatter.class,
|
LongFormatter.class, DoubleFormatter.class, FloatFormatter.class, ShortFormatter.class,
|
||||||
CharactorFormatter.class, ByteFormatter.class, BooleanFormatter.class);
|
CharactorFormatter.class, ByteFormatter.class, BooleanFormatter.class);
|
||||||
|
public static final List<BasicClassDetector> basicClassDetector= Arrays.asList(new IntegerClassDetector(),
|
||||||
|
new LongClassDetector(),
|
||||||
|
new FloatClassDetector(),
|
||||||
|
new DoubleClassDetector(),
|
||||||
|
new ShortClassDetector(),
|
||||||
|
new ByteClassDetector(),
|
||||||
|
new BooleanClassDetector(),
|
||||||
|
new CharacterClassDetector());
|
||||||
|
|
||||||
public static Class<?> detectBasicClass(Class<?> type) {
|
public static Class<?> detectBasicClass(Class<?> type) {
|
||||||
if (type.equals(Integer.TYPE) || type.equals(Integer.class)) {
|
for (BasicClassDetector detector : basicClassDetector) {
|
||||||
return Integer.class;
|
Class<?> detectedClass = detector.detectBasicClass(type);
|
||||||
} else if (type.equals(Long.TYPE) || type.equals(Long.class)) {
|
if (detectedClass != null) {
|
||||||
return Long.class;
|
return detectedClass;
|
||||||
} else if (type.equals(Double.TYPE) || type.equals(Double.class)) {
|
}
|
||||||
return Double.class;
|
|
||||||
} else if (type.equals(Float.TYPE) || type.equals(Float.class)) {
|
|
||||||
return Float.class;
|
|
||||||
} else if (type.equals(Short.TYPE) || type.equals(Short.class)) {
|
|
||||||
return Short.class;
|
|
||||||
} else if (type.equals(Character.TYPE) || type.equals(Character.class)) {
|
|
||||||
return Character.class;
|
|
||||||
} else if (type.equals(Byte.TYPE) || type.equals(Byte.class)) {
|
|
||||||
return Byte.class;
|
|
||||||
} else if (type.equals(Boolean.TYPE) || type.equals(Boolean.class)) {
|
|
||||||
return Boolean.class;
|
|
||||||
}
|
}
|
||||||
return type;
|
return type;
|
||||||
}
|
}
|
||||||
|
@ -146,5 +142,4 @@ public abstract class BasicTypeFormatter<T> implements ObjectFormatter<T> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,68 @@
|
||||||
|
package us.codecraft.webmagic.model.sources;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import us.codecraft.webmagic.Page;
|
||||||
|
import us.codecraft.webmagic.model.FieldExtractor;
|
||||||
|
|
||||||
|
public interface Source {
|
||||||
|
public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor);
|
||||||
|
public List<String> getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor);
|
||||||
|
|
||||||
|
public class RawHtml implements Source {
|
||||||
|
public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
|
||||||
|
return page.getHtml().selectDocument(fieldExtractor.getSelector());
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<String> getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
|
||||||
|
return page.getHtml().selectDocumentForList(fieldExtractor.getSelector());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public class SelectedHtml implements Source {
|
||||||
|
public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
|
||||||
|
if (isRaw)
|
||||||
|
return page.getHtml().selectDocument(fieldExtractor.getSelector());
|
||||||
|
else
|
||||||
|
return fieldExtractor.getSelector().select(html);
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<String> getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
|
||||||
|
if (isRaw)
|
||||||
|
return page.getHtml().selectDocumentForList(fieldExtractor.getSelector());
|
||||||
|
else
|
||||||
|
return fieldExtractor.getSelector().selectList(html);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public class Url implements Source {
|
||||||
|
public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
|
||||||
|
return fieldExtractor.getSelector().select(page.getUrl().toString());
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<String> getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
|
||||||
|
return fieldExtractor.getSelector().selectList(page.getUrl().toString());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public class RawText implements Source {
|
||||||
|
public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
|
||||||
|
return fieldExtractor.getSelector().select(page.getRawText());
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<String> getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
|
||||||
|
return fieldExtractor.getSelector().selectList(page.getRawText());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public class DefaultSource implements Source {
|
||||||
|
public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
|
||||||
|
return fieldExtractor.getSelector().select(html);
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<String> getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
|
||||||
|
return fieldExtractor.getSelector().selectList(html);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
@ -0,0 +1,17 @@
|
||||||
|
package us.codecraft.webmagic.model.sources;
|
||||||
|
|
||||||
|
import us.codecraft.webmagic.Page;
|
||||||
|
import us.codecraft.webmagic.model.FieldExtractor;
|
||||||
|
import us.codecraft.webmagic.model.fields.MultipleField;
|
||||||
|
import us.codecraft.webmagic.model.fields.PageField;
|
||||||
|
import us.codecraft.webmagic.model.fields.SingleField;
|
||||||
|
|
||||||
|
public class SourceTextExtractor {
|
||||||
|
public static PageField getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
|
||||||
|
Source source = fieldExtractor.getSource();
|
||||||
|
if (fieldExtractor.isMulti())
|
||||||
|
return new MultipleField(source.getTextList(page, html, isRaw, fieldExtractor));
|
||||||
|
else
|
||||||
|
return new SingleField(source.getText(page, html, isRaw, fieldExtractor));
|
||||||
|
}
|
||||||
|
}
|
|
@ -102,7 +102,7 @@ public class RedisPriorityScheduler extends RedisScheduler {
|
||||||
}
|
}
|
||||||
|
|
||||||
private void setExtrasInItem(Jedis jedis,Request request, Task task) {
|
private void setExtrasInItem(Jedis jedis,Request request, Task task) {
|
||||||
if (request.getExtras() != null) {
|
if (!request.getExtras().isEmpty()) {
|
||||||
String field = DigestUtils.sha1Hex(request.getUrl());
|
String field = DigestUtils.sha1Hex(request.getUrl());
|
||||||
String value = JSON.toJSONString(request);
|
String value = JSON.toJSONString(request);
|
||||||
jedis.hset(getItemKey(task), field, value);
|
jedis.hset(getItemKey(task), field, value);
|
||||||
|
|
|
@ -84,7 +84,7 @@ public class RedisScheduler extends DuplicateRemovedScheduler implements Monitor
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (request.getExtras() != null && !request.getExtras().isEmpty()) {
|
if (!request.getExtras().isEmpty()) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
if (request.getPriority() != 0L) {
|
if (request.getPriority() != 0L) {
|
||||||
|
|
|
@ -1,21 +0,0 @@
|
||||||
<?xml version="1.0" encoding="UTF-8"?>
|
|
||||||
<!DOCTYPE log4j:configuration SYSTEM "log4j.dtd">
|
|
||||||
<log4j:configuration xmlns:log4j="http://jakarta.apache.org/log4j/">
|
|
||||||
|
|
||||||
<appender name="stdout" class="org.apache.log4j.ConsoleAppender">
|
|
||||||
<layout class="org.apache.log4j.PatternLayout">
|
|
||||||
<param name="ConversionPattern" value="%d{yy-MM-dd HH:mm:ss,SSS} %-5p %c(%F:%L) ## %m%n" />
|
|
||||||
</layout>
|
|
||||||
</appender>
|
|
||||||
|
|
||||||
<logger name="org.apache" additivity="false">
|
|
||||||
<level value="warn" />
|
|
||||||
<appender-ref ref="stdout" />
|
|
||||||
</logger>
|
|
||||||
|
|
||||||
<root>
|
|
||||||
<level value="info" />
|
|
||||||
<appender-ref ref="stdout" />
|
|
||||||
</root>
|
|
||||||
|
|
||||||
</log4j:configuration>
|
|
|
@ -13,7 +13,6 @@ import static org.assertj.core.api.Assertions.assertThat;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @author code4crafter@gmail.com
|
* @author code4crafter@gmail.com
|
||||||
* @date 14-4-5
|
|
||||||
*/
|
*/
|
||||||
public class ConfigurablePageProcessorTest {
|
public class ConfigurablePageProcessorTest {
|
||||||
|
|
||||||
|
|
|
@ -12,7 +12,6 @@ import static org.assertj.core.api.Assertions.assertThat;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @author code4crafter@gmail.com
|
* @author code4crafter@gmail.com
|
||||||
* @date 14-4-4
|
|
||||||
*/
|
*/
|
||||||
public class ModelPageProcessorTest {
|
public class ModelPageProcessorTest {
|
||||||
|
|
||||||
|
|
|
@ -1,21 +0,0 @@
|
||||||
<?xml version="1.0" encoding="UTF-8"?>
|
|
||||||
<!DOCTYPE log4j:configuration SYSTEM "log4j.dtd">
|
|
||||||
<log4j:configuration xmlns:log4j="http://jakarta.apache.org/log4j/">
|
|
||||||
|
|
||||||
<appender name="stdout" class="org.apache.log4j.ConsoleAppender">
|
|
||||||
<layout class="org.apache.log4j.PatternLayout">
|
|
||||||
<param name="ConversionPattern" value="%d{yy-MM-dd HH:mm:ss,SSS} %-5p %c(%F:%L) ## %m%n" />
|
|
||||||
</layout>
|
|
||||||
</appender>
|
|
||||||
|
|
||||||
<logger name="org.apache" additivity="false">
|
|
||||||
<level value="warn" />
|
|
||||||
<appender-ref ref="stdout" />
|
|
||||||
</logger>
|
|
||||||
|
|
||||||
<root>
|
|
||||||
<level value="info" />
|
|
||||||
<appender-ref ref="stdout" />
|
|
||||||
</root>
|
|
||||||
|
|
||||||
</log4j:configuration>
|
|
|
@ -0,0 +1,16 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<Configuration>
|
||||||
|
<Appenders>
|
||||||
|
<Console name="stdout" target="SYSTEM_OUT">
|
||||||
|
<PatternLayout pattern="%d{yy-MM-dd HH:mm:ss,SSS} %-5p %c(%F:%L) ## %m%n" />
|
||||||
|
</Console>
|
||||||
|
</Appenders>
|
||||||
|
<Loggers>
|
||||||
|
<Logger name="org.apache" level="warn" additivity="false">
|
||||||
|
<AppenderRef ref="stdout" />
|
||||||
|
</Logger>
|
||||||
|
<Root level="info">
|
||||||
|
<AppenderRef ref="stdout" />
|
||||||
|
</Root>
|
||||||
|
</Loggers>
|
||||||
|
</Configuration>
|
|
@ -1,9 +1,14 @@
|
||||||
<?xml version="1.0" encoding="UTF-8"?>
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
<project
|
||||||
|
xmlns="http://maven.apache.org/POM/4.0.0"
|
||||||
|
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||||
|
xsi:schemaLocation="
|
||||||
|
http://maven.apache.org/POM/4.0.0
|
||||||
|
http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||||
<parent>
|
<parent>
|
||||||
<artifactId>webmagic-parent</artifactId>
|
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<version>0.10.3</version>
|
<artifactId>webmagic</artifactId>
|
||||||
|
<version>1.0.0</version>
|
||||||
</parent>
|
</parent>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
@ -20,10 +25,6 @@
|
||||||
<artifactId>webmagic-extension</artifactId>
|
<artifactId>webmagic-extension</artifactId>
|
||||||
<version>${project.version}</version>
|
<version>${project.version}</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
|
||||||
<groupId>junit</groupId>
|
|
||||||
<artifactId>junit</artifactId>
|
|
||||||
</dependency>
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.mapdb</groupId>
|
<groupId>org.mapdb</groupId>
|
||||||
<artifactId>mapdb</artifactId>
|
<artifactId>mapdb</artifactId>
|
||||||
|
@ -42,7 +43,7 @@
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>com.fasterxml.jackson.core</groupId>
|
<groupId>com.fasterxml.jackson.core</groupId>
|
||||||
<artifactId>jackson-databind</artifactId>
|
<artifactId>jackson-databind</artifactId>
|
||||||
<version>2.15.2</version>
|
<version>2.16.0</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
</dependencies>
|
</dependencies>
|
||||||
|
|
||||||
|
|
|
@ -1,26 +0,0 @@
|
||||||
<?xml version="1.0" encoding="UTF-8"?>
|
|
||||||
<!DOCTYPE log4j:configuration SYSTEM "log4j.dtd">
|
|
||||||
<log4j:configuration xmlns:log4j="http://jakarta.apache.org/log4j/">
|
|
||||||
|
|
||||||
<appender name="stdout" class="org.apache.log4j.ConsoleAppender">
|
|
||||||
<layout class="org.apache.log4j.PatternLayout">
|
|
||||||
<param name="ConversionPattern" value="%d{yy-MM-dd HH:mm:ss,SSS} %-5p %c(%F:%L) ## %m%n" />
|
|
||||||
</layout>
|
|
||||||
</appender>
|
|
||||||
|
|
||||||
<logger name="org.springframework" additivity="false">
|
|
||||||
<level value="warn" />
|
|
||||||
<appender-ref ref="stdout" />
|
|
||||||
</logger>
|
|
||||||
|
|
||||||
<logger name="net.sf.ehcache" additivity="false">
|
|
||||||
<level value="warn" />
|
|
||||||
<appender-ref ref="stdout" />
|
|
||||||
</logger>
|
|
||||||
|
|
||||||
<root>
|
|
||||||
<level value="info" />
|
|
||||||
<appender-ref ref="stdout" />
|
|
||||||
</root>
|
|
||||||
|
|
||||||
</log4j:configuration>
|
|
|
@ -0,0 +1,19 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<Configuration>
|
||||||
|
<Appenders>
|
||||||
|
<Console name="stdout" target="SYSTEM_OUT">
|
||||||
|
<PatternLayout pattern="%d{yy-MM-dd HH:mm:ss,SSS} %-5p %c(%F:%L) ## %m%n" />
|
||||||
|
</Console>
|
||||||
|
</Appenders>
|
||||||
|
<Loggers>
|
||||||
|
<Logger name="org.springframework" level="warn" additivity="false">
|
||||||
|
<AppenderRef ref="stdout" />
|
||||||
|
</Logger>
|
||||||
|
<Logger name="net.sf.ehcache" level="warn" additivity="false">
|
||||||
|
<AppenderRef ref="stdout" />
|
||||||
|
</Logger>
|
||||||
|
<Root level="info">
|
||||||
|
<AppenderRef ref="stdout" />
|
||||||
|
</Root>
|
||||||
|
</Loggers>
|
||||||
|
</Configuration>
|
|
@ -1,14 +1,23 @@
|
||||||
<?xml version="1.0" encoding="UTF-8"?>
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
<project
|
||||||
|
xmlns="http://maven.apache.org/POM/4.0.0"
|
||||||
|
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||||
|
xsi:schemaLocation="
|
||||||
|
http://maven.apache.org/POM/4.0.0
|
||||||
|
http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||||
<parent>
|
<parent>
|
||||||
<artifactId>webmagic-parent</artifactId>
|
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<version>0.10.3</version>
|
<artifactId>webmagic</artifactId>
|
||||||
|
<version>1.0.0</version>
|
||||||
</parent>
|
</parent>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
<artifactId>webmagic-saxon</artifactId>
|
<artifactId>webmagic-saxon</artifactId>
|
||||||
|
|
||||||
|
<properties>
|
||||||
|
<maven.deploy.skip>true</maven.deploy.skip>
|
||||||
|
</properties>
|
||||||
|
|
||||||
<dependencies>
|
<dependencies>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>${project.groupId}</groupId>
|
<groupId>${project.groupId}</groupId>
|
||||||
|
@ -23,23 +32,6 @@
|
||||||
<groupId>net.sf.saxon</groupId>
|
<groupId>net.sf.saxon</groupId>
|
||||||
<artifactId>Saxon-HE</artifactId>
|
<artifactId>Saxon-HE</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
|
||||||
<groupId>junit</groupId>
|
|
||||||
<artifactId>junit</artifactId>
|
|
||||||
</dependency>
|
|
||||||
</dependencies>
|
</dependencies>
|
||||||
|
|
||||||
<build>
|
|
||||||
<plugins>
|
|
||||||
<plugin>
|
|
||||||
<groupId>org.apache.maven.plugins</groupId>
|
|
||||||
<artifactId>maven-deploy-plugin</artifactId>
|
|
||||||
<version>3.0.0-M1</version>
|
|
||||||
<configuration>
|
|
||||||
<skip>true</skip>
|
|
||||||
</configuration>
|
|
||||||
</plugin>
|
|
||||||
</plugins>
|
|
||||||
</build>
|
|
||||||
|
|
||||||
</project>
|
</project>
|
||||||
|
|
|
@ -1,9 +1,14 @@
|
||||||
<?xml version="1.0" encoding="UTF-8"?>
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
<project
|
||||||
|
xmlns="http://maven.apache.org/POM/4.0.0"
|
||||||
|
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||||
|
xsi:schemaLocation="
|
||||||
|
http://maven.apache.org/POM/4.0.0
|
||||||
|
http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||||
<parent>
|
<parent>
|
||||||
<artifactId>webmagic-parent</artifactId>
|
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<version>0.10.3</version>
|
<artifactId>webmagic</artifactId>
|
||||||
|
<version>1.0.0</version>
|
||||||
</parent>
|
</parent>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
@ -13,6 +18,14 @@
|
||||||
</properties>
|
</properties>
|
||||||
|
|
||||||
<dependencies>
|
<dependencies>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.logging.log4j</groupId>
|
||||||
|
<artifactId>log4j-core</artifactId>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.logging.log4j</groupId>
|
||||||
|
<artifactId>log4j-slf4j2-impl</artifactId>
|
||||||
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.jruby</groupId>
|
<groupId>org.jruby</groupId>
|
||||||
<artifactId>jruby</artifactId>
|
<artifactId>jruby</artifactId>
|
||||||
|
@ -30,25 +43,22 @@
|
||||||
<groupId>commons-cli</groupId>
|
<groupId>commons-cli</groupId>
|
||||||
<artifactId>commons-cli</artifactId>
|
<artifactId>commons-cli</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
|
||||||
<groupId>junit</groupId>
|
|
||||||
<artifactId>junit</artifactId>
|
|
||||||
<scope>test</scope>
|
|
||||||
</dependency>
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>${project.groupId}</groupId>
|
<groupId>${project.groupId}</groupId>
|
||||||
<artifactId>webmagic-core</artifactId>
|
<artifactId>webmagic-core</artifactId>
|
||||||
<version>${project.version}</version>
|
<version>${project.version}</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
|
||||||
<groupId>org.slf4j</groupId>
|
|
||||||
<artifactId>slf4j-log4j12</artifactId>
|
|
||||||
</dependency>
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>${project.groupId}</groupId>
|
<groupId>${project.groupId}</groupId>
|
||||||
<artifactId>webmagic-extension</artifactId>
|
<artifactId>webmagic-extension</artifactId>
|
||||||
<version>${project.version}</version>
|
<version>${project.version}</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.projectlombok</groupId>
|
||||||
|
<artifactId>lombok</artifactId>
|
||||||
|
<version>1.18.32</version>
|
||||||
|
<scope>provided</scope>
|
||||||
|
</dependency>
|
||||||
</dependencies>
|
</dependencies>
|
||||||
|
|
||||||
<build>
|
<build>
|
||||||
|
|
|
@ -0,0 +1,47 @@
|
||||||
|
package us.codecraft.webmagic.scripts;
|
||||||
|
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
import lombok.Getter;
|
||||||
|
import lombok.Setter;
|
||||||
|
import us.codecraft.webmagic.scripts.languages.JRuby;
|
||||||
|
import us.codecraft.webmagic.scripts.languages.Javascript;
|
||||||
|
import us.codecraft.webmagic.scripts.languages.Language;
|
||||||
|
import us.codecraft.webmagic.utils.WMCollections;
|
||||||
|
|
||||||
|
public class Params {
|
||||||
|
@Getter
|
||||||
|
Language language = new Javascript();
|
||||||
|
|
||||||
|
@Getter @Setter
|
||||||
|
String scriptFileName;
|
||||||
|
|
||||||
|
@Getter @Setter
|
||||||
|
List<String> urls;
|
||||||
|
|
||||||
|
@Getter @Setter
|
||||||
|
int thread = 1;
|
||||||
|
|
||||||
|
@Getter @Setter
|
||||||
|
int sleepTime = 1000;
|
||||||
|
|
||||||
|
private static Map<Language, Set<String>> alias;
|
||||||
|
|
||||||
|
public Params() {
|
||||||
|
alias = new HashMap<Language, Set<String>>();
|
||||||
|
alias.put(new Javascript(), WMCollections.<String>newHashSet("js", "javascript", "JavaScript", "JS"));
|
||||||
|
alias.put(new JRuby(), WMCollections.<String>newHashSet("ruby", "jruby", "Ruby", "JRuby"));
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setLanguagefromArg(String arg) {
|
||||||
|
for (Map.Entry<Language, Set<String>> languageSetEntry : alias.entrySet()) {
|
||||||
|
if (languageSetEntry.getValue().contains(arg)) {
|
||||||
|
this.language = languageSetEntry.getKey();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -1,88 +1,21 @@
|
||||||
package us.codecraft.webmagic.scripts;
|
package us.codecraft.webmagic.scripts;
|
||||||
|
|
||||||
import org.apache.commons.cli.*;
|
import org.apache.commons.cli.*;
|
||||||
import org.apache.log4j.Level;
|
|
||||||
import org.apache.log4j.Logger;
|
|
||||||
import us.codecraft.webmagic.ResultItems;
|
import us.codecraft.webmagic.ResultItems;
|
||||||
import us.codecraft.webmagic.Spider;
|
import us.codecraft.webmagic.Spider;
|
||||||
import us.codecraft.webmagic.Task;
|
import us.codecraft.webmagic.Task;
|
||||||
import us.codecraft.webmagic.pipeline.Pipeline;
|
import us.codecraft.webmagic.pipeline.Pipeline;
|
||||||
|
import us.codecraft.webmagic.scripts.config.CommandLineOption;
|
||||||
import us.codecraft.webmagic.utils.WMCollections;
|
import us.codecraft.webmagic.utils.WMCollections;
|
||||||
|
|
||||||
import java.util.HashMap;
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @author code4crafter@gmail.com
|
* @author code4crafter@gmail.com / FrancoisGib
|
||||||
* @since 0.4.1
|
* @since 0.4.1
|
||||||
*/
|
*/
|
||||||
public class ScriptConsole {
|
public class ScriptConsole {
|
||||||
|
|
||||||
private static class Params {
|
|
||||||
Language language = Language.JavaScript;
|
|
||||||
String scriptFileName;
|
|
||||||
List<String> urls;
|
|
||||||
int thread = 1;
|
|
||||||
int sleepTime = 1000;
|
|
||||||
private static Map<Language, Set<String>> alias = new HashMap<Language, Set<String>>();
|
|
||||||
|
|
||||||
static {
|
|
||||||
alias.put(Language.JavaScript, WMCollections.<String>newHashSet("js", "javascript", "JavaScript", "JS"));
|
|
||||||
alias.put(Language.JRuby, WMCollections.<String>newHashSet("ruby", "jruby", "Ruby", "JRuby"));
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setLanguagefromArg(String arg) {
|
|
||||||
for (Map.Entry<Language, Set<String>> languageSetEntry : alias.entrySet()) {
|
|
||||||
if (languageSetEntry.getValue().contains(arg)) {
|
|
||||||
this.language = languageSetEntry.getKey();
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private Language getLanguage() {
|
|
||||||
return language;
|
|
||||||
}
|
|
||||||
|
|
||||||
private void setLanguage(Language language) {
|
|
||||||
this.language = language;
|
|
||||||
}
|
|
||||||
|
|
||||||
private String getScriptFileName() {
|
|
||||||
return scriptFileName;
|
|
||||||
}
|
|
||||||
|
|
||||||
private void setScriptFileName(String scriptFileName) {
|
|
||||||
this.scriptFileName = scriptFileName;
|
|
||||||
}
|
|
||||||
|
|
||||||
private List<String> getUrls() {
|
|
||||||
return urls;
|
|
||||||
}
|
|
||||||
|
|
||||||
private void setUrls(List<String> urls) {
|
|
||||||
this.urls = urls;
|
|
||||||
}
|
|
||||||
|
|
||||||
private int getThread() {
|
|
||||||
return thread;
|
|
||||||
}
|
|
||||||
|
|
||||||
private void setThread(int thread) {
|
|
||||||
this.thread = thread;
|
|
||||||
}
|
|
||||||
|
|
||||||
private int getSleepTime() {
|
|
||||||
return sleepTime;
|
|
||||||
}
|
|
||||||
|
|
||||||
private void setSleepTime(int sleepTime) {
|
|
||||||
this.sleepTime = sleepTime;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public static void main(String[] args) {
|
public static void main(String[] args) {
|
||||||
Params params = parseCommand(args);
|
Params params = parseCommand(args);
|
||||||
startSpider(params);
|
startSpider(params);
|
||||||
|
@ -140,45 +73,9 @@ public class ScriptConsole {
|
||||||
|
|
||||||
private static Params readOptions(CommandLine commandLine) {
|
private static Params readOptions(CommandLine commandLine) {
|
||||||
Params params = new Params();
|
Params params = new Params();
|
||||||
if (commandLine.hasOption("l")) {
|
List<CommandLineOption> options = CommandLineOption.getAllOptions();
|
||||||
String language = commandLine.getOptionValue("l");
|
for (CommandLineOption option : options)
|
||||||
params.setLanguagefromArg(language);
|
option.addParamOptionIfInCommandLine(params, commandLine);
|
||||||
}
|
|
||||||
if (commandLine.hasOption("f")) {
|
|
||||||
String scriptFilename = commandLine.getOptionValue("f");
|
|
||||||
params.setScriptFileName(scriptFilename);
|
|
||||||
} else {
|
|
||||||
exit();
|
|
||||||
}
|
|
||||||
if (commandLine.hasOption("s")) {
|
|
||||||
Integer sleepTime = Integer.parseInt(commandLine.getOptionValue("s"));
|
|
||||||
params.setSleepTime(sleepTime);
|
|
||||||
}
|
|
||||||
if (commandLine.hasOption("t")) {
|
|
||||||
Integer thread = Integer.parseInt(commandLine.getOptionValue("t"));
|
|
||||||
params.setThread(thread);
|
|
||||||
}
|
|
||||||
if (commandLine.hasOption("g")) {
|
|
||||||
configLogger(commandLine.getOptionValue("g"));
|
|
||||||
}
|
|
||||||
params.setUrls(commandLine.getArgList());
|
|
||||||
return params;
|
return params;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void configLogger(String value) {
|
|
||||||
Logger rootLogger = Logger.getRootLogger();
|
|
||||||
if ("debug".equalsIgnoreCase(value)) {
|
|
||||||
rootLogger.setLevel(Level.DEBUG);
|
|
||||||
} else if ("info".equalsIgnoreCase(value)) {
|
|
||||||
rootLogger.setLevel(Level.INFO);
|
|
||||||
} else if ("warn".equalsIgnoreCase(value)) {
|
|
||||||
rootLogger.setLevel(Level.WARN);
|
|
||||||
} else if ("trace".equalsIgnoreCase(value)) {
|
|
||||||
rootLogger.setLevel(Level.TRACE);
|
|
||||||
} else if ("off".equalsIgnoreCase(value)) {
|
|
||||||
rootLogger.setLevel(Level.OFF);
|
|
||||||
} else if ("error".equalsIgnoreCase(value)) {
|
|
||||||
rootLogger.setLevel(Level.ERROR);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
|
@ -2,6 +2,9 @@ package us.codecraft.webmagic.scripts;
|
||||||
|
|
||||||
import javax.script.ScriptEngine;
|
import javax.script.ScriptEngine;
|
||||||
import javax.script.ScriptEngineManager;
|
import javax.script.ScriptEngineManager;
|
||||||
|
|
||||||
|
import us.codecraft.webmagic.scripts.languages.Language;
|
||||||
|
|
||||||
import java.util.concurrent.LinkedBlockingQueue;
|
import java.util.concurrent.LinkedBlockingQueue;
|
||||||
import java.util.concurrent.atomic.AtomicInteger;
|
import java.util.concurrent.atomic.AtomicInteger;
|
||||||
|
|
||||||
|
@ -11,14 +14,11 @@ import java.util.concurrent.atomic.AtomicInteger;
|
||||||
*/
|
*/
|
||||||
public class ScriptEnginePool {
|
public class ScriptEnginePool {
|
||||||
|
|
||||||
private final int size;
|
|
||||||
|
|
||||||
private final AtomicInteger availableCount;
|
private final AtomicInteger availableCount;
|
||||||
|
|
||||||
private final LinkedBlockingQueue<ScriptEngine> scriptEngines = new LinkedBlockingQueue<ScriptEngine>();
|
private final LinkedBlockingQueue<ScriptEngine> scriptEngines = new LinkedBlockingQueue<ScriptEngine>();
|
||||||
|
|
||||||
public ScriptEnginePool(Language language,int size) {
|
public ScriptEnginePool(Language language,int size) {
|
||||||
this.size = size;
|
|
||||||
this.availableCount = new AtomicInteger(size);
|
this.availableCount = new AtomicInteger(size);
|
||||||
for (int i=0;i<size;i++){
|
for (int i=0;i<size;i++){
|
||||||
ScriptEngineManager manager = new ScriptEngineManager();
|
ScriptEngineManager manager = new ScriptEngineManager();
|
||||||
|
|
|
@ -4,17 +4,14 @@ package us.codecraft.webmagic.scripts;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
import java.nio.charset.Charset;
|
import java.nio.charset.Charset;
|
||||||
import java.util.Iterator;
|
|
||||||
import java.util.Map;
|
|
||||||
import javax.script.ScriptContext;
|
import javax.script.ScriptContext;
|
||||||
import javax.script.ScriptEngine;
|
import javax.script.ScriptEngine;
|
||||||
import javax.script.ScriptException;
|
import javax.script.ScriptException;
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.jruby.RubyHash;
|
|
||||||
import org.python.core.PyDictionary;
|
|
||||||
import us.codecraft.webmagic.Page;
|
import us.codecraft.webmagic.Page;
|
||||||
import us.codecraft.webmagic.Site;
|
import us.codecraft.webmagic.Site;
|
||||||
import us.codecraft.webmagic.processor.PageProcessor;
|
import us.codecraft.webmagic.processor.PageProcessor;
|
||||||
|
import us.codecraft.webmagic.scripts.languages.Language;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @author code4crafter@gmail.com
|
* @author code4crafter@gmail.com
|
||||||
|
@ -55,35 +52,7 @@ public class ScriptProcessor implements PageProcessor {
|
||||||
context.setAttribute("page", page, ScriptContext.ENGINE_SCOPE);
|
context.setAttribute("page", page, ScriptContext.ENGINE_SCOPE);
|
||||||
context.setAttribute("config", site, ScriptContext.ENGINE_SCOPE);
|
context.setAttribute("config", site, ScriptContext.ENGINE_SCOPE);
|
||||||
try {
|
try {
|
||||||
switch (language) {
|
this.language.process(engine, defines, script, page);
|
||||||
case JavaScript:
|
|
||||||
engine.eval(defines + "\n" + script, context);
|
|
||||||
// NativeObject o = (NativeObject) engine.get("result");
|
|
||||||
// if (o != null) {
|
|
||||||
// for (Object o1 : o.getIds()) {
|
|
||||||
// String key = String.valueOf(o1);
|
|
||||||
// page.getResultItems().put(key, NativeObject.getProperty(o, key));
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
break;
|
|
||||||
case JRuby:
|
|
||||||
RubyHash oRuby = (RubyHash) engine.eval(defines + "\n" + script, context);
|
|
||||||
Iterator itruby = oRuby.entrySet().iterator();
|
|
||||||
while (itruby.hasNext()) {
|
|
||||||
Map.Entry pairs = (Map.Entry) itruby.next();
|
|
||||||
page.getResultItems().put(pairs.getKey().toString(), pairs.getValue());
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
case Jython:
|
|
||||||
engine.eval(defines + "\n" + script, context);
|
|
||||||
PyDictionary oJython = (PyDictionary) engine.get("result");
|
|
||||||
Iterator it = oJython.entrySet().iterator();
|
|
||||||
while (it.hasNext()) {
|
|
||||||
Map.Entry pairs = (Map.Entry) it.next();
|
|
||||||
page.getResultItems().put(pairs.getKey().toString(), pairs.getValue());
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
} catch (ScriptException e) {
|
} catch (ScriptException e) {
|
||||||
e.printStackTrace();
|
e.printStackTrace();
|
||||||
}
|
}
|
||||||
|
|
|
@ -7,6 +7,9 @@ import java.io.InputStream;
|
||||||
import java.nio.charset.Charset;
|
import java.nio.charset.Charset;
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
|
|
||||||
|
import us.codecraft.webmagic.scripts.languages.Javascript;
|
||||||
|
import us.codecraft.webmagic.scripts.languages.Language;
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @author code4crafter@gmail.com
|
* @author code4crafter@gmail.com
|
||||||
|
@ -14,7 +17,7 @@ import org.apache.commons.io.IOUtils;
|
||||||
*/
|
*/
|
||||||
public class ScriptProcessorBuilder {
|
public class ScriptProcessorBuilder {
|
||||||
|
|
||||||
private static final Language DefaultLanguage = Language.JavaScript;
|
private static final Language DefaultLanguage = new Javascript();
|
||||||
|
|
||||||
private Language language = DefaultLanguage;
|
private Language language = DefaultLanguage;
|
||||||
|
|
||||||
|
@ -39,7 +42,6 @@ public class ScriptProcessorBuilder {
|
||||||
InputStream resourceAsStream = new FileInputStream(fileName);
|
InputStream resourceAsStream = new FileInputStream(fileName);
|
||||||
this.script = IOUtils.toString(resourceAsStream, Charset.defaultCharset());
|
this.script = IOUtils.toString(resourceAsStream, Charset.defaultCharset());
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
//wrap IOException because I prefer a runtime exception...
|
|
||||||
throw new IllegalArgumentException(e);
|
throw new IllegalArgumentException(e);
|
||||||
}
|
}
|
||||||
return this;
|
return this;
|
||||||
|
@ -50,7 +52,6 @@ public class ScriptProcessorBuilder {
|
||||||
InputStream resourceAsStream = ScriptProcessor.class.getClassLoader().getResourceAsStream(fileName);
|
InputStream resourceAsStream = ScriptProcessor.class.getClassLoader().getResourceAsStream(fileName);
|
||||||
this.script = IOUtils.toString(resourceAsStream, Charset.defaultCharset());
|
this.script = IOUtils.toString(resourceAsStream, Charset.defaultCharset());
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
//wrap IOException because I prefer a runtime exception...
|
|
||||||
throw new IllegalArgumentException(e);
|
throw new IllegalArgumentException(e);
|
||||||
}
|
}
|
||||||
return this;
|
return this;
|
||||||
|
|
|
@ -0,0 +1,82 @@
|
||||||
|
package us.codecraft.webmagic.scripts.config;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.apache.commons.cli.CommandLine;
|
||||||
|
|
||||||
|
import lombok.Getter;
|
||||||
|
import us.codecraft.webmagic.scripts.Params;
|
||||||
|
|
||||||
|
public abstract class CommandLineOption {
|
||||||
|
@Getter
|
||||||
|
char option;
|
||||||
|
|
||||||
|
public CommandLineOption(char option) {
|
||||||
|
this.option = option;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected abstract void addParamOption(Params params, CommandLine commandLine);
|
||||||
|
|
||||||
|
public void addParamOptionIfInCommandLine(Params params, CommandLine commandLine) {
|
||||||
|
if (commandLine.hasOption(this.option))
|
||||||
|
this.addParamOption(params, commandLine);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static List<CommandLineOption> getAllOptions() {
|
||||||
|
return List.of(new OptionL(), new OptionF(), new OptionS(), new OptionT(), new OptionG());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
class OptionL extends CommandLineOption {
|
||||||
|
public OptionL() {
|
||||||
|
super('l');
|
||||||
|
}
|
||||||
|
|
||||||
|
protected void addParamOption(Params params, CommandLine commandLine) {
|
||||||
|
String language = commandLine.getOptionValue("l");
|
||||||
|
params.setLanguagefromArg(language);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
class OptionF extends CommandLineOption {
|
||||||
|
public OptionF() {
|
||||||
|
super('f');
|
||||||
|
}
|
||||||
|
|
||||||
|
protected void addParamOption(Params params, CommandLine commandLine) {
|
||||||
|
String scriptFilename = commandLine.getOptionValue("f");
|
||||||
|
params.setScriptFileName(scriptFilename);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
class OptionS extends CommandLineOption {
|
||||||
|
public OptionS() {
|
||||||
|
super('s');
|
||||||
|
}
|
||||||
|
|
||||||
|
protected void addParamOption(Params params, CommandLine commandLine) {
|
||||||
|
Integer sleepTime = Integer.parseInt(commandLine.getOptionValue("s"));
|
||||||
|
params.setSleepTime(sleepTime);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
class OptionT extends CommandLineOption {
|
||||||
|
public OptionT() {
|
||||||
|
super('t');
|
||||||
|
}
|
||||||
|
|
||||||
|
protected void addParamOption(Params params, CommandLine commandLine) {
|
||||||
|
Integer thread = Integer.parseInt(commandLine.getOptionValue("t"));
|
||||||
|
params.setThread(thread);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
class OptionG extends CommandLineOption {
|
||||||
|
public OptionG() {
|
||||||
|
super('g');
|
||||||
|
}
|
||||||
|
|
||||||
|
protected void addParamOption(Params params, CommandLine commandLine) {
|
||||||
|
ConfigLogger.configLogger(commandLine.getOptionValue("g"));
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,34 @@
|
||||||
|
package us.codecraft.webmagic.scripts.config;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.apache.commons.lang3.tuple.Pair;
|
||||||
|
import org.apache.logging.log4j.Level;
|
||||||
|
import org.apache.logging.log4j.core.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
public class ConfigLogger {
|
||||||
|
/**
|
||||||
|
* Log the config parameter. If the counter is less than the number of available
|
||||||
|
* options then it means that the user entered an option
|
||||||
|
*
|
||||||
|
* @param value The config string
|
||||||
|
*/
|
||||||
|
public static void configLogger(String value) {
|
||||||
|
List<Pair<String, Level>> options = List.of(
|
||||||
|
Pair.of("debug", Level.DEBUG),
|
||||||
|
Pair.of("info", Level.INFO),
|
||||||
|
Pair.of("warn", Level.WARN),
|
||||||
|
Pair.of("trace", Level.TRACE),
|
||||||
|
Pair.of("off", Level.OFF),
|
||||||
|
Pair.of("error", Level.ERROR));
|
||||||
|
Pair<String, Level> option = options.get(0);
|
||||||
|
int i = 1;
|
||||||
|
while (i < options.size() && !option.getLeft().equalsIgnoreCase(value))
|
||||||
|
option = options.get(i++);
|
||||||
|
if (i < options.size()) {
|
||||||
|
Logger rootLogger = (Logger) LoggerFactory.getLogger(org.slf4j.Logger.ROOT_LOGGER_NAME);
|
||||||
|
rootLogger.setLevel(option.getRight());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,26 @@
|
||||||
|
package us.codecraft.webmagic.scripts.languages;
|
||||||
|
|
||||||
|
import java.util.Iterator;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
import javax.script.ScriptEngine;
|
||||||
|
import javax.script.ScriptException;
|
||||||
|
|
||||||
|
import org.jruby.RubyHash;
|
||||||
|
|
||||||
|
import us.codecraft.webmagic.Page;
|
||||||
|
|
||||||
|
public class JRuby extends Language {
|
||||||
|
public JRuby() {
|
||||||
|
super("jruby","ruby/defines.rb","");
|
||||||
|
}
|
||||||
|
|
||||||
|
public void process(ScriptEngine engine, String defines, String script, Page page) throws ScriptException {
|
||||||
|
RubyHash oRuby = (RubyHash) engine.eval(defines + "\n" + script, engine.getContext());
|
||||||
|
Iterator itruby = oRuby.entrySet().iterator();
|
||||||
|
while (itruby.hasNext()) {
|
||||||
|
Map.Entry pairs = (Map.Entry) itruby.next();
|
||||||
|
page.getResultItems().put(pairs.getKey().toString(), pairs.getValue());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,16 @@
|
||||||
|
package us.codecraft.webmagic.scripts.languages;
|
||||||
|
|
||||||
|
import javax.script.ScriptEngine;
|
||||||
|
import javax.script.ScriptException;
|
||||||
|
|
||||||
|
import us.codecraft.webmagic.Page;
|
||||||
|
|
||||||
|
public class Javascript extends Language {
|
||||||
|
public Javascript() {
|
||||||
|
super("javascript","js/defines.js","");
|
||||||
|
}
|
||||||
|
|
||||||
|
public void process(ScriptEngine engine, String defines, String script, Page page) throws ScriptException {
|
||||||
|
engine.eval(defines + "\n" + script, engine.getContext());
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,27 @@
|
||||||
|
package us.codecraft.webmagic.scripts.languages;
|
||||||
|
|
||||||
|
import java.util.Iterator;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
import javax.script.ScriptEngine;
|
||||||
|
import javax.script.ScriptException;
|
||||||
|
|
||||||
|
import org.python.core.PyDictionary;
|
||||||
|
|
||||||
|
import us.codecraft.webmagic.Page;
|
||||||
|
|
||||||
|
public class Jython extends Language {
|
||||||
|
public Jython() {
|
||||||
|
super("jython","python/defines.py","");
|
||||||
|
}
|
||||||
|
|
||||||
|
public void process(ScriptEngine engine, String defines, String script, Page page) throws ScriptException {
|
||||||
|
engine.eval(defines + "\n" + script, engine.getContext());
|
||||||
|
PyDictionary oJython = (PyDictionary) engine.get("result");
|
||||||
|
Iterator it = oJython.entrySet().iterator();
|
||||||
|
while (it.hasNext()) {
|
||||||
|
Map.Entry pairs = (Map.Entry) it.next();
|
||||||
|
page.getResultItems().put(pairs.getKey().toString(), pairs.getValue());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -1,15 +1,18 @@
|
||||||
package us.codecraft.webmagic.scripts;
|
package us.codecraft.webmagic.scripts.languages;
|
||||||
|
|
||||||
|
import javax.script.ScriptEngine;
|
||||||
|
import javax.script.ScriptException;
|
||||||
|
import us.codecraft.webmagic.Page;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @author code4crafter@gmail.com
|
* @author FrancoisGib
|
||||||
*/
|
*/
|
||||||
public enum Language {
|
public abstract class Language {
|
||||||
|
public Language(String engineName, String defineFile, String gatherFile) {
|
||||||
JavaScript("javascript","js/defines.js",""),
|
this.engineName = engineName;
|
||||||
|
this.defineFile = defineFile;
|
||||||
JRuby("jruby","ruby/defines.rb",""),
|
this.gatherFile = gatherFile;
|
||||||
|
}
|
||||||
Jython("jython","python/defines.py","");
|
|
||||||
|
|
||||||
private String engineName;
|
private String engineName;
|
||||||
|
|
||||||
|
@ -17,12 +20,6 @@ public enum Language {
|
||||||
|
|
||||||
private String gatherFile;
|
private String gatherFile;
|
||||||
|
|
||||||
Language(String engineName, String defineFile, String gatherFile) {
|
|
||||||
this.engineName = engineName;
|
|
||||||
this.defineFile = defineFile;
|
|
||||||
this.gatherFile = gatherFile;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getEngineName() {
|
public String getEngineName() {
|
||||||
return engineName;
|
return engineName;
|
||||||
}
|
}
|
||||||
|
@ -34,4 +31,6 @@ public enum Language {
|
||||||
public String getGatherFile() {
|
public String getGatherFile() {
|
||||||
return gatherFile;
|
return gatherFile;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public abstract void process(ScriptEngine engine, String defines, String script, Page page) throws ScriptException;
|
||||||
}
|
}
|
|
@ -1,21 +0,0 @@
|
||||||
<?xml version="1.0" encoding="UTF-8"?>
|
|
||||||
<!DOCTYPE log4j:configuration SYSTEM "log4j.dtd">
|
|
||||||
<log4j:configuration xmlns:log4j="http://jakarta.apache.org/log4j/">
|
|
||||||
|
|
||||||
<appender name="stdout" class="org.apache.log4j.ConsoleAppender">
|
|
||||||
<layout class="org.apache.log4j.PatternLayout">
|
|
||||||
<param name="ConversionPattern" value="%d{yy-MM-dd HH:mm:ss,SSS} %-5p %c(%F:%L) ## %m%n" />
|
|
||||||
</layout>
|
|
||||||
</appender>
|
|
||||||
|
|
||||||
<logger name="org.apache" additivity="false">
|
|
||||||
<level value="error" />
|
|
||||||
<appender-ref ref="stdout" />
|
|
||||||
</logger>
|
|
||||||
|
|
||||||
<root>
|
|
||||||
<level value="info" />
|
|
||||||
<appender-ref ref="stdout" />
|
|
||||||
</root>
|
|
||||||
|
|
||||||
</log4j:configuration>
|
|
|
@ -2,7 +2,11 @@ package us.codecraft.webmagic.scripts;
|
||||||
|
|
||||||
import org.junit.Ignore;
|
import org.junit.Ignore;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
|
||||||
import us.codecraft.webmagic.Spider;
|
import us.codecraft.webmagic.Spider;
|
||||||
|
import us.codecraft.webmagic.scripts.languages.JRuby;
|
||||||
|
import us.codecraft.webmagic.scripts.languages.Javascript;
|
||||||
|
import us.codecraft.webmagic.scripts.languages.Jython;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @author code4crafter@gmail.com
|
* @author code4crafter@gmail.com
|
||||||
|
@ -13,14 +17,14 @@ public class ScriptProcessorTest {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testJavaScriptProcessor() {
|
public void testJavaScriptProcessor() {
|
||||||
ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom().language(Language.JavaScript).scriptFromClassPathFile("js/oschina.js").build();
|
ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom().language(new Javascript()).scriptFromClassPathFile("js/oschina.js").build();
|
||||||
pageProcessor.getSite().setSleepTime(0);
|
pageProcessor.getSite().setSleepTime(0);
|
||||||
Spider.create(pageProcessor).addUrl("http://my.oschina.net/flashsword/blog").setSpawnUrl(false).run();
|
Spider.create(pageProcessor).addUrl("http://my.oschina.net/flashsword/blog").setSpawnUrl(false).run();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testRubyProcessor() {
|
public void testRubyProcessor() {
|
||||||
ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom().language(Language.JRuby).scriptFromClassPathFile("ruby/oschina.rb").build();
|
ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom().language(new JRuby()).scriptFromClassPathFile("ruby/oschina.rb").build();
|
||||||
pageProcessor.getSite().setSleepTime(0);
|
pageProcessor.getSite().setSleepTime(0);
|
||||||
Spider.create(pageProcessor).addUrl("http://my.oschina.net/flashsword/blog").setSpawnUrl(false).run();
|
Spider.create(pageProcessor).addUrl("http://my.oschina.net/flashsword/blog").setSpawnUrl(false).run();
|
||||||
}
|
}
|
||||||
|
@ -28,7 +32,7 @@ public class ScriptProcessorTest {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testPythonProcessor() {
|
public void testPythonProcessor() {
|
||||||
ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom().language(Language.Jython).scriptFromClassPathFile("python/oschina.py").build();
|
ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom().language(new Jython()).scriptFromClassPathFile("python/oschina.py").build();
|
||||||
pageProcessor.getSite().setSleepTime(0);
|
pageProcessor.getSite().setSleepTime(0);
|
||||||
Spider.create(pageProcessor).addUrl("http://my.oschina.net/flashsword/blog").setSpawnUrl(false).run();
|
Spider.create(pageProcessor).addUrl("http://my.oschina.net/flashsword/blog").setSpawnUrl(false).run();
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,21 +0,0 @@
|
||||||
<?xml version="1.0" encoding="UTF-8"?>
|
|
||||||
<!DOCTYPE log4j:configuration SYSTEM "log4j.dtd">
|
|
||||||
<log4j:configuration xmlns:log4j="http://jakarta.apache.org/log4j/">
|
|
||||||
|
|
||||||
<appender name="stdout" class="org.apache.log4j.ConsoleAppender">
|
|
||||||
<layout class="org.apache.log4j.PatternLayout">
|
|
||||||
<param name="ConversionPattern" value="%d{yy-MM-dd HH:mm:ss,SSS} %-5p %c(%F:%L) ## %m%n" />
|
|
||||||
</layout>
|
|
||||||
</appender>
|
|
||||||
|
|
||||||
<logger name="org.apache" additivity="false">
|
|
||||||
<level value="warn" />
|
|
||||||
<appender-ref ref="stdout" />
|
|
||||||
</logger>
|
|
||||||
|
|
||||||
<root>
|
|
||||||
<level value="debug" />
|
|
||||||
<appender-ref ref="stdout" />
|
|
||||||
</root>
|
|
||||||
|
|
||||||
</log4j:configuration>
|
|
|
@ -0,0 +1,16 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<Configuration>
|
||||||
|
<Appenders>
|
||||||
|
<Console name="stdout" target="SYSTEM_OUT">
|
||||||
|
<PatternLayout pattern="%d{yy-MM-dd HH:mm:ss,SSS} %-5p %c(%F:%L) ## %m%n" />
|
||||||
|
</Console>
|
||||||
|
</Appenders>
|
||||||
|
<Loggers>
|
||||||
|
<Logger name="org.apache" level="warn" additivity="false">
|
||||||
|
<AppenderRef ref="stdout" />
|
||||||
|
</Logger>
|
||||||
|
<Root level="debug">
|
||||||
|
<AppenderRef ref="stdout" />
|
||||||
|
</Root>
|
||||||
|
</Loggers>
|
||||||
|
</Configuration>
|
|
@ -1,9 +1,14 @@
|
||||||
<?xml version="1.0" encoding="UTF-8"?>
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
<project
|
||||||
|
xmlns="http://maven.apache.org/POM/4.0.0"
|
||||||
|
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||||
|
xsi:schemaLocation="
|
||||||
|
http://maven.apache.org/POM/4.0.0
|
||||||
|
http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||||
<parent>
|
<parent>
|
||||||
<artifactId>webmagic-parent</artifactId>
|
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<version>0.10.3</version>
|
<artifactId>webmagic</artifactId>
|
||||||
|
<version>1.0.0</version>
|
||||||
</parent>
|
</parent>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
@ -23,10 +28,6 @@
|
||||||
<groupId>com.github.detro</groupId>
|
<groupId>com.github.detro</groupId>
|
||||||
<artifactId>phantomjsdriver</artifactId>
|
<artifactId>phantomjsdriver</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
|
||||||
<groupId>junit</groupId>
|
|
||||||
<artifactId>junit</artifactId>
|
|
||||||
</dependency>
|
|
||||||
</dependencies>
|
</dependencies>
|
||||||
|
|
||||||
<build>
|
<build>
|
||||||
|
|
|
@ -1,15 +1,5 @@
|
||||||
package us.codecraft.webmagic.downloader.selenium;
|
package us.codecraft.webmagic.downloader.selenium;
|
||||||
|
|
||||||
import org.openqa.selenium.WebDriver;
|
|
||||||
import org.openqa.selenium.chrome.ChromeDriver;
|
|
||||||
import org.openqa.selenium.firefox.FirefoxDriver;
|
|
||||||
import org.openqa.selenium.phantomjs.PhantomJSDriver;
|
|
||||||
import org.openqa.selenium.phantomjs.PhantomJSDriverService;
|
|
||||||
import org.openqa.selenium.remote.DesiredCapabilities;
|
|
||||||
import org.openqa.selenium.remote.RemoteWebDriver;
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
import java.io.FileReader;
|
import java.io.FileReader;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.net.MalformedURLException;
|
import java.net.MalformedURLException;
|
||||||
|
@ -22,6 +12,18 @@ import java.util.concurrent.BlockingDeque;
|
||||||
import java.util.concurrent.LinkedBlockingDeque;
|
import java.util.concurrent.LinkedBlockingDeque;
|
||||||
import java.util.concurrent.atomic.AtomicInteger;
|
import java.util.concurrent.atomic.AtomicInteger;
|
||||||
|
|
||||||
|
import org.openqa.selenium.WebDriver;
|
||||||
|
import org.openqa.selenium.chrome.ChromeDriver;
|
||||||
|
import org.openqa.selenium.chrome.ChromeOptions;
|
||||||
|
import org.openqa.selenium.firefox.FirefoxDriver;
|
||||||
|
import org.openqa.selenium.firefox.FirefoxOptions;
|
||||||
|
import org.openqa.selenium.phantomjs.PhantomJSDriver;
|
||||||
|
import org.openqa.selenium.phantomjs.PhantomJSDriverService;
|
||||||
|
import org.openqa.selenium.remote.DesiredCapabilities;
|
||||||
|
import org.openqa.selenium.remote.RemoteWebDriver;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
* Date: 13-7-26 <br>
|
* Date: 13-7-26 <br>
|
||||||
|
@ -73,7 +75,6 @@ class WebDriverPool {
|
||||||
|
|
||||||
// Prepare capabilities
|
// Prepare capabilities
|
||||||
sCaps = new DesiredCapabilities();
|
sCaps = new DesiredCapabilities();
|
||||||
sCaps.setJavascriptEnabled(true);
|
|
||||||
sCaps.setCapability("takesScreenshot", false);
|
sCaps.setCapability("takesScreenshot", false);
|
||||||
|
|
||||||
String driver = sConfig.getProperty("driver", DRIVER_PHANTOMJS);
|
String driver = sConfig.getProperty("driver", DRIVER_PHANTOMJS);
|
||||||
|
@ -134,9 +135,9 @@ class WebDriverPool {
|
||||||
sCaps.setBrowserName("phantomjs");
|
sCaps.setBrowserName("phantomjs");
|
||||||
mDriver = new RemoteWebDriver(new URL(driver), sCaps);
|
mDriver = new RemoteWebDriver(new URL(driver), sCaps);
|
||||||
} else if (driver.equals(DRIVER_FIREFOX)) {
|
} else if (driver.equals(DRIVER_FIREFOX)) {
|
||||||
mDriver = new FirefoxDriver(sCaps);
|
mDriver = new FirefoxDriver(new FirefoxOptions(sCaps));
|
||||||
} else if (driver.equals(DRIVER_CHROME)) {
|
} else if (driver.equals(DRIVER_CHROME)) {
|
||||||
mDriver = new ChromeDriver(sCaps);
|
mDriver = new ChromeDriver(new ChromeOptions().merge(sCaps));
|
||||||
} else if (driver.equals(DRIVER_PHANTOMJS)) {
|
} else if (driver.equals(DRIVER_PHANTOMJS)) {
|
||||||
mDriver = new PhantomJSDriver(sCaps);
|
mDriver = new PhantomJSDriver(sCaps);
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,17 +1,18 @@
|
||||||
package us.codecraft.webmagic.downloader;
|
package us.codecraft.webmagic.downloader;
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
import org.junit.Ignore;
|
import org.junit.Ignore;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
import org.openqa.selenium.By;
|
import org.openqa.selenium.By;
|
||||||
import org.openqa.selenium.WebDriver;
|
import org.openqa.selenium.WebDriver;
|
||||||
import org.openqa.selenium.WebElement;
|
import org.openqa.selenium.WebElement;
|
||||||
import org.openqa.selenium.chrome.ChromeDriver;
|
import org.openqa.selenium.chrome.ChromeDriver;
|
||||||
|
import org.openqa.selenium.chrome.ChromeOptions;
|
||||||
import org.openqa.selenium.remote.DesiredCapabilities;
|
import org.openqa.selenium.remote.DesiredCapabilities;
|
||||||
|
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.HashMap;
|
|
||||||
import java.util.Map;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
* Date: 13-7-26 <br>
|
* Date: 13-7-26 <br>
|
||||||
|
@ -29,10 +30,10 @@ public class SeleniumTest {
|
||||||
Map<String, Object> preferences = new HashMap<String, Object>();
|
Map<String, Object> preferences = new HashMap<String, Object>();
|
||||||
preferences.put("profile.default_content_settings", contentSettings);
|
preferences.put("profile.default_content_settings", contentSettings);
|
||||||
|
|
||||||
DesiredCapabilities caps = DesiredCapabilities.chrome();
|
DesiredCapabilities caps = new DesiredCapabilities();
|
||||||
caps.setCapability("chrome.prefs", preferences);
|
caps.setCapability("chrome.prefs", preferences);
|
||||||
caps.setCapability("chrome.switches", Arrays.asList("--user-data-dir=/Users/yihua/temp/chrome"));
|
caps.setCapability("chrome.switches", Arrays.asList("--user-data-dir=/Users/yihua/temp/chrome"));
|
||||||
WebDriver webDriver = new ChromeDriver(caps);
|
WebDriver webDriver = new ChromeDriver(new ChromeOptions().merge(caps));
|
||||||
webDriver.get("http://huaban.com/");
|
webDriver.get("http://huaban.com/");
|
||||||
WebElement webElement = webDriver.findElement(By.xpath("/html"));
|
WebElement webElement = webDriver.findElement(By.xpath("/html"));
|
||||||
System.out.println(webElement.getAttribute("outerHTML"));
|
System.out.println(webElement.getAttribute("outerHTML"));
|
||||||
|
|
Loading…
Reference in New Issue