Merge branch 'release/1.0.0'
commit
9d75cce16d
|
@ -1,9 +1,77 @@
|
|||
target
|
||||
*.iml
|
||||
out/
|
||||
.idea
|
||||
.classpath
|
||||
target/
|
||||
pom.xml.tag
|
||||
pom.xml.releaseBackup
|
||||
pom.xml.versionsBackup
|
||||
pom.xml.next
|
||||
release.properties
|
||||
dependency-reduced-pom.xml
|
||||
buildNumber.properties
|
||||
.mvn/timing.properties
|
||||
# https://github.com/takari/maven-wrapper#usage-without-binary-jar
|
||||
.mvn/wrapper/maven-wrapper.jar
|
||||
|
||||
# Eclipse m2e generated files
|
||||
# Eclipse Core
|
||||
.project
|
||||
.settings/
|
||||
# JDT-specific (Eclipse Java Development Tools)
|
||||
.classpath
|
||||
.metadata
|
||||
bin/
|
||||
.myeclipse
|
||||
tmp/
|
||||
*.tmp
|
||||
*.bak
|
||||
*.swp
|
||||
*~.nib
|
||||
local.properties
|
||||
.settings/
|
||||
.loadpath
|
||||
.recommenders
|
||||
|
||||
# External tool builders
|
||||
.externalToolBuilders/
|
||||
|
||||
# Locally stored "Eclipse launch configurations"
|
||||
*.launch
|
||||
|
||||
# PyDev specific (Python IDE for Eclipse)
|
||||
*.pydevproject
|
||||
|
||||
# CDT-specific (C/C++ Development Tooling)
|
||||
.cproject
|
||||
|
||||
# CDT- autotools
|
||||
.autotools
|
||||
|
||||
# Java annotation processor (APT)
|
||||
.factorypath
|
||||
|
||||
# PDT-specific (PHP Development Tools)
|
||||
.buildpath
|
||||
|
||||
# sbteclipse plugin
|
||||
.target
|
||||
|
||||
# Tern plugin
|
||||
.tern-project
|
||||
|
||||
# TeXlipse plugin
|
||||
.texlipse
|
||||
|
||||
# STS (Spring Tool Suite)
|
||||
.springBeans
|
||||
|
||||
# Code Recommenders
|
||||
.recommenders/
|
||||
|
||||
# Annotation Processing
|
||||
.apt_generated/
|
||||
.apt_generated_test/
|
||||
|
||||
# Scala IDE specific (Scala & Java development for Eclipse)
|
||||
.cache-main
|
||||
.scala_dependencies
|
||||
.worksheet
|
||||
|
||||
# Uncomment this line if you wish to ignore the project description file.
|
||||
# Typically, this file would be tracked if it contains build/dependency configurations:
|
||||
#.project
|
||||
|
|
370
pom.xml
370
pom.xml
|
@ -1,14 +1,24 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<groupId>us.codecraft</groupId>
|
||||
<version>0.10.3</version>
|
||||
<project
|
||||
xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="
|
||||
http://maven.apache.org/POM/4.0.0
|
||||
http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
<parent>
|
||||
<groupId>org.oxerr</groupId>
|
||||
<artifactId>oxerr-parent</artifactId>
|
||||
<version>2.2.1</version>
|
||||
</parent>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<version>1.0.0</version>
|
||||
<packaging>pom</packaging>
|
||||
<properties>
|
||||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
||||
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
|
||||
<maven.compiler.source>1.8</maven.compiler.source>
|
||||
<maven.compiler.target>1.8</maven.compiler.target>
|
||||
<maven.compiler.source>11</maven.compiler.source>
|
||||
<maven.compiler.target>11</maven.compiler.target>
|
||||
<assertj.version>3.23.1</assertj.version>
|
||||
<commons-cli.version>1.5.0</commons-cli.version>
|
||||
<commons-collections4.version>4.4</commons-collections4.version>
|
||||
|
@ -23,20 +33,21 @@
|
|||
<jedis.version>3.7.1</jedis.version>
|
||||
<jruby.version>9.3.9.0</jruby.version>
|
||||
<json-path.version>2.9.0</json-path.version>
|
||||
<junit.version>4.13.2</junit.version>
|
||||
<junit.version>5.10.2</junit.version>
|
||||
<junit.platform.version>1.10.2</junit.platform.version>
|
||||
<jython.version>2.7.3</jython.version>
|
||||
<log4j.version>1.2.17</log4j.version>
|
||||
<log4j2.version>2.23.1</log4j2.version>
|
||||
<mockito-all.version>2.0.2-beta</mockito-all.version>
|
||||
<moco.version>1.3.0</moco.version>
|
||||
<phantomjsdriver.version>1.2.0</phantomjsdriver.version>
|
||||
<saxon-he.version>11.4</saxon-he.version>
|
||||
<selenium-java.version>3.141.59</selenium-java.version>
|
||||
<saxon-he.version>12.4</saxon-he.version>
|
||||
<selenium-java.version>4.14.1</selenium-java.version>
|
||||
<slf4j.version>2.0.4</slf4j.version>
|
||||
<spring-version>4.0.0.RELEASE</spring-version>
|
||||
<xsoup.version>0.3.5</xsoup.version>
|
||||
</properties>
|
||||
<artifactId>webmagic-parent</artifactId>
|
||||
<name>webmagic-parent</name>
|
||||
<artifactId>webmagic</artifactId>
|
||||
<name>webmagic</name>
|
||||
<description>
|
||||
A crawler framework. It covers the whole lifecycle of crawler: downloading, url management, content
|
||||
extraction and persistent. It can simply the development of a specific crawler.
|
||||
|
@ -77,14 +88,41 @@
|
|||
<module>webmagic-coverage</module>
|
||||
</modules>
|
||||
|
||||
<dependencyManagement>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>junit</groupId>
|
||||
<artifactId>junit</artifactId>
|
||||
<version>${junit.version}</version>
|
||||
<groupId>org.apache.logging.log4j</groupId>
|
||||
<artifactId>log4j-core</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.logging.log4j</groupId>
|
||||
<artifactId>log4j-slf4j2-impl</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.junit.jupiter</groupId>
|
||||
<artifactId>junit-jupiter-engine</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.junit.vintage</groupId>
|
||||
<artifactId>junit-vintage-engine</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.junit.platform</groupId>
|
||||
<artifactId>junit-platform-launcher</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.junit.platform</groupId>
|
||||
<artifactId>junit-platform-runner</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
<dependencyManagement>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.mockito</groupId>
|
||||
<artifactId>mockito-all</artifactId>
|
||||
|
@ -101,6 +139,16 @@
|
|||
<artifactId>httpcore</artifactId>
|
||||
<version>${httpcore.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.logging.log4j</groupId>
|
||||
<artifactId>log4j-core</artifactId>
|
||||
<version>${log4j2.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.logging.log4j</groupId>
|
||||
<artifactId>log4j-slf4j2-impl</artifactId>
|
||||
<version>${log4j2.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.google.guava</groupId>
|
||||
<artifactId>guava</artifactId>
|
||||
|
@ -112,13 +160,28 @@
|
|||
<version>${json-path.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.slf4j</groupId>
|
||||
<artifactId>slf4j-api</artifactId>
|
||||
<version>${slf4j.version}</version>
|
||||
<groupId>org.junit.jupiter</groupId>
|
||||
<artifactId>junit-jupiter-engine</artifactId>
|
||||
<version>${junit.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.junit.vintage</groupId>
|
||||
<artifactId>junit-vintage-engine</artifactId>
|
||||
<version>${junit.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.junit.platform</groupId>
|
||||
<artifactId>junit-platform-launcher</artifactId>
|
||||
<version>${junit.platform.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.junit.platform</groupId>
|
||||
<artifactId>junit-platform-runner</artifactId>
|
||||
<version>${junit.platform.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.slf4j</groupId>
|
||||
<artifactId>slf4j-log4j12</artifactId>
|
||||
<artifactId>slf4j-api</artifactId>
|
||||
<version>${slf4j.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
|
@ -143,11 +206,6 @@
|
|||
</exclusion>
|
||||
</exclusions>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>log4j</groupId>
|
||||
<artifactId>log4j</artifactId>
|
||||
<version>${log4j.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.assertj</groupId>
|
||||
<artifactId>assertj-core</artifactId>
|
||||
|
@ -219,86 +277,10 @@
|
|||
|
||||
<build>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-enforcer-plugin</artifactId>
|
||||
<version>3.1.0</version>
|
||||
<executions>
|
||||
<execution>
|
||||
<id>enforce-maven</id>
|
||||
<goals>
|
||||
<goal>enforce</goal>
|
||||
</goals>
|
||||
<configuration>
|
||||
<rules>
|
||||
<requireMavenVersion>
|
||||
<version>3.5.0</version>
|
||||
</requireMavenVersion>
|
||||
</rules>
|
||||
</configuration>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-surefire-plugin</artifactId>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-compiler-plugin</artifactId>
|
||||
</plugin>
|
||||
<!--<plugin>-->
|
||||
<!--<groupId>org.apache.maven.plugins</groupId>-->
|
||||
<!--<artifactId>maven-dependency-plugin</artifactId>-->
|
||||
<!--<version>2.8</version>-->
|
||||
<!--<executions>-->
|
||||
<!--<execution>-->
|
||||
<!--<id>copy-dependencies</id>-->
|
||||
<!--<phase>package</phase>-->
|
||||
<!--<goals>-->
|
||||
<!--<goal>copy-dependencies</goal>-->
|
||||
<!--</goals>-->
|
||||
<!--<configuration>-->
|
||||
<!--<outputDirectory>${project.build.directory}/lib</outputDirectory>-->
|
||||
<!--<overWriteReleases>false</overWriteReleases>-->
|
||||
<!--<overWriteSnapshots>false</overWriteSnapshots>-->
|
||||
<!--<overWriteIfNewer>true</overWriteIfNewer>-->
|
||||
<!--</configuration>-->
|
||||
<!--</execution>-->
|
||||
<!--</executions>-->
|
||||
<!--</plugin>-->
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-resources-plugin</artifactId>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-jar-plugin</artifactId>
|
||||
<configuration>
|
||||
<excludes>
|
||||
<exclude>log4j.xml</exclude>
|
||||
</excludes>
|
||||
</configuration>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-source-plugin</artifactId>
|
||||
<version>3.2.1</version>
|
||||
<executions>
|
||||
<execution>
|
||||
<id>attach-sources</id>
|
||||
<goals>
|
||||
<goal>jar</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-javadoc-plugin</artifactId>
|
||||
<version>3.4.1</version>
|
||||
<configuration>
|
||||
<encoding>UTF-8</encoding>
|
||||
<doctitle>WebMagic ${project.version}</doctitle>
|
||||
<locale>en_US</locale>
|
||||
|
||||
|
@ -322,11 +304,6 @@
|
|||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-release-plugin</artifactId>
|
||||
<version>3.0.0-M6</version>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.jacoco</groupId>
|
||||
<artifactId>jacoco-maven-plugin</artifactId>
|
||||
|
@ -355,189 +332,6 @@
|
|||
</configuration>
|
||||
</plugin>
|
||||
</plugins>
|
||||
<pluginManagement>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-clean-plugin</artifactId>
|
||||
<version>3.2.0</version>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-compiler-plugin</artifactId>
|
||||
<version>3.10.1</version>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-deploy-plugin</artifactId>
|
||||
<version>3.0.0</version>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-install-plugin</artifactId>
|
||||
<version>3.0.1</version>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-jar-plugin</artifactId>
|
||||
<version>3.3.0</version>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-jxr-plugin</artifactId>
|
||||
<version>3.3.0</version>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-pmd-plugin</artifactId>
|
||||
<version>3.19.0</version>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-resources-plugin</artifactId>
|
||||
<version>3.3.0</version>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-site-plugin</artifactId>
|
||||
<version>4.0.0-M3</version>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-surefire-plugin</artifactId>
|
||||
<version>3.0.0-M7</version>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-surefire-report-plugin</artifactId>
|
||||
<version>3.0.0-M7</version>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.codehaus.mojo</groupId>
|
||||
<artifactId>taglist-maven-plugin</artifactId>
|
||||
<version>3.0.0</version>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.jacoco</groupId>
|
||||
<artifactId>jacoco-maven-plugin</artifactId>
|
||||
<version>0.8.8</version>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>com.amashchenko.maven.plugin</groupId>
|
||||
<artifactId>gitflow-maven-plugin</artifactId>
|
||||
<version>1.18.0</version>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>com.github.spotbugs</groupId>
|
||||
<artifactId>spotbugs-maven-plugin</artifactId>
|
||||
<version>4.7.2.0</version>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</pluginManagement>
|
||||
</build>
|
||||
|
||||
<reporting>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-javadoc-plugin</artifactId>
|
||||
<configuration>
|
||||
<doclint>none</doclint>
|
||||
</configuration>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-jxr-plugin</artifactId>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-pmd-plugin</artifactId>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-surefire-report-plugin</artifactId>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.codehaus.mojo</groupId>
|
||||
<artifactId>taglist-maven-plugin</artifactId>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>com.github.spotbugs</groupId>
|
||||
<artifactId>spotbugs-maven-plugin</artifactId>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</reporting>
|
||||
|
||||
<profiles>
|
||||
<profile>
|
||||
<id>release</id>
|
||||
<build>
|
||||
<plugins>
|
||||
<!-- Source -->
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-source-plugin</artifactId>
|
||||
<version>3.2.1</version>
|
||||
<executions>
|
||||
<execution>
|
||||
<phase>package</phase>
|
||||
<goals>
|
||||
<goal>jar-no-fork</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
<!-- Javadoc -->
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-javadoc-plugin</artifactId>
|
||||
<version>3.4.1</version>
|
||||
<executions>
|
||||
<execution>
|
||||
<phase>package</phase>
|
||||
<goals>
|
||||
<goal>jar</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
<!-- GPG -->
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-gpg-plugin</artifactId>
|
||||
<version>3.0.1</version>
|
||||
<executions>
|
||||
<execution>
|
||||
<phase>verify</phase>
|
||||
<goals>
|
||||
<goal>sign</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.sonatype.plugins</groupId>
|
||||
<artifactId>nexus-staging-maven-plugin</artifactId>
|
||||
<version>1.6.13</version>
|
||||
<extensions>true</extensions>
|
||||
<configuration>
|
||||
<serverId>sonatype-nexus-staging</serverId>
|
||||
<nexusUrl>https://oss.sonatype.org/</nexusUrl>
|
||||
<autoReleaseAfterClose>true</autoReleaseAfterClose>
|
||||
</configuration>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
<distributionManagement>
|
||||
<snapshotRepository>
|
||||
<id>sonatype-nexus-snapshots</id>
|
||||
<url>https://oss.sonatype.org/content/repositories/snapshots/</url>
|
||||
</snapshotRepository>
|
||||
<repository>
|
||||
<id>sonatype-nexus-staging</id>
|
||||
<url>https://oss.sonatype.org/service/local/staging/deploy/maven2/</url>
|
||||
</repository>
|
||||
</distributionManagement>
|
||||
</profile>
|
||||
</profiles>
|
||||
</project>
|
||||
|
|
|
@ -1,9 +1,14 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<project
|
||||
xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="
|
||||
http://maven.apache.org/POM/4.0.0
|
||||
http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<parent>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<artifactId>webmagic-parent</artifactId>
|
||||
<version>0.10.3</version>
|
||||
<artifactId>webmagic</artifactId>
|
||||
<version>1.0.0</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
|
@ -15,11 +20,6 @@
|
|||
<artifactId>httpclient</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>junit</groupId>
|
||||
<artifactId>junit</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.commons</groupId>
|
||||
<artifactId>commons-lang3</artifactId>
|
||||
|
@ -45,12 +45,6 @@
|
|||
<artifactId>mockito-all</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.slf4j</groupId>
|
||||
<artifactId>slf4j-log4j12</artifactId>
|
||||
<optional>true</optional>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.commons</groupId>
|
||||
<artifactId>commons-collections4</artifactId>
|
||||
|
|
|
@ -71,6 +71,7 @@ public class Page {
|
|||
* Returns a {@link Page} with {@link #downloadSuccess} is {@code false},
|
||||
* and {@link #request} is specified.
|
||||
*
|
||||
* @param request the {@link Request}.
|
||||
* @return the page.
|
||||
* @since 0.10.0
|
||||
*/
|
||||
|
|
|
@ -1,13 +1,14 @@
|
|||
package us.codecraft.webmagic;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import us.codecraft.webmagic.downloader.Downloader;
|
||||
import us.codecraft.webmagic.model.HttpRequestBody;
|
||||
import us.codecraft.webmagic.utils.Experimental;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Object contains url to crawl.<br>
|
||||
* It contains some additional information.<br>
|
||||
|
@ -35,7 +36,7 @@ public class Request implements Serializable {
|
|||
/**
|
||||
* Store additional information in extras.
|
||||
*/
|
||||
private Map<String, Object> extras;
|
||||
private Map<String, Object> extras = new HashMap<>();
|
||||
|
||||
/**
|
||||
* cookies for current url, if not set use Site's cookies
|
||||
|
@ -93,9 +94,6 @@ public class Request implements Serializable {
|
|||
}
|
||||
|
||||
public <T> Request putExtra(String key, T value) {
|
||||
if (extras == null) {
|
||||
extras = new HashMap<String, Object>();
|
||||
}
|
||||
extras.put(key, value);
|
||||
return this;
|
||||
}
|
||||
|
@ -105,11 +103,11 @@ public class Request implements Serializable {
|
|||
}
|
||||
|
||||
public Map<String, Object> getExtras() {
|
||||
return extras;
|
||||
return Collections.unmodifiableMap(extras);
|
||||
}
|
||||
|
||||
public Request setExtras(Map<String, Object> extras) {
|
||||
this.extras = extras;
|
||||
this.extras.putAll(extras);
|
||||
return this;
|
||||
}
|
||||
|
||||
|
|
|
@ -9,11 +9,8 @@ import java.util.Date;
|
|||
import java.util.List;
|
||||
import java.util.UUID;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
import java.util.concurrent.atomic.AtomicLong;
|
||||
import java.util.concurrent.locks.Condition;
|
||||
import java.util.concurrent.locks.ReentrantLock;
|
||||
import org.apache.commons.collections4.CollectionUtils;
|
||||
import org.apache.commons.lang3.SerializationUtils;
|
||||
import org.slf4j.Logger;
|
||||
|
@ -76,7 +73,7 @@ public class Spider implements Runnable, Task {
|
|||
|
||||
protected String uuid;
|
||||
|
||||
protected Scheduler scheduler = new QueueScheduler();
|
||||
protected SpiderScheduler scheduler;
|
||||
|
||||
protected Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
|
@ -88,7 +85,7 @@ public class Spider implements Runnable, Task {
|
|||
|
||||
protected AtomicInteger stat = new AtomicInteger(STAT_INIT);
|
||||
|
||||
protected boolean exitWhenComplete = true;
|
||||
protected volatile boolean exitWhenComplete = true;
|
||||
|
||||
protected final static int STAT_INIT = 0;
|
||||
|
||||
|
@ -100,10 +97,6 @@ public class Spider implements Runnable, Task {
|
|||
|
||||
protected boolean destroyWhenExit = true;
|
||||
|
||||
private ReentrantLock newUrlLock = new ReentrantLock();
|
||||
|
||||
private Condition newUrlCondition = newUrlLock.newCondition();
|
||||
|
||||
private List<SpiderListener> spiderListeners;
|
||||
|
||||
private final AtomicLong pageCount = new AtomicLong(0);
|
||||
|
@ -131,6 +124,7 @@ public class Spider implements Runnable, Task {
|
|||
public Spider(PageProcessor pageProcessor) {
|
||||
this.pageProcessor = pageProcessor;
|
||||
this.site = pageProcessor.getSite();
|
||||
this.scheduler = new SpiderScheduler(new QueueScheduler());
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -186,15 +180,15 @@ public class Spider implements Runnable, Task {
|
|||
/**
|
||||
* set scheduler for Spider
|
||||
*
|
||||
* @param scheduler scheduler
|
||||
* @param updateScheduler scheduler
|
||||
* @return this
|
||||
* @see Scheduler
|
||||
* @since 0.2.1
|
||||
*/
|
||||
public Spider setScheduler(Scheduler scheduler) {
|
||||
public Spider setScheduler(Scheduler updateScheduler) {
|
||||
checkIfRunning();
|
||||
Scheduler oldScheduler = this.scheduler;
|
||||
this.scheduler = scheduler;
|
||||
SpiderScheduler oldScheduler = this.scheduler;
|
||||
scheduler.setScheduler(updateScheduler);
|
||||
if (oldScheduler != null) {
|
||||
Request request;
|
||||
while ((request = oldScheduler.poll(this)) != null) {
|
||||
|
@ -333,8 +327,8 @@ public class Spider implements Runnable, Task {
|
|||
}
|
||||
} else {
|
||||
// wait until new url added,
|
||||
if (waitNewUrl()) {
|
||||
//if interrupted
|
||||
if (scheduler.waitNewUrl(threadPool, emptySleepTime)) {
|
||||
// if interrupted
|
||||
break;
|
||||
}
|
||||
continue;
|
||||
|
@ -353,7 +347,7 @@ public class Spider implements Runnable, Task {
|
|||
logger.error("process request " + request + " error", e);
|
||||
} finally {
|
||||
pageCount.incrementAndGet();
|
||||
signalNewUrl();
|
||||
scheduler.signalNewUrl();
|
||||
}
|
||||
}
|
||||
});
|
||||
|
@ -536,7 +530,7 @@ public class Spider implements Runnable, Task {
|
|||
for (String url : urls) {
|
||||
addRequest(new Request(url));
|
||||
}
|
||||
signalNewUrl();
|
||||
scheduler.signalNewUrl();
|
||||
return this;
|
||||
}
|
||||
|
||||
|
@ -588,42 +582,10 @@ public class Spider implements Runnable, Task {
|
|||
for (Request request : requests) {
|
||||
addRequest(request);
|
||||
}
|
||||
signalNewUrl();
|
||||
scheduler.signalNewUrl();
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @return isInterrupted
|
||||
*/
|
||||
private boolean waitNewUrl() {
|
||||
// now there may not be any thread live
|
||||
newUrlLock.lock();
|
||||
try {
|
||||
//double check,unnecessary, unless very fast concurrent
|
||||
if (threadPool.getThreadAlive() == 0) {
|
||||
return false;
|
||||
}
|
||||
//wait for amount of time
|
||||
newUrlCondition.await(emptySleepTime, TimeUnit.MILLISECONDS);
|
||||
return false;
|
||||
} catch (InterruptedException e) {
|
||||
// logger.warn("waitNewUrl - interrupted, error {}", e);
|
||||
return true;
|
||||
} finally {
|
||||
newUrlLock.unlock();
|
||||
}
|
||||
}
|
||||
|
||||
private void signalNewUrl() {
|
||||
try {
|
||||
newUrlLock.lock();
|
||||
newUrlCondition.signalAll();
|
||||
} finally {
|
||||
newUrlLock.unlock();
|
||||
}
|
||||
}
|
||||
|
||||
public void start() {
|
||||
runAsync();
|
||||
}
|
||||
|
@ -636,6 +598,13 @@ public class Spider implements Runnable, Task {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Stop when all tasks in the queue are completed and all worker threads are also completed
|
||||
*/
|
||||
public void stopWhenComplete(){
|
||||
this.exitWhenComplete = true;
|
||||
}
|
||||
|
||||
/**
|
||||
* start with more than one threads
|
||||
*
|
||||
|
@ -799,7 +768,7 @@ public class Spider implements Runnable, Task {
|
|||
}
|
||||
|
||||
public Scheduler getScheduler() {
|
||||
return scheduler;
|
||||
return scheduler.getScheduler();
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -0,0 +1,59 @@
|
|||
package us.codecraft.webmagic;
|
||||
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.concurrent.locks.Condition;
|
||||
import java.util.concurrent.locks.ReentrantLock;
|
||||
|
||||
import us.codecraft.webmagic.scheduler.Scheduler;
|
||||
import us.codecraft.webmagic.thread.CountableThreadPool;
|
||||
|
||||
public class SpiderScheduler {
|
||||
private Scheduler scheduler;
|
||||
private final ReentrantLock newUrlLock = new ReentrantLock();
|
||||
private final Condition newUrlCondition = newUrlLock.newCondition();
|
||||
|
||||
public SpiderScheduler(Scheduler scheduler) {
|
||||
this.scheduler = scheduler;
|
||||
}
|
||||
|
||||
public Scheduler getScheduler() {
|
||||
return scheduler;
|
||||
}
|
||||
|
||||
public void setScheduler(Scheduler scheduler) {
|
||||
this.scheduler = scheduler;
|
||||
}
|
||||
|
||||
public Request poll(Spider spider) {
|
||||
return scheduler.poll(spider);
|
||||
}
|
||||
|
||||
public void push(Request request, Spider spider) {
|
||||
scheduler.push(request, spider);
|
||||
}
|
||||
|
||||
public boolean waitNewUrl(CountableThreadPool threadPool, long emptySleepTime) {
|
||||
newUrlLock.lock();
|
||||
try {
|
||||
if (threadPool.getThreadAlive() == 0) {
|
||||
return false;
|
||||
}
|
||||
newUrlCondition.await(emptySleepTime, TimeUnit.MILLISECONDS);
|
||||
return false;
|
||||
} catch (InterruptedException e) {
|
||||
return true;
|
||||
} finally {
|
||||
newUrlLock.unlock();
|
||||
}
|
||||
}
|
||||
|
||||
public void signalNewUrl() {
|
||||
try {
|
||||
newUrlLock.lock();
|
||||
newUrlCondition.signalAll();
|
||||
} finally {
|
||||
newUrlLock.unlock();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -42,7 +42,9 @@ public class HttpUriRequestConverter {
|
|||
HttpClientContext httpContext = new HttpClientContext();
|
||||
if (proxy != null && proxy.getUsername() != null) {
|
||||
AuthState authState = new AuthState();
|
||||
authState.update(new BasicScheme(ChallengeState.PROXY), new UsernamePasswordCredentials(proxy.getUsername(), proxy.getPassword()));
|
||||
BasicScheme proxyAuthScheme = new BasicScheme(ChallengeState.PROXY);
|
||||
UsernamePasswordCredentials proxyCredentials = new UsernamePasswordCredentials(proxy.getUsername(), proxy.getPassword());
|
||||
authState.update(proxyAuthScheme, proxyCredentials);
|
||||
httpContext.setAttribute(HttpClientContext.PROXY_AUTH_STATE, authState);
|
||||
}
|
||||
if (request.getCookies() != null && !request.getCookies().isEmpty()) {
|
||||
|
|
|
@ -26,7 +26,6 @@ public class HtmlNode extends AbstractSelectable {
|
|||
return elements;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Selectable smartContent() {
|
||||
SmartContentSelector smartContentSelector = Selectors.smartContent();
|
||||
return select(smartContentSelector, getSourceTexts());
|
||||
|
|
|
@ -42,11 +42,6 @@ public class PlainText extends AbstractSelectable {
|
|||
throw new UnsupportedOperationException("$ can not apply to plain text. Please check whether you use a previous xpath with attribute select (/@href etc).");
|
||||
}
|
||||
|
||||
@Override
|
||||
public Selectable smartContent() {
|
||||
throw new UnsupportedOperationException("Smart content can not apply to plain text. Please check whether you use a previous xpath with attribute select (/@href etc).");
|
||||
}
|
||||
|
||||
@Override
|
||||
public Selectable links() {
|
||||
throw new UnsupportedOperationException("Links can not apply to plain text. Please check whether you use a previous xpath with attribute select (/@href etc).");
|
||||
|
|
|
@ -51,14 +51,6 @@ public interface Selectable {
|
|||
* @return new Selectable after extract
|
||||
*/
|
||||
public Selectable css(String selector, String attrName);
|
||||
|
||||
/**
|
||||
* select smart content with ReadAbility algorithm
|
||||
*
|
||||
* @return content
|
||||
*/
|
||||
public Selectable smartContent();
|
||||
|
||||
/**
|
||||
* select all links
|
||||
*
|
||||
|
|
|
@ -21,6 +21,10 @@ public abstract class CharsetUtils {
|
|||
|
||||
private static Logger logger = LoggerFactory.getLogger(CharsetUtils.class);
|
||||
|
||||
private CharsetUtils() {
|
||||
throw new AssertionError("No us.codecraft.webmagic.utils.CharsetUtils instances for you!");
|
||||
}
|
||||
|
||||
public static String detectCharset(String contentType, byte[] contentBytes) throws IOException {
|
||||
String charset;
|
||||
// charset
|
||||
|
|
|
@ -116,6 +116,10 @@ public class UrlUtils {
|
|||
private static final Pattern patternForCharset = Pattern.compile("charset\\s*=\\s*['\"]*([^\\s;'\"]*)", Pattern.CASE_INSENSITIVE);
|
||||
|
||||
public static String getCharset(String contentType) {
|
||||
if (contentType == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
Matcher matcher = patternForCharset.matcher(contentType);
|
||||
if (matcher.find()) {
|
||||
String charset = matcher.group(1);
|
||||
|
|
|
@ -1,21 +0,0 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE log4j:configuration SYSTEM "log4j.dtd">
|
||||
<log4j:configuration xmlns:log4j="http://jakarta.apache.org/log4j/">
|
||||
|
||||
<appender name="stdout" class="org.apache.log4j.ConsoleAppender">
|
||||
<layout class="org.apache.log4j.PatternLayout">
|
||||
<param name="ConversionPattern" value="%d{yy-MM-dd HH:mm:ss,SSS} %-5p %c(%F:%L) ## %m%n" />
|
||||
</layout>
|
||||
</appender>
|
||||
|
||||
<logger name="org.apache" additivity="false">
|
||||
<level value="warn" />
|
||||
<appender-ref ref="stdout" />
|
||||
</logger>
|
||||
|
||||
<root>
|
||||
<level value="info" />
|
||||
<appender-ref ref="stdout" />
|
||||
</root>
|
||||
|
||||
</log4j:configuration>
|
|
@ -1,10 +1,14 @@
|
|||
package us.codecraft.webmagic;
|
||||
|
||||
import org.junit.Test;
|
||||
import us.codecraft.webmagic.utils.HttpConstant;
|
||||
|
||||
import static org.assertj.core.api.Assertions.assertThat;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.Map;
|
||||
|
||||
import org.junit.Test;
|
||||
|
||||
import us.codecraft.webmagic.utils.HttpConstant;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com
|
||||
* Date: 17/3/11
|
||||
|
@ -22,4 +26,28 @@ public class RequestTest {
|
|||
assertThat(requestA).isNotEqualTo(requestB);
|
||||
assertThat(requestA.hashCode()).isNotEqualTo(requestB.hashCode());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSetExtras() {
|
||||
Request request = new Request();
|
||||
Map<String, Object> extras = Collections.singletonMap("a", "1");
|
||||
request.setExtras(extras);
|
||||
request.putExtra("b", "2");
|
||||
assertThat(request.<String>getExtra("a")).isEqualTo("1");
|
||||
assertThat(request.<String>getExtra("b")).isEqualTo("2");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testGetExtras() {
|
||||
Request request = new Request();
|
||||
request.putExtra("a", "1");
|
||||
assertThat(request.getExtras()).containsEntry("a", "1");
|
||||
}
|
||||
|
||||
@Test(expected = UnsupportedOperationException.class)
|
||||
public void testGetExtrasShouldBeUnmodifiable() {
|
||||
Request request = new Request();
|
||||
request.getExtras().put("a", "1");
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,8 +1,12 @@
|
|||
package us.codecraft.webmagic;
|
||||
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import org.junit.Test;
|
||||
|
||||
|
@ -14,4 +18,23 @@ public class SiteTest {
|
|||
assertEquals(StandardCharsets.UTF_8.name(), site.getDefaultCharset());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void addCookieTest(){
|
||||
Site site=Site.me().setDefaultCharset(StandardCharsets.UTF_8.name());
|
||||
site.addCookie("cookieDefault","cookie-webmagicDefault");
|
||||
String firstDomain="example.com";
|
||||
String secondDomain="exampleCopy.com";
|
||||
site.addCookie(firstDomain, "cookie", "cookie-webmagic");
|
||||
site.addCookie(firstDomain, "cookieCopy", "cookie-webmagicCopy");
|
||||
site.addCookie(secondDomain, "cookie", "cookie-webmagic");
|
||||
Map<String, Map<String, String>> allCookies = site.getAllCookies();
|
||||
List<String> domains=new ArrayList<>();
|
||||
for(String key : allCookies.keySet()){
|
||||
domains.add(key);
|
||||
}
|
||||
assertEquals("cookie-webmagic", allCookies.get(firstDomain).get("cookie"));
|
||||
assertEquals("cookie-webmagicCopy", allCookies.get(firstDomain).get("cookieCopy"));
|
||||
assertEquals("cookie-webmagic", allCookies.get(secondDomain).get("cookie"));
|
||||
assertEquals(2, domains.size());
|
||||
}
|
||||
}
|
||||
|
|
|
@ -40,6 +40,7 @@ import static com.github.dreamhead.moco.Moco.uri;
|
|||
import static com.github.dreamhead.moco.Moco.with;
|
||||
import static org.assertj.core.api.Assertions.assertThat;
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import static org.junit.Assert.assertThrows;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
|
||||
/**
|
||||
|
@ -333,5 +334,13 @@ public class HttpClientDownloaderTest {
|
|||
});
|
||||
}
|
||||
|
||||
@Test
|
||||
public void test_no_task_download(){
|
||||
Request request = new Request();
|
||||
request.setUrl("http://127.0.0.1:13423/");
|
||||
HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
|
||||
assertThrows(NullPointerException.class, () -> httpClientDownloader.download(request,null));
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
|
@ -8,19 +8,19 @@ import java.util.ArrayList;
|
|||
import java.util.List;
|
||||
|
||||
import org.apache.http.HttpHost;
|
||||
import org.junit.BeforeClass;
|
||||
import org.junit.Test;
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
/**
|
||||
* @author yxssfxwzy@sina.com May 30, 2014
|
||||
*
|
||||
*/
|
||||
public class ProxyTest {
|
||||
class ProxyTest {
|
||||
|
||||
private static List<String[]> httpProxyList = new ArrayList<String[]>();
|
||||
|
||||
@BeforeClass
|
||||
public static void before() {
|
||||
@BeforeAll
|
||||
static void before() {
|
||||
// String[] source = { "0.0.0.1:0", "0.0.0.2:0", "0.0.0.3:0",
|
||||
// "0.0.0.4:0" };
|
||||
String[] source = { "::0.0.0.1:0", "::0.0.0.2:0", "::0.0.0.3:0", "::0.0.0.4:0" };
|
||||
|
@ -48,7 +48,7 @@ public class ProxyTest {
|
|||
}
|
||||
|
||||
@Test
|
||||
public void testCreate() {
|
||||
void testCreate() {
|
||||
Proxy proxy = Proxy.create(URI.create("//127.0.0.1:8080"));
|
||||
assertNull(proxy.getScheme());
|
||||
assertNull(proxy.getUsername());
|
||||
|
@ -86,7 +86,15 @@ public class ProxyTest {
|
|||
}
|
||||
|
||||
@Test
|
||||
public void testToString() {
|
||||
void testEqualsHashCode() {
|
||||
var proxy0 = new Proxy("::1", 1080);
|
||||
var proxy1 = new Proxy("::1", 1080);
|
||||
assertEquals(proxy0, proxy1);
|
||||
assertEquals(proxy0.hashCode(), proxy1.hashCode());
|
||||
}
|
||||
|
||||
@Test
|
||||
void testToString() {
|
||||
assertEquals("//127.0.0.1:8080", new Proxy("127.0.0.1", 8080).toString());
|
||||
assertEquals("http://127.0.0.1:8080", new Proxy("127.0.0.1", 8080, "http").toString());
|
||||
assertEquals("//username:password@127.0.0.1:8080", new Proxy("127.0.0.1", 8080, "username", "password").toString());
|
||||
|
|
|
@ -0,0 +1,59 @@
|
|||
package us.codecraft.webmagic.selector;
|
||||
|
||||
import static org.junit.Assert.assertEquals;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.junit.Test;
|
||||
|
||||
public class AndSelectorTest {
|
||||
|
||||
@Test
|
||||
public void testSelectList() {
|
||||
String htmlContent = "<!DOCTYPE html>\n" +
|
||||
"<html lang=\"en\">\n" +
|
||||
"<head>\n" +
|
||||
" <meta charset=\"UTF-8\">\n" +
|
||||
" <meta name=\"viewport\" content=\"width=device-width, initial-scale=1.0\">\n" +
|
||||
" <title>HTML with XPath</title>\n" +
|
||||
"</head>\n" +
|
||||
"<body>\n" +
|
||||
" <div class=\"container\">\n" +
|
||||
" <div class=\"item1\">Item 1</div>\n" +
|
||||
" <div class=\"item2\">Item 2</div>\n" +
|
||||
" </div>\n" +
|
||||
"</body>\n" +
|
||||
"</html>";
|
||||
List<Selector> selectors = new ArrayList<Selector>();
|
||||
selectors.add(new CssSelector("div"));
|
||||
selectors.add(new XpathSelector("//div[@class='item1']"));
|
||||
AndSelector andSelector = new AndSelector(selectors);
|
||||
List<String> result = andSelector.selectList(htmlContent);
|
||||
assertEquals("<div class=\"item1\">\n Item 1\n</div>", result.get(0));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSelectList_NoResults() {
|
||||
String htmlContent = "<!DOCTYPE html>\n" +
|
||||
"<html lang=\"en\">\n" +
|
||||
"<head>\n" +
|
||||
" <meta charset=\"UTF-8\">\n" +
|
||||
" <meta name=\"viewport\" content=\"width=device-width, initial-scale=1.0\">\n" +
|
||||
" <title>HTML with XPath</title>\n" +
|
||||
"</head>\n" +
|
||||
"<body>\n" +
|
||||
" <div class=\"container\">\n" +
|
||||
" <div class=\"item1\">Item 1</div>\n" +
|
||||
" <div class=\"item2\">Item 2</div>\n" +
|
||||
" </div>\n" +
|
||||
"</body>\n" +
|
||||
"</html>";
|
||||
List<Selector> selectors = new ArrayList<Selector>();
|
||||
selectors.add(new CssSelector("div"));
|
||||
selectors.add(new XpathSelector("//div[@class='item']"));
|
||||
AndSelector andSelector = new AndSelector(selectors);
|
||||
List<String> result = andSelector.selectList(htmlContent);
|
||||
assertEquals(0, result.size());
|
||||
}
|
||||
}
|
|
@ -0,0 +1,39 @@
|
|||
package us.codecraft.webmagic.selector;
|
||||
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.jsoup.nodes.Element;
|
||||
import org.jsoup.select.Elements;
|
||||
import org.junit.Test;
|
||||
import org.junit.runner.RunWith;
|
||||
import org.mockito.Mock;
|
||||
import org.mockito.Mockito;
|
||||
import org.mockito.runners.MockitoJUnitRunner;
|
||||
|
||||
import java.util.List;
|
||||
import static org.junit.Assert.*;
|
||||
|
||||
public class CssSelectorTest {
|
||||
|
||||
@Test
|
||||
public void testSelectElement() {
|
||||
CssSelector cssSelector = new CssSelector("div");
|
||||
String htmlContent = "<html><head><title>Dummy Page</title></head><body><div id=\"dummyDiv\">Hello World!</div></body></html>";
|
||||
Document doc = Jsoup.parse(htmlContent);
|
||||
Element dummyElement = doc.getElementById("dummyDiv");
|
||||
Element resultElement = cssSelector.selectElement(dummyElement);
|
||||
assertNotNull(resultElement);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSelectList() {
|
||||
CssSelector cssSelector = new CssSelector("div");
|
||||
String htmlContent = "<html><head><title>Dummy Page</title></head><body><div id=\"dummyDiv\">Hello World!</div></body></html>";
|
||||
Document doc = Jsoup.parse(htmlContent);
|
||||
Element dummyElement = doc.getElementById("dummyDiv");
|
||||
List<String> result = cssSelector.selectList(dummyElement);
|
||||
assertEquals(1, result.size());
|
||||
assertEquals("[<div id=\"dummyDiv\">\n Hello World!\n</div>]", result.toString());
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,44 @@
|
|||
package us.codecraft.webmagic.selector;
|
||||
|
||||
import static org.junit.Assert.assertEquals;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.junit.Test;
|
||||
|
||||
public class OrSelectorTest {
|
||||
@Test
|
||||
public void testSelectList() {
|
||||
String htmlContent = "<!DOCTYPE html>\n" +
|
||||
"<html lang=\"en\">\n" +
|
||||
"<head>\n" +
|
||||
" <meta charset=\"UTF-8\">\n" +
|
||||
" <meta name=\"viewport\" content=\"width=device-width, initial-scale=1.0\">\n" +
|
||||
" <title>HTML with XPath</title>\n" +
|
||||
"</head>\n" +
|
||||
"<body>\n" +
|
||||
" <div class=\"container\">\n" +
|
||||
" <div class=\"item1\">Item 1</div>\n" +
|
||||
" <div class=\"item2\">Item 2</div>\n" +
|
||||
" </div>\n" +
|
||||
"</body>\n" +
|
||||
"</html>";
|
||||
String expectedResult = "[<head>\n" +
|
||||
" <meta charset=\"UTF-8\">\n" +
|
||||
" <meta name=\"viewport\" content=\"width=device-width, initial-scale=1.0\">\n" +
|
||||
" <title>HTML with XPath</title>\n" +
|
||||
"</head>, <div class=\"item1\">\n" +
|
||||
" Item 1\n" +
|
||||
"</div>, <div class=\"item2\">\n" +
|
||||
" Item 2\n" +
|
||||
"</div>]";
|
||||
List<Selector> selectors = new ArrayList<Selector>();
|
||||
selectors.add(new CssSelector("head"));
|
||||
selectors.add(new XpathSelector("//div[@class='item1']"));
|
||||
selectors.add(new XpathSelector("//div[@class='item2']"));
|
||||
OrSelector orSelector = new OrSelector(selectors);
|
||||
List<String> result = orSelector.selectList(htmlContent);
|
||||
assertEquals(expectedResult, result.toString());
|
||||
}
|
||||
}
|
|
@ -0,0 +1,16 @@
|
|||
package us.codecraft.webmagic.utils;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertNull;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
class CharsetUtilsTest {
|
||||
|
||||
@Test
|
||||
void testDetectCharset() throws IOException {
|
||||
assertNull(CharsetUtils.detectCharset(null, new byte[0]));
|
||||
}
|
||||
|
||||
}
|
|
@ -1,5 +1,7 @@
|
|||
package us.codecraft.webmagic.utils;
|
||||
|
||||
import static org.junit.Assert.assertNull;
|
||||
|
||||
import org.junit.Assert;
|
||||
import org.junit.Test;
|
||||
|
||||
|
@ -43,5 +45,9 @@ public class UrlUtilsTest {
|
|||
Assert.assertEquals("www.dianping.com",UrlUtils.getDomain(url));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testGetCharset() {
|
||||
assertNull(UrlUtils.getCharset(null));
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,21 +0,0 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE log4j:configuration SYSTEM "log4j.dtd">
|
||||
<log4j:configuration xmlns:log4j="http://jakarta.apache.org/log4j/">
|
||||
|
||||
<appender name="stdout" class="org.apache.log4j.ConsoleAppender">
|
||||
<layout class="org.apache.log4j.PatternLayout">
|
||||
<param name="ConversionPattern" value="%d{yy-MM-dd HH:mm:ss,SSS} %-5p %c(%F:%L) ## %m%n" />
|
||||
</layout>
|
||||
</appender>
|
||||
|
||||
<logger name="org.apache" additivity="false">
|
||||
<level value="warn" />
|
||||
<appender-ref ref="stdout" />
|
||||
</logger>
|
||||
|
||||
<root>
|
||||
<level value="info" />
|
||||
<appender-ref ref="stdout" />
|
||||
</root>
|
||||
|
||||
</log4j:configuration>
|
|
@ -0,0 +1,16 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<Configuration>
|
||||
<Appenders>
|
||||
<Console name="stdout" target="SYSTEM_OUT">
|
||||
<PatternLayout pattern="%d{yy-MM-dd HH:mm:ss,SSS} %-5p %c(%F:%L) ## %m%n" />
|
||||
</Console>
|
||||
</Appenders>
|
||||
<Loggers>
|
||||
<Logger name="org.apache" level="warn" additivity="false">
|
||||
<AppenderRef ref="stdout" />
|
||||
</Logger>
|
||||
<Root level="info">
|
||||
<AppenderRef ref="stdout" />
|
||||
</Root>
|
||||
</Loggers>
|
||||
</Configuration>
|
|
@ -1,14 +1,16 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
<project
|
||||
xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0
|
||||
xsi:schemaLocation="
|
||||
http://maven.apache.org/POM/4.0.0
|
||||
http://maven.apache.org/maven-v4_0_0.xsd">
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
<parent>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<artifactId>webmagic-parent</artifactId>
|
||||
<version>0.10.3</version>
|
||||
<artifactId>webmagic</artifactId>
|
||||
<version>1.0.0</version>
|
||||
</parent>
|
||||
|
||||
<artifactId>webmagic-coverage</artifactId>
|
||||
|
|
|
@ -1,15 +1,26 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<project
|
||||
xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="
|
||||
http://maven.apache.org/POM/4.0.0
|
||||
http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<parent>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<artifactId>webmagic-parent</artifactId>
|
||||
<version>0.10.3</version>
|
||||
<artifactId>webmagic</artifactId>
|
||||
<version>1.0.0</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
<artifactId>webmagic-extension</artifactId>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.projectlombok</groupId>
|
||||
<artifactId>lombok</artifactId>
|
||||
<version>1.18.32</version>
|
||||
<scope>provided</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>redis.clients</groupId>
|
||||
<artifactId>jedis</artifactId>
|
||||
|
@ -29,10 +40,6 @@
|
|||
<artifactId>webmagic-core</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>junit</groupId>
|
||||
<artifactId>junit</artifactId>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
</project>
|
||||
|
|
|
@ -1,5 +1,9 @@
|
|||
package us.codecraft.webmagic.model;
|
||||
|
||||
import lombok.Getter;
|
||||
import lombok.Setter;
|
||||
|
||||
import us.codecraft.webmagic.model.sources.Source;
|
||||
import us.codecraft.webmagic.selector.Selector;
|
||||
|
||||
/**
|
||||
|
@ -7,18 +11,18 @@ import us.codecraft.webmagic.selector.Selector;
|
|||
* @author code4crafter@gmail.com <br>
|
||||
* @since 0.2.0
|
||||
*/
|
||||
class Extractor {
|
||||
public class Extractor {
|
||||
|
||||
@Getter @Setter
|
||||
protected Selector selector;
|
||||
|
||||
@Getter
|
||||
protected final Source source;
|
||||
|
||||
protected final boolean notNull;
|
||||
|
||||
protected final boolean multi;
|
||||
|
||||
static enum Source {Html, Url, RawHtml, RawText}
|
||||
|
||||
public Extractor(Selector selector, Source source, boolean notNull, boolean multi) {
|
||||
this.selector = selector;
|
||||
this.source = source;
|
||||
|
@ -26,23 +30,11 @@ class Extractor {
|
|||
this.multi = multi;
|
||||
}
|
||||
|
||||
Selector getSelector() {
|
||||
return selector;
|
||||
}
|
||||
|
||||
Source getSource() {
|
||||
return source;
|
||||
}
|
||||
|
||||
boolean isNotNull() {
|
||||
public boolean isNotNull() {
|
||||
return notNull;
|
||||
}
|
||||
|
||||
boolean isMulti() {
|
||||
public boolean isMulti() {
|
||||
return multi;
|
||||
}
|
||||
|
||||
void setSelector(Selector selector) {
|
||||
this.selector = selector;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,58 +1,33 @@
|
|||
package us.codecraft.webmagic.model;
|
||||
|
||||
import us.codecraft.webmagic.model.formatter.ObjectFormatter;
|
||||
import us.codecraft.webmagic.model.sources.Source;
|
||||
import us.codecraft.webmagic.selector.Selector;
|
||||
|
||||
import java.lang.reflect.Field;
|
||||
import java.lang.reflect.Method;
|
||||
|
||||
import lombok.Getter;
|
||||
import lombok.Setter;
|
||||
|
||||
/**
|
||||
* Wrapper of field and extractor.
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* @since 0.2.0
|
||||
*/
|
||||
class FieldExtractor extends Extractor {
|
||||
public class FieldExtractor extends Extractor {
|
||||
|
||||
@Getter
|
||||
private final Field field;
|
||||
|
||||
@Getter @Setter
|
||||
private Method setterMethod;
|
||||
|
||||
@Getter @Setter
|
||||
private ObjectFormatter objectFormatter;
|
||||
|
||||
public FieldExtractor(Field field, Selector selector, Source source, boolean notNull, boolean multi) {
|
||||
super(selector, source, notNull, multi);
|
||||
this.field = field;
|
||||
}
|
||||
|
||||
Field getField() {
|
||||
return field;
|
||||
}
|
||||
|
||||
Selector getSelector() {
|
||||
return selector;
|
||||
}
|
||||
|
||||
Source getSource() {
|
||||
return source;
|
||||
}
|
||||
|
||||
void setSetterMethod(Method setterMethod) {
|
||||
this.setterMethod = setterMethod;
|
||||
}
|
||||
|
||||
Method getSetterMethod() {
|
||||
return setterMethod;
|
||||
}
|
||||
|
||||
boolean isNotNull() {
|
||||
return notNull;
|
||||
}
|
||||
|
||||
ObjectFormatter getObjectFormatter() {
|
||||
return objectFormatter;
|
||||
}
|
||||
|
||||
void setObjectFormatter(ObjectFormatter objectFormatter) {
|
||||
this.objectFormatter = objectFormatter;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -3,17 +3,21 @@ package us.codecraft.webmagic.model;
|
|||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import lombok.Getter;
|
||||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.model.annotation.*;
|
||||
import us.codecraft.webmagic.model.formatter.ObjectFormatter;
|
||||
import us.codecraft.webmagic.model.fields.PageField;
|
||||
import us.codecraft.webmagic.model.formatter.ObjectFormatterBuilder;
|
||||
import us.codecraft.webmagic.model.sources.Source;
|
||||
import us.codecraft.webmagic.model.sources.SourceTextExtractor;
|
||||
import us.codecraft.webmagic.model.sources.Source.*;
|
||||
import us.codecraft.webmagic.selector.*;
|
||||
import us.codecraft.webmagic.utils.ClassUtils;
|
||||
import us.codecraft.webmagic.utils.ExtractorUtils;
|
||||
|
||||
import java.lang.annotation.Annotation;
|
||||
import java.lang.reflect.Field;
|
||||
import java.lang.reflect.InvocationTargetException;
|
||||
import java.lang.reflect.Method;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
@ -29,14 +33,19 @@ import static us.codecraft.webmagic.model.annotation.ExtractBy.Source.RawText;
|
|||
*/
|
||||
class PageModelExtractor {
|
||||
|
||||
@Getter
|
||||
private List<Pattern> targetUrlPatterns = new ArrayList<Pattern>();
|
||||
|
||||
@Getter
|
||||
private Selector targetUrlRegionSelector;
|
||||
|
||||
@Getter
|
||||
private List<Pattern> helpUrlPatterns = new ArrayList<Pattern>();
|
||||
|
||||
@Getter
|
||||
private Selector helpUrlRegionSelector;
|
||||
|
||||
@Getter
|
||||
private Class clazz;
|
||||
|
||||
private List<FieldExtractor> fieldExtractors;
|
||||
|
@ -86,7 +95,7 @@ class PageModelExtractor {
|
|||
regexPattern = ".*";
|
||||
}
|
||||
fieldExtractor = new FieldExtractor(field,
|
||||
new RegexSelector(regexPattern), FieldExtractor.Source.Url, extractByUrl.notNull(),
|
||||
new RegexSelector(regexPattern), new Url(), extractByUrl.notNull(),
|
||||
extractByUrl.multi() || List.class.isAssignableFrom(field.getType()));
|
||||
Method setterMethod = getSetterMethod(clazz, field);
|
||||
if (setterMethod != null) {
|
||||
|
@ -112,7 +121,7 @@ class PageModelExtractor {
|
|||
default:
|
||||
selector = new AndSelector(ExtractorUtils.getSelectors(extractBies));
|
||||
}
|
||||
fieldExtractor = new FieldExtractor(field, selector, comboExtract.source() == ComboExtract.Source.RawHtml ? FieldExtractor.Source.RawHtml : FieldExtractor.Source.Html,
|
||||
fieldExtractor = new FieldExtractor(field, selector, comboExtract.source() == ComboExtract.Source.RawHtml ? new RawHtml() : new SelectedHtml(),
|
||||
comboExtract.notNull(), comboExtract.multi() || List.class.isAssignableFrom(field.getType()));
|
||||
Method setterMethod = getSetterMethod(clazz, field);
|
||||
if (setterMethod != null) {
|
||||
|
@ -127,26 +136,23 @@ class PageModelExtractor {
|
|||
ExtractBy extractBy = field.getAnnotation(ExtractBy.class);
|
||||
if (extractBy != null) {
|
||||
Selector selector = ExtractorUtils.getSelector(extractBy);
|
||||
ExtractBy.Source source0 = extractBy.source();
|
||||
if (extractBy.type()== ExtractBy.Type.JsonPath){
|
||||
source0 = RawText;
|
||||
}
|
||||
FieldExtractor.Source source = null;
|
||||
switch (source0){
|
||||
ExtractBy.Source extractSource = extractBy.source();
|
||||
if (extractBy.type()== ExtractBy.Type.JsonPath)
|
||||
extractSource = RawText;
|
||||
Source source = null;
|
||||
switch (extractSource) {
|
||||
case RawText:
|
||||
source = FieldExtractor.Source.RawText;
|
||||
source = new RawText();
|
||||
break;
|
||||
case RawHtml:
|
||||
source = FieldExtractor.Source.RawHtml;
|
||||
source = new RawHtml();
|
||||
break;
|
||||
case SelectedHtml:
|
||||
source =FieldExtractor.Source.Html;
|
||||
source = new SelectedHtml();
|
||||
break;
|
||||
default:
|
||||
source =FieldExtractor.Source.Html;
|
||||
|
||||
source = new SelectedHtml();
|
||||
}
|
||||
|
||||
fieldExtractor = new FieldExtractor(field, selector, source,
|
||||
extractBy.notNull(), List.class.isAssignableFrom(field.getType()));
|
||||
fieldExtractor.setSetterMethod(getSetterMethod(clazz, field));
|
||||
|
@ -193,7 +199,7 @@ class PageModelExtractor {
|
|||
annotation = clazz.getAnnotation(ExtractBy.class);
|
||||
if (annotation != null) {
|
||||
ExtractBy extractBy = (ExtractBy) annotation;
|
||||
objectExtractor = new Extractor(new XpathSelector(extractBy.value()), Extractor.Source.Html, extractBy.notNull(), extractBy.multi());
|
||||
objectExtractor = new Extractor(new XpathSelector(extractBy.value()), new SelectedHtml(), extractBy.notNull(), extractBy.multi());
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -233,135 +239,15 @@ class PageModelExtractor {
|
|||
try {
|
||||
o = clazz.newInstance();
|
||||
for (FieldExtractor fieldExtractor : fieldExtractors) {
|
||||
if (fieldExtractor.isMulti()) {
|
||||
List<String> value;
|
||||
switch (fieldExtractor.getSource()) {
|
||||
case RawHtml:
|
||||
value = page.getHtml().selectDocumentForList(fieldExtractor.getSelector());
|
||||
break;
|
||||
case Html:
|
||||
if (isRaw) {
|
||||
value = page.getHtml().selectDocumentForList(fieldExtractor.getSelector());
|
||||
} else {
|
||||
value = fieldExtractor.getSelector().selectList(html);
|
||||
}
|
||||
break;
|
||||
case Url:
|
||||
value = fieldExtractor.getSelector().selectList(page.getUrl().toString());
|
||||
break;
|
||||
case RawText:
|
||||
value = fieldExtractor.getSelector().selectList(page.getRawText());
|
||||
break;
|
||||
default:
|
||||
value = fieldExtractor.getSelector().selectList(html);
|
||||
}
|
||||
if ((value == null || value.size() == 0) && fieldExtractor.isNotNull()) {
|
||||
PageField field = SourceTextExtractor.getText(page, html, isRaw, fieldExtractor);
|
||||
if (!field.operation(o, fieldExtractor, logger))
|
||||
return null;
|
||||
}
|
||||
if (fieldExtractor.getObjectFormatter() != null) {
|
||||
List<Object> converted = convert(value, fieldExtractor.getObjectFormatter());
|
||||
setField(o, fieldExtractor, converted);
|
||||
} else {
|
||||
setField(o, fieldExtractor, value);
|
||||
}
|
||||
} else {
|
||||
String value;
|
||||
switch (fieldExtractor.getSource()) {
|
||||
case RawHtml:
|
||||
value = page.getHtml().selectDocument(fieldExtractor.getSelector());
|
||||
break;
|
||||
case Html:
|
||||
if (isRaw) {
|
||||
value = page.getHtml().selectDocument(fieldExtractor.getSelector());
|
||||
} else {
|
||||
value = fieldExtractor.getSelector().select(html);
|
||||
}
|
||||
break;
|
||||
case Url:
|
||||
value = fieldExtractor.getSelector().select(page.getUrl().toString());
|
||||
break;
|
||||
case RawText:
|
||||
value = fieldExtractor.getSelector().select(page.getRawText());
|
||||
break;
|
||||
default:
|
||||
value = fieldExtractor.getSelector().select(html);
|
||||
}
|
||||
if (value == null && fieldExtractor.isNotNull()) {
|
||||
return null;
|
||||
}
|
||||
if (fieldExtractor.getObjectFormatter() != null) {
|
||||
Object converted = convert(value, fieldExtractor.getObjectFormatter());
|
||||
if (converted == null && fieldExtractor.isNotNull()) {
|
||||
return null;
|
||||
}
|
||||
setField(o, fieldExtractor, converted);
|
||||
} else {
|
||||
setField(o, fieldExtractor, value);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (AfterExtractor.class.isAssignableFrom(clazz)) {
|
||||
if (AfterExtractor.class.isAssignableFrom(clazz))
|
||||
((AfterExtractor) o).afterProcess(page);
|
||||
}
|
||||
} catch (InstantiationException e) {
|
||||
logger.error("extract fail", e);
|
||||
} catch (IllegalAccessException e) {
|
||||
logger.error("extract fail", e);
|
||||
} catch (InvocationTargetException e) {
|
||||
} catch (Exception e) {
|
||||
logger.error("extract fail", e);
|
||||
}
|
||||
return o;
|
||||
}
|
||||
|
||||
private Object convert(String value, ObjectFormatter objectFormatter) {
|
||||
try {
|
||||
Object format = objectFormatter.format(value);
|
||||
logger.debug("String {} is converted to {}", value, format);
|
||||
return format;
|
||||
} catch (Exception e) {
|
||||
logger.error("convert " + value + " to " + objectFormatter.clazz() + " error!", e);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
private List<Object> convert(List<String> values, ObjectFormatter objectFormatter) {
|
||||
List<Object> objects = new ArrayList<Object>();
|
||||
for (String value : values) {
|
||||
Object converted = convert(value, objectFormatter);
|
||||
if (converted != null) {
|
||||
objects.add(converted);
|
||||
}
|
||||
}
|
||||
return objects;
|
||||
}
|
||||
|
||||
private void setField(Object o, FieldExtractor fieldExtractor, Object value) throws IllegalAccessException, InvocationTargetException {
|
||||
if (value == null) {
|
||||
return;
|
||||
}
|
||||
if (fieldExtractor.getSetterMethod() != null) {
|
||||
fieldExtractor.getSetterMethod().invoke(o, value);
|
||||
}
|
||||
fieldExtractor.getField().set(o, value);
|
||||
}
|
||||
|
||||
Class getClazz() {
|
||||
return clazz;
|
||||
}
|
||||
|
||||
List<Pattern> getTargetUrlPatterns() {
|
||||
return targetUrlPatterns;
|
||||
}
|
||||
|
||||
List<Pattern> getHelpUrlPatterns() {
|
||||
return helpUrlPatterns;
|
||||
}
|
||||
|
||||
Selector getTargetUrlRegionSelector() {
|
||||
return targetUrlRegionSelector;
|
||||
}
|
||||
|
||||
Selector getHelpUrlRegionSelector() {
|
||||
return helpUrlRegionSelector;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,42 @@
|
|||
package us.codecraft.webmagic.model.fields;
|
||||
|
||||
import java.lang.reflect.InvocationTargetException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.slf4j.Logger;
|
||||
|
||||
import lombok.Getter;
|
||||
import us.codecraft.webmagic.model.FieldExtractor;
|
||||
import us.codecraft.webmagic.model.formatter.ObjectFormatter;
|
||||
|
||||
public class MultipleField extends PageField {
|
||||
@Getter
|
||||
private List<String> fieldNames;
|
||||
|
||||
public MultipleField(List<String> fieldNames) {
|
||||
this.fieldNames = fieldNames;
|
||||
}
|
||||
|
||||
public boolean operation(Object o, FieldExtractor fieldExtractor, Logger logger) throws IllegalAccessException, InvocationTargetException {
|
||||
if ((this.fieldNames == null || this.fieldNames.size() == 0) && fieldExtractor.isNotNull())
|
||||
return false;
|
||||
if (fieldExtractor.getObjectFormatter() != null) {
|
||||
List<Object> converted = this.convert(this.fieldNames, fieldExtractor.getObjectFormatter(), logger);
|
||||
setField(o, fieldExtractor, converted);
|
||||
}
|
||||
else
|
||||
setField(o, fieldExtractor, this.fieldNames);
|
||||
return true;
|
||||
}
|
||||
|
||||
private List<Object> convert(List<String> values, ObjectFormatter objectFormatter, Logger logger) {
|
||||
List<Object> objects = new ArrayList<>();
|
||||
for (String value : values) {
|
||||
Object converted = this.convert(value, objectFormatter, logger);
|
||||
if (converted != null)
|
||||
objects.add(converted);
|
||||
}
|
||||
return objects;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,31 @@
|
|||
package us.codecraft.webmagic.model.fields;
|
||||
|
||||
import java.lang.reflect.InvocationTargetException;
|
||||
|
||||
import org.slf4j.Logger;
|
||||
|
||||
import us.codecraft.webmagic.model.FieldExtractor;
|
||||
import us.codecraft.webmagic.model.formatter.ObjectFormatter;
|
||||
|
||||
public abstract class PageField {
|
||||
public abstract boolean operation(Object o, FieldExtractor fieldExtractor, Logger logger) throws IllegalAccessException, InvocationTargetException;
|
||||
|
||||
protected Object convert(String value, ObjectFormatter objectFormatter, Logger logger) {
|
||||
try {
|
||||
Object format = objectFormatter.format(value);
|
||||
logger.debug("String {} is converted to {}", value, format);
|
||||
return format;
|
||||
} catch (Exception e) {
|
||||
logger.error("convert " + value + " to " + objectFormatter.clazz() + " error!", e);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
protected void setField(Object o, FieldExtractor fieldExtractor, Object value) throws IllegalAccessException, InvocationTargetException {
|
||||
if (value != null) {
|
||||
if (fieldExtractor.getSetterMethod() != null)
|
||||
fieldExtractor.getSetterMethod().invoke(o, value);
|
||||
fieldExtractor.getField().set(o, value);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,28 @@
|
|||
package us.codecraft.webmagic.model.fields;
|
||||
|
||||
import java.lang.reflect.InvocationTargetException;
|
||||
|
||||
import org.slf4j.Logger;
|
||||
|
||||
import lombok.Getter;
|
||||
import us.codecraft.webmagic.model.FieldExtractor;
|
||||
|
||||
public class SingleField extends PageField {
|
||||
@Getter
|
||||
private String fieldName;
|
||||
|
||||
public SingleField(String fieldName) {
|
||||
this.fieldName = fieldName;
|
||||
}
|
||||
|
||||
public boolean operation(Object o, FieldExtractor fieldExtractor, Logger logger) throws IllegalAccessException, InvocationTargetException {
|
||||
if (fieldExtractor.getObjectFormatter() != null) {
|
||||
Object converted = this.convert(this.fieldName, fieldExtractor.getObjectFormatter(), logger);
|
||||
if (converted == null && fieldExtractor.isNotNull())
|
||||
return false;
|
||||
setField(o, fieldExtractor, converted);
|
||||
} else
|
||||
setField(o, fieldExtractor, this.fieldName);
|
||||
return true;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,85 @@
|
|||
package us.codecraft.webmagic.model.formatter;
|
||||
|
||||
public interface BasicClassDetector {
|
||||
Class<?> detectBasicClass(Class<?> type);
|
||||
}
|
||||
|
||||
class IntegerClassDetector implements BasicClassDetector {
|
||||
@Override
|
||||
public Class<?> detectBasicClass(Class<?> type) {
|
||||
if (type.equals(Integer.TYPE) || type.equals(Integer.class)) {
|
||||
return Integer.class;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
class LongClassDetector implements BasicClassDetector {
|
||||
@Override
|
||||
public Class<?> detectBasicClass(Class<?> type) {
|
||||
if (type.equals(Long.TYPE) || type.equals(Long.class)) {
|
||||
return Long.class;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
class DoubleClassDetector implements BasicClassDetector {
|
||||
@Override
|
||||
public Class<?> detectBasicClass(Class<?> type) {
|
||||
if (type.equals(Double.TYPE) || type.equals(Double.class)) {
|
||||
return Double.class;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
class FloatClassDetector implements BasicClassDetector {
|
||||
@Override
|
||||
public Class<?> detectBasicClass(Class<?> type) {
|
||||
if (type.equals(Float.TYPE) || type.equals(Float.class)) {
|
||||
return Float.class;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
class ShortClassDetector implements BasicClassDetector {
|
||||
@Override
|
||||
public Class<?> detectBasicClass(Class<?> type) {
|
||||
if (type.equals(Short.TYPE) || type.equals(Short.class)) {
|
||||
return Short.class;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
class CharacterClassDetector implements BasicClassDetector {
|
||||
@Override
|
||||
public Class<?> detectBasicClass(Class<?> type) {
|
||||
if (type.equals(Character.TYPE) || type.equals(Character.class)) {
|
||||
return Character.class;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
class ByteClassDetector implements BasicClassDetector {
|
||||
@Override
|
||||
public Class<?> detectBasicClass(Class<?> type) {
|
||||
if (type.equals(Byte.TYPE) || type.equals(Byte.class)) {
|
||||
return Byte.class;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
class BooleanClassDetector implements BasicClassDetector {
|
||||
@Override
|
||||
public Class<?> detectBasicClass(Class<?> type) {
|
||||
if (type.equals(Boolean.TYPE) || type.equals(Boolean.class)) {
|
||||
return Boolean.class;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
|
@ -24,28 +24,24 @@ public abstract class BasicTypeFormatter<T> implements ObjectFormatter<T> {
|
|||
}
|
||||
|
||||
protected abstract T formatTrimmed(String raw) throws Exception;
|
||||
|
||||
public static final List<Class<? extends ObjectFormatter>> basicTypeFormatters = Arrays.<Class<? extends ObjectFormatter>>asList(IntegerFormatter.class,
|
||||
LongFormatter.class, DoubleFormatter.class, FloatFormatter.class, ShortFormatter.class,
|
||||
CharactorFormatter.class, ByteFormatter.class, BooleanFormatter.class);
|
||||
public static final List<BasicClassDetector> basicClassDetector= Arrays.asList(new IntegerClassDetector(),
|
||||
new LongClassDetector(),
|
||||
new FloatClassDetector(),
|
||||
new DoubleClassDetector(),
|
||||
new ShortClassDetector(),
|
||||
new ByteClassDetector(),
|
||||
new BooleanClassDetector(),
|
||||
new CharacterClassDetector());
|
||||
|
||||
public static Class<?> detectBasicClass(Class<?> type) {
|
||||
if (type.equals(Integer.TYPE) || type.equals(Integer.class)) {
|
||||
return Integer.class;
|
||||
} else if (type.equals(Long.TYPE) || type.equals(Long.class)) {
|
||||
return Long.class;
|
||||
} else if (type.equals(Double.TYPE) || type.equals(Double.class)) {
|
||||
return Double.class;
|
||||
} else if (type.equals(Float.TYPE) || type.equals(Float.class)) {
|
||||
return Float.class;
|
||||
} else if (type.equals(Short.TYPE) || type.equals(Short.class)) {
|
||||
return Short.class;
|
||||
} else if (type.equals(Character.TYPE) || type.equals(Character.class)) {
|
||||
return Character.class;
|
||||
} else if (type.equals(Byte.TYPE) || type.equals(Byte.class)) {
|
||||
return Byte.class;
|
||||
} else if (type.equals(Boolean.TYPE) || type.equals(Boolean.class)) {
|
||||
return Boolean.class;
|
||||
for (BasicClassDetector detector : basicClassDetector) {
|
||||
Class<?> detectedClass = detector.detectBasicClass(type);
|
||||
if (detectedClass != null) {
|
||||
return detectedClass;
|
||||
}
|
||||
}
|
||||
return type;
|
||||
}
|
||||
|
@ -146,5 +142,4 @@ public abstract class BasicTypeFormatter<T> implements ObjectFormatter<T> {
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
|
@ -0,0 +1,68 @@
|
|||
package us.codecraft.webmagic.model.sources;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.model.FieldExtractor;
|
||||
|
||||
public interface Source {
|
||||
public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor);
|
||||
public List<String> getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor);
|
||||
|
||||
public class RawHtml implements Source {
|
||||
public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
|
||||
return page.getHtml().selectDocument(fieldExtractor.getSelector());
|
||||
}
|
||||
|
||||
public List<String> getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
|
||||
return page.getHtml().selectDocumentForList(fieldExtractor.getSelector());
|
||||
}
|
||||
}
|
||||
|
||||
public class SelectedHtml implements Source {
|
||||
public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
|
||||
if (isRaw)
|
||||
return page.getHtml().selectDocument(fieldExtractor.getSelector());
|
||||
else
|
||||
return fieldExtractor.getSelector().select(html);
|
||||
}
|
||||
|
||||
public List<String> getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
|
||||
if (isRaw)
|
||||
return page.getHtml().selectDocumentForList(fieldExtractor.getSelector());
|
||||
else
|
||||
return fieldExtractor.getSelector().selectList(html);
|
||||
}
|
||||
}
|
||||
|
||||
public class Url implements Source {
|
||||
public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
|
||||
return fieldExtractor.getSelector().select(page.getUrl().toString());
|
||||
}
|
||||
|
||||
public List<String> getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
|
||||
return fieldExtractor.getSelector().selectList(page.getUrl().toString());
|
||||
}
|
||||
}
|
||||
|
||||
public class RawText implements Source {
|
||||
public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
|
||||
return fieldExtractor.getSelector().select(page.getRawText());
|
||||
}
|
||||
|
||||
public List<String> getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
|
||||
return fieldExtractor.getSelector().selectList(page.getRawText());
|
||||
}
|
||||
}
|
||||
|
||||
public class DefaultSource implements Source {
|
||||
public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
|
||||
return fieldExtractor.getSelector().select(html);
|
||||
}
|
||||
|
||||
public List<String> getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
|
||||
return fieldExtractor.getSelector().selectList(html);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,17 @@
|
|||
package us.codecraft.webmagic.model.sources;
|
||||
|
||||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.model.FieldExtractor;
|
||||
import us.codecraft.webmagic.model.fields.MultipleField;
|
||||
import us.codecraft.webmagic.model.fields.PageField;
|
||||
import us.codecraft.webmagic.model.fields.SingleField;
|
||||
|
||||
public class SourceTextExtractor {
|
||||
public static PageField getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
|
||||
Source source = fieldExtractor.getSource();
|
||||
if (fieldExtractor.isMulti())
|
||||
return new MultipleField(source.getTextList(page, html, isRaw, fieldExtractor));
|
||||
else
|
||||
return new SingleField(source.getText(page, html, isRaw, fieldExtractor));
|
||||
}
|
||||
}
|
|
@ -102,7 +102,7 @@ public class RedisPriorityScheduler extends RedisScheduler {
|
|||
}
|
||||
|
||||
private void setExtrasInItem(Jedis jedis,Request request, Task task) {
|
||||
if (request.getExtras() != null) {
|
||||
if (!request.getExtras().isEmpty()) {
|
||||
String field = DigestUtils.sha1Hex(request.getUrl());
|
||||
String value = JSON.toJSONString(request);
|
||||
jedis.hset(getItemKey(task), field, value);
|
||||
|
|
|
@ -84,7 +84,7 @@ public class RedisScheduler extends DuplicateRemovedScheduler implements Monitor
|
|||
return true;
|
||||
}
|
||||
|
||||
if (request.getExtras() != null && !request.getExtras().isEmpty()) {
|
||||
if (!request.getExtras().isEmpty()) {
|
||||
return true;
|
||||
}
|
||||
if (request.getPriority() != 0L) {
|
||||
|
|
|
@ -1,21 +0,0 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE log4j:configuration SYSTEM "log4j.dtd">
|
||||
<log4j:configuration xmlns:log4j="http://jakarta.apache.org/log4j/">
|
||||
|
||||
<appender name="stdout" class="org.apache.log4j.ConsoleAppender">
|
||||
<layout class="org.apache.log4j.PatternLayout">
|
||||
<param name="ConversionPattern" value="%d{yy-MM-dd HH:mm:ss,SSS} %-5p %c(%F:%L) ## %m%n" />
|
||||
</layout>
|
||||
</appender>
|
||||
|
||||
<logger name="org.apache" additivity="false">
|
||||
<level value="warn" />
|
||||
<appender-ref ref="stdout" />
|
||||
</logger>
|
||||
|
||||
<root>
|
||||
<level value="info" />
|
||||
<appender-ref ref="stdout" />
|
||||
</root>
|
||||
|
||||
</log4j:configuration>
|
|
@ -13,7 +13,6 @@ import static org.assertj.core.api.Assertions.assertThat;
|
|||
|
||||
/**
|
||||
* @author code4crafter@gmail.com
|
||||
* @date 14-4-5
|
||||
*/
|
||||
public class ConfigurablePageProcessorTest {
|
||||
|
||||
|
|
|
@ -12,7 +12,6 @@ import static org.assertj.core.api.Assertions.assertThat;
|
|||
|
||||
/**
|
||||
* @author code4crafter@gmail.com
|
||||
* @date 14-4-4
|
||||
*/
|
||||
public class ModelPageProcessorTest {
|
||||
|
||||
|
|
|
@ -1,21 +0,0 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE log4j:configuration SYSTEM "log4j.dtd">
|
||||
<log4j:configuration xmlns:log4j="http://jakarta.apache.org/log4j/">
|
||||
|
||||
<appender name="stdout" class="org.apache.log4j.ConsoleAppender">
|
||||
<layout class="org.apache.log4j.PatternLayout">
|
||||
<param name="ConversionPattern" value="%d{yy-MM-dd HH:mm:ss,SSS} %-5p %c(%F:%L) ## %m%n" />
|
||||
</layout>
|
||||
</appender>
|
||||
|
||||
<logger name="org.apache" additivity="false">
|
||||
<level value="warn" />
|
||||
<appender-ref ref="stdout" />
|
||||
</logger>
|
||||
|
||||
<root>
|
||||
<level value="info" />
|
||||
<appender-ref ref="stdout" />
|
||||
</root>
|
||||
|
||||
</log4j:configuration>
|
|
@ -0,0 +1,16 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<Configuration>
|
||||
<Appenders>
|
||||
<Console name="stdout" target="SYSTEM_OUT">
|
||||
<PatternLayout pattern="%d{yy-MM-dd HH:mm:ss,SSS} %-5p %c(%F:%L) ## %m%n" />
|
||||
</Console>
|
||||
</Appenders>
|
||||
<Loggers>
|
||||
<Logger name="org.apache" level="warn" additivity="false">
|
||||
<AppenderRef ref="stdout" />
|
||||
</Logger>
|
||||
<Root level="info">
|
||||
<AppenderRef ref="stdout" />
|
||||
</Root>
|
||||
</Loggers>
|
||||
</Configuration>
|
|
@ -1,9 +1,14 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<project
|
||||
xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="
|
||||
http://maven.apache.org/POM/4.0.0
|
||||
http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<parent>
|
||||
<artifactId>webmagic-parent</artifactId>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<version>0.10.3</version>
|
||||
<artifactId>webmagic</artifactId>
|
||||
<version>1.0.0</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
|
@ -20,10 +25,6 @@
|
|||
<artifactId>webmagic-extension</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>junit</groupId>
|
||||
<artifactId>junit</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.mapdb</groupId>
|
||||
<artifactId>mapdb</artifactId>
|
||||
|
@ -42,7 +43,7 @@
|
|||
<dependency>
|
||||
<groupId>com.fasterxml.jackson.core</groupId>
|
||||
<artifactId>jackson-databind</artifactId>
|
||||
<version>2.15.2</version>
|
||||
<version>2.16.0</version>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
|
|
|
@ -1,26 +0,0 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE log4j:configuration SYSTEM "log4j.dtd">
|
||||
<log4j:configuration xmlns:log4j="http://jakarta.apache.org/log4j/">
|
||||
|
||||
<appender name="stdout" class="org.apache.log4j.ConsoleAppender">
|
||||
<layout class="org.apache.log4j.PatternLayout">
|
||||
<param name="ConversionPattern" value="%d{yy-MM-dd HH:mm:ss,SSS} %-5p %c(%F:%L) ## %m%n" />
|
||||
</layout>
|
||||
</appender>
|
||||
|
||||
<logger name="org.springframework" additivity="false">
|
||||
<level value="warn" />
|
||||
<appender-ref ref="stdout" />
|
||||
</logger>
|
||||
|
||||
<logger name="net.sf.ehcache" additivity="false">
|
||||
<level value="warn" />
|
||||
<appender-ref ref="stdout" />
|
||||
</logger>
|
||||
|
||||
<root>
|
||||
<level value="info" />
|
||||
<appender-ref ref="stdout" />
|
||||
</root>
|
||||
|
||||
</log4j:configuration>
|
|
@ -0,0 +1,19 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<Configuration>
|
||||
<Appenders>
|
||||
<Console name="stdout" target="SYSTEM_OUT">
|
||||
<PatternLayout pattern="%d{yy-MM-dd HH:mm:ss,SSS} %-5p %c(%F:%L) ## %m%n" />
|
||||
</Console>
|
||||
</Appenders>
|
||||
<Loggers>
|
||||
<Logger name="org.springframework" level="warn" additivity="false">
|
||||
<AppenderRef ref="stdout" />
|
||||
</Logger>
|
||||
<Logger name="net.sf.ehcache" level="warn" additivity="false">
|
||||
<AppenderRef ref="stdout" />
|
||||
</Logger>
|
||||
<Root level="info">
|
||||
<AppenderRef ref="stdout" />
|
||||
</Root>
|
||||
</Loggers>
|
||||
</Configuration>
|
|
@ -1,14 +1,23 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<project
|
||||
xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="
|
||||
http://maven.apache.org/POM/4.0.0
|
||||
http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<parent>
|
||||
<artifactId>webmagic-parent</artifactId>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<version>0.10.3</version>
|
||||
<artifactId>webmagic</artifactId>
|
||||
<version>1.0.0</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
<artifactId>webmagic-saxon</artifactId>
|
||||
|
||||
<properties>
|
||||
<maven.deploy.skip>true</maven.deploy.skip>
|
||||
</properties>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>${project.groupId}</groupId>
|
||||
|
@ -23,23 +32,6 @@
|
|||
<groupId>net.sf.saxon</groupId>
|
||||
<artifactId>Saxon-HE</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>junit</groupId>
|
||||
<artifactId>junit</artifactId>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
<build>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-deploy-plugin</artifactId>
|
||||
<version>3.0.0-M1</version>
|
||||
<configuration>
|
||||
<skip>true</skip>
|
||||
</configuration>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
|
||||
</project>
|
||||
|
|
|
@ -1,9 +1,14 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<project
|
||||
xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="
|
||||
http://maven.apache.org/POM/4.0.0
|
||||
http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<parent>
|
||||
<artifactId>webmagic-parent</artifactId>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<version>0.10.3</version>
|
||||
<artifactId>webmagic</artifactId>
|
||||
<version>1.0.0</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
|
@ -13,6 +18,14 @@
|
|||
</properties>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.apache.logging.log4j</groupId>
|
||||
<artifactId>log4j-core</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.logging.log4j</groupId>
|
||||
<artifactId>log4j-slf4j2-impl</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.jruby</groupId>
|
||||
<artifactId>jruby</artifactId>
|
||||
|
@ -30,25 +43,22 @@
|
|||
<groupId>commons-cli</groupId>
|
||||
<artifactId>commons-cli</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>junit</groupId>
|
||||
<artifactId>junit</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>${project.groupId}</groupId>
|
||||
<artifactId>webmagic-core</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.slf4j</groupId>
|
||||
<artifactId>slf4j-log4j12</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>${project.groupId}</groupId>
|
||||
<artifactId>webmagic-extension</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.projectlombok</groupId>
|
||||
<artifactId>lombok</artifactId>
|
||||
<version>1.18.32</version>
|
||||
<scope>provided</scope>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
<build>
|
||||
|
|
|
@ -0,0 +1,47 @@
|
|||
package us.codecraft.webmagic.scripts;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import lombok.Getter;
|
||||
import lombok.Setter;
|
||||
import us.codecraft.webmagic.scripts.languages.JRuby;
|
||||
import us.codecraft.webmagic.scripts.languages.Javascript;
|
||||
import us.codecraft.webmagic.scripts.languages.Language;
|
||||
import us.codecraft.webmagic.utils.WMCollections;
|
||||
|
||||
public class Params {
|
||||
@Getter
|
||||
Language language = new Javascript();
|
||||
|
||||
@Getter @Setter
|
||||
String scriptFileName;
|
||||
|
||||
@Getter @Setter
|
||||
List<String> urls;
|
||||
|
||||
@Getter @Setter
|
||||
int thread = 1;
|
||||
|
||||
@Getter @Setter
|
||||
int sleepTime = 1000;
|
||||
|
||||
private static Map<Language, Set<String>> alias;
|
||||
|
||||
public Params() {
|
||||
alias = new HashMap<Language, Set<String>>();
|
||||
alias.put(new Javascript(), WMCollections.<String>newHashSet("js", "javascript", "JavaScript", "JS"));
|
||||
alias.put(new JRuby(), WMCollections.<String>newHashSet("ruby", "jruby", "Ruby", "JRuby"));
|
||||
}
|
||||
|
||||
public void setLanguagefromArg(String arg) {
|
||||
for (Map.Entry<Language, Set<String>> languageSetEntry : alias.entrySet()) {
|
||||
if (languageSetEntry.getValue().contains(arg)) {
|
||||
this.language = languageSetEntry.getKey();
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,88 +1,21 @@
|
|||
package us.codecraft.webmagic.scripts;
|
||||
|
||||
import org.apache.commons.cli.*;
|
||||
import org.apache.log4j.Level;
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
import us.codecraft.webmagic.ResultItems;
|
||||
import us.codecraft.webmagic.Spider;
|
||||
import us.codecraft.webmagic.Task;
|
||||
import us.codecraft.webmagic.pipeline.Pipeline;
|
||||
import us.codecraft.webmagic.scripts.config.CommandLineOption;
|
||||
import us.codecraft.webmagic.utils.WMCollections;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com
|
||||
* @author code4crafter@gmail.com / FrancoisGib
|
||||
* @since 0.4.1
|
||||
*/
|
||||
public class ScriptConsole {
|
||||
|
||||
private static class Params {
|
||||
Language language = Language.JavaScript;
|
||||
String scriptFileName;
|
||||
List<String> urls;
|
||||
int thread = 1;
|
||||
int sleepTime = 1000;
|
||||
private static Map<Language, Set<String>> alias = new HashMap<Language, Set<String>>();
|
||||
|
||||
static {
|
||||
alias.put(Language.JavaScript, WMCollections.<String>newHashSet("js", "javascript", "JavaScript", "JS"));
|
||||
alias.put(Language.JRuby, WMCollections.<String>newHashSet("ruby", "jruby", "Ruby", "JRuby"));
|
||||
}
|
||||
|
||||
public void setLanguagefromArg(String arg) {
|
||||
for (Map.Entry<Language, Set<String>> languageSetEntry : alias.entrySet()) {
|
||||
if (languageSetEntry.getValue().contains(arg)) {
|
||||
this.language = languageSetEntry.getKey();
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private Language getLanguage() {
|
||||
return language;
|
||||
}
|
||||
|
||||
private void setLanguage(Language language) {
|
||||
this.language = language;
|
||||
}
|
||||
|
||||
private String getScriptFileName() {
|
||||
return scriptFileName;
|
||||
}
|
||||
|
||||
private void setScriptFileName(String scriptFileName) {
|
||||
this.scriptFileName = scriptFileName;
|
||||
}
|
||||
|
||||
private List<String> getUrls() {
|
||||
return urls;
|
||||
}
|
||||
|
||||
private void setUrls(List<String> urls) {
|
||||
this.urls = urls;
|
||||
}
|
||||
|
||||
private int getThread() {
|
||||
return thread;
|
||||
}
|
||||
|
||||
private void setThread(int thread) {
|
||||
this.thread = thread;
|
||||
}
|
||||
|
||||
private int getSleepTime() {
|
||||
return sleepTime;
|
||||
}
|
||||
|
||||
private void setSleepTime(int sleepTime) {
|
||||
this.sleepTime = sleepTime;
|
||||
}
|
||||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
Params params = parseCommand(args);
|
||||
startSpider(params);
|
||||
|
@ -140,45 +73,9 @@ public class ScriptConsole {
|
|||
|
||||
private static Params readOptions(CommandLine commandLine) {
|
||||
Params params = new Params();
|
||||
if (commandLine.hasOption("l")) {
|
||||
String language = commandLine.getOptionValue("l");
|
||||
params.setLanguagefromArg(language);
|
||||
}
|
||||
if (commandLine.hasOption("f")) {
|
||||
String scriptFilename = commandLine.getOptionValue("f");
|
||||
params.setScriptFileName(scriptFilename);
|
||||
} else {
|
||||
exit();
|
||||
}
|
||||
if (commandLine.hasOption("s")) {
|
||||
Integer sleepTime = Integer.parseInt(commandLine.getOptionValue("s"));
|
||||
params.setSleepTime(sleepTime);
|
||||
}
|
||||
if (commandLine.hasOption("t")) {
|
||||
Integer thread = Integer.parseInt(commandLine.getOptionValue("t"));
|
||||
params.setThread(thread);
|
||||
}
|
||||
if (commandLine.hasOption("g")) {
|
||||
configLogger(commandLine.getOptionValue("g"));
|
||||
}
|
||||
params.setUrls(commandLine.getArgList());
|
||||
List<CommandLineOption> options = CommandLineOption.getAllOptions();
|
||||
for (CommandLineOption option : options)
|
||||
option.addParamOptionIfInCommandLine(params, commandLine);
|
||||
return params;
|
||||
}
|
||||
|
||||
private static void configLogger(String value) {
|
||||
Logger rootLogger = Logger.getRootLogger();
|
||||
if ("debug".equalsIgnoreCase(value)) {
|
||||
rootLogger.setLevel(Level.DEBUG);
|
||||
} else if ("info".equalsIgnoreCase(value)) {
|
||||
rootLogger.setLevel(Level.INFO);
|
||||
} else if ("warn".equalsIgnoreCase(value)) {
|
||||
rootLogger.setLevel(Level.WARN);
|
||||
} else if ("trace".equalsIgnoreCase(value)) {
|
||||
rootLogger.setLevel(Level.TRACE);
|
||||
} else if ("off".equalsIgnoreCase(value)) {
|
||||
rootLogger.setLevel(Level.OFF);
|
||||
} else if ("error".equalsIgnoreCase(value)) {
|
||||
rootLogger.setLevel(Level.ERROR);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -2,6 +2,9 @@ package us.codecraft.webmagic.scripts;
|
|||
|
||||
import javax.script.ScriptEngine;
|
||||
import javax.script.ScriptEngineManager;
|
||||
|
||||
import us.codecraft.webmagic.scripts.languages.Language;
|
||||
|
||||
import java.util.concurrent.LinkedBlockingQueue;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
|
||||
|
@ -11,14 +14,11 @@ import java.util.concurrent.atomic.AtomicInteger;
|
|||
*/
|
||||
public class ScriptEnginePool {
|
||||
|
||||
private final int size;
|
||||
|
||||
private final AtomicInteger availableCount;
|
||||
|
||||
private final LinkedBlockingQueue<ScriptEngine> scriptEngines = new LinkedBlockingQueue<ScriptEngine>();
|
||||
|
||||
public ScriptEnginePool(Language language,int size) {
|
||||
this.size = size;
|
||||
this.availableCount = new AtomicInteger(size);
|
||||
for (int i=0;i<size;i++){
|
||||
ScriptEngineManager manager = new ScriptEngineManager();
|
||||
|
|
|
@ -4,17 +4,14 @@ package us.codecraft.webmagic.scripts;
|
|||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.nio.charset.Charset;
|
||||
import java.util.Iterator;
|
||||
import java.util.Map;
|
||||
import javax.script.ScriptContext;
|
||||
import javax.script.ScriptEngine;
|
||||
import javax.script.ScriptException;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.jruby.RubyHash;
|
||||
import org.python.core.PyDictionary;
|
||||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.Site;
|
||||
import us.codecraft.webmagic.processor.PageProcessor;
|
||||
import us.codecraft.webmagic.scripts.languages.Language;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com
|
||||
|
@ -55,35 +52,7 @@ public class ScriptProcessor implements PageProcessor {
|
|||
context.setAttribute("page", page, ScriptContext.ENGINE_SCOPE);
|
||||
context.setAttribute("config", site, ScriptContext.ENGINE_SCOPE);
|
||||
try {
|
||||
switch (language) {
|
||||
case JavaScript:
|
||||
engine.eval(defines + "\n" + script, context);
|
||||
// NativeObject o = (NativeObject) engine.get("result");
|
||||
// if (o != null) {
|
||||
// for (Object o1 : o.getIds()) {
|
||||
// String key = String.valueOf(o1);
|
||||
// page.getResultItems().put(key, NativeObject.getProperty(o, key));
|
||||
// }
|
||||
// }
|
||||
break;
|
||||
case JRuby:
|
||||
RubyHash oRuby = (RubyHash) engine.eval(defines + "\n" + script, context);
|
||||
Iterator itruby = oRuby.entrySet().iterator();
|
||||
while (itruby.hasNext()) {
|
||||
Map.Entry pairs = (Map.Entry) itruby.next();
|
||||
page.getResultItems().put(pairs.getKey().toString(), pairs.getValue());
|
||||
}
|
||||
break;
|
||||
case Jython:
|
||||
engine.eval(defines + "\n" + script, context);
|
||||
PyDictionary oJython = (PyDictionary) engine.get("result");
|
||||
Iterator it = oJython.entrySet().iterator();
|
||||
while (it.hasNext()) {
|
||||
Map.Entry pairs = (Map.Entry) it.next();
|
||||
page.getResultItems().put(pairs.getKey().toString(), pairs.getValue());
|
||||
}
|
||||
break;
|
||||
}
|
||||
this.language.process(engine, defines, script, page);
|
||||
} catch (ScriptException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
|
|
|
@ -7,6 +7,9 @@ import java.io.InputStream;
|
|||
import java.nio.charset.Charset;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
|
||||
import us.codecraft.webmagic.scripts.languages.Javascript;
|
||||
import us.codecraft.webmagic.scripts.languages.Language;
|
||||
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com
|
||||
|
@ -14,7 +17,7 @@ import org.apache.commons.io.IOUtils;
|
|||
*/
|
||||
public class ScriptProcessorBuilder {
|
||||
|
||||
private static final Language DefaultLanguage = Language.JavaScript;
|
||||
private static final Language DefaultLanguage = new Javascript();
|
||||
|
||||
private Language language = DefaultLanguage;
|
||||
|
||||
|
@ -39,7 +42,6 @@ public class ScriptProcessorBuilder {
|
|||
InputStream resourceAsStream = new FileInputStream(fileName);
|
||||
this.script = IOUtils.toString(resourceAsStream, Charset.defaultCharset());
|
||||
} catch (IOException e) {
|
||||
//wrap IOException because I prefer a runtime exception...
|
||||
throw new IllegalArgumentException(e);
|
||||
}
|
||||
return this;
|
||||
|
@ -50,7 +52,6 @@ public class ScriptProcessorBuilder {
|
|||
InputStream resourceAsStream = ScriptProcessor.class.getClassLoader().getResourceAsStream(fileName);
|
||||
this.script = IOUtils.toString(resourceAsStream, Charset.defaultCharset());
|
||||
} catch (IOException e) {
|
||||
//wrap IOException because I prefer a runtime exception...
|
||||
throw new IllegalArgumentException(e);
|
||||
}
|
||||
return this;
|
||||
|
|
|
@ -0,0 +1,82 @@
|
|||
package us.codecraft.webmagic.scripts.config;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.commons.cli.CommandLine;
|
||||
|
||||
import lombok.Getter;
|
||||
import us.codecraft.webmagic.scripts.Params;
|
||||
|
||||
public abstract class CommandLineOption {
|
||||
@Getter
|
||||
char option;
|
||||
|
||||
public CommandLineOption(char option) {
|
||||
this.option = option;
|
||||
}
|
||||
|
||||
protected abstract void addParamOption(Params params, CommandLine commandLine);
|
||||
|
||||
public void addParamOptionIfInCommandLine(Params params, CommandLine commandLine) {
|
||||
if (commandLine.hasOption(this.option))
|
||||
this.addParamOption(params, commandLine);
|
||||
}
|
||||
|
||||
public static List<CommandLineOption> getAllOptions() {
|
||||
return List.of(new OptionL(), new OptionF(), new OptionS(), new OptionT(), new OptionG());
|
||||
}
|
||||
}
|
||||
|
||||
class OptionL extends CommandLineOption {
|
||||
public OptionL() {
|
||||
super('l');
|
||||
}
|
||||
|
||||
protected void addParamOption(Params params, CommandLine commandLine) {
|
||||
String language = commandLine.getOptionValue("l");
|
||||
params.setLanguagefromArg(language);
|
||||
}
|
||||
}
|
||||
|
||||
class OptionF extends CommandLineOption {
|
||||
public OptionF() {
|
||||
super('f');
|
||||
}
|
||||
|
||||
protected void addParamOption(Params params, CommandLine commandLine) {
|
||||
String scriptFilename = commandLine.getOptionValue("f");
|
||||
params.setScriptFileName(scriptFilename);
|
||||
}
|
||||
}
|
||||
|
||||
class OptionS extends CommandLineOption {
|
||||
public OptionS() {
|
||||
super('s');
|
||||
}
|
||||
|
||||
protected void addParamOption(Params params, CommandLine commandLine) {
|
||||
Integer sleepTime = Integer.parseInt(commandLine.getOptionValue("s"));
|
||||
params.setSleepTime(sleepTime);
|
||||
}
|
||||
}
|
||||
|
||||
class OptionT extends CommandLineOption {
|
||||
public OptionT() {
|
||||
super('t');
|
||||
}
|
||||
|
||||
protected void addParamOption(Params params, CommandLine commandLine) {
|
||||
Integer thread = Integer.parseInt(commandLine.getOptionValue("t"));
|
||||
params.setThread(thread);
|
||||
}
|
||||
}
|
||||
|
||||
class OptionG extends CommandLineOption {
|
||||
public OptionG() {
|
||||
super('g');
|
||||
}
|
||||
|
||||
protected void addParamOption(Params params, CommandLine commandLine) {
|
||||
ConfigLogger.configLogger(commandLine.getOptionValue("g"));
|
||||
}
|
||||
}
|
|
@ -0,0 +1,34 @@
|
|||
package us.codecraft.webmagic.scripts.config;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.commons.lang3.tuple.Pair;
|
||||
import org.apache.logging.log4j.Level;
|
||||
import org.apache.logging.log4j.core.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
public class ConfigLogger {
|
||||
/**
|
||||
* Log the config parameter. If the counter is less than the number of available
|
||||
* options then it means that the user entered an option
|
||||
*
|
||||
* @param value The config string
|
||||
*/
|
||||
public static void configLogger(String value) {
|
||||
List<Pair<String, Level>> options = List.of(
|
||||
Pair.of("debug", Level.DEBUG),
|
||||
Pair.of("info", Level.INFO),
|
||||
Pair.of("warn", Level.WARN),
|
||||
Pair.of("trace", Level.TRACE),
|
||||
Pair.of("off", Level.OFF),
|
||||
Pair.of("error", Level.ERROR));
|
||||
Pair<String, Level> option = options.get(0);
|
||||
int i = 1;
|
||||
while (i < options.size() && !option.getLeft().equalsIgnoreCase(value))
|
||||
option = options.get(i++);
|
||||
if (i < options.size()) {
|
||||
Logger rootLogger = (Logger) LoggerFactory.getLogger(org.slf4j.Logger.ROOT_LOGGER_NAME);
|
||||
rootLogger.setLevel(option.getRight());
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,26 @@
|
|||
package us.codecraft.webmagic.scripts.languages;
|
||||
|
||||
import java.util.Iterator;
|
||||
import java.util.Map;
|
||||
|
||||
import javax.script.ScriptEngine;
|
||||
import javax.script.ScriptException;
|
||||
|
||||
import org.jruby.RubyHash;
|
||||
|
||||
import us.codecraft.webmagic.Page;
|
||||
|
||||
public class JRuby extends Language {
|
||||
public JRuby() {
|
||||
super("jruby","ruby/defines.rb","");
|
||||
}
|
||||
|
||||
public void process(ScriptEngine engine, String defines, String script, Page page) throws ScriptException {
|
||||
RubyHash oRuby = (RubyHash) engine.eval(defines + "\n" + script, engine.getContext());
|
||||
Iterator itruby = oRuby.entrySet().iterator();
|
||||
while (itruby.hasNext()) {
|
||||
Map.Entry pairs = (Map.Entry) itruby.next();
|
||||
page.getResultItems().put(pairs.getKey().toString(), pairs.getValue());
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,16 @@
|
|||
package us.codecraft.webmagic.scripts.languages;
|
||||
|
||||
import javax.script.ScriptEngine;
|
||||
import javax.script.ScriptException;
|
||||
|
||||
import us.codecraft.webmagic.Page;
|
||||
|
||||
public class Javascript extends Language {
|
||||
public Javascript() {
|
||||
super("javascript","js/defines.js","");
|
||||
}
|
||||
|
||||
public void process(ScriptEngine engine, String defines, String script, Page page) throws ScriptException {
|
||||
engine.eval(defines + "\n" + script, engine.getContext());
|
||||
}
|
||||
}
|
|
@ -0,0 +1,27 @@
|
|||
package us.codecraft.webmagic.scripts.languages;
|
||||
|
||||
import java.util.Iterator;
|
||||
import java.util.Map;
|
||||
|
||||
import javax.script.ScriptEngine;
|
||||
import javax.script.ScriptException;
|
||||
|
||||
import org.python.core.PyDictionary;
|
||||
|
||||
import us.codecraft.webmagic.Page;
|
||||
|
||||
public class Jython extends Language {
|
||||
public Jython() {
|
||||
super("jython","python/defines.py","");
|
||||
}
|
||||
|
||||
public void process(ScriptEngine engine, String defines, String script, Page page) throws ScriptException {
|
||||
engine.eval(defines + "\n" + script, engine.getContext());
|
||||
PyDictionary oJython = (PyDictionary) engine.get("result");
|
||||
Iterator it = oJython.entrySet().iterator();
|
||||
while (it.hasNext()) {
|
||||
Map.Entry pairs = (Map.Entry) it.next();
|
||||
page.getResultItems().put(pairs.getKey().toString(), pairs.getValue());
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,15 +1,18 @@
|
|||
package us.codecraft.webmagic.scripts;
|
||||
package us.codecraft.webmagic.scripts.languages;
|
||||
|
||||
import javax.script.ScriptEngine;
|
||||
import javax.script.ScriptException;
|
||||
import us.codecraft.webmagic.Page;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com
|
||||
* @author FrancoisGib
|
||||
*/
|
||||
public enum Language {
|
||||
|
||||
JavaScript("javascript","js/defines.js",""),
|
||||
|
||||
JRuby("jruby","ruby/defines.rb",""),
|
||||
|
||||
Jython("jython","python/defines.py","");
|
||||
public abstract class Language {
|
||||
public Language(String engineName, String defineFile, String gatherFile) {
|
||||
this.engineName = engineName;
|
||||
this.defineFile = defineFile;
|
||||
this.gatherFile = gatherFile;
|
||||
}
|
||||
|
||||
private String engineName;
|
||||
|
||||
|
@ -17,12 +20,6 @@ public enum Language {
|
|||
|
||||
private String gatherFile;
|
||||
|
||||
Language(String engineName, String defineFile, String gatherFile) {
|
||||
this.engineName = engineName;
|
||||
this.defineFile = defineFile;
|
||||
this.gatherFile = gatherFile;
|
||||
}
|
||||
|
||||
public String getEngineName() {
|
||||
return engineName;
|
||||
}
|
||||
|
@ -34,4 +31,6 @@ public enum Language {
|
|||
public String getGatherFile() {
|
||||
return gatherFile;
|
||||
}
|
||||
|
||||
public abstract void process(ScriptEngine engine, String defines, String script, Page page) throws ScriptException;
|
||||
}
|
|
@ -1,21 +0,0 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE log4j:configuration SYSTEM "log4j.dtd">
|
||||
<log4j:configuration xmlns:log4j="http://jakarta.apache.org/log4j/">
|
||||
|
||||
<appender name="stdout" class="org.apache.log4j.ConsoleAppender">
|
||||
<layout class="org.apache.log4j.PatternLayout">
|
||||
<param name="ConversionPattern" value="%d{yy-MM-dd HH:mm:ss,SSS} %-5p %c(%F:%L) ## %m%n" />
|
||||
</layout>
|
||||
</appender>
|
||||
|
||||
<logger name="org.apache" additivity="false">
|
||||
<level value="error" />
|
||||
<appender-ref ref="stdout" />
|
||||
</logger>
|
||||
|
||||
<root>
|
||||
<level value="info" />
|
||||
<appender-ref ref="stdout" />
|
||||
</root>
|
||||
|
||||
</log4j:configuration>
|
|
@ -2,7 +2,11 @@ package us.codecraft.webmagic.scripts;
|
|||
|
||||
import org.junit.Ignore;
|
||||
import org.junit.Test;
|
||||
|
||||
import us.codecraft.webmagic.Spider;
|
||||
import us.codecraft.webmagic.scripts.languages.JRuby;
|
||||
import us.codecraft.webmagic.scripts.languages.Javascript;
|
||||
import us.codecraft.webmagic.scripts.languages.Jython;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com
|
||||
|
@ -13,14 +17,14 @@ public class ScriptProcessorTest {
|
|||
|
||||
@Test
|
||||
public void testJavaScriptProcessor() {
|
||||
ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom().language(Language.JavaScript).scriptFromClassPathFile("js/oschina.js").build();
|
||||
ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom().language(new Javascript()).scriptFromClassPathFile("js/oschina.js").build();
|
||||
pageProcessor.getSite().setSleepTime(0);
|
||||
Spider.create(pageProcessor).addUrl("http://my.oschina.net/flashsword/blog").setSpawnUrl(false).run();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testRubyProcessor() {
|
||||
ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom().language(Language.JRuby).scriptFromClassPathFile("ruby/oschina.rb").build();
|
||||
ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom().language(new JRuby()).scriptFromClassPathFile("ruby/oschina.rb").build();
|
||||
pageProcessor.getSite().setSleepTime(0);
|
||||
Spider.create(pageProcessor).addUrl("http://my.oschina.net/flashsword/blog").setSpawnUrl(false).run();
|
||||
}
|
||||
|
@ -28,7 +32,7 @@ public class ScriptProcessorTest {
|
|||
|
||||
@Test
|
||||
public void testPythonProcessor() {
|
||||
ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom().language(Language.Jython).scriptFromClassPathFile("python/oschina.py").build();
|
||||
ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom().language(new Jython()).scriptFromClassPathFile("python/oschina.py").build();
|
||||
pageProcessor.getSite().setSleepTime(0);
|
||||
Spider.create(pageProcessor).addUrl("http://my.oschina.net/flashsword/blog").setSpawnUrl(false).run();
|
||||
}
|
||||
|
|
|
@ -1,21 +0,0 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE log4j:configuration SYSTEM "log4j.dtd">
|
||||
<log4j:configuration xmlns:log4j="http://jakarta.apache.org/log4j/">
|
||||
|
||||
<appender name="stdout" class="org.apache.log4j.ConsoleAppender">
|
||||
<layout class="org.apache.log4j.PatternLayout">
|
||||
<param name="ConversionPattern" value="%d{yy-MM-dd HH:mm:ss,SSS} %-5p %c(%F:%L) ## %m%n" />
|
||||
</layout>
|
||||
</appender>
|
||||
|
||||
<logger name="org.apache" additivity="false">
|
||||
<level value="warn" />
|
||||
<appender-ref ref="stdout" />
|
||||
</logger>
|
||||
|
||||
<root>
|
||||
<level value="debug" />
|
||||
<appender-ref ref="stdout" />
|
||||
</root>
|
||||
|
||||
</log4j:configuration>
|
|
@ -0,0 +1,16 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<Configuration>
|
||||
<Appenders>
|
||||
<Console name="stdout" target="SYSTEM_OUT">
|
||||
<PatternLayout pattern="%d{yy-MM-dd HH:mm:ss,SSS} %-5p %c(%F:%L) ## %m%n" />
|
||||
</Console>
|
||||
</Appenders>
|
||||
<Loggers>
|
||||
<Logger name="org.apache" level="warn" additivity="false">
|
||||
<AppenderRef ref="stdout" />
|
||||
</Logger>
|
||||
<Root level="debug">
|
||||
<AppenderRef ref="stdout" />
|
||||
</Root>
|
||||
</Loggers>
|
||||
</Configuration>
|
|
@ -1,9 +1,14 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<project
|
||||
xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="
|
||||
http://maven.apache.org/POM/4.0.0
|
||||
http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<parent>
|
||||
<artifactId>webmagic-parent</artifactId>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<version>0.10.3</version>
|
||||
<artifactId>webmagic</artifactId>
|
||||
<version>1.0.0</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
|
@ -23,10 +28,6 @@
|
|||
<groupId>com.github.detro</groupId>
|
||||
<artifactId>phantomjsdriver</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>junit</groupId>
|
||||
<artifactId>junit</artifactId>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
<build>
|
||||
|
|
|
@ -1,15 +1,5 @@
|
|||
package us.codecraft.webmagic.downloader.selenium;
|
||||
|
||||
import org.openqa.selenium.WebDriver;
|
||||
import org.openqa.selenium.chrome.ChromeDriver;
|
||||
import org.openqa.selenium.firefox.FirefoxDriver;
|
||||
import org.openqa.selenium.phantomjs.PhantomJSDriver;
|
||||
import org.openqa.selenium.phantomjs.PhantomJSDriverService;
|
||||
import org.openqa.selenium.remote.DesiredCapabilities;
|
||||
import org.openqa.selenium.remote.RemoteWebDriver;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.FileReader;
|
||||
import java.io.IOException;
|
||||
import java.net.MalformedURLException;
|
||||
|
@ -22,6 +12,18 @@ import java.util.concurrent.BlockingDeque;
|
|||
import java.util.concurrent.LinkedBlockingDeque;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
|
||||
import org.openqa.selenium.WebDriver;
|
||||
import org.openqa.selenium.chrome.ChromeDriver;
|
||||
import org.openqa.selenium.chrome.ChromeOptions;
|
||||
import org.openqa.selenium.firefox.FirefoxDriver;
|
||||
import org.openqa.selenium.firefox.FirefoxOptions;
|
||||
import org.openqa.selenium.phantomjs.PhantomJSDriver;
|
||||
import org.openqa.selenium.phantomjs.PhantomJSDriverService;
|
||||
import org.openqa.selenium.remote.DesiredCapabilities;
|
||||
import org.openqa.selenium.remote.RemoteWebDriver;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-7-26 <br>
|
||||
|
@ -73,7 +75,6 @@ class WebDriverPool {
|
|||
|
||||
// Prepare capabilities
|
||||
sCaps = new DesiredCapabilities();
|
||||
sCaps.setJavascriptEnabled(true);
|
||||
sCaps.setCapability("takesScreenshot", false);
|
||||
|
||||
String driver = sConfig.getProperty("driver", DRIVER_PHANTOMJS);
|
||||
|
@ -134,9 +135,9 @@ class WebDriverPool {
|
|||
sCaps.setBrowserName("phantomjs");
|
||||
mDriver = new RemoteWebDriver(new URL(driver), sCaps);
|
||||
} else if (driver.equals(DRIVER_FIREFOX)) {
|
||||
mDriver = new FirefoxDriver(sCaps);
|
||||
mDriver = new FirefoxDriver(new FirefoxOptions(sCaps));
|
||||
} else if (driver.equals(DRIVER_CHROME)) {
|
||||
mDriver = new ChromeDriver(sCaps);
|
||||
mDriver = new ChromeDriver(new ChromeOptions().merge(sCaps));
|
||||
} else if (driver.equals(DRIVER_PHANTOMJS)) {
|
||||
mDriver = new PhantomJSDriver(sCaps);
|
||||
}
|
||||
|
|
|
@ -1,17 +1,18 @@
|
|||
package us.codecraft.webmagic.downloader;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.junit.Ignore;
|
||||
import org.junit.Test;
|
||||
import org.openqa.selenium.By;
|
||||
import org.openqa.selenium.WebDriver;
|
||||
import org.openqa.selenium.WebElement;
|
||||
import org.openqa.selenium.chrome.ChromeDriver;
|
||||
import org.openqa.selenium.chrome.ChromeOptions;
|
||||
import org.openqa.selenium.remote.DesiredCapabilities;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-7-26 <br>
|
||||
|
@ -29,10 +30,10 @@ public class SeleniumTest {
|
|||
Map<String, Object> preferences = new HashMap<String, Object>();
|
||||
preferences.put("profile.default_content_settings", contentSettings);
|
||||
|
||||
DesiredCapabilities caps = DesiredCapabilities.chrome();
|
||||
DesiredCapabilities caps = new DesiredCapabilities();
|
||||
caps.setCapability("chrome.prefs", preferences);
|
||||
caps.setCapability("chrome.switches", Arrays.asList("--user-data-dir=/Users/yihua/temp/chrome"));
|
||||
WebDriver webDriver = new ChromeDriver(caps);
|
||||
WebDriver webDriver = new ChromeDriver(new ChromeOptions().merge(caps));
|
||||
webDriver.get("http://huaban.com/");
|
||||
WebElement webElement = webDriver.findElement(By.xpath("/html"));
|
||||
System.out.println(webElement.getAttribute("outerHTML"));
|
||||
|
|
Loading…
Reference in New Issue