complete javadoc
parent
db3cbf6ca5
commit
9c5716a543
4
pom.xml
4
pom.xml
|
@ -6,7 +6,7 @@
|
||||||
<version>7</version>
|
<version>7</version>
|
||||||
</parent>
|
</parent>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<version>0.5.3-SNAPSHOT</version>
|
<version>0.5.3</version>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
<packaging>pom</packaging>
|
<packaging>pom</packaging>
|
||||||
<properties>
|
<properties>
|
||||||
|
@ -38,7 +38,7 @@
|
||||||
<connection>scm:git:git@github.com:code4craft/webmagic.git</connection>
|
<connection>scm:git:git@github.com:code4craft/webmagic.git</connection>
|
||||||
<developerConnection>scm:git:git@github.com:code4craft/webmagic.git</developerConnection>
|
<developerConnection>scm:git:git@github.com:code4craft/webmagic.git</developerConnection>
|
||||||
<url>git@github.com:code4craft/webmagic.git</url>
|
<url>git@github.com:code4craft/webmagic.git</url>
|
||||||
<tag>HEAD</tag>
|
<tag>webmagic-parent-0.5.3</tag>
|
||||||
</scm>
|
</scm>
|
||||||
<licenses>
|
<licenses>
|
||||||
<license>
|
<license>
|
||||||
|
|
|
@ -0,0 +1,317 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||||
|
<parent>
|
||||||
|
<groupId>org.sonatype.oss</groupId>
|
||||||
|
<artifactId>oss-parent</artifactId>
|
||||||
|
<version>7</version>
|
||||||
|
</parent>
|
||||||
|
<groupId>us.codecraft</groupId>
|
||||||
|
<version>0.5.3-SNAPSHOT</version>
|
||||||
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
<packaging>pom</packaging>
|
||||||
|
<properties>
|
||||||
|
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
||||||
|
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
|
||||||
|
<spring-version>4.0.0.RELEASE</spring-version>
|
||||||
|
|
||||||
|
</properties>
|
||||||
|
<artifactId>webmagic-parent</artifactId>
|
||||||
|
<name>webmagic-parent</name>
|
||||||
|
<description>
|
||||||
|
A crawler framework. It covers the whole lifecycle of crawler: downloading, url management, content
|
||||||
|
extraction and persistent. It can simply the development of a specific crawler.
|
||||||
|
</description>
|
||||||
|
<url>https://github.com/code4craft/webmagic/</url>
|
||||||
|
<developers>
|
||||||
|
<developer>
|
||||||
|
<id>code4craft</id>
|
||||||
|
<name>Yihua huang</name>
|
||||||
|
<email>code4crafer@gmail.com</email>
|
||||||
|
</developer>
|
||||||
|
<developer>
|
||||||
|
<id>yuany</id>
|
||||||
|
<name>Ligang Yao</name>
|
||||||
|
<email>ligang.yao@answers.com</email>
|
||||||
|
</developer>
|
||||||
|
</developers>
|
||||||
|
<scm>
|
||||||
|
<connection>scm:git:git@github.com:code4craft/webmagic.git</connection>
|
||||||
|
<developerConnection>scm:git:git@github.com:code4craft/webmagic.git</developerConnection>
|
||||||
|
<url>git@github.com:code4craft/webmagic.git</url>
|
||||||
|
<tag>HEAD</tag>
|
||||||
|
</scm>
|
||||||
|
<licenses>
|
||||||
|
<license>
|
||||||
|
<name>Apache License, Version 2.0</name>
|
||||||
|
<url>http://www.apache.org/licenses/LICENSE-2.0</url>
|
||||||
|
</license>
|
||||||
|
</licenses>
|
||||||
|
|
||||||
|
<modules>
|
||||||
|
<module>webmagic-core</module>
|
||||||
|
<module>webmagic-extension/</module>
|
||||||
|
<module>webmagic-scripts/</module>
|
||||||
|
<module>webmagic-selenium</module>
|
||||||
|
<module>webmagic-saxon</module>
|
||||||
|
<module>webmagic-samples</module>
|
||||||
|
<!--<module>webmagic-avalon</module>-->
|
||||||
|
</modules>
|
||||||
|
|
||||||
|
<dependencyManagement>
|
||||||
|
<dependencies>
|
||||||
|
<dependency>
|
||||||
|
<groupId>junit</groupId>
|
||||||
|
<artifactId>junit</artifactId>
|
||||||
|
<version>4.11</version>
|
||||||
|
<scope>test</scope>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.httpcomponents</groupId>
|
||||||
|
<artifactId>httpclient</artifactId>
|
||||||
|
<version>4.3.3</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>com.google.guava</groupId>
|
||||||
|
<artifactId>guava</artifactId>
|
||||||
|
<version>15.0</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.slf4j</groupId>
|
||||||
|
<artifactId>slf4j-api</artifactId>
|
||||||
|
<version>1.7.6</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.slf4j</groupId>
|
||||||
|
<artifactId>slf4j-log4j12</artifactId>
|
||||||
|
<version>1.7.6</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>us.codecraft</groupId>
|
||||||
|
<artifactId>xsoup</artifactId>
|
||||||
|
<version>0.3.1</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>com.alibaba</groupId>
|
||||||
|
<artifactId>fastjson</artifactId>
|
||||||
|
<version>1.1.37</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>com.github.dreamhead</groupId>
|
||||||
|
<artifactId>moco-core</artifactId>
|
||||||
|
<version>0.9.1</version>
|
||||||
|
<scope>test</scope>
|
||||||
|
<exclusions>
|
||||||
|
<exclusion>
|
||||||
|
<groupId>org.slf4j</groupId>
|
||||||
|
<artifactId>slf4j-simple</artifactId>
|
||||||
|
</exclusion>
|
||||||
|
</exclusions>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>log4j</groupId>
|
||||||
|
<artifactId>log4j</artifactId>
|
||||||
|
<version>1.2.17</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.assertj</groupId>
|
||||||
|
<artifactId>assertj-core</artifactId>
|
||||||
|
<version>1.5.0</version>
|
||||||
|
<scope>test</scope>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.commons</groupId>
|
||||||
|
<artifactId>commons-lang3</artifactId>
|
||||||
|
<version>3.1</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>commons-collections</groupId>
|
||||||
|
<artifactId>commons-collections</artifactId>
|
||||||
|
<version>3.2.1</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.commons</groupId>
|
||||||
|
<artifactId>commons-io</artifactId>
|
||||||
|
<version>1.3.2</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.jsoup</groupId>
|
||||||
|
<artifactId>jsoup</artifactId>
|
||||||
|
<version>1.8.3</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.mockito</groupId>
|
||||||
|
<artifactId>mockito-all</artifactId>
|
||||||
|
<version>1.9.5</version>
|
||||||
|
<scope>test</scope>
|
||||||
|
</dependency>
|
||||||
|
</dependencies>
|
||||||
|
</dependencyManagement>
|
||||||
|
|
||||||
|
<build>
|
||||||
|
<plugins>
|
||||||
|
<plugin>
|
||||||
|
<groupId>org.apache.maven.plugins</groupId>
|
||||||
|
<artifactId>maven-surefire-plugin</artifactId>
|
||||||
|
<version>2.18</version>
|
||||||
|
<configuration>
|
||||||
|
<forkMode>pertest</forkMode>
|
||||||
|
<argLine>-Xms1024m -Xmx1024m -Xss1m </argLine>
|
||||||
|
</configuration>
|
||||||
|
</plugin>
|
||||||
|
<plugin>
|
||||||
|
<groupId>org.apache.maven.plugins</groupId>
|
||||||
|
<artifactId>maven-compiler-plugin</artifactId>
|
||||||
|
<version>3.1</version>
|
||||||
|
<configuration>
|
||||||
|
<source>1.6</source>
|
||||||
|
<target>1.6</target>
|
||||||
|
<encoding>UTF-8</encoding>
|
||||||
|
</configuration>
|
||||||
|
</plugin>
|
||||||
|
<!--<plugin>-->
|
||||||
|
<!--<groupId>org.apache.maven.plugins</groupId>-->
|
||||||
|
<!--<artifactId>maven-dependency-plugin</artifactId>-->
|
||||||
|
<!--<version>2.8</version>-->
|
||||||
|
<!--<executions>-->
|
||||||
|
<!--<execution>-->
|
||||||
|
<!--<id>copy-dependencies</id>-->
|
||||||
|
<!--<phase>package</phase>-->
|
||||||
|
<!--<goals>-->
|
||||||
|
<!--<goal>copy-dependencies</goal>-->
|
||||||
|
<!--</goals>-->
|
||||||
|
<!--<configuration>-->
|
||||||
|
<!--<outputDirectory>${project.build.directory}/lib</outputDirectory>-->
|
||||||
|
<!--<overWriteReleases>false</overWriteReleases>-->
|
||||||
|
<!--<overWriteSnapshots>false</overWriteSnapshots>-->
|
||||||
|
<!--<overWriteIfNewer>true</overWriteIfNewer>-->
|
||||||
|
<!--</configuration>-->
|
||||||
|
<!--</execution>-->
|
||||||
|
<!--</executions>-->
|
||||||
|
<!--</plugin>-->
|
||||||
|
<plugin>
|
||||||
|
<groupId>org.apache.maven.plugins</groupId>
|
||||||
|
<artifactId>maven-resources-plugin</artifactId>
|
||||||
|
<version>2.6</version>
|
||||||
|
<configuration>
|
||||||
|
<encoding>UTF-8</encoding>
|
||||||
|
</configuration>
|
||||||
|
</plugin>
|
||||||
|
<plugin>
|
||||||
|
<groupId>org.apache.maven.plugins</groupId>
|
||||||
|
<artifactId>maven-jar-plugin</artifactId>
|
||||||
|
<configuration>
|
||||||
|
<excludes>
|
||||||
|
<exclude>log4j.xml</exclude>
|
||||||
|
</excludes>
|
||||||
|
</configuration>
|
||||||
|
</plugin>
|
||||||
|
<plugin>
|
||||||
|
<groupId>org.apache.maven.plugins</groupId>
|
||||||
|
<artifactId>maven-source-plugin</artifactId>
|
||||||
|
<version>2.2.1</version>
|
||||||
|
<executions>
|
||||||
|
<execution>
|
||||||
|
<id>attach-sources</id>
|
||||||
|
<goals>
|
||||||
|
<goal>jar</goal>
|
||||||
|
</goals>
|
||||||
|
</execution>
|
||||||
|
</executions>
|
||||||
|
</plugin>
|
||||||
|
<plugin>
|
||||||
|
<groupId>org.apache.maven.plugins</groupId>
|
||||||
|
<artifactId>maven-javadoc-plugin</artifactId>
|
||||||
|
<version>2.9.1</version>
|
||||||
|
<configuration>
|
||||||
|
<encoding>UTF-8</encoding>
|
||||||
|
</configuration>
|
||||||
|
<executions>
|
||||||
|
<execution>
|
||||||
|
<id>attach-javadocs</id>
|
||||||
|
<goals>
|
||||||
|
<goal>jar</goal>
|
||||||
|
</goals>
|
||||||
|
</execution>
|
||||||
|
</executions>
|
||||||
|
</plugin>
|
||||||
|
<plugin>
|
||||||
|
<groupId>org.apache.maven.plugins</groupId>
|
||||||
|
<artifactId>maven-release-plugin</artifactId>
|
||||||
|
<version>2.4.1</version>
|
||||||
|
</plugin>
|
||||||
|
</plugins>
|
||||||
|
</build>
|
||||||
|
|
||||||
|
<profiles>
|
||||||
|
<profile>
|
||||||
|
<id>release</id>
|
||||||
|
<build>
|
||||||
|
<plugins>
|
||||||
|
<!-- Source -->
|
||||||
|
<plugin>
|
||||||
|
<groupId>org.apache.maven.plugins</groupId>
|
||||||
|
<artifactId>maven-source-plugin</artifactId>
|
||||||
|
<version>2.2.1</version>
|
||||||
|
<executions>
|
||||||
|
<execution>
|
||||||
|
<phase>package</phase>
|
||||||
|
<goals>
|
||||||
|
<goal>jar-no-fork</goal>
|
||||||
|
</goals>
|
||||||
|
</execution>
|
||||||
|
</executions>
|
||||||
|
</plugin>
|
||||||
|
<!-- Javadoc -->
|
||||||
|
<plugin>
|
||||||
|
<groupId>org.apache.maven.plugins</groupId>
|
||||||
|
<artifactId>maven-javadoc-plugin</artifactId>
|
||||||
|
<version>2.9.1</version>
|
||||||
|
<executions>
|
||||||
|
<execution>
|
||||||
|
<phase>package</phase>
|
||||||
|
<goals>
|
||||||
|
<goal>jar</goal>
|
||||||
|
</goals>
|
||||||
|
</execution>
|
||||||
|
</executions>
|
||||||
|
</plugin>
|
||||||
|
<!-- GPG -->
|
||||||
|
<plugin>
|
||||||
|
<groupId>org.apache.maven.plugins</groupId>
|
||||||
|
<artifactId>maven-gpg-plugin</artifactId>
|
||||||
|
<version>1.5</version>
|
||||||
|
<executions>
|
||||||
|
<execution>
|
||||||
|
<phase>verify</phase>
|
||||||
|
<goals>
|
||||||
|
<goal>sign</goal>
|
||||||
|
</goals>
|
||||||
|
</execution>
|
||||||
|
</executions>
|
||||||
|
</plugin>
|
||||||
|
<plugin>
|
||||||
|
<groupId>org.sonatype.plugins</groupId>
|
||||||
|
<artifactId>nexus-staging-maven-plugin</artifactId>
|
||||||
|
<version>1.6</version>
|
||||||
|
<extensions>true</extensions>
|
||||||
|
<configuration>
|
||||||
|
<serverId>sonatype-nexus-staging</serverId>
|
||||||
|
<nexusUrl>https://oss.sonatype.org/</nexusUrl>
|
||||||
|
<autoReleaseAfterClose>true</autoReleaseAfterClose>
|
||||||
|
</configuration>
|
||||||
|
</plugin>
|
||||||
|
</plugins>
|
||||||
|
</build>
|
||||||
|
<distributionManagement>
|
||||||
|
<snapshotRepository>
|
||||||
|
<id>sonatype-nexus-snapshots</id>
|
||||||
|
<url>https://oss.sonatype.org/content/repositories/snapshots/</url>
|
||||||
|
</snapshotRepository>
|
||||||
|
<repository>
|
||||||
|
<id>sonatype-nexus-staging</id>
|
||||||
|
<url>https://oss.sonatype.org/service/local/staging/deploy/maven2/</url>
|
||||||
|
</repository>
|
||||||
|
</distributionManagement>
|
||||||
|
</profile>
|
||||||
|
</profiles>
|
||||||
|
</project>
|
|
@ -0,0 +1,36 @@
|
||||||
|
#release configuration
|
||||||
|
#Thu Jan 21 17:59:05 CST 2016
|
||||||
|
project.dev.us.codecraft\:webmagic-selenium=0.5.4-SNAPSHOT
|
||||||
|
scm.commentPrefix=[maven-release-plugin]
|
||||||
|
project.rel.us.codecraft\:webmagic-samples=0.5.3
|
||||||
|
project.rel.us.codecraft\:webmagic-selenium=0.5.3
|
||||||
|
project.rel.us.codecraft\:webmagic-saxon=0.5.3
|
||||||
|
pushChanges=true
|
||||||
|
project.dev.us.codecraft\:webmagic-extension=0.5.4-SNAPSHOT
|
||||||
|
project.scm.us.codecraft\:webmagic-selenium.empty=true
|
||||||
|
project.scm.us.codecraft\:webmagic-parent.developerConnection=scm\:git\:git@github.com\:code4craft/webmagic.git
|
||||||
|
project.rel.us.codecraft\:webmagic-core=0.5.3
|
||||||
|
project.scm.us.codecraft\:webmagic-scripts.empty=true
|
||||||
|
project.rel.us.codecraft\:webmagic-extension=0.5.3
|
||||||
|
project.rel.us.codecraft\:webmagic-parent=0.5.3
|
||||||
|
scm.tag=webmagic-parent-0.5.3
|
||||||
|
remoteTagging=true
|
||||||
|
project.scm.us.codecraft\:webmagic-parent.tag=HEAD
|
||||||
|
project.dev.us.codecraft\:webmagic-scripts=0.5.4-SNAPSHOT
|
||||||
|
exec.additionalArguments=-Psonatype-oss-release -P development
|
||||||
|
project.dev.us.codecraft\:webmagic-core=0.5.4-SNAPSHOT
|
||||||
|
project.scm.us.codecraft\:webmagic-saxon.empty=true
|
||||||
|
project.scm.us.codecraft\:webmagic-extension.empty=true
|
||||||
|
scm.url=scm\:git\:git@github.com\:code4craft/webmagic.git
|
||||||
|
scm.tagNameFormat=@{project.artifactId}-@{project.version}
|
||||||
|
project.scm.us.codecraft\:webmagic-parent.url=git@github.com\:code4craft/webmagic.git
|
||||||
|
preparationGoals=clean verify
|
||||||
|
project.dev.us.codecraft\:webmagic-saxon=0.5.4-SNAPSHOT
|
||||||
|
project.rel.us.codecraft\:webmagic-scripts=0.5.3
|
||||||
|
project.scm.us.codecraft\:webmagic-core.empty=true
|
||||||
|
project.scm.us.codecraft\:webmagic-parent.connection=scm\:git\:git@github.com\:code4craft/webmagic.git
|
||||||
|
project.dev.us.codecraft\:webmagic-samples=0.5.4-SNAPSHOT
|
||||||
|
exec.snapshotReleasePluginAllowed=false
|
||||||
|
project.dev.us.codecraft\:webmagic-parent=0.5.4-SNAPSHOT
|
||||||
|
project.scm.us.codecraft\:webmagic-samples.empty=true
|
||||||
|
completedPhase=generate-release-poms
|
|
@ -3,7 +3,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<artifactId>webmagic-parent</artifactId>
|
<artifactId>webmagic-parent</artifactId>
|
||||||
<version>0.5.3-SNAPSHOT</version>
|
<version>0.5.3</version>
|
||||||
</parent>
|
</parent>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,86 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||||
|
<parent>
|
||||||
|
<groupId>us.codecraft</groupId>
|
||||||
|
<artifactId>webmagic-parent</artifactId>
|
||||||
|
<version>0.5.3-SNAPSHOT</version>
|
||||||
|
</parent>
|
||||||
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
<artifactId>webmagic-core</artifactId>
|
||||||
|
|
||||||
|
<dependencies>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.httpcomponents</groupId>
|
||||||
|
<artifactId>httpclient</artifactId>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>junit</groupId>
|
||||||
|
<artifactId>junit</artifactId>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>com.google.guava</groupId>
|
||||||
|
<artifactId>guava</artifactId>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.commons</groupId>
|
||||||
|
<artifactId>commons-lang3</artifactId>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>us.codecraft</groupId>
|
||||||
|
<artifactId>xsoup</artifactId>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>com.github.dreamhead</groupId>
|
||||||
|
<artifactId>moco-core</artifactId>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.slf4j</groupId>
|
||||||
|
<artifactId>slf4j-api</artifactId>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.slf4j</groupId>
|
||||||
|
<artifactId>slf4j-log4j12</artifactId>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>commons-collections</groupId>
|
||||||
|
<artifactId>commons-collections</artifactId>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.assertj</groupId>
|
||||||
|
<artifactId>assertj-core</artifactId>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.jsoup</groupId>
|
||||||
|
<artifactId>jsoup</artifactId>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.commons</groupId>
|
||||||
|
<artifactId>commons-io</artifactId>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>com.jayway.jsonpath</groupId>
|
||||||
|
<artifactId>json-path</artifactId>
|
||||||
|
<version>0.8.1</version>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>com.alibaba</groupId>
|
||||||
|
<artifactId>fastjson</artifactId>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
</dependencies>
|
||||||
|
|
||||||
|
</project>
|
|
@ -56,8 +56,8 @@ public class Page {
|
||||||
/**
|
/**
|
||||||
* store extract results
|
* store extract results
|
||||||
*
|
*
|
||||||
* @param key
|
* @param key key
|
||||||
* @param field
|
* @param field field
|
||||||
*/
|
*/
|
||||||
public void putField(String key, Object field) {
|
public void putField(String key, Object field) {
|
||||||
resultItems.put(key, field);
|
resultItems.put(key, field);
|
||||||
|
@ -89,7 +89,7 @@ public class Page {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @param html
|
* @param html html
|
||||||
* @deprecated since 0.4.0
|
* @deprecated since 0.4.0
|
||||||
* The html is parse just when first time of calling {@link #getHtml()}, so use {@link #setRawText(String)} instead.
|
* The html is parse just when first time of calling {@link #getHtml()}, so use {@link #setRawText(String)} instead.
|
||||||
*/
|
*/
|
||||||
|
@ -104,7 +104,7 @@ public class Page {
|
||||||
/**
|
/**
|
||||||
* add urls to fetch
|
* add urls to fetch
|
||||||
*
|
*
|
||||||
* @param requests
|
* @param requests requests
|
||||||
*/
|
*/
|
||||||
public void addTargetRequests(List<String> requests) {
|
public void addTargetRequests(List<String> requests) {
|
||||||
synchronized (targetRequests) {
|
synchronized (targetRequests) {
|
||||||
|
@ -121,7 +121,8 @@ public class Page {
|
||||||
/**
|
/**
|
||||||
* add urls to fetch
|
* add urls to fetch
|
||||||
*
|
*
|
||||||
* @param requests
|
* @param requests requests
|
||||||
|
* @param priority priority
|
||||||
*/
|
*/
|
||||||
public void addTargetRequests(List<String> requests, long priority) {
|
public void addTargetRequests(List<String> requests, long priority) {
|
||||||
synchronized (targetRequests) {
|
synchronized (targetRequests) {
|
||||||
|
@ -138,7 +139,7 @@ public class Page {
|
||||||
/**
|
/**
|
||||||
* add url to fetch
|
* add url to fetch
|
||||||
*
|
*
|
||||||
* @param requestString
|
* @param requestString requestString
|
||||||
*/
|
*/
|
||||||
public void addTargetRequest(String requestString) {
|
public void addTargetRequest(String requestString) {
|
||||||
if (StringUtils.isBlank(requestString) || requestString.equals("#")) {
|
if (StringUtils.isBlank(requestString) || requestString.equals("#")) {
|
||||||
|
@ -153,7 +154,7 @@ public class Page {
|
||||||
/**
|
/**
|
||||||
* add requests to fetch
|
* add requests to fetch
|
||||||
*
|
*
|
||||||
* @param request
|
* @param request request
|
||||||
*/
|
*/
|
||||||
public void addTargetRequest(Request request) {
|
public void addTargetRequest(Request request) {
|
||||||
synchronized (targetRequests) {
|
synchronized (targetRequests) {
|
||||||
|
|
|
@ -53,7 +53,7 @@ public class Request implements Serializable {
|
||||||
* Need a scheduler supporting priority.<br>
|
* Need a scheduler supporting priority.<br>
|
||||||
* @see us.codecraft.webmagic.scheduler.PriorityScheduler
|
* @see us.codecraft.webmagic.scheduler.PriorityScheduler
|
||||||
*
|
*
|
||||||
* @param priority
|
* @param priority priority
|
||||||
* @return this
|
* @return this
|
||||||
*/
|
*/
|
||||||
@Experimental
|
@Experimental
|
||||||
|
|
|
@ -81,8 +81,8 @@ public class Site {
|
||||||
/**
|
/**
|
||||||
* Add a cookie with domain {@link #getDomain()}
|
* Add a cookie with domain {@link #getDomain()}
|
||||||
*
|
*
|
||||||
* @param name
|
* @param name name
|
||||||
* @param value
|
* @param value value
|
||||||
* @return this
|
* @return this
|
||||||
*/
|
*/
|
||||||
public Site addCookie(String name, String value) {
|
public Site addCookie(String name, String value) {
|
||||||
|
@ -93,9 +93,9 @@ public class Site {
|
||||||
/**
|
/**
|
||||||
* Add a cookie with specific domain.
|
* Add a cookie with specific domain.
|
||||||
*
|
*
|
||||||
* @param domain
|
* @param domain domain
|
||||||
* @param name
|
* @param name name
|
||||||
* @param value
|
* @param value value
|
||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
public Site addCookie(String domain, String name, String value) {
|
public Site addCookie(String domain, String name, String value) {
|
||||||
|
@ -153,7 +153,7 @@ public class Site {
|
||||||
/**
|
/**
|
||||||
* set the domain of site.
|
* set the domain of site.
|
||||||
*
|
*
|
||||||
* @param domain
|
* @param domain domain
|
||||||
* @return this
|
* @return this
|
||||||
*/
|
*/
|
||||||
public Site setDomain(String domain) {
|
public Site setDomain(String domain) {
|
||||||
|
@ -165,7 +165,7 @@ public class Site {
|
||||||
* Set charset of page manually.<br>
|
* Set charset of page manually.<br>
|
||||||
* When charset is not set or set to null, it can be auto detected by Http header.
|
* When charset is not set or set to null, it can be auto detected by Http header.
|
||||||
*
|
*
|
||||||
* @param charset
|
* @param charset charset
|
||||||
* @return this
|
* @return this
|
||||||
*/
|
*/
|
||||||
public Site setCharset(String charset) {
|
public Site setCharset(String charset) {
|
||||||
|
@ -189,7 +189,7 @@ public class Site {
|
||||||
/**
|
/**
|
||||||
* set timeout for downloader in ms
|
* set timeout for downloader in ms
|
||||||
*
|
*
|
||||||
* @param timeOut
|
* @param timeOut timeOut
|
||||||
*/
|
*/
|
||||||
public Site setTimeOut(int timeOut) {
|
public Site setTimeOut(int timeOut) {
|
||||||
this.timeOut = timeOut;
|
this.timeOut = timeOut;
|
||||||
|
@ -202,7 +202,7 @@ public class Site {
|
||||||
* {200} by default.<br>
|
* {200} by default.<br>
|
||||||
* It is not necessarily to be set.<br>
|
* It is not necessarily to be set.<br>
|
||||||
*
|
*
|
||||||
* @param acceptStatCode
|
* @param acceptStatCode acceptStatCode
|
||||||
* @return this
|
* @return this
|
||||||
*/
|
*/
|
||||||
public Site setAcceptStatCode(Set<Integer> acceptStatCode) {
|
public Site setAcceptStatCode(Set<Integer> acceptStatCode) {
|
||||||
|
@ -239,7 +239,7 @@ public class Site {
|
||||||
* Add a url to start url.<br>
|
* Add a url to start url.<br>
|
||||||
* Because urls are more a Spider's property than Site, move it to {@link Spider#addUrl(String...)}}
|
* Because urls are more a Spider's property than Site, move it to {@link Spider#addUrl(String...)}}
|
||||||
*
|
*
|
||||||
* @param startUrl
|
* @param startUrl startUrl
|
||||||
* @return this
|
* @return this
|
||||||
* @see Spider#addUrl(String...)
|
* @see Spider#addUrl(String...)
|
||||||
* @deprecated
|
* @deprecated
|
||||||
|
@ -252,7 +252,7 @@ public class Site {
|
||||||
* Add a url to start url.<br>
|
* Add a url to start url.<br>
|
||||||
* Because urls are more a Spider's property than Site, move it to {@link Spider#addRequest(Request...)}}
|
* Because urls are more a Spider's property than Site, move it to {@link Spider#addRequest(Request...)}}
|
||||||
*
|
*
|
||||||
* @param startRequest
|
* @param startRequest startRequest
|
||||||
* @return this
|
* @return this
|
||||||
* @see Spider#addRequest(Request...)
|
* @see Spider#addRequest(Request...)
|
||||||
* @deprecated
|
* @deprecated
|
||||||
|
@ -269,7 +269,7 @@ public class Site {
|
||||||
* Set the interval between the processing of two pages.<br>
|
* Set the interval between the processing of two pages.<br>
|
||||||
* Time unit is micro seconds.<br>
|
* Time unit is micro seconds.<br>
|
||||||
*
|
*
|
||||||
* @param sleepTime
|
* @param sleepTime sleepTime
|
||||||
* @return this
|
* @return this
|
||||||
*/
|
*/
|
||||||
public Site setSleepTime(int sleepTime) {
|
public Site setSleepTime(int sleepTime) {
|
||||||
|
@ -349,7 +349,7 @@ public class Site {
|
||||||
/**
|
/**
|
||||||
* set up httpProxy for this site
|
* set up httpProxy for this site
|
||||||
*
|
*
|
||||||
* @param httpProxy
|
* @param httpProxy httpProxy
|
||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
public Site setHttpProxy(HttpHost httpProxy) {
|
public Site setHttpProxy(HttpHost httpProxy) {
|
||||||
|
@ -368,7 +368,7 @@ public class Site {
|
||||||
/**
|
/**
|
||||||
* Set retry sleep times when download fail, 1000 by default. <br>
|
* Set retry sleep times when download fail, 1000 by default. <br>
|
||||||
*
|
*
|
||||||
* @param retrySleepTime
|
* @param retrySleepTime retrySleepTime
|
||||||
*/
|
*/
|
||||||
public Site setRetrySleepTime(int retrySleepTime) {
|
public Site setRetrySleepTime(int retrySleepTime) {
|
||||||
this.retrySleepTime = retrySleepTime;
|
this.retrySleepTime = retrySleepTime;
|
||||||
|
@ -379,7 +379,7 @@ public class Site {
|
||||||
* Whether use gzip. <br>
|
* Whether use gzip. <br>
|
||||||
* Default is true, you can set it to false to disable gzip.
|
* Default is true, you can set it to false to disable gzip.
|
||||||
*
|
*
|
||||||
* @param useGzip
|
* @param useGzip useGzip
|
||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
public Site setUseGzip(boolean useGzip) {
|
public Site setUseGzip(boolean useGzip) {
|
||||||
|
|
|
@ -111,7 +111,7 @@ public class Spider implements Runnable, Task {
|
||||||
/**
|
/**
|
||||||
* create a spider with pageProcessor.
|
* create a spider with pageProcessor.
|
||||||
*
|
*
|
||||||
* @param pageProcessor
|
* @param pageProcessor pageProcessor
|
||||||
* @return new spider
|
* @return new spider
|
||||||
* @see PageProcessor
|
* @see PageProcessor
|
||||||
*/
|
*/
|
||||||
|
@ -122,7 +122,7 @@ public class Spider implements Runnable, Task {
|
||||||
/**
|
/**
|
||||||
* create a spider with pageProcessor.
|
* create a spider with pageProcessor.
|
||||||
*
|
*
|
||||||
* @param pageProcessor
|
* @param pageProcessor pageProcessor
|
||||||
*/
|
*/
|
||||||
public Spider(PageProcessor pageProcessor) {
|
public Spider(PageProcessor pageProcessor) {
|
||||||
this.pageProcessor = pageProcessor;
|
this.pageProcessor = pageProcessor;
|
||||||
|
@ -134,7 +134,7 @@ public class Spider implements Runnable, Task {
|
||||||
* Set startUrls of Spider.<br>
|
* Set startUrls of Spider.<br>
|
||||||
* Prior to startUrls of Site.
|
* Prior to startUrls of Site.
|
||||||
*
|
*
|
||||||
* @param startUrls
|
* @param startUrls startUrls
|
||||||
* @return this
|
* @return this
|
||||||
*/
|
*/
|
||||||
public Spider startUrls(List<String> startUrls) {
|
public Spider startUrls(List<String> startUrls) {
|
||||||
|
@ -147,7 +147,7 @@ public class Spider implements Runnable, Task {
|
||||||
* Set startUrls of Spider.<br>
|
* Set startUrls of Spider.<br>
|
||||||
* Prior to startUrls of Site.
|
* Prior to startUrls of Site.
|
||||||
*
|
*
|
||||||
* @param startRequests
|
* @param startRequests startRequests
|
||||||
* @return this
|
* @return this
|
||||||
*/
|
*/
|
||||||
public Spider startRequest(List<Request> startRequests) {
|
public Spider startRequest(List<Request> startRequests) {
|
||||||
|
@ -160,7 +160,7 @@ public class Spider implements Runnable, Task {
|
||||||
* Set an uuid for spider.<br>
|
* Set an uuid for spider.<br>
|
||||||
* Default uuid is domain of site.<br>
|
* Default uuid is domain of site.<br>
|
||||||
*
|
*
|
||||||
* @param uuid
|
* @param uuid uuid
|
||||||
* @return this
|
* @return this
|
||||||
*/
|
*/
|
||||||
public Spider setUUID(String uuid) {
|
public Spider setUUID(String uuid) {
|
||||||
|
@ -171,7 +171,7 @@ public class Spider implements Runnable, Task {
|
||||||
/**
|
/**
|
||||||
* set scheduler for Spider
|
* set scheduler for Spider
|
||||||
*
|
*
|
||||||
* @param scheduler
|
* @param scheduler scheduler
|
||||||
* @return this
|
* @return this
|
||||||
* @Deprecated
|
* @Deprecated
|
||||||
* @see #setScheduler(us.codecraft.webmagic.scheduler.Scheduler)
|
* @see #setScheduler(us.codecraft.webmagic.scheduler.Scheduler)
|
||||||
|
@ -183,7 +183,7 @@ public class Spider implements Runnable, Task {
|
||||||
/**
|
/**
|
||||||
* set scheduler for Spider
|
* set scheduler for Spider
|
||||||
*
|
*
|
||||||
* @param scheduler
|
* @param scheduler scheduler
|
||||||
* @return this
|
* @return this
|
||||||
* @see Scheduler
|
* @see Scheduler
|
||||||
* @since 0.2.1
|
* @since 0.2.1
|
||||||
|
@ -204,7 +204,7 @@ public class Spider implements Runnable, Task {
|
||||||
/**
|
/**
|
||||||
* add a pipeline for Spider
|
* add a pipeline for Spider
|
||||||
*
|
*
|
||||||
* @param pipeline
|
* @param pipeline pipeline
|
||||||
* @return this
|
* @return this
|
||||||
* @see #addPipeline(us.codecraft.webmagic.pipeline.Pipeline)
|
* @see #addPipeline(us.codecraft.webmagic.pipeline.Pipeline)
|
||||||
* @deprecated
|
* @deprecated
|
||||||
|
@ -216,7 +216,7 @@ public class Spider implements Runnable, Task {
|
||||||
/**
|
/**
|
||||||
* add a pipeline for Spider
|
* add a pipeline for Spider
|
||||||
*
|
*
|
||||||
* @param pipeline
|
* @param pipeline pipeline
|
||||||
* @return this
|
* @return this
|
||||||
* @see Pipeline
|
* @see Pipeline
|
||||||
* @since 0.2.1
|
* @since 0.2.1
|
||||||
|
@ -230,7 +230,7 @@ public class Spider implements Runnable, Task {
|
||||||
/**
|
/**
|
||||||
* set pipelines for Spider
|
* set pipelines for Spider
|
||||||
*
|
*
|
||||||
* @param pipelines
|
* @param pipelines pipelines
|
||||||
* @return this
|
* @return this
|
||||||
* @see Pipeline
|
* @see Pipeline
|
||||||
* @since 0.4.1
|
* @since 0.4.1
|
||||||
|
@ -254,7 +254,7 @@ public class Spider implements Runnable, Task {
|
||||||
/**
|
/**
|
||||||
* set the downloader of spider
|
* set the downloader of spider
|
||||||
*
|
*
|
||||||
* @param downloader
|
* @param downloader downloader
|
||||||
* @return this
|
* @return this
|
||||||
* @see #setDownloader(us.codecraft.webmagic.downloader.Downloader)
|
* @see #setDownloader(us.codecraft.webmagic.downloader.Downloader)
|
||||||
* @deprecated
|
* @deprecated
|
||||||
|
@ -266,7 +266,7 @@ public class Spider implements Runnable, Task {
|
||||||
/**
|
/**
|
||||||
* set the downloader of spider
|
* set the downloader of spider
|
||||||
*
|
*
|
||||||
* @param downloader
|
* @param downloader downloader
|
||||||
* @return this
|
* @return this
|
||||||
* @see Downloader
|
* @see Downloader
|
||||||
*/
|
*/
|
||||||
|
@ -468,7 +468,7 @@ public class Spider implements Runnable, Task {
|
||||||
/**
|
/**
|
||||||
* Add urls to crawl. <br/>
|
* Add urls to crawl. <br/>
|
||||||
*
|
*
|
||||||
* @param urls
|
* @param urls urls
|
||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
public Spider addUrl(String... urls) {
|
public Spider addUrl(String... urls) {
|
||||||
|
@ -482,7 +482,7 @@ public class Spider implements Runnable, Task {
|
||||||
/**
|
/**
|
||||||
* Download urls synchronizing.
|
* Download urls synchronizing.
|
||||||
*
|
*
|
||||||
* @param urls
|
* @param urls urls
|
||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
public <T> List<T> getAll(Collection<String> urls) {
|
public <T> List<T> getAll(Collection<String> urls) {
|
||||||
|
@ -517,7 +517,7 @@ public class Spider implements Runnable, Task {
|
||||||
/**
|
/**
|
||||||
* Add urls with information to crawl.<br/>
|
* Add urls with information to crawl.<br/>
|
||||||
*
|
*
|
||||||
* @param requests
|
* @param requests requests
|
||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
public Spider addRequest(Request... requests) {
|
public Spider addRequest(Request... requests) {
|
||||||
|
@ -567,7 +567,7 @@ public class Spider implements Runnable, Task {
|
||||||
/**
|
/**
|
||||||
* start with more than one threads
|
* start with more than one threads
|
||||||
*
|
*
|
||||||
* @param threadNum
|
* @param threadNum threadNum
|
||||||
* @return this
|
* @return this
|
||||||
*/
|
*/
|
||||||
public Spider thread(int threadNum) {
|
public Spider thread(int threadNum) {
|
||||||
|
@ -582,7 +582,7 @@ public class Spider implements Runnable, Task {
|
||||||
/**
|
/**
|
||||||
* start with more than one threads
|
* start with more than one threads
|
||||||
*
|
*
|
||||||
* @param threadNum
|
* @param threadNum threadNum
|
||||||
* @return this
|
* @return this
|
||||||
*/
|
*/
|
||||||
public Spider thread(ExecutorService executorService, int threadNum) {
|
public Spider thread(ExecutorService executorService, int threadNum) {
|
||||||
|
@ -603,7 +603,7 @@ public class Spider implements Runnable, Task {
|
||||||
* True: exit when all url of the site is downloaded. <br/>
|
* True: exit when all url of the site is downloaded. <br/>
|
||||||
* False: not exit until call stop() manually.<br/>
|
* False: not exit until call stop() manually.<br/>
|
||||||
*
|
*
|
||||||
* @param exitWhenComplete
|
* @param exitWhenComplete exitWhenComplete
|
||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
public Spider setExitWhenComplete(boolean exitWhenComplete) {
|
public Spider setExitWhenComplete(boolean exitWhenComplete) {
|
||||||
|
@ -679,7 +679,7 @@ public class Spider implements Runnable, Task {
|
||||||
* Add urls to download when it is true, and just download seed urls when it is false. <br>
|
* Add urls to download when it is true, and just download seed urls when it is false. <br>
|
||||||
* DO NOT set it unless you know what it means!
|
* DO NOT set it unless you know what it means!
|
||||||
*
|
*
|
||||||
* @param spawnUrl
|
* @param spawnUrl spawnUrl
|
||||||
* @return
|
* @return
|
||||||
* @since 0.4.0
|
* @since 0.4.0
|
||||||
*/
|
*/
|
||||||
|
|
|
@ -16,7 +16,7 @@ public abstract class AbstractDownloader implements Downloader {
|
||||||
/**
|
/**
|
||||||
* A simple method to download a url.
|
* A simple method to download a url.
|
||||||
*
|
*
|
||||||
* @param url
|
* @param url url
|
||||||
* @return html
|
* @return html
|
||||||
*/
|
*/
|
||||||
public Html download(String url) {
|
public Html download(String url) {
|
||||||
|
@ -26,7 +26,8 @@ public abstract class AbstractDownloader implements Downloader {
|
||||||
/**
|
/**
|
||||||
* A simple method to download a url.
|
* A simple method to download a url.
|
||||||
*
|
*
|
||||||
* @param url
|
* @param url url
|
||||||
|
* @param charset charset
|
||||||
* @return html
|
* @return html
|
||||||
*/
|
*/
|
||||||
public Html download(String url, String charset) {
|
public Html download(String url, String charset) {
|
||||||
|
|
|
@ -17,8 +17,8 @@ public interface Downloader {
|
||||||
/**
|
/**
|
||||||
* Downloads web pages and store in Page object.
|
* Downloads web pages and store in Page object.
|
||||||
*
|
*
|
||||||
* @param request
|
* @param request request
|
||||||
* @param task
|
* @param task task
|
||||||
* @return page
|
* @return page
|
||||||
*/
|
*/
|
||||||
public Page download(Request request, Task task);
|
public Page download(Request request, Task task);
|
||||||
|
|
|
@ -17,8 +17,8 @@ public interface Pipeline {
|
||||||
/**
|
/**
|
||||||
* Process extracted results.
|
* Process extracted results.
|
||||||
*
|
*
|
||||||
* @param resultItems
|
* @param resultItems resultItems
|
||||||
* @param task
|
* @param task task
|
||||||
*/
|
*/
|
||||||
public void process(ResultItems resultItems, Task task);
|
public void process(ResultItems resultItems, Task task);
|
||||||
}
|
}
|
||||||
|
|
|
@ -22,7 +22,7 @@ public interface PageProcessor {
|
||||||
/**
|
/**
|
||||||
* process the page, extract urls to fetch, extract the data and store
|
* process the page, extract urls to fetch, extract the data and store
|
||||||
*
|
*
|
||||||
* @param page
|
* @param page page
|
||||||
*/
|
*/
|
||||||
public void process(Page page);
|
public void process(Page page);
|
||||||
|
|
||||||
|
|
|
@ -17,8 +17,8 @@ public interface Scheduler {
|
||||||
/**
|
/**
|
||||||
* add a url to fetch
|
* add a url to fetch
|
||||||
*
|
*
|
||||||
* @param request
|
* @param request request
|
||||||
* @param task
|
* @param task task
|
||||||
*/
|
*/
|
||||||
public void push(Request request, Task task);
|
public void push(Request request, Task task);
|
||||||
|
|
||||||
|
|
|
@ -13,21 +13,21 @@ public interface DuplicateRemover {
|
||||||
*
|
*
|
||||||
* Check whether the request is duplicate.
|
* Check whether the request is duplicate.
|
||||||
*
|
*
|
||||||
* @param request
|
* @param request request
|
||||||
* @param task
|
* @param task task
|
||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
public boolean isDuplicate(Request request, Task task);
|
public boolean isDuplicate(Request request, Task task);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Reset duplicate check.
|
* Reset duplicate check.
|
||||||
* @param task
|
* @param task task
|
||||||
*/
|
*/
|
||||||
public void resetDuplicateCheck(Task task);
|
public void resetDuplicateCheck(Task task);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Get TotalRequestsCount for monitor.
|
* Get TotalRequestsCount for monitor.
|
||||||
* @param task
|
* @param task task
|
||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
public int getTotalRequestsCount(Task task);
|
public int getTotalRequestsCount(Task task);
|
||||||
|
|
|
@ -16,7 +16,7 @@ public interface ElementSelector {
|
||||||
* Extract single result in text.<br>
|
* Extract single result in text.<br>
|
||||||
* If there are more than one result, only the first will be chosen.
|
* If there are more than one result, only the first will be chosen.
|
||||||
*
|
*
|
||||||
* @param element
|
* @param element element
|
||||||
* @return result
|
* @return result
|
||||||
*/
|
*/
|
||||||
public String select(Element element);
|
public String select(Element element);
|
||||||
|
@ -24,7 +24,7 @@ public interface ElementSelector {
|
||||||
/**
|
/**
|
||||||
* Extract all results in text.<br>
|
* Extract all results in text.<br>
|
||||||
*
|
*
|
||||||
* @param element
|
* @param element element
|
||||||
* @return results
|
* @return results
|
||||||
*/
|
*/
|
||||||
public List<String> selectList(Element element);
|
public List<String> selectList(Element element);
|
||||||
|
|
|
@ -68,7 +68,7 @@ public class Html extends HtmlNode {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @param selector
|
* @param selector selector
|
||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
public String selectDocument(Selector selector) {
|
public String selectDocument(Selector selector) {
|
||||||
|
|
|
@ -59,7 +59,7 @@ public class HtmlNode extends AbstractSelectable {
|
||||||
/**
|
/**
|
||||||
* select elements
|
* select elements
|
||||||
*
|
*
|
||||||
* @param elementSelector
|
* @param elementSelector elementSelector
|
||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
protected Selectable selectElements(BaseElementSelector elementSelector) {
|
protected Selectable selectElements(BaseElementSelector elementSelector) {
|
||||||
|
@ -89,8 +89,8 @@ public class HtmlNode extends AbstractSelectable {
|
||||||
* Only document can be select
|
* Only document can be select
|
||||||
* See: https://github.com/code4craft/webmagic/issues/113
|
* See: https://github.com/code4craft/webmagic/issues/113
|
||||||
*
|
*
|
||||||
* @param elementIterator
|
* @param elementIterator elementIterator
|
||||||
* @param element
|
* @param element element
|
||||||
*/
|
*/
|
||||||
private Element checkElementAndConvert(ListIterator<Element> elementIterator) {
|
private Element checkElementAndConvert(ListIterator<Element> elementIterator) {
|
||||||
Element element = elementIterator.next();
|
Element element = elementIterator.next();
|
||||||
|
|
|
@ -22,7 +22,7 @@ public class Json extends PlainText {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* remove padding for JSONP
|
* remove padding for JSONP
|
||||||
* @param padding
|
* @param padding padding
|
||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
public Json removePadding(String padding) {
|
public Json removePadding(String padding) {
|
||||||
|
|
|
@ -13,7 +13,7 @@ public interface Selectable {
|
||||||
/**
|
/**
|
||||||
* select list with xpath
|
* select list with xpath
|
||||||
*
|
*
|
||||||
* @param xpath
|
* @param xpath xpath
|
||||||
* @return new Selectable after extract
|
* @return new Selectable after extract
|
||||||
*/
|
*/
|
||||||
public Selectable xpath(String xpath);
|
public Selectable xpath(String xpath);
|
||||||
|
@ -69,7 +69,7 @@ public interface Selectable {
|
||||||
/**
|
/**
|
||||||
* select list with regex, default group is group 1
|
* select list with regex, default group is group 1
|
||||||
*
|
*
|
||||||
* @param regex
|
* @param regex regex
|
||||||
* @return new Selectable after extract
|
* @return new Selectable after extract
|
||||||
*/
|
*/
|
||||||
public Selectable regex(String regex);
|
public Selectable regex(String regex);
|
||||||
|
@ -77,8 +77,8 @@ public interface Selectable {
|
||||||
/**
|
/**
|
||||||
* select list with regex
|
* select list with regex
|
||||||
*
|
*
|
||||||
* @param regex
|
* @param regex regex
|
||||||
* @param group
|
* @param group group
|
||||||
* @return new Selectable after extract
|
* @return new Selectable after extract
|
||||||
*/
|
*/
|
||||||
public Selectable regex(String regex, int group);
|
public Selectable regex(String regex, int group);
|
||||||
|
@ -86,8 +86,8 @@ public interface Selectable {
|
||||||
/**
|
/**
|
||||||
* replace with regex
|
* replace with regex
|
||||||
*
|
*
|
||||||
* @param regex
|
* @param regex regex
|
||||||
* @param replacement
|
* @param replacement replacement
|
||||||
* @return new Selectable after extract
|
* @return new Selectable after extract
|
||||||
*/
|
*/
|
||||||
public Selectable replace(String regex, String replacement);
|
public Selectable replace(String regex, String replacement);
|
||||||
|
@ -123,7 +123,7 @@ public interface Selectable {
|
||||||
/**
|
/**
|
||||||
* extract by JSON Path expression
|
* extract by JSON Path expression
|
||||||
*
|
*
|
||||||
* @param jsonPath
|
* @param jsonPath jsonPath
|
||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
public Selectable jsonPath(String jsonPath);
|
public Selectable jsonPath(String jsonPath);
|
||||||
|
@ -131,7 +131,7 @@ public interface Selectable {
|
||||||
/**
|
/**
|
||||||
* extract by custom selector
|
* extract by custom selector
|
||||||
*
|
*
|
||||||
* @param selector
|
* @param selector selector
|
||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
public Selectable select(Selector selector);
|
public Selectable select(Selector selector);
|
||||||
|
@ -139,7 +139,7 @@ public interface Selectable {
|
||||||
/**
|
/**
|
||||||
* extract by custom selector
|
* extract by custom selector
|
||||||
*
|
*
|
||||||
* @param selector
|
* @param selector selector
|
||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
public Selectable selectList(Selector selector);
|
public Selectable selectList(Selector selector);
|
||||||
|
|
|
@ -14,7 +14,7 @@ public interface Selector {
|
||||||
* Extract single result in text.<br>
|
* Extract single result in text.<br>
|
||||||
* If there are more than one result, only the first will be chosen.
|
* If there are more than one result, only the first will be chosen.
|
||||||
*
|
*
|
||||||
* @param text
|
* @param text text
|
||||||
* @return result
|
* @return result
|
||||||
*/
|
*/
|
||||||
public String select(String text);
|
public String select(String text);
|
||||||
|
@ -22,7 +22,7 @@ public interface Selector {
|
||||||
/**
|
/**
|
||||||
* Extract all results in text.<br>
|
* Extract all results in text.<br>
|
||||||
*
|
*
|
||||||
* @param text
|
* @param text text
|
||||||
* @return results
|
* @return results
|
||||||
*/
|
*/
|
||||||
public List<String> selectList(String text);
|
public List<String> selectList(String text);
|
||||||
|
|
|
@ -25,8 +25,8 @@ public class UrlUtils {
|
||||||
* <p/>
|
* <p/>
|
||||||
* Borrowed from Jsoup.
|
* Borrowed from Jsoup.
|
||||||
*
|
*
|
||||||
* @param url
|
* @param url url
|
||||||
* @param refer
|
* @param refer refer
|
||||||
* @return canonicalizeUrl
|
* @return canonicalizeUrl
|
||||||
*/
|
*/
|
||||||
public static String canonicalizeUrl(String url, String refer) {
|
public static String canonicalizeUrl(String url, String refer) {
|
||||||
|
@ -51,7 +51,7 @@ public class UrlUtils {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
*
|
||||||
* @param url
|
* @param url url
|
||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
public static String encodeIllegalCharacterInUrl(String url) {
|
public static String encodeIllegalCharacterInUrl(String url) {
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<artifactId>webmagic-parent</artifactId>
|
<artifactId>webmagic-parent</artifactId>
|
||||||
<version>0.5.3-SNAPSHOT</version>
|
<version>0.5.3</version>
|
||||||
</parent>
|
</parent>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,29 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||||
|
<parent>
|
||||||
|
<groupId>us.codecraft</groupId>
|
||||||
|
<artifactId>webmagic-parent</artifactId>
|
||||||
|
<version>0.5.3-SNAPSHOT</version>
|
||||||
|
</parent>
|
||||||
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
<artifactId>webmagic-extension</artifactId>
|
||||||
|
|
||||||
|
<dependencies>
|
||||||
|
<dependency>
|
||||||
|
<groupId>redis.clients</groupId>
|
||||||
|
<artifactId>jedis</artifactId>
|
||||||
|
<version>2.0.0</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>us.codecraft</groupId>
|
||||||
|
<artifactId>webmagic-core</artifactId>
|
||||||
|
<version>${project.version}</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>junit</groupId>
|
||||||
|
<artifactId>junit</artifactId>
|
||||||
|
</dependency>
|
||||||
|
</dependencies>
|
||||||
|
|
||||||
|
</project>
|
|
@ -38,7 +38,7 @@ public interface MultiPageModel {
|
||||||
/**
|
/**
|
||||||
* Combine multiPageModels to a whole object.
|
* Combine multiPageModels to a whole object.
|
||||||
*
|
*
|
||||||
* @param multiPageModel
|
* @param multiPageModel multiPageModel
|
||||||
* @return multiPageModel combined
|
* @return multiPageModel combined
|
||||||
*/
|
*/
|
||||||
public MultiPageModel combine(MultiPageModel multiPageModel);
|
public MultiPageModel combine(MultiPageModel multiPageModel);
|
||||||
|
|
|
@ -12,7 +12,7 @@ public interface RequestMatcher {
|
||||||
* Check whether to process the page.<br></br>
|
* Check whether to process the page.<br></br>
|
||||||
* Please DO NOT change page status in this method.
|
* Please DO NOT change page status in this method.
|
||||||
*
|
*
|
||||||
* @param page
|
* @param page page
|
||||||
*
|
*
|
||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
|
|
|
@ -11,7 +11,7 @@ public interface SubPageProcessor extends RequestMatcher {
|
||||||
/**
|
/**
|
||||||
* process the page, extract urls to fetch, extract the data and store
|
* process the page, extract urls to fetch, extract the data and store
|
||||||
*
|
*
|
||||||
* @param page
|
* @param page page
|
||||||
*
|
*
|
||||||
* @return whether continue to match
|
* @return whether continue to match
|
||||||
*/
|
*/
|
||||||
|
|
|
@ -12,8 +12,8 @@ public interface SubPipeline extends RequestMatcher {
|
||||||
/**
|
/**
|
||||||
* process the page, extract urls to fetch, extract the data and store
|
* process the page, extract urls to fetch, extract the data and store
|
||||||
*
|
*
|
||||||
* @param page
|
* @param page page
|
||||||
* @param task
|
* @param task task
|
||||||
* @return whether continue to match
|
* @return whether continue to match
|
||||||
*/
|
*/
|
||||||
public MatchOther processResult(ResultItems resultItems, Task task);
|
public MatchOther processResult(ResultItems resultItems, Task task);
|
||||||
|
|
|
@ -60,9 +60,9 @@ public class OOSpider<T> extends Spider {
|
||||||
/**
|
/**
|
||||||
* create a spider
|
* create a spider
|
||||||
*
|
*
|
||||||
* @param site
|
* @param site site
|
||||||
* @param pageModelPipeline
|
* @param pageModelPipeline pageModelPipeline
|
||||||
* @param pageModels
|
* @param pageModels pageModels
|
||||||
*/
|
*/
|
||||||
public OOSpider(Site site, PageModelPipeline pageModelPipeline, Class... pageModels) {
|
public OOSpider(Site site, PageModelPipeline pageModelPipeline, Class... pageModels) {
|
||||||
this(ModelPageProcessor.create(site, pageModels));
|
this(ModelPageProcessor.create(site, pageModels));
|
||||||
|
|
|
@ -42,7 +42,7 @@ public class SpiderMonitor {
|
||||||
/**
|
/**
|
||||||
* Register spider for monitor.
|
* Register spider for monitor.
|
||||||
*
|
*
|
||||||
* @param spiders
|
* @param spiders spiders
|
||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
public synchronized SpiderMonitor register(Spider... spiders) throws JMException {
|
public synchronized SpiderMonitor register(Spider... spiders) throws JMException {
|
||||||
|
|
|
@ -30,7 +30,7 @@ public class DoubleKeyMap<K1, K2, V> extends MultiKeyMapBase {
|
||||||
/**
|
/**
|
||||||
* init map with protoMapClass
|
* init map with protoMapClass
|
||||||
*
|
*
|
||||||
* @param protoMapClass
|
* @param protoMapClass protoMapClass
|
||||||
*/
|
*/
|
||||||
@SuppressWarnings("rawtypes")
|
@SuppressWarnings("rawtypes")
|
||||||
public DoubleKeyMap(Map<K1, Map<K2, V>> map, Class<? extends Map> protoMapClass) {
|
public DoubleKeyMap(Map<K1, Map<K2, V>> map, Class<? extends Map> protoMapClass) {
|
||||||
|
@ -40,7 +40,7 @@ public class DoubleKeyMap<K1, K2, V> extends MultiKeyMapBase {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @param key
|
* @param key key
|
||||||
* @return map
|
* @return map
|
||||||
*/
|
*/
|
||||||
public Map<K2, V> get(K1 key) {
|
public Map<K2, V> get(K1 key) {
|
||||||
|
@ -48,8 +48,8 @@ public class DoubleKeyMap<K1, K2, V> extends MultiKeyMapBase {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @param key1
|
* @param key1 key1
|
||||||
* @param key2
|
* @param key2 key2
|
||||||
* @return value
|
* @return value
|
||||||
*/
|
*/
|
||||||
public V get(K1 key1, K2 key2) {
|
public V get(K1 key1, K2 key2) {
|
||||||
|
@ -61,8 +61,8 @@ public class DoubleKeyMap<K1, K2, V> extends MultiKeyMapBase {
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @param key1
|
* @param key1 key1
|
||||||
* @param submap
|
* @param submap submap
|
||||||
* @return value
|
* @return value
|
||||||
*/
|
*/
|
||||||
public V put(K1 key1, Map<K2, V> submap) {
|
public V put(K1 key1, Map<K2, V> submap) {
|
||||||
|
@ -70,9 +70,9 @@ public class DoubleKeyMap<K1, K2, V> extends MultiKeyMapBase {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @param key1
|
* @param key1 key1
|
||||||
* @param key2
|
* @param key2 key2
|
||||||
* @param value
|
* @param value value
|
||||||
* @return value
|
* @return value
|
||||||
*/
|
*/
|
||||||
public synchronized V put(K1 key1, K2 key2, V value) {
|
public synchronized V put(K1 key1, K2 key2, V value) {
|
||||||
|
@ -84,8 +84,8 @@ public class DoubleKeyMap<K1, K2, V> extends MultiKeyMapBase {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @param key1
|
* @param key1 key1
|
||||||
* @param key2
|
* @param key2 key2
|
||||||
* @return value
|
* @return value
|
||||||
*/
|
*/
|
||||||
public synchronized V remove(K1 key1, K2 key2) {
|
public synchronized V remove(K1 key1, K2 key2) {
|
||||||
|
@ -100,7 +100,7 @@ public class DoubleKeyMap<K1, K2, V> extends MultiKeyMapBase {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @param key1
|
* @param key1 key1
|
||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
public Map<K2, V> remove(K1 key1) {
|
public Map<K2, V> remove(K1 key1) {
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<artifactId>webmagic-parent</artifactId>
|
<artifactId>webmagic-parent</artifactId>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<version>0.5.3-SNAPSHOT</version>
|
<version>0.5.3</version>
|
||||||
</parent>
|
</parent>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,73 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||||
|
<parent>
|
||||||
|
<artifactId>webmagic-parent</artifactId>
|
||||||
|
<groupId>us.codecraft</groupId>
|
||||||
|
<version>0.5.3-SNAPSHOT</version>
|
||||||
|
</parent>
|
||||||
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
<artifactId>webmagic-samples</artifactId>
|
||||||
|
|
||||||
|
<dependencies>
|
||||||
|
<dependency>
|
||||||
|
<groupId>us.codecraft</groupId>
|
||||||
|
<artifactId>webmagic-core</artifactId>
|
||||||
|
<version>${project.version}</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>us.codecraft</groupId>
|
||||||
|
<artifactId>webmagic-extension</artifactId>
|
||||||
|
<version>${project.version}</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>junit</groupId>
|
||||||
|
<artifactId>junit</artifactId>
|
||||||
|
</dependency>
|
||||||
|
</dependencies>
|
||||||
|
|
||||||
|
<build>
|
||||||
|
<plugins>
|
||||||
|
<plugin>
|
||||||
|
<artifactId>maven-deploy-plugin</artifactId>
|
||||||
|
<configuration>
|
||||||
|
<skip>true</skip>
|
||||||
|
</configuration>
|
||||||
|
</plugin>
|
||||||
|
<plugin>
|
||||||
|
<groupId>org.apache.maven.plugins</groupId>
|
||||||
|
<artifactId>maven-dependency-plugin</artifactId>
|
||||||
|
<executions>
|
||||||
|
<execution>
|
||||||
|
<id>copy-dependencies</id>
|
||||||
|
<phase>package</phase>
|
||||||
|
<goals>
|
||||||
|
<goal>copy-dependencies</goal>
|
||||||
|
</goals>
|
||||||
|
<configuration>
|
||||||
|
<outputDirectory>${project.build.directory}/lib</outputDirectory>
|
||||||
|
<overWriteReleases>false</overWriteReleases>
|
||||||
|
<overWriteSnapshots>false</overWriteSnapshots>
|
||||||
|
<overWriteIfNewer>true</overWriteIfNewer>
|
||||||
|
</configuration>
|
||||||
|
</execution>
|
||||||
|
</executions>
|
||||||
|
</plugin>
|
||||||
|
<plugin>
|
||||||
|
<groupId>org.apache.maven.plugins</groupId>
|
||||||
|
<artifactId>maven-jar-plugin</artifactId>
|
||||||
|
<version>2.4</version>
|
||||||
|
<configuration>
|
||||||
|
<archive>
|
||||||
|
<manifest>
|
||||||
|
<addClasspath>true</addClasspath>
|
||||||
|
<classpathPrefix>./lib/</classpathPrefix>
|
||||||
|
<mainClass>us.codecraft.webmagic.main.QuickStarter</mainClass>
|
||||||
|
</manifest>
|
||||||
|
</archive>
|
||||||
|
</configuration>
|
||||||
|
</plugin>
|
||||||
|
</plugins>
|
||||||
|
</build>
|
||||||
|
|
||||||
|
</project>
|
|
@ -3,7 +3,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<artifactId>webmagic-parent</artifactId>
|
<artifactId>webmagic-parent</artifactId>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<version>0.5.3-SNAPSHOT</version>
|
<version>0.5.3</version>
|
||||||
</parent>
|
</parent>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,45 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||||
|
<parent>
|
||||||
|
<artifactId>webmagic-parent</artifactId>
|
||||||
|
<groupId>us.codecraft</groupId>
|
||||||
|
<version>0.5.3-SNAPSHOT</version>
|
||||||
|
</parent>
|
||||||
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
<artifactId>webmagic-saxon</artifactId>
|
||||||
|
|
||||||
|
<dependencies>
|
||||||
|
<dependency>
|
||||||
|
<groupId>us.codecraft</groupId>
|
||||||
|
<artifactId>webmagic-core</artifactId>
|
||||||
|
<version>${project.version}</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>net.sourceforge.htmlcleaner</groupId>
|
||||||
|
<artifactId>htmlcleaner</artifactId>
|
||||||
|
<version>2.5</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>net.sf.saxon</groupId>
|
||||||
|
<artifactId>Saxon-HE</artifactId>
|
||||||
|
<version>9.5.1-1</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>junit</groupId>
|
||||||
|
<artifactId>junit</artifactId>
|
||||||
|
</dependency>
|
||||||
|
</dependencies>
|
||||||
|
|
||||||
|
<build>
|
||||||
|
<plugins>
|
||||||
|
<plugin>
|
||||||
|
<artifactId>maven-deploy-plugin</artifactId>
|
||||||
|
<configuration>
|
||||||
|
<skip>true</skip>
|
||||||
|
</configuration>
|
||||||
|
</plugin>
|
||||||
|
</plugins>
|
||||||
|
</build>
|
||||||
|
|
||||||
|
</project>
|
|
@ -3,7 +3,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<artifactId>webmagic-parent</artifactId>
|
<artifactId>webmagic-parent</artifactId>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<version>0.5.3-SNAPSHOT</version>
|
<version>0.5.3</version>
|
||||||
</parent>
|
</parent>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,79 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||||
|
<parent>
|
||||||
|
<artifactId>webmagic-parent</artifactId>
|
||||||
|
<groupId>us.codecraft</groupId>
|
||||||
|
<version>0.5.3-SNAPSHOT</version>
|
||||||
|
</parent>
|
||||||
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
<groupId>us.codecraft</groupId>
|
||||||
|
<artifactId>webmagic-scripts</artifactId>
|
||||||
|
|
||||||
|
<dependencies>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.jruby</groupId>
|
||||||
|
<artifactId>jruby</artifactId>
|
||||||
|
<version>1.7.6</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency><groupId>org.python</groupId>
|
||||||
|
<artifactId>jython</artifactId>
|
||||||
|
<version>2.5.3</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>commons-cli</groupId>
|
||||||
|
<artifactId>commons-cli</artifactId>
|
||||||
|
<version>1.2</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>junit</groupId>
|
||||||
|
<artifactId>junit</artifactId>
|
||||||
|
<scope>test</scope>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>us.codecraft</groupId>
|
||||||
|
<artifactId>webmagic-core</artifactId>
|
||||||
|
<version>${project.version}</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>us.codecraft</groupId>
|
||||||
|
<artifactId>webmagic-extension</artifactId>
|
||||||
|
<version>${project.version}</version>
|
||||||
|
</dependency>
|
||||||
|
</dependencies>
|
||||||
|
|
||||||
|
<build>
|
||||||
|
<plugins>
|
||||||
|
<plugin>
|
||||||
|
<artifactId>maven-compiler-plugin</artifactId>
|
||||||
|
<configuration>
|
||||||
|
<source>1.6</source>
|
||||||
|
<target>1.6</target>
|
||||||
|
<encoding>UTF-8</encoding>
|
||||||
|
</configuration>
|
||||||
|
</plugin>
|
||||||
|
<plugin>
|
||||||
|
<groupId>org.apache.maven.plugins</groupId>
|
||||||
|
<artifactId>maven-resources-plugin</artifactId>
|
||||||
|
<configuration>
|
||||||
|
<encoding>UTF-8</encoding>
|
||||||
|
</configuration>
|
||||||
|
</plugin>
|
||||||
|
<plugin>
|
||||||
|
<groupId>org.apache.maven.plugins</groupId>
|
||||||
|
<artifactId>maven-jar-plugin</artifactId>
|
||||||
|
<configuration>
|
||||||
|
<archive>
|
||||||
|
<manifest>
|
||||||
|
<addClasspath>true</addClasspath>
|
||||||
|
<classpathPrefix>./lib/</classpathPrefix>
|
||||||
|
<mainClass>us.codecraft.webmagic.scripts.ScriptConsole</mainClass>
|
||||||
|
</manifest>
|
||||||
|
</archive>
|
||||||
|
</configuration>
|
||||||
|
</plugin>
|
||||||
|
</plugins>
|
||||||
|
</build>
|
||||||
|
|
||||||
|
|
||||||
|
</project>
|
|
@ -1,10 +1,9 @@
|
||||||
<?xml version="1.0" encoding="UTF-8"?>
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
|
||||||
<parent>
|
<parent>
|
||||||
<artifactId>webmagic-parent</artifactId>
|
<artifactId>webmagic-parent</artifactId>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<version>0.5.3-SNAPSHOT</version>
|
<version>0.5.3</version>
|
||||||
</parent>
|
</parent>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,49 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||||
|
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||||
|
<parent>
|
||||||
|
<artifactId>webmagic-parent</artifactId>
|
||||||
|
<groupId>us.codecraft</groupId>
|
||||||
|
<version>0.5.3-SNAPSHOT</version>
|
||||||
|
</parent>
|
||||||
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
<artifactId>webmagic-selenium</artifactId>
|
||||||
|
|
||||||
|
<dependencies>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.seleniumhq.selenium</groupId>
|
||||||
|
<artifactId>selenium-java</artifactId>
|
||||||
|
<version>2.46.0</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>us.codecraft</groupId>
|
||||||
|
<artifactId>webmagic-core</artifactId>
|
||||||
|
<version>${project.version}</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>com.github.detro</groupId>
|
||||||
|
<artifactId>phantomjsdriver</artifactId>
|
||||||
|
<version>1.2.0</version>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>junit</groupId>
|
||||||
|
<artifactId>junit</artifactId>
|
||||||
|
</dependency>
|
||||||
|
</dependencies>
|
||||||
|
|
||||||
|
<build>
|
||||||
|
<plugins>
|
||||||
|
<plugin>
|
||||||
|
<artifactId>maven-deploy-plugin</artifactId>
|
||||||
|
<configuration>
|
||||||
|
<skip>true</skip>
|
||||||
|
</configuration>
|
||||||
|
</plugin>
|
||||||
|
</plugins>
|
||||||
|
</build>
|
||||||
|
|
||||||
|
</project>
|
|
@ -42,7 +42,7 @@ public class SeleniumDownloader implements Downloader, Closeable {
|
||||||
/**
|
/**
|
||||||
* 新建
|
* 新建
|
||||||
*
|
*
|
||||||
* @param chromeDriverPath
|
* @param chromeDriverPath chromeDriverPath
|
||||||
*/
|
*/
|
||||||
public SeleniumDownloader(String chromeDriverPath) {
|
public SeleniumDownloader(String chromeDriverPath) {
|
||||||
System.getProperties().setProperty("webdriver.chrome.driver",
|
System.getProperties().setProperty("webdriver.chrome.driver",
|
||||||
|
@ -62,7 +62,7 @@ public class SeleniumDownloader implements Downloader, Closeable {
|
||||||
/**
|
/**
|
||||||
* set sleep time to wait until load success
|
* set sleep time to wait until load success
|
||||||
*
|
*
|
||||||
* @param sleepTime
|
* @param sleepTime sleepTime
|
||||||
* @return this
|
* @return this
|
||||||
*/
|
*/
|
||||||
public SeleniumDownloader setSleepTime(int sleepTime) {
|
public SeleniumDownloader setSleepTime(int sleepTime) {
|
||||||
|
|
|
@ -141,7 +141,7 @@ class WebDriverPool {
|
||||||
* check whether input is a valid URL
|
* check whether input is a valid URL
|
||||||
*
|
*
|
||||||
* @author bob.li.0718@gmail.com
|
* @author bob.li.0718@gmail.com
|
||||||
* @param urlString
|
* @param urlString urlString
|
||||||
* @return true means yes, otherwise no.
|
* @return true means yes, otherwise no.
|
||||||
*/
|
*/
|
||||||
private boolean isUrl(String urlString) {
|
private boolean isUrl(String urlString) {
|
||||||
|
|
Loading…
Reference in New Issue