Merge branch 'release/0.7.6'

master
Sutra Zhou 2022-10-24 00:10:29 +08:00
commit dc7218eba3
31 changed files with 484 additions and 335 deletions

124
pom.xml
View File

@ -1,7 +1,7 @@
<?xml version="1.0" encoding="UTF-8"?> <?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<groupId>us.codecraft</groupId> <groupId>us.codecraft</groupId>
<version>0.7.5</version> <version>0.7.6</version>
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<packaging>pom</packaging> <packaging>pom</packaging>
<properties> <properties>
@ -9,7 +9,31 @@
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding> <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
<maven.compiler.source>1.8</maven.compiler.source> <maven.compiler.source>1.8</maven.compiler.source>
<maven.compiler.target>1.8</maven.compiler.target> <maven.compiler.target>1.8</maven.compiler.target>
<assertj.version>3.23.1</assertj.version>
<commons-cli.version>1.5.0</commons-cli.version>
<commons-collections4.version>4.4</commons-collections4.version>
<commons-io.version>2.11.0</commons-io.version>
<commons-lang3.version>3.12.0</commons-lang3.version>
<fastjson.version>2.0.14.graal</fastjson.version>
<groovy-all.version>3.0.13</groovy-all.version>
<guava.version>31.1-jre</guava.version>
<htmlcleaner.version>2.26</htmlcleaner.version>
<httpclient.version>4.5.13</httpclient.version>
<httpcore.version>4.4.15</httpcore.version>
<jedis.version>3.7.1</jedis.version>
<jruby.version>9.3.8.0</jruby.version>
<json-path.version>2.7.0</json-path.version>
<junit.version>4.13.2</junit.version>
<jython.version>2.7.3</jython.version>
<log4j.version>1.2.17</log4j.version>
<mockito-all.version>2.0.2-beta</mockito-all.version>
<moco.version>1.3.0</moco.version>
<phantomjsdriver.version>1.2.0</phantomjsdriver.version>
<saxon-he.version>11.4</saxon-he.version>
<selenium-java.version>3.141.59</selenium-java.version>
<slf4j.version>2.0.3</slf4j.version>
<spring-version>4.0.0.RELEASE</spring-version> <spring-version>4.0.0.RELEASE</spring-version>
<xsoup.version>0.3.5</xsoup.version>
</properties> </properties>
<artifactId>webmagic-parent</artifactId> <artifactId>webmagic-parent</artifactId>
<name>webmagic-parent</name> <name>webmagic-parent</name>
@ -58,59 +82,59 @@
<dependency> <dependency>
<groupId>junit</groupId> <groupId>junit</groupId>
<artifactId>junit</artifactId> <artifactId>junit</artifactId>
<version>4.13.1</version> <version>${junit.version}</version>
<scope>test</scope> <scope>test</scope>
</dependency> </dependency>
<dependency> <dependency>
<groupId>org.mockito</groupId> <groupId>org.mockito</groupId>
<artifactId>mockito-all</artifactId> <artifactId>mockito-all</artifactId>
<version>1.10.19</version> <version>${mockito-all.version}</version>
<scope>test</scope> <scope>test</scope>
</dependency> </dependency>
<dependency> <dependency>
<groupId>org.apache.httpcomponents</groupId> <groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId> <artifactId>httpclient</artifactId>
<version>4.5.13</version> <version>${httpclient.version}</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>org.apache.httpcomponents</groupId> <groupId>org.apache.httpcomponents</groupId>
<artifactId>httpcore</artifactId> <artifactId>httpcore</artifactId>
<version>4.4.14</version> <version>${httpcore.version}</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>com.google.guava</groupId> <groupId>com.google.guava</groupId>
<artifactId>guava</artifactId> <artifactId>guava</artifactId>
<version>30.1-jre</version> <version>${guava.version}</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>com.jayway.jsonpath</groupId> <groupId>com.jayway.jsonpath</groupId>
<artifactId>json-path</artifactId> <artifactId>json-path</artifactId>
<version>2.6.0</version> <version>${json-path.version}</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>org.slf4j</groupId> <groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId> <artifactId>slf4j-api</artifactId>
<version>1.7.30</version> <version>${slf4j.version}</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>org.slf4j</groupId> <groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId> <artifactId>slf4j-log4j12</artifactId>
<version>1.7.30</version> <version>${slf4j.version}</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>us.codecraft</groupId> <groupId>us.codecraft</groupId>
<artifactId>xsoup</artifactId> <artifactId>xsoup</artifactId>
<version>0.3.4</version> <version>${xsoup.version}</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>com.alibaba</groupId> <groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId> <artifactId>fastjson</artifactId>
<version>1.2.83</version> <version>${fastjson.version}</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>com.github.dreamhead</groupId> <groupId>com.github.dreamhead</groupId>
<artifactId>moco-core</artifactId> <artifactId>moco-core</artifactId>
<version>1.3.0</version> <version>${moco.version}</version>
<scope>test</scope> <scope>test</scope>
<exclusions> <exclusions>
<exclusion> <exclusion>
@ -122,73 +146,73 @@
<dependency> <dependency>
<groupId>log4j</groupId> <groupId>log4j</groupId>
<artifactId>log4j</artifactId> <artifactId>log4j</artifactId>
<version>1.2.17</version> <version>${log4j.version}</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>org.assertj</groupId> <groupId>org.assertj</groupId>
<artifactId>assertj-core</artifactId> <artifactId>assertj-core</artifactId>
<version>3.18.1</version> <version>${assertj.version}</version>
<scope>test</scope> <scope>test</scope>
</dependency> </dependency>
<dependency> <dependency>
<groupId>org.apache.commons</groupId> <groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId> <artifactId>commons-lang3</artifactId>
<version>3.11</version> <version>${commons-lang3.version}</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>commons-collections</groupId> <groupId>org.apache.commons</groupId>
<artifactId>commons-collections</artifactId> <artifactId>commons-collections4</artifactId>
<version>3.2.2</version> <version>${commons-collections4.version}</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>commons-io</groupId> <groupId>commons-io</groupId>
<artifactId>commons-io</artifactId> <artifactId>commons-io</artifactId>
<version>2.8.0</version> <version>${commons-io.version}</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>org.codehaus.groovy</groupId> <groupId>org.codehaus.groovy</groupId>
<artifactId>groovy-all</artifactId> <artifactId>groovy-all</artifactId>
<version>3.0.7</version> <version>${groovy-all.version}</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>org.jruby</groupId> <groupId>org.jruby</groupId>
<artifactId>jruby</artifactId> <artifactId>jruby</artifactId>
<version>9.3.0.0</version> <version>${jruby.version}</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>org.python</groupId> <groupId>org.python</groupId>
<artifactId>jython</artifactId> <artifactId>jython</artifactId>
<version>2.7.2</version> <version>${jython.version}</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>org.seleniumhq.selenium</groupId> <groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-java</artifactId> <artifactId>selenium-java</artifactId>
<version>4.0.0</version> <version>${selenium-java.version}</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>net.sf.saxon</groupId> <groupId>net.sf.saxon</groupId>
<artifactId>Saxon-HE</artifactId> <artifactId>Saxon-HE</artifactId>
<version>10.3</version> <version>${saxon-he.version}</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>net.sourceforge.htmlcleaner</groupId> <groupId>net.sourceforge.htmlcleaner</groupId>
<artifactId>htmlcleaner</artifactId> <artifactId>htmlcleaner</artifactId>
<version>2.26</version> <version>${htmlcleaner.version}</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>com.github.detro</groupId> <groupId>com.github.detro</groupId>
<artifactId>phantomjsdriver</artifactId> <artifactId>phantomjsdriver</artifactId>
<version>1.2.0</version> <version>${phantomjsdriver.version}</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>commons-cli</groupId> <groupId>commons-cli</groupId>
<artifactId>commons-cli</artifactId> <artifactId>commons-cli</artifactId>
<version>1.4</version> <version>${commons-cli.version}</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>redis.clients</groupId> <groupId>redis.clients</groupId>
<artifactId>jedis</artifactId> <artifactId>jedis</artifactId>
<version>3.6.0</version> <version>${jedis.version}</version>
</dependency> </dependency>
</dependencies> </dependencies>
</dependencyManagement> </dependencyManagement>
@ -198,7 +222,7 @@
<plugin> <plugin>
<groupId>org.apache.maven.plugins</groupId> <groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-enforcer-plugin</artifactId> <artifactId>maven-enforcer-plugin</artifactId>
<version>3.0.0-M3</version> <version>3.1.0</version>
<executions> <executions>
<execution> <execution>
<id>enforce-maven</id> <id>enforce-maven</id>
@ -272,7 +296,7 @@
<plugin> <plugin>
<groupId>org.apache.maven.plugins</groupId> <groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-javadoc-plugin</artifactId> <artifactId>maven-javadoc-plugin</artifactId>
<version>3.2.0</version> <version>3.4.1</version>
<configuration> <configuration>
<encoding>UTF-8</encoding> <encoding>UTF-8</encoding>
<doctitle>WebMagic ${project.version}</doctitle> <doctitle>WebMagic ${project.version}</doctitle>
@ -301,7 +325,7 @@
<plugin> <plugin>
<groupId>org.apache.maven.plugins</groupId> <groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-release-plugin</artifactId> <artifactId>maven-release-plugin</artifactId>
<version>3.0.0-M1</version> <version>3.0.0-M6</version>
</plugin> </plugin>
<plugin> <plugin>
<groupId>org.jacoco</groupId> <groupId>org.jacoco</groupId>
@ -336,77 +360,77 @@
<plugin> <plugin>
<groupId>org.apache.maven.plugins</groupId> <groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-clean-plugin</artifactId> <artifactId>maven-clean-plugin</artifactId>
<version>3.1.0</version> <version>3.2.0</version>
</plugin> </plugin>
<plugin> <plugin>
<groupId>org.apache.maven.plugins</groupId> <groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId> <artifactId>maven-compiler-plugin</artifactId>
<version>3.8.1</version> <version>3.10.1</version>
</plugin> </plugin>
<plugin> <plugin>
<groupId>org.apache.maven.plugins</groupId> <groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-deploy-plugin</artifactId> <artifactId>maven-deploy-plugin</artifactId>
<version>3.0.0-M1</version> <version>3.0.0</version>
</plugin> </plugin>
<plugin> <plugin>
<groupId>org.apache.maven.plugins</groupId> <groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-install-plugin</artifactId> <artifactId>maven-install-plugin</artifactId>
<version>3.0.0-M1</version> <version>3.0.1</version>
</plugin> </plugin>
<plugin> <plugin>
<groupId>org.apache.maven.plugins</groupId> <groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jar-plugin</artifactId> <artifactId>maven-jar-plugin</artifactId>
<version>3.2.0</version> <version>3.3.0</version>
</plugin> </plugin>
<plugin> <plugin>
<groupId>org.apache.maven.plugins</groupId> <groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jxr-plugin</artifactId> <artifactId>maven-jxr-plugin</artifactId>
<version>3.1.1</version> <version>3.3.0</version>
</plugin> </plugin>
<plugin> <plugin>
<groupId>org.apache.maven.plugins</groupId> <groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-pmd-plugin</artifactId> <artifactId>maven-pmd-plugin</artifactId>
<version>3.14.0</version> <version>3.19.0</version>
</plugin> </plugin>
<plugin> <plugin>
<groupId>org.apache.maven.plugins</groupId> <groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-resources-plugin</artifactId> <artifactId>maven-resources-plugin</artifactId>
<version>3.2.0</version> <version>3.3.0</version>
</plugin> </plugin>
<plugin> <plugin>
<groupId>org.apache.maven.plugins</groupId> <groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-site-plugin</artifactId> <artifactId>maven-site-plugin</artifactId>
<version>3.9.1</version> <version>4.0.0-M3</version>
</plugin> </plugin>
<plugin> <plugin>
<groupId>org.apache.maven.plugins</groupId> <groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId> <artifactId>maven-surefire-plugin</artifactId>
<version>3.0.0-M5</version> <version>3.0.0-M7</version>
</plugin> </plugin>
<plugin> <plugin>
<groupId>org.apache.maven.plugins</groupId> <groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-report-plugin</artifactId> <artifactId>maven-surefire-report-plugin</artifactId>
<version>3.0.0-M5</version> <version>3.0.0-M7</version>
</plugin> </plugin>
<plugin> <plugin>
<groupId>org.codehaus.mojo</groupId> <groupId>org.codehaus.mojo</groupId>
<artifactId>taglist-maven-plugin</artifactId> <artifactId>taglist-maven-plugin</artifactId>
<version>2.4</version> <version>3.0.0</version>
</plugin> </plugin>
<plugin> <plugin>
<groupId>org.jacoco</groupId> <groupId>org.jacoco</groupId>
<artifactId>jacoco-maven-plugin</artifactId> <artifactId>jacoco-maven-plugin</artifactId>
<version>0.8.7</version> <version>0.8.8</version>
</plugin> </plugin>
<plugin> <plugin>
<groupId>com.amashchenko.maven.plugin</groupId> <groupId>com.amashchenko.maven.plugin</groupId>
<artifactId>gitflow-maven-plugin</artifactId> <artifactId>gitflow-maven-plugin</artifactId>
<version>1.15.0</version> <version>1.18.0</version>
</plugin> </plugin>
<plugin> <plugin>
<groupId>com.github.spotbugs</groupId> <groupId>com.github.spotbugs</groupId>
<artifactId>spotbugs-maven-plugin</artifactId> <artifactId>spotbugs-maven-plugin</artifactId>
<version>4.2.3</version> <version>4.7.2.0</version>
</plugin> </plugin>
</plugins> </plugins>
</pluginManagement> </pluginManagement>
@ -453,7 +477,7 @@
<plugin> <plugin>
<groupId>org.apache.maven.plugins</groupId> <groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-source-plugin</artifactId> <artifactId>maven-source-plugin</artifactId>
<version>2.2.1</version> <version>3.2.1</version>
<executions> <executions>
<execution> <execution>
<phase>package</phase> <phase>package</phase>
@ -467,7 +491,7 @@
<plugin> <plugin>
<groupId>org.apache.maven.plugins</groupId> <groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-javadoc-plugin</artifactId> <artifactId>maven-javadoc-plugin</artifactId>
<version>2.9.1</version> <version>3.4.1</version>
<executions> <executions>
<execution> <execution>
<phase>package</phase> <phase>package</phase>
@ -481,7 +505,7 @@
<plugin> <plugin>
<groupId>org.apache.maven.plugins</groupId> <groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-gpg-plugin</artifactId> <artifactId>maven-gpg-plugin</artifactId>
<version>1.6</version> <version>3.0.1</version>
<executions> <executions>
<execution> <execution>
<phase>verify</phase> <phase>verify</phase>
@ -494,7 +518,7 @@
<plugin> <plugin>
<groupId>org.sonatype.plugins</groupId> <groupId>org.sonatype.plugins</groupId>
<artifactId>nexus-staging-maven-plugin</artifactId> <artifactId>nexus-staging-maven-plugin</artifactId>
<version>1.6.8</version> <version>1.6.13</version>
<extensions>true</extensions> <extensions>true</extensions>
<configuration> <configuration>
<serverId>sonatype-nexus-staging</serverId> <serverId>sonatype-nexus-staging</serverId>

View File

@ -3,7 +3,7 @@
<parent> <parent>
<groupId>us.codecraft</groupId> <groupId>us.codecraft</groupId>
<artifactId>webmagic-parent</artifactId> <artifactId>webmagic-parent</artifactId>
<version>0.7.5</version> <version>0.7.6</version>
</parent> </parent>
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
@ -52,8 +52,8 @@
</dependency> </dependency>
<dependency> <dependency>
<groupId>commons-collections</groupId> <groupId>org.apache.commons</groupId>
<artifactId>commons-collections</artifactId> <artifactId>commons-collections4</artifactId>
</dependency> </dependency>
<dependency> <dependency>

View File

@ -1,6 +1,20 @@
package us.codecraft.webmagic; package us.codecraft.webmagic;
import org.apache.commons.collections.CollectionUtils;
import java.io.Closeable;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;
import java.util.List;
import java.util.UUID;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.locks.Condition;
import java.util.concurrent.locks.ReentrantLock;
import org.apache.commons.collections4.CollectionUtils;
import org.apache.commons.lang3.SerializationUtils; import org.apache.commons.lang3.SerializationUtils;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -17,16 +31,6 @@ import us.codecraft.webmagic.thread.CountableThreadPool;
import us.codecraft.webmagic.utils.UrlUtils; import us.codecraft.webmagic.utils.UrlUtils;
import us.codecraft.webmagic.utils.WMCollections; import us.codecraft.webmagic.utils.WMCollections;
import java.io.Closeable;
import java.io.IOException;
import java.util.*;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.locks.Condition;
import java.util.concurrent.locks.ReentrantLock;
/** /**
* Entrance of a crawler.<br> * Entrance of a crawler.<br>
* A spider contains four modules: Downloader, Scheduler, PageProcessor and * A spider contains four modules: Downloader, Scheduler, PageProcessor and
@ -106,7 +110,7 @@ public class Spider implements Runnable, Task {
private Date startTime; private Date startTime;
private int emptySleepTime = 30000; private long emptySleepTime = 30000;
/** /**
* create a spider with pageProcessor. * create a spider with pageProcessor.
@ -306,15 +310,37 @@ public class Spider implements Runnable, Task {
checkRunningStat(); checkRunningStat();
initComponent(); initComponent();
logger.info("Spider {} started!", getUUID()); logger.info("Spider {} started!", getUUID());
// interrupt won't be necessarily detected
while (!Thread.currentThread().isInterrupted() && stat.get() == STAT_RUNNING) { while (!Thread.currentThread().isInterrupted() && stat.get() == STAT_RUNNING) {
final Request request = scheduler.poll(this); Request poll = scheduler.poll(this);
if (request == null) { if (poll == null) {
if (threadPool.getThreadAlive() == 0 && exitWhenComplete) { if (threadPool.getThreadAlive() == 0) {
//no alive thread anymore , try again
poll = scheduler.poll(this);
if (poll == null) {
if (exitWhenComplete) {
break;
} else {
// wait
try {
Thread.sleep(emptySleepTime);
continue;
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
break; break;
} }
// wait until new url added }
waitNewUrl(); }
} else { } else {
// wait until new url added
if (waitNewUrl())
//if interrupted
break;
continue;
}
}
final Request request = poll;
//this may swallow the interruption
threadPool.execute(new Runnable() { threadPool.execute(new Runnable() {
@Override @Override
public void run() { public void run() {
@ -331,7 +357,6 @@ public class Spider implements Runnable, Task {
} }
}); });
} }
}
stat.set(STAT_STOPPED); stat.set(STAT_STOPPED);
// release some resources // release some resources
if (destroyWhenExit) { if (destroyWhenExit) {
@ -469,6 +494,7 @@ public class Spider implements Runnable, Task {
Thread.sleep(time); Thread.sleep(time);
} catch (InterruptedException e) { } catch (InterruptedException e) {
logger.error("Thread interrupted when sleep",e); logger.error("Thread interrupted when sleep",e);
Thread.currentThread().interrupt();
} }
} }
@ -565,16 +591,24 @@ public class Spider implements Runnable, Task {
return this; return this;
} }
private void waitNewUrl() { /**
*
* @return isInterrupted
*/
private boolean waitNewUrl() {
// now there may not be any thread live
newUrlLock.lock(); newUrlLock.lock();
try { try {
//double check //double checkunnecessary, unless very fast concurrent
if (threadPool.getThreadAlive() == 0 && exitWhenComplete) { if (threadPool.getThreadAlive() == 0) {
return; return false;
} }
//wait for amount of time
newUrlCondition.await(emptySleepTime, TimeUnit.MILLISECONDS); newUrlCondition.await(emptySleepTime, TimeUnit.MILLISECONDS);
return false;
} catch (InterruptedException e) { } catch (InterruptedException e) {
logger.warn("waitNewUrl - interrupted, error {}", e); // logger.warn("waitNewUrl - interrupted, error {}", e);
return true;
} finally { } finally {
newUrlLock.unlock(); newUrlLock.unlock();
} }
@ -772,7 +806,10 @@ public class Spider implements Runnable, Task {
* *
* @param emptySleepTime In MILLISECONDS. * @param emptySleepTime In MILLISECONDS.
*/ */
public void setEmptySleepTime(int emptySleepTime) { public void setEmptySleepTime(long emptySleepTime) {
if(emptySleepTime<=0){
throw new IllegalArgumentException("emptySleepTime should be more than zero!");
}
this.emptySleepTime = emptySleepTime; this.emptySleepTime = emptySleepTime;
} }
} }

View File

@ -8,13 +8,14 @@ package us.codecraft.webmagic;
*/ */
public interface SpiderListener { public interface SpiderListener {
public void onSuccess(Request request); void onSuccess(Request request);
/** /**
* @deprecated Use {@link #onError(Request, Exception)} instead. * @deprecated Use {@link #onError(Request, Exception)} instead.
*/ */
@Deprecated @Deprecated
public void onError(Request request); default void onError(Request request) {
}
default void onError(Request request, Exception e) { default void onError(Request request, Exception e) {
this.onError(request); this.onError(request);

View File

@ -3,6 +3,7 @@ package us.codecraft.webmagic.downloader;
import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.selector.Html; import us.codecraft.webmagic.selector.Html;
/** /**
@ -35,10 +36,26 @@ public abstract class AbstractDownloader implements Downloader {
return (Html) page.getHtml(); return (Html) page.getHtml();
} }
@Deprecated
protected void onSuccess(Request request) { protected void onSuccess(Request request) {
} }
/**
* @since 0.7.6
*/
protected void onSuccess(Request request, Task task) {
this.onSuccess(request);
}
@Deprecated
protected void onError(Request request) { protected void onError(Request request) {
} }
/**
* @since 0.7.6
*/
protected void onError(Request request, Task task, Throwable e) {
this.onError(request);
}
} }

View File

@ -82,12 +82,12 @@ public class HttpClientDownloader extends AbstractDownloader {
try { try {
httpResponse = httpClient.execute(requestContext.getHttpUriRequest(), requestContext.getHttpClientContext()); httpResponse = httpClient.execute(requestContext.getHttpUriRequest(), requestContext.getHttpClientContext());
page = handleResponse(request, request.getCharset() != null ? request.getCharset() : task.getSite().getCharset(), httpResponse, task); page = handleResponse(request, request.getCharset() != null ? request.getCharset() : task.getSite().getCharset(), httpResponse, task);
onSuccess(request); onSuccess(request, task);
logger.info("downloading page success {}", request.getUrl()); logger.info("downloading page success {}", request.getUrl());
return page; return page;
} catch (IOException e) { } catch (IOException e) {
logger.warn("download page {} error", request.getUrl(), e); logger.warn("download page {} error", request.getUrl(), e);
onError(request); onError(request, task, e);
return page; return page;
} finally { } finally {
if (httpResponse != null) { if (httpResponse != null) {

View File

@ -4,13 +4,16 @@ import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Site;
/** /**
* Interface to be implemented to customize a crawler.<br> * Interface to be implemented to customize a crawler.
* <br> *
* <p>
* In PageProcessor, you can customize: * In PageProcessor, you can customize:
* <br> * </p>
* start urls and other settings in {@link Site}<br> * <ul>
* how the urls to fetch are detected <br> * <li>start URLs and other settings in {@link Site}</li>
* how the data are extracted and stored <br> * <li>how the URLs to fetch are detected</li>
* <li>how the data are extracted and stored</li>
* </ul>
* *
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* @see Site * @see Site
@ -20,17 +23,20 @@ import us.codecraft.webmagic.Site;
public interface PageProcessor { public interface PageProcessor {
/** /**
* process the page, extract urls to fetch, extract the data and store * Processes the page, extract URLs to fetch, extract the data and store.
* *
* @param page page * @param page page
*/ */
public void process(Page page); void process(Page page);
/** /**
* get the site settings * Returns the site settings.
* *
* @return site * @return site
* @see Site * @see Site
*/ */
public Site getSite(); default Site getSite() {
return Site.me();
}
} }

View File

@ -1,9 +1,9 @@
package us.codecraft.webmagic.selector; package us.codecraft.webmagic.selector;
import org.apache.commons.collections.CollectionUtils;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
import org.apache.commons.collections4.CollectionUtils;
/** /**
* @author code4crafer@gmail.com * @author code4crafer@gmail.com

View File

@ -1,6 +1,7 @@
package us.codecraft.webmagic.selector; package us.codecraft.webmagic.selector;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element; import org.jsoup.nodes.Element;
import java.util.ArrayList; import java.util.ArrayList;
@ -11,11 +12,24 @@ import java.util.List;
* @since 0.3.0 * @since 0.3.0
*/ */
public abstract class BaseElementSelector implements Selector, ElementSelector { public abstract class BaseElementSelector implements Selector, ElementSelector {
private Document parse(String text) {
if (text == null) {
return null;
}
// Jsoup could not parse <tr></tr> or <td></td> tag directly
// https://stackoverflow.com/questions/63607740/jsoup-couldnt-parse-tr-tag
if ((text.startsWith("<tr>") && text.endsWith("</tr>"))
|| (text.startsWith("<td>") && text.endsWith("</td>"))) {
text = "<table>" + text + "</table>";
}
return Jsoup.parse(text);
}
@Override @Override
public String select(String text) { public String select(String text) {
if (text != null) { if (text != null) {
return select(Jsoup.parse(text)); return select(parse(text));
} }
return null; return null;
} }
@ -23,7 +37,7 @@ public abstract class BaseElementSelector implements Selector, ElementSelector {
@Override @Override
public List<String> selectList(String text) { public List<String> selectList(String text) {
if (text != null) { if (text != null) {
return selectList(Jsoup.parse(text)); return selectList(parse(text));
} else { } else {
return new ArrayList<String>(); return new ArrayList<String>();
} }
@ -31,14 +45,14 @@ public abstract class BaseElementSelector implements Selector, ElementSelector {
public Element selectElement(String text) { public Element selectElement(String text) {
if (text != null) { if (text != null) {
return selectElement(Jsoup.parse(text)); return selectElement(parse(text));
} }
return null; return null;
} }
public List<Element> selectElements(String text) { public List<Element> selectElements(String text) {
if (text != null) { if (text != null) {
return selectElements(Jsoup.parse(text)); return selectElements(parse(text));
} else { } else {
return new ArrayList<Element>(); return new ArrayList<Element>();
} }

View File

@ -1,14 +1,14 @@
package us.codecraft.webmagic.selector; package us.codecraft.webmagic.selector;
import org.apache.commons.collections.CollectionUtils;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.collections4.CollectionUtils;
import org.jsoup.nodes.Element; import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node; import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode; import org.jsoup.nodes.TextNode;
import org.jsoup.select.Elements; import org.jsoup.select.Elements;
import java.util.ArrayList;
import java.util.List;
/** /**
* CSS selector. Based on Jsoup. * CSS selector. Based on Jsoup.
* *

View File

@ -1,11 +1,11 @@
package us.codecraft.webmagic.selector; package us.codecraft.webmagic.selector;
import com.alibaba.fastjson.JSON;
import com.jayway.jsonpath.JsonPath;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import com.alibaba.fastjson.JSON;
import com.jayway.jsonpath.JsonPath;
/** /**
* JsonPath selector.<br> * JsonPath selector.<br>
@ -16,15 +16,20 @@ import java.util.Map;
*/ */
public class JsonPathSelector implements Selector { public class JsonPathSelector implements Selector {
private String jsonPathStr; private final String jsonPathStr;
private JsonPath jsonPath; private final JsonPath jsonPath;
public JsonPathSelector(String jsonPathStr) { public JsonPathSelector(String jsonPathStr) {
this.jsonPathStr = jsonPathStr; this.jsonPathStr = jsonPathStr;
this.jsonPath = JsonPath.compile(this.jsonPathStr); this.jsonPath = JsonPath.compile(this.jsonPathStr);
} }
@SuppressWarnings("unused")
public String getJsonPathStr() {
return jsonPathStr;
}
@Override @Override
public String select(String text) { public String select(String text) {
Object object = jsonPath.read(text); Object object = jsonPath.read(text);
@ -32,8 +37,8 @@ public class JsonPathSelector implements Selector {
return null; return null;
} }
if (object instanceof List) { if (object instanceof List) {
List list = (List) object; List<?> list = (List<?>) object;
if (list != null && list.size() > 0) { if (list.size() > 0) {
return toString(list.iterator().next()); return toString(list.iterator().next());
} }
} }
@ -49,8 +54,9 @@ public class JsonPathSelector implements Selector {
} }
@Override @Override
@SuppressWarnings("unchecked")
public List<String> selectList(String text) { public List<String> selectList(String text) {
List<String> list = new ArrayList<String>(); List<String> list = new ArrayList<>();
Object object = jsonPath.read(text); Object object = jsonPath.read(text);
if (object == null) { if (object == null) {
return list; return list;

View File

@ -1,12 +1,12 @@
package us.codecraft.webmagic.selector; package us.codecraft.webmagic.selector;
import org.apache.commons.collections.CollectionUtils;
import java.util.List;
import org.apache.commons.collections4.CollectionUtils;
import org.jsoup.nodes.Element; import org.jsoup.nodes.Element;
import us.codecraft.xsoup.XPathEvaluator; import us.codecraft.xsoup.XPathEvaluator;
import us.codecraft.xsoup.Xsoup; import us.codecraft.xsoup.Xsoup;
import java.util.List;
/** /**
* XPath selector based on Xsoup.<br> * XPath selector based on Xsoup.<br>
* *

View File

@ -1,9 +1,10 @@
package us.codecraft.webmagic.downloader; package us.codecraft.webmagic.downloader;
import com.github.dreamhead.moco.HttpServer;
import com.github.dreamhead.moco.Runnable; import java.io.IOException;
import com.github.dreamhead.moco.Runner; import java.io.UnsupportedEncodingException;
import org.apache.commons.collections.map.HashedMap; import java.util.Map;
import org.apache.commons.collections4.map.HashedMap;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpUriRequest; import org.apache.http.client.methods.HttpUriRequest;
@ -11,6 +12,9 @@ import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients; import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils; import org.apache.http.util.EntityUtils;
import org.junit.Test; import org.junit.Test;
import com.github.dreamhead.moco.HttpServer;
import com.github.dreamhead.moco.Runnable;
import com.github.dreamhead.moco.Runner;
import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Site;
@ -21,12 +25,19 @@ import us.codecraft.webmagic.proxy.SimpleProxyProvider;
import us.codecraft.webmagic.selector.Html; import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.utils.CharsetUtils; import us.codecraft.webmagic.utils.CharsetUtils;
import us.codecraft.webmagic.utils.HttpConstant; import us.codecraft.webmagic.utils.HttpConstant;
import static com.github.dreamhead.moco.Moco.and;
import java.io.IOException; import static com.github.dreamhead.moco.Moco.by;
import java.io.UnsupportedEncodingException; import static com.github.dreamhead.moco.Moco.cookie;
import java.util.Map; import static com.github.dreamhead.moco.Moco.eq;
import static com.github.dreamhead.moco.Moco.form;
import static com.github.dreamhead.moco.Moco.*; import static com.github.dreamhead.moco.Moco.header;
import static com.github.dreamhead.moco.Moco.httpServer;
import static com.github.dreamhead.moco.Moco.method;
import static com.github.dreamhead.moco.Moco.not;
import static com.github.dreamhead.moco.Moco.query;
import static com.github.dreamhead.moco.Moco.text;
import static com.github.dreamhead.moco.Moco.uri;
import static com.github.dreamhead.moco.Moco.with;
import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assertions.assertThat;
import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue; import static org.junit.Assert.assertTrue;

View File

@ -1,13 +1,15 @@
package us.codecraft.webmagic.downloader; package us.codecraft.webmagic.downloader;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task; import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.selector.PlainText; import us.codecraft.webmagic.selector.PlainText;
import java.io.IOException;
import java.io.InputStream;
/** /**
* @author code4crafter@gmail.com * @author code4crafter@gmail.com
@ -19,7 +21,7 @@ public class MockGithubDownloader implements Downloader {
Page page = new Page(); Page page = new Page();
InputStream resourceAsStream = this.getClass().getResourceAsStream("/html/mock-github.html"); InputStream resourceAsStream = this.getClass().getResourceAsStream("/html/mock-github.html");
try { try {
page.setRawText(IOUtils.toString(resourceAsStream)); page.setRawText(IOUtils.toString(resourceAsStream, Charset.defaultCharset()));
} catch (IOException e) { } catch (IOException e) {
e.printStackTrace(); e.printStackTrace();
} }

View File

@ -0,0 +1,40 @@
package us.codecraft.webmagic.processor;
import static org.junit.Assert.assertEquals;
import org.junit.Test;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
public class PageProcessorTest {
@Test
public void testGetSite() {
Site actualSite = new PageProcessor() {
@Override
public void process(Page page) {
}
}.getSite();
assertEquals(Site.me(), actualSite);
actualSite = new PageProcessor() {
@Override
public void process(Page page) {
}
@Override
public Site getSite() {
return Site.me().setTimeOut(123);
};
}.getSite();
assertEquals(Site.me().setTimeOut(123), actualSite);
}
}

View File

@ -8,7 +8,7 @@
<parent> <parent>
<groupId>us.codecraft</groupId> <groupId>us.codecraft</groupId>
<artifactId>webmagic-parent</artifactId> <artifactId>webmagic-parent</artifactId>
<version>0.7.5</version> <version>0.7.6</version>
</parent> </parent>
<artifactId>webmagic-coverage</artifactId> <artifactId>webmagic-coverage</artifactId>

View File

@ -3,7 +3,7 @@
<parent> <parent>
<groupId>us.codecraft</groupId> <groupId>us.codecraft</groupId>
<artifactId>webmagic-parent</artifactId> <artifactId>webmagic-parent</artifactId>
<version>0.7.5</version> <version>0.7.6</version>
</parent> </parent>
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>

View File

@ -16,21 +16,17 @@ import java.io.*;
* @version 0.5.3 * @version 0.5.3
*/ */
public class PhantomJSDownloader extends AbstractDownloader { public class PhantomJSDownloader extends AbstractDownloader {
private static final Logger logger = LoggerFactory.getLogger(PhantomJSDownloader.class);
private static Logger logger = LoggerFactory.getLogger(PhantomJSDownloader.class);
private static String crawlJsPath; private static String crawlJsPath;
private static String phantomJsCommand = "phantomjs"; // default private static String phantomJsCommand = "phantomjs"; // default
private int retryNum;
private int threadNum;
public PhantomJSDownloader() { public PhantomJSDownloader() {
this.initPhantomjsCrawlPath(); this.initPhantomjsCrawlPath();
} }
/** /**
* phantomjs * phantomjs
* * <p>
* example: * example:
* phantomjs.exe windows * phantomjs.exe windows
* phantomjs --ignore-ssl-errors=yes https * phantomjs --ignore-ssl-errors=yes https
@ -69,7 +65,7 @@ public class PhantomJSDownloader extends AbstractDownloader {
* -- crawl.js end * -- crawl.js end
* </pre> * </pre>
* js使 * js使
* * <p>
* example: * example:
* new PhantomJSDownloader("/your/path/phantomjs", "/your/path/crawl.js"); * new PhantomJSDownloader("/your/path/phantomjs", "/your/path/crawl.js");
* *
@ -82,7 +78,8 @@ public class PhantomJSDownloader extends AbstractDownloader {
} }
private void initPhantomjsCrawlPath() { private void initPhantomjsCrawlPath() {
PhantomJSDownloader.crawlJsPath = new File(this.getClass().getResource("/").getPath()).getPath() + System.getProperty("file.separator") + "crawl.js "; PhantomJSDownloader.crawlJsPath = new File(this.getClass().getResource("/").getPath()).getPath()
+ System.getProperty("file.separator") + "crawl.js ";
} }
@Override @Override
@ -90,61 +87,41 @@ public class PhantomJSDownloader extends AbstractDownloader {
if (logger.isInfoEnabled()) { if (logger.isInfoEnabled()) {
logger.info("downloading page: " + request.getUrl()); logger.info("downloading page: " + request.getUrl());
} }
String content = getPage(request);
if (content.contains("HTTP request failed")) {
for (int i = 1; i <= getRetryNum(); i++) {
content = getPage(request);
if (!content.contains("HTTP request failed")) {
break;
}
}
if (content.contains("HTTP request failed")) {
//when failed
Page page = new Page();
page.setRequest(request);
return page;
}
}
Page page = new Page(); Page page = Page.fail();
try {
String content = getPage(request);
if (!content.contains("HTTP request failed")) {
page.setDownloadSuccess(true);
page.setRawText(content); page.setRawText(content);
page.setUrl(new PlainText(request.getUrl())); page.setUrl(new PlainText(request.getUrl()));
page.setRequest(request); page.setRequest(request);
page.setStatusCode(200); page.setStatusCode(200);
}
onSuccess(request, task);
} catch (Exception e) {
onError(request, task, e);
logger.warn("download page {} error", request.getUrl(), e);
}
return page; return page;
} }
@Override @Override
public void setThread(int threadNum) { public void setThread(int threadNum) {
this.threadNum = threadNum; // ignore
} }
protected String getPage(Request request) { protected String getPage(Request request) throws Exception {
try {
String url = request.getUrl(); String url = request.getUrl();
Runtime runtime = Runtime.getRuntime(); Runtime runtime = Runtime.getRuntime();
Process process = runtime.exec(phantomJsCommand + " " + crawlJsPath + " " + url); Process process = runtime.exec(phantomJsCommand + " " + crawlJsPath + " " + url);
InputStream is = process.getInputStream(); InputStream is = process.getInputStream();
BufferedReader br = new BufferedReader(new InputStreamReader(is)); BufferedReader br = new BufferedReader(new InputStreamReader(is));
StringBuffer stringBuffer = new StringBuffer(); StringBuilder builder = new StringBuilder();
String line; String line;
while ((line = br.readLine()) != null) { while ((line = br.readLine()) != null) {
stringBuffer.append(line).append("\n"); builder.append(line).append("\n");
} }
return stringBuffer.toString(); return builder.toString();
} catch (IOException e) {
e.printStackTrace();
}
return null;
}
public int getRetryNum() {
return retryNum;
}
public PhantomJSDownloader setRetryNum(int retryNum) {
this.retryNum = retryNum;
return this;
} }
} }

View File

@ -1,11 +1,13 @@
package us.codecraft.webmagic.model; package us.codecraft.webmagic.model;
import java.io.IOException;
import java.nio.charset.Charset;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.selector.PlainText; import us.codecraft.webmagic.selector.PlainText;
import java.io.IOException;
/** /**
* @author code4crafter@gmail.com * @author code4crafter@gmail.com
@ -16,7 +18,7 @@ public class PageMocker {
public Page getMockJsonPage() throws IOException { public Page getMockJsonPage() throws IOException {
Page page = new Page(); Page page = new Page();
page.setRawText(IOUtils.toString(PageMocker.class.getClassLoader().getResourceAsStream("json/mock-githubrepo.json"))); page.setRawText(IOUtils.toString(PageMocker.class.getClassLoader().getResourceAsStream("json/mock-githubrepo.json"), Charset.defaultCharset()));
page.setRequest(new Request("https://api.github.com/repos/code4craft/webmagic")); page.setRequest(new Request("https://api.github.com/repos/code4craft/webmagic"));
page.setUrl(new PlainText("https://api.github.com/repos/code4craft/webmagic")); page.setUrl(new PlainText("https://api.github.com/repos/code4craft/webmagic"));
return page; return page;
@ -24,7 +26,7 @@ public class PageMocker {
public Page getMockPage() throws IOException { public Page getMockPage() throws IOException {
Page page = new Page(); Page page = new Page();
page.setRawText(IOUtils.toString(PageMocker.class.getClassLoader().getResourceAsStream("html/mock-webmagic.html"))); page.setRawText(IOUtils.toString(PageMocker.class.getClassLoader().getResourceAsStream("html/mock-webmagic.html"), Charset.defaultCharset()));
page.setRequest(new Request("http://webmagic.io/list/0")); page.setRequest(new Request("http://webmagic.io/list/0"));
page.setUrl(new PlainText("http://webmagic.io/list/0")); page.setUrl(new PlainText("http://webmagic.io/list/0"));
return page; return page;

View File

@ -3,7 +3,7 @@
<parent> <parent>
<artifactId>webmagic-parent</artifactId> <artifactId>webmagic-parent</artifactId>
<groupId>us.codecraft</groupId> <groupId>us.codecraft</groupId>
<version>0.7.5</version> <version>0.7.6</version>
</parent> </parent>
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>

View File

@ -1,14 +1,14 @@
package us.codecraft.webmagic.samples; package us.codecraft.webmagic.samples;
import org.apache.commons.collections.CollectionUtils;
import java.util.List;
import org.apache.commons.collections4.CollectionUtils;
import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.JsonPathSelector; import us.codecraft.webmagic.selector.JsonPathSelector;
import java.util.List;
/** /**
* @author code4crafter@gmail.com * @author code4crafter@gmail.com
* @since 0.5.0 * @since 0.5.0

View File

@ -1,6 +1,6 @@
package us.codecraft.webmagic.samples; package us.codecraft.webmagic.samples;
import org.apache.commons.collections.CollectionUtils; import org.apache.commons.collections4.CollectionUtils;
import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.Spider;

View File

@ -36,7 +36,7 @@ public class PhantomJSPageProcessor implements PageProcessor {
} }
public static void main(String[] args) throws Exception { public static void main(String[] args) throws Exception {
PhantomJSDownloader phantomDownloader = new PhantomJSDownloader().setRetryNum(3); PhantomJSDownloader phantomDownloader = new PhantomJSDownloader();
CollectorPipeline<ResultItems> collectorPipeline = new ResultItemsCollectorPipeline(); CollectorPipeline<ResultItems> collectorPipeline = new ResultItemsCollectorPipeline();

View File

@ -3,7 +3,7 @@
<parent> <parent>
<artifactId>webmagic-parent</artifactId> <artifactId>webmagic-parent</artifactId>
<groupId>us.codecraft</groupId> <groupId>us.codecraft</groupId>
<version>0.7.5</version> <version>0.7.6</version>
</parent> </parent>
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>

View File

@ -75,6 +75,7 @@ public class Xpath2Selector implements Selector {
private XPath2NamespaceContext() { private XPath2NamespaceContext() {
put("fn", NamespaceConstant.FN); put("fn", NamespaceConstant.FN);
put("xslt", NamespaceConstant.XSLT); put("xslt", NamespaceConstant.XSLT);
put("xhtml", NamespaceConstant.XHTML);
} }
@Override @Override

View File

@ -1376,7 +1376,7 @@ public class XpathSelectorTest {
@Test @Test
public void testXpath2Selector() { public void testXpath2Selector() {
Xpath2Selector xpath2Selector = new Xpath2Selector("//a/@href"); Xpath2Selector xpath2Selector = new Xpath2Selector("//xhtml:a/@href");
String select = xpath2Selector.select(html); String select = xpath2Selector.select(html);
Assert.assertEquals("http://www.oschina.net/", select); Assert.assertEquals("http://www.oschina.net/", select);

View File

@ -3,7 +3,7 @@
<parent> <parent>
<artifactId>webmagic-parent</artifactId> <artifactId>webmagic-parent</artifactId>
<groupId>us.codecraft</groupId> <groupId>us.codecraft</groupId>
<version>0.7.5</version> <version>0.7.6</version>
</parent> </parent>
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>

View File

@ -1,5 +1,14 @@
package us.codecraft.webmagic.scripts; package us.codecraft.webmagic.scripts;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.util.Iterator;
import java.util.Map;
import javax.script.ScriptContext;
import javax.script.ScriptEngine;
import javax.script.ScriptException;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.jruby.RubyHash; import org.jruby.RubyHash;
import org.python.core.PyDictionary; import org.python.core.PyDictionary;
@ -7,14 +16,6 @@ import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.processor.PageProcessor;
import javax.script.ScriptContext;
import javax.script.ScriptEngine;
import javax.script.ScriptException;
import java.io.IOException;
import java.io.InputStream;
import java.util.Iterator;
import java.util.Map;
/** /**
* @author code4crafter@gmail.com * @author code4crafter@gmail.com
* @since 0.4.1 * @since 0.4.1
@ -39,7 +40,7 @@ public class ScriptProcessor implements PageProcessor {
enginePool = new ScriptEnginePool(language, threadNum); enginePool = new ScriptEnginePool(language, threadNum);
InputStream resourceAsStream = this.getClass().getClassLoader().getResourceAsStream(language.getDefineFile()); InputStream resourceAsStream = this.getClass().getClassLoader().getResourceAsStream(language.getDefineFile());
try { try {
defines = IOUtils.toString(resourceAsStream); defines = IOUtils.toString(resourceAsStream, Charset.defaultCharset());
} catch (IOException e) { } catch (IOException e) {
throw new IllegalArgumentException(e); throw new IllegalArgumentException(e);
} }

View File

@ -1,10 +1,12 @@
package us.codecraft.webmagic.scripts; package us.codecraft.webmagic.scripts;
import org.apache.commons.io.IOUtils;
import java.io.FileInputStream; import java.io.FileInputStream;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.nio.charset.Charset;
import org.apache.commons.io.IOUtils;
/** /**
* @author code4crafter@gmail.com * @author code4crafter@gmail.com
@ -35,7 +37,7 @@ public class ScriptProcessorBuilder {
public ScriptProcessorBuilder scriptFromFile(String fileName) { public ScriptProcessorBuilder scriptFromFile(String fileName) {
try { try {
InputStream resourceAsStream = new FileInputStream(fileName); InputStream resourceAsStream = new FileInputStream(fileName);
this.script = IOUtils.toString(resourceAsStream); this.script = IOUtils.toString(resourceAsStream, Charset.defaultCharset());
} catch (IOException e) { } catch (IOException e) {
//wrap IOException because I prefer a runtime exception... //wrap IOException because I prefer a runtime exception...
throw new IllegalArgumentException(e); throw new IllegalArgumentException(e);
@ -46,7 +48,7 @@ public class ScriptProcessorBuilder {
public ScriptProcessorBuilder scriptFromClassPathFile(String fileName) { public ScriptProcessorBuilder scriptFromClassPathFile(String fileName) {
try { try {
InputStream resourceAsStream = ScriptProcessor.class.getClassLoader().getResourceAsStream(fileName); InputStream resourceAsStream = ScriptProcessor.class.getClassLoader().getResourceAsStream(fileName);
this.script = IOUtils.toString(resourceAsStream); this.script = IOUtils.toString(resourceAsStream, Charset.defaultCharset());
} catch (IOException e) { } catch (IOException e) {
//wrap IOException because I prefer a runtime exception... //wrap IOException because I prefer a runtime exception...
throw new IllegalArgumentException(e); throw new IllegalArgumentException(e);

View File

@ -3,7 +3,7 @@
<parent> <parent>
<artifactId>webmagic-parent</artifactId> <artifactId>webmagic-parent</artifactId>
<groupId>us.codecraft</groupId> <groupId>us.codecraft</groupId>
<version>0.7.5</version> <version>0.7.6</version>
</parent> </parent>
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>

View File

@ -11,7 +11,7 @@ import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Task; import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.downloader.Downloader; import us.codecraft.webmagic.downloader.AbstractDownloader;
import us.codecraft.webmagic.selector.Html; import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.PlainText; import us.codecraft.webmagic.selector.PlainText;
@ -27,7 +27,7 @@ import java.util.Map;
* Date: 13-7-26 <br> * Date: 13-7-26 <br>
* Time: 1:37 <br> * Time: 1:37 <br>
*/ */
public class SeleniumDownloader implements Downloader, Closeable { public class SeleniumDownloader extends AbstractDownloader implements Closeable {
private volatile WebDriverPool webDriverPool; private volatile WebDriverPool webDriverPool;
@ -73,17 +73,17 @@ public class SeleniumDownloader implements Downloader, Closeable {
@Override @Override
public Page download(Request request, Task task) { public Page download(Request request, Task task) {
checkInit(); checkInit();
WebDriver webDriver; WebDriver webDriver = null;
Page page = Page.fail();
try { try {
webDriver = webDriverPool.get(); webDriver = webDriverPool.get();
} catch (InterruptedException e) {
logger.warn("interrupted", e);
return null;
}
logger.info("downloading page " + request.getUrl()); logger.info("downloading page " + request.getUrl());
webDriver.get(request.getUrl()); webDriver.get(request.getUrl());
try { try {
if (sleepTime > 0) {
Thread.sleep(sleepTime); Thread.sleep(sleepTime);
}
} catch (InterruptedException e) { } catch (InterruptedException e) {
e.printStackTrace(); e.printStackTrace();
} }
@ -106,12 +106,20 @@ public class SeleniumDownloader implements Downloader, Closeable {
WebElement webElement = webDriver.findElement(By.xpath("/html")); WebElement webElement = webDriver.findElement(By.xpath("/html"));
String content = webElement.getAttribute("outerHTML"); String content = webElement.getAttribute("outerHTML");
Page page = new Page(); page.setDownloadSuccess(true);
page.setRawText(content); page.setRawText(content);
page.setHtml(new Html(content, request.getUrl())); page.setHtml(new Html(content, request.getUrl()));
page.setUrl(new PlainText(request.getUrl())); page.setUrl(new PlainText(request.getUrl()));
page.setRequest(request); page.setRequest(request);
onSuccess(request, task);
} catch (Exception e) {
logger.warn("download page {} error", request.getUrl(), e);
onError(request, task, e);
} finally {
if (webDriver != null) {
webDriverPool.returnToPool(webDriver); webDriverPool.returnToPool(webDriver);
}
}
return page; return page;
} }