Merge branch 'release/0.7.6'
commit
dc7218eba3
124
pom.xml
124
pom.xml
|
@ -1,7 +1,7 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<groupId>us.codecraft</groupId>
|
||||
<version>0.7.5</version>
|
||||
<version>0.7.6</version>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
<packaging>pom</packaging>
|
||||
<properties>
|
||||
|
@ -9,7 +9,31 @@
|
|||
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
|
||||
<maven.compiler.source>1.8</maven.compiler.source>
|
||||
<maven.compiler.target>1.8</maven.compiler.target>
|
||||
<assertj.version>3.23.1</assertj.version>
|
||||
<commons-cli.version>1.5.0</commons-cli.version>
|
||||
<commons-collections4.version>4.4</commons-collections4.version>
|
||||
<commons-io.version>2.11.0</commons-io.version>
|
||||
<commons-lang3.version>3.12.0</commons-lang3.version>
|
||||
<fastjson.version>2.0.14.graal</fastjson.version>
|
||||
<groovy-all.version>3.0.13</groovy-all.version>
|
||||
<guava.version>31.1-jre</guava.version>
|
||||
<htmlcleaner.version>2.26</htmlcleaner.version>
|
||||
<httpclient.version>4.5.13</httpclient.version>
|
||||
<httpcore.version>4.4.15</httpcore.version>
|
||||
<jedis.version>3.7.1</jedis.version>
|
||||
<jruby.version>9.3.8.0</jruby.version>
|
||||
<json-path.version>2.7.0</json-path.version>
|
||||
<junit.version>4.13.2</junit.version>
|
||||
<jython.version>2.7.3</jython.version>
|
||||
<log4j.version>1.2.17</log4j.version>
|
||||
<mockito-all.version>2.0.2-beta</mockito-all.version>
|
||||
<moco.version>1.3.0</moco.version>
|
||||
<phantomjsdriver.version>1.2.0</phantomjsdriver.version>
|
||||
<saxon-he.version>11.4</saxon-he.version>
|
||||
<selenium-java.version>3.141.59</selenium-java.version>
|
||||
<slf4j.version>2.0.3</slf4j.version>
|
||||
<spring-version>4.0.0.RELEASE</spring-version>
|
||||
<xsoup.version>0.3.5</xsoup.version>
|
||||
</properties>
|
||||
<artifactId>webmagic-parent</artifactId>
|
||||
<name>webmagic-parent</name>
|
||||
|
@ -58,59 +82,59 @@
|
|||
<dependency>
|
||||
<groupId>junit</groupId>
|
||||
<artifactId>junit</artifactId>
|
||||
<version>4.13.1</version>
|
||||
<version>${junit.version}</version>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.mockito</groupId>
|
||||
<artifactId>mockito-all</artifactId>
|
||||
<version>1.10.19</version>
|
||||
<version>${mockito-all.version}</version>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.httpcomponents</groupId>
|
||||
<artifactId>httpclient</artifactId>
|
||||
<version>4.5.13</version>
|
||||
<version>${httpclient.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.httpcomponents</groupId>
|
||||
<artifactId>httpcore</artifactId>
|
||||
<version>4.4.14</version>
|
||||
<version>${httpcore.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.google.guava</groupId>
|
||||
<artifactId>guava</artifactId>
|
||||
<version>30.1-jre</version>
|
||||
<version>${guava.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.jayway.jsonpath</groupId>
|
||||
<artifactId>json-path</artifactId>
|
||||
<version>2.6.0</version>
|
||||
<version>${json-path.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.slf4j</groupId>
|
||||
<artifactId>slf4j-api</artifactId>
|
||||
<version>1.7.30</version>
|
||||
<version>${slf4j.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.slf4j</groupId>
|
||||
<artifactId>slf4j-log4j12</artifactId>
|
||||
<version>1.7.30</version>
|
||||
<version>${slf4j.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<artifactId>xsoup</artifactId>
|
||||
<version>0.3.4</version>
|
||||
<version>${xsoup.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.alibaba</groupId>
|
||||
<artifactId>fastjson</artifactId>
|
||||
<version>1.2.83</version>
|
||||
<version>${fastjson.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.github.dreamhead</groupId>
|
||||
<artifactId>moco-core</artifactId>
|
||||
<version>1.3.0</version>
|
||||
<version>${moco.version}</version>
|
||||
<scope>test</scope>
|
||||
<exclusions>
|
||||
<exclusion>
|
||||
|
@ -122,73 +146,73 @@
|
|||
<dependency>
|
||||
<groupId>log4j</groupId>
|
||||
<artifactId>log4j</artifactId>
|
||||
<version>1.2.17</version>
|
||||
<version>${log4j.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.assertj</groupId>
|
||||
<artifactId>assertj-core</artifactId>
|
||||
<version>3.18.1</version>
|
||||
<version>${assertj.version}</version>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.commons</groupId>
|
||||
<artifactId>commons-lang3</artifactId>
|
||||
<version>3.11</version>
|
||||
<version>${commons-lang3.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>commons-collections</groupId>
|
||||
<artifactId>commons-collections</artifactId>
|
||||
<version>3.2.2</version>
|
||||
<groupId>org.apache.commons</groupId>
|
||||
<artifactId>commons-collections4</artifactId>
|
||||
<version>${commons-collections4.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>commons-io</groupId>
|
||||
<artifactId>commons-io</artifactId>
|
||||
<version>2.8.0</version>
|
||||
<version>${commons-io.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.codehaus.groovy</groupId>
|
||||
<artifactId>groovy-all</artifactId>
|
||||
<version>3.0.7</version>
|
||||
<version>${groovy-all.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.jruby</groupId>
|
||||
<artifactId>jruby</artifactId>
|
||||
<version>9.3.0.0</version>
|
||||
<version>${jruby.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.python</groupId>
|
||||
<artifactId>jython</artifactId>
|
||||
<version>2.7.2</version>
|
||||
<version>${jython.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.seleniumhq.selenium</groupId>
|
||||
<artifactId>selenium-java</artifactId>
|
||||
<version>4.0.0</version>
|
||||
<version>${selenium-java.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>net.sf.saxon</groupId>
|
||||
<artifactId>Saxon-HE</artifactId>
|
||||
<version>10.3</version>
|
||||
<version>${saxon-he.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>net.sourceforge.htmlcleaner</groupId>
|
||||
<artifactId>htmlcleaner</artifactId>
|
||||
<version>2.26</version>
|
||||
<version>${htmlcleaner.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.github.detro</groupId>
|
||||
<artifactId>phantomjsdriver</artifactId>
|
||||
<version>1.2.0</version>
|
||||
<version>${phantomjsdriver.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>commons-cli</groupId>
|
||||
<artifactId>commons-cli</artifactId>
|
||||
<version>1.4</version>
|
||||
<version>${commons-cli.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>redis.clients</groupId>
|
||||
<artifactId>jedis</artifactId>
|
||||
<version>3.6.0</version>
|
||||
<version>${jedis.version}</version>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
</dependencyManagement>
|
||||
|
@ -198,7 +222,7 @@
|
|||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-enforcer-plugin</artifactId>
|
||||
<version>3.0.0-M3</version>
|
||||
<version>3.1.0</version>
|
||||
<executions>
|
||||
<execution>
|
||||
<id>enforce-maven</id>
|
||||
|
@ -272,7 +296,7 @@
|
|||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-javadoc-plugin</artifactId>
|
||||
<version>3.2.0</version>
|
||||
<version>3.4.1</version>
|
||||
<configuration>
|
||||
<encoding>UTF-8</encoding>
|
||||
<doctitle>WebMagic ${project.version}</doctitle>
|
||||
|
@ -301,7 +325,7 @@
|
|||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-release-plugin</artifactId>
|
||||
<version>3.0.0-M1</version>
|
||||
<version>3.0.0-M6</version>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.jacoco</groupId>
|
||||
|
@ -336,77 +360,77 @@
|
|||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-clean-plugin</artifactId>
|
||||
<version>3.1.0</version>
|
||||
<version>3.2.0</version>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-compiler-plugin</artifactId>
|
||||
<version>3.8.1</version>
|
||||
<version>3.10.1</version>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-deploy-plugin</artifactId>
|
||||
<version>3.0.0-M1</version>
|
||||
<version>3.0.0</version>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-install-plugin</artifactId>
|
||||
<version>3.0.0-M1</version>
|
||||
<version>3.0.1</version>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-jar-plugin</artifactId>
|
||||
<version>3.2.0</version>
|
||||
<version>3.3.0</version>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-jxr-plugin</artifactId>
|
||||
<version>3.1.1</version>
|
||||
<version>3.3.0</version>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-pmd-plugin</artifactId>
|
||||
<version>3.14.0</version>
|
||||
<version>3.19.0</version>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-resources-plugin</artifactId>
|
||||
<version>3.2.0</version>
|
||||
<version>3.3.0</version>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-site-plugin</artifactId>
|
||||
<version>3.9.1</version>
|
||||
<version>4.0.0-M3</version>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-surefire-plugin</artifactId>
|
||||
<version>3.0.0-M5</version>
|
||||
<version>3.0.0-M7</version>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-surefire-report-plugin</artifactId>
|
||||
<version>3.0.0-M5</version>
|
||||
<version>3.0.0-M7</version>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.codehaus.mojo</groupId>
|
||||
<artifactId>taglist-maven-plugin</artifactId>
|
||||
<version>2.4</version>
|
||||
<version>3.0.0</version>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.jacoco</groupId>
|
||||
<artifactId>jacoco-maven-plugin</artifactId>
|
||||
<version>0.8.7</version>
|
||||
<version>0.8.8</version>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>com.amashchenko.maven.plugin</groupId>
|
||||
<artifactId>gitflow-maven-plugin</artifactId>
|
||||
<version>1.15.0</version>
|
||||
<version>1.18.0</version>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>com.github.spotbugs</groupId>
|
||||
<artifactId>spotbugs-maven-plugin</artifactId>
|
||||
<version>4.2.3</version>
|
||||
<version>4.7.2.0</version>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</pluginManagement>
|
||||
|
@ -453,7 +477,7 @@
|
|||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-source-plugin</artifactId>
|
||||
<version>2.2.1</version>
|
||||
<version>3.2.1</version>
|
||||
<executions>
|
||||
<execution>
|
||||
<phase>package</phase>
|
||||
|
@ -467,7 +491,7 @@
|
|||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-javadoc-plugin</artifactId>
|
||||
<version>2.9.1</version>
|
||||
<version>3.4.1</version>
|
||||
<executions>
|
||||
<execution>
|
||||
<phase>package</phase>
|
||||
|
@ -481,7 +505,7 @@
|
|||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-gpg-plugin</artifactId>
|
||||
<version>1.6</version>
|
||||
<version>3.0.1</version>
|
||||
<executions>
|
||||
<execution>
|
||||
<phase>verify</phase>
|
||||
|
@ -494,7 +518,7 @@
|
|||
<plugin>
|
||||
<groupId>org.sonatype.plugins</groupId>
|
||||
<artifactId>nexus-staging-maven-plugin</artifactId>
|
||||
<version>1.6.8</version>
|
||||
<version>1.6.13</version>
|
||||
<extensions>true</extensions>
|
||||
<configuration>
|
||||
<serverId>sonatype-nexus-staging</serverId>
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
<parent>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<artifactId>webmagic-parent</artifactId>
|
||||
<version>0.7.5</version>
|
||||
<version>0.7.6</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
|
@ -52,8 +52,8 @@
|
|||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>commons-collections</groupId>
|
||||
<artifactId>commons-collections</artifactId>
|
||||
<groupId>org.apache.commons</groupId>
|
||||
<artifactId>commons-collections4</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
|
|
|
@ -1,6 +1,20 @@
|
|||
package us.codecraft.webmagic;
|
||||
|
||||
import org.apache.commons.collections.CollectionUtils;
|
||||
|
||||
import java.io.Closeable;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Date;
|
||||
import java.util.List;
|
||||
import java.util.UUID;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
import java.util.concurrent.atomic.AtomicLong;
|
||||
import java.util.concurrent.locks.Condition;
|
||||
import java.util.concurrent.locks.ReentrantLock;
|
||||
import org.apache.commons.collections4.CollectionUtils;
|
||||
import org.apache.commons.lang3.SerializationUtils;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
@ -17,16 +31,6 @@ import us.codecraft.webmagic.thread.CountableThreadPool;
|
|||
import us.codecraft.webmagic.utils.UrlUtils;
|
||||
import us.codecraft.webmagic.utils.WMCollections;
|
||||
|
||||
import java.io.Closeable;
|
||||
import java.io.IOException;
|
||||
import java.util.*;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
import java.util.concurrent.atomic.AtomicLong;
|
||||
import java.util.concurrent.locks.Condition;
|
||||
import java.util.concurrent.locks.ReentrantLock;
|
||||
|
||||
/**
|
||||
* Entrance of a crawler.<br>
|
||||
* A spider contains four modules: Downloader, Scheduler, PageProcessor and
|
||||
|
@ -106,7 +110,7 @@ public class Spider implements Runnable, Task {
|
|||
|
||||
private Date startTime;
|
||||
|
||||
private int emptySleepTime = 30000;
|
||||
private long emptySleepTime = 30000;
|
||||
|
||||
/**
|
||||
* create a spider with pageProcessor.
|
||||
|
@ -305,32 +309,53 @@ public class Spider implements Runnable, Task {
|
|||
public void run() {
|
||||
checkRunningStat();
|
||||
initComponent();
|
||||
logger.info("Spider {} started!",getUUID());
|
||||
logger.info("Spider {} started!", getUUID());
|
||||
// interrupt won't be necessarily detected
|
||||
while (!Thread.currentThread().isInterrupted() && stat.get() == STAT_RUNNING) {
|
||||
final Request request = scheduler.poll(this);
|
||||
if (request == null) {
|
||||
if (threadPool.getThreadAlive() == 0 && exitWhenComplete) {
|
||||
break;
|
||||
}
|
||||
// wait until new url added
|
||||
waitNewUrl();
|
||||
} else {
|
||||
threadPool.execute(new Runnable() {
|
||||
@Override
|
||||
public void run() {
|
||||
try {
|
||||
processRequest(request);
|
||||
onSuccess(request);
|
||||
} catch (Exception e) {
|
||||
onError(request, e);
|
||||
logger.error("process request " + request + " error", e);
|
||||
} finally {
|
||||
pageCount.incrementAndGet();
|
||||
signalNewUrl();
|
||||
Request poll = scheduler.poll(this);
|
||||
if (poll == null) {
|
||||
if (threadPool.getThreadAlive() == 0) {
|
||||
//no alive thread anymore , try again
|
||||
poll = scheduler.poll(this);
|
||||
if (poll == null) {
|
||||
if (exitWhenComplete) {
|
||||
break;
|
||||
} else {
|
||||
// wait
|
||||
try {
|
||||
Thread.sleep(emptySleepTime);
|
||||
continue;
|
||||
} catch (InterruptedException e) {
|
||||
Thread.currentThread().interrupt();
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
} else {
|
||||
// wait until new url added,
|
||||
if (waitNewUrl())
|
||||
//if interrupted
|
||||
break;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
final Request request = poll;
|
||||
//this may swallow the interruption
|
||||
threadPool.execute(new Runnable() {
|
||||
@Override
|
||||
public void run() {
|
||||
try {
|
||||
processRequest(request);
|
||||
onSuccess(request);
|
||||
} catch (Exception e) {
|
||||
onError(request, e);
|
||||
logger.error("process request " + request + " error", e);
|
||||
} finally {
|
||||
pageCount.incrementAndGet();
|
||||
signalNewUrl();
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
stat.set(STAT_STOPPED);
|
||||
// release some resources
|
||||
|
@ -469,6 +494,7 @@ public class Spider implements Runnable, Task {
|
|||
Thread.sleep(time);
|
||||
} catch (InterruptedException e) {
|
||||
logger.error("Thread interrupted when sleep",e);
|
||||
Thread.currentThread().interrupt();
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -565,16 +591,24 @@ public class Spider implements Runnable, Task {
|
|||
return this;
|
||||
}
|
||||
|
||||
private void waitNewUrl() {
|
||||
/**
|
||||
*
|
||||
* @return isInterrupted
|
||||
*/
|
||||
private boolean waitNewUrl() {
|
||||
// now there may not be any thread live
|
||||
newUrlLock.lock();
|
||||
try {
|
||||
//double check
|
||||
if (threadPool.getThreadAlive() == 0 && exitWhenComplete) {
|
||||
return;
|
||||
//double check,unnecessary, unless very fast concurrent
|
||||
if (threadPool.getThreadAlive() == 0) {
|
||||
return false;
|
||||
}
|
||||
//wait for amount of time
|
||||
newUrlCondition.await(emptySleepTime, TimeUnit.MILLISECONDS);
|
||||
return false;
|
||||
} catch (InterruptedException e) {
|
||||
logger.warn("waitNewUrl - interrupted, error {}", e);
|
||||
// logger.warn("waitNewUrl - interrupted, error {}", e);
|
||||
return true;
|
||||
} finally {
|
||||
newUrlLock.unlock();
|
||||
}
|
||||
|
@ -772,7 +806,10 @@ public class Spider implements Runnable, Task {
|
|||
*
|
||||
* @param emptySleepTime In MILLISECONDS.
|
||||
*/
|
||||
public void setEmptySleepTime(int emptySleepTime) {
|
||||
public void setEmptySleepTime(long emptySleepTime) {
|
||||
if(emptySleepTime<=0){
|
||||
throw new IllegalArgumentException("emptySleepTime should be more than zero!");
|
||||
}
|
||||
this.emptySleepTime = emptySleepTime;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -8,13 +8,14 @@ package us.codecraft.webmagic;
|
|||
*/
|
||||
public interface SpiderListener {
|
||||
|
||||
public void onSuccess(Request request);
|
||||
void onSuccess(Request request);
|
||||
|
||||
/**
|
||||
* @deprecated Use {@link #onError(Request, Exception)} instead.
|
||||
*/
|
||||
@Deprecated
|
||||
public void onError(Request request);
|
||||
default void onError(Request request) {
|
||||
}
|
||||
|
||||
default void onError(Request request, Exception e) {
|
||||
this.onError(request);
|
||||
|
|
|
@ -3,6 +3,7 @@ package us.codecraft.webmagic.downloader;
|
|||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.Request;
|
||||
import us.codecraft.webmagic.Site;
|
||||
import us.codecraft.webmagic.Task;
|
||||
import us.codecraft.webmagic.selector.Html;
|
||||
|
||||
/**
|
||||
|
@ -26,7 +27,7 @@ public abstract class AbstractDownloader implements Downloader {
|
|||
/**
|
||||
* A simple method to download a url.
|
||||
*
|
||||
* @param url url
|
||||
* @param url url
|
||||
* @param charset charset
|
||||
* @return html
|
||||
*/
|
||||
|
@ -35,10 +36,26 @@ public abstract class AbstractDownloader implements Downloader {
|
|||
return (Html) page.getHtml();
|
||||
}
|
||||
|
||||
@Deprecated
|
||||
protected void onSuccess(Request request) {
|
||||
}
|
||||
|
||||
/**
|
||||
* @since 0.7.6
|
||||
*/
|
||||
protected void onSuccess(Request request, Task task) {
|
||||
this.onSuccess(request);
|
||||
}
|
||||
|
||||
@Deprecated
|
||||
protected void onError(Request request) {
|
||||
}
|
||||
|
||||
/**
|
||||
* @since 0.7.6
|
||||
*/
|
||||
protected void onError(Request request, Task task, Throwable e) {
|
||||
this.onError(request);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -82,12 +82,12 @@ public class HttpClientDownloader extends AbstractDownloader {
|
|||
try {
|
||||
httpResponse = httpClient.execute(requestContext.getHttpUriRequest(), requestContext.getHttpClientContext());
|
||||
page = handleResponse(request, request.getCharset() != null ? request.getCharset() : task.getSite().getCharset(), httpResponse, task);
|
||||
onSuccess(request);
|
||||
onSuccess(request, task);
|
||||
logger.info("downloading page success {}", request.getUrl());
|
||||
return page;
|
||||
} catch (IOException e) {
|
||||
logger.warn("download page {} error", request.getUrl(), e);
|
||||
onError(request);
|
||||
onError(request, task, e);
|
||||
return page;
|
||||
} finally {
|
||||
if (httpResponse != null) {
|
||||
|
@ -110,7 +110,7 @@ public class HttpClientDownloader extends AbstractDownloader {
|
|||
String contentType = httpResponse.getEntity().getContentType() == null ? "" : httpResponse.getEntity().getContentType().getValue();
|
||||
Page page = new Page();
|
||||
page.setBytes(bytes);
|
||||
if (!request.isBinaryContent()){
|
||||
if (!request.isBinaryContent()) {
|
||||
if (charset == null) {
|
||||
charset = getHtmlCharset(contentType, bytes);
|
||||
}
|
||||
|
|
|
@ -4,13 +4,16 @@ import us.codecraft.webmagic.Page;
|
|||
import us.codecraft.webmagic.Site;
|
||||
|
||||
/**
|
||||
* Interface to be implemented to customize a crawler.<br>
|
||||
* <br>
|
||||
* Interface to be implemented to customize a crawler.
|
||||
*
|
||||
* <p>
|
||||
* In PageProcessor, you can customize:
|
||||
* <br>
|
||||
* start urls and other settings in {@link Site}<br>
|
||||
* how the urls to fetch are detected <br>
|
||||
* how the data are extracted and stored <br>
|
||||
* </p>
|
||||
* <ul>
|
||||
* <li>start URLs and other settings in {@link Site}</li>
|
||||
* <li>how the URLs to fetch are detected</li>
|
||||
* <li>how the data are extracted and stored</li>
|
||||
* </ul>
|
||||
*
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* @see Site
|
||||
|
@ -20,17 +23,20 @@ import us.codecraft.webmagic.Site;
|
|||
public interface PageProcessor {
|
||||
|
||||
/**
|
||||
* process the page, extract urls to fetch, extract the data and store
|
||||
* Processes the page, extract URLs to fetch, extract the data and store.
|
||||
*
|
||||
* @param page page
|
||||
*/
|
||||
public void process(Page page);
|
||||
void process(Page page);
|
||||
|
||||
/**
|
||||
* get the site settings
|
||||
* Returns the site settings.
|
||||
*
|
||||
* @return site
|
||||
* @see Site
|
||||
*/
|
||||
public Site getSite();
|
||||
default Site getSite() {
|
||||
return Site.me();
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,9 +1,9 @@
|
|||
package us.codecraft.webmagic.selector;
|
||||
|
||||
import org.apache.commons.collections.CollectionUtils;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import org.apache.commons.collections4.CollectionUtils;
|
||||
|
||||
/**
|
||||
* @author code4crafer@gmail.com
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
package us.codecraft.webmagic.selector;
|
||||
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.jsoup.nodes.Element;
|
||||
|
||||
import java.util.ArrayList;
|
||||
|
@ -11,11 +12,24 @@ import java.util.List;
|
|||
* @since 0.3.0
|
||||
*/
|
||||
public abstract class BaseElementSelector implements Selector, ElementSelector {
|
||||
private Document parse(String text) {
|
||||
if (text == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// Jsoup could not parse <tr></tr> or <td></td> tag directly
|
||||
// https://stackoverflow.com/questions/63607740/jsoup-couldnt-parse-tr-tag
|
||||
if ((text.startsWith("<tr>") && text.endsWith("</tr>"))
|
||||
|| (text.startsWith("<td>") && text.endsWith("</td>"))) {
|
||||
text = "<table>" + text + "</table>";
|
||||
}
|
||||
return Jsoup.parse(text);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String select(String text) {
|
||||
if (text != null) {
|
||||
return select(Jsoup.parse(text));
|
||||
return select(parse(text));
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
@ -23,7 +37,7 @@ public abstract class BaseElementSelector implements Selector, ElementSelector {
|
|||
@Override
|
||||
public List<String> selectList(String text) {
|
||||
if (text != null) {
|
||||
return selectList(Jsoup.parse(text));
|
||||
return selectList(parse(text));
|
||||
} else {
|
||||
return new ArrayList<String>();
|
||||
}
|
||||
|
@ -31,14 +45,14 @@ public abstract class BaseElementSelector implements Selector, ElementSelector {
|
|||
|
||||
public Element selectElement(String text) {
|
||||
if (text != null) {
|
||||
return selectElement(Jsoup.parse(text));
|
||||
return selectElement(parse(text));
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
public List<Element> selectElements(String text) {
|
||||
if (text != null) {
|
||||
return selectElements(Jsoup.parse(text));
|
||||
return selectElements(parse(text));
|
||||
} else {
|
||||
return new ArrayList<Element>();
|
||||
}
|
||||
|
|
|
@ -1,14 +1,14 @@
|
|||
package us.codecraft.webmagic.selector;
|
||||
|
||||
import org.apache.commons.collections.CollectionUtils;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import org.apache.commons.collections4.CollectionUtils;
|
||||
import org.jsoup.nodes.Element;
|
||||
import org.jsoup.nodes.Node;
|
||||
import org.jsoup.nodes.TextNode;
|
||||
import org.jsoup.select.Elements;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* CSS selector. Based on Jsoup.
|
||||
*
|
||||
|
|
|
@ -1,11 +1,11 @@
|
|||
package us.codecraft.webmagic.selector;
|
||||
|
||||
import com.alibaba.fastjson.JSON;
|
||||
import com.jayway.jsonpath.JsonPath;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import com.alibaba.fastjson.JSON;
|
||||
import com.jayway.jsonpath.JsonPath;
|
||||
|
||||
/**
|
||||
* JsonPath selector.<br>
|
||||
|
@ -16,15 +16,20 @@ import java.util.Map;
|
|||
*/
|
||||
public class JsonPathSelector implements Selector {
|
||||
|
||||
private String jsonPathStr;
|
||||
private final String jsonPathStr;
|
||||
|
||||
private JsonPath jsonPath;
|
||||
private final JsonPath jsonPath;
|
||||
|
||||
public JsonPathSelector(String jsonPathStr) {
|
||||
this.jsonPathStr = jsonPathStr;
|
||||
this.jsonPath = JsonPath.compile(this.jsonPathStr);
|
||||
}
|
||||
|
||||
@SuppressWarnings("unused")
|
||||
public String getJsonPathStr() {
|
||||
return jsonPathStr;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String select(String text) {
|
||||
Object object = jsonPath.read(text);
|
||||
|
@ -32,8 +37,8 @@ public class JsonPathSelector implements Selector {
|
|||
return null;
|
||||
}
|
||||
if (object instanceof List) {
|
||||
List list = (List) object;
|
||||
if (list != null && list.size() > 0) {
|
||||
List<?> list = (List<?>) object;
|
||||
if (list.size() > 0) {
|
||||
return toString(list.iterator().next());
|
||||
}
|
||||
}
|
||||
|
@ -49,8 +54,9 @@ public class JsonPathSelector implements Selector {
|
|||
}
|
||||
|
||||
@Override
|
||||
@SuppressWarnings("unchecked")
|
||||
public List<String> selectList(String text) {
|
||||
List<String> list = new ArrayList<String>();
|
||||
List<String> list = new ArrayList<>();
|
||||
Object object = jsonPath.read(text);
|
||||
if (object == null) {
|
||||
return list;
|
||||
|
|
|
@ -1,12 +1,12 @@
|
|||
package us.codecraft.webmagic.selector;
|
||||
|
||||
import org.apache.commons.collections.CollectionUtils;
|
||||
|
||||
import java.util.List;
|
||||
import org.apache.commons.collections4.CollectionUtils;
|
||||
import org.jsoup.nodes.Element;
|
||||
import us.codecraft.xsoup.XPathEvaluator;
|
||||
import us.codecraft.xsoup.Xsoup;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* XPath selector based on Xsoup.<br>
|
||||
*
|
||||
|
|
|
@ -1,9 +1,10 @@
|
|||
package us.codecraft.webmagic.downloader;
|
||||
|
||||
import com.github.dreamhead.moco.HttpServer;
|
||||
import com.github.dreamhead.moco.Runnable;
|
||||
import com.github.dreamhead.moco.Runner;
|
||||
import org.apache.commons.collections.map.HashedMap;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.UnsupportedEncodingException;
|
||||
import java.util.Map;
|
||||
import org.apache.commons.collections4.map.HashedMap;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.http.client.methods.CloseableHttpResponse;
|
||||
import org.apache.http.client.methods.HttpUriRequest;
|
||||
|
@ -11,6 +12,9 @@ import org.apache.http.impl.client.CloseableHttpClient;
|
|||
import org.apache.http.impl.client.HttpClients;
|
||||
import org.apache.http.util.EntityUtils;
|
||||
import org.junit.Test;
|
||||
import com.github.dreamhead.moco.HttpServer;
|
||||
import com.github.dreamhead.moco.Runnable;
|
||||
import com.github.dreamhead.moco.Runner;
|
||||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.Request;
|
||||
import us.codecraft.webmagic.Site;
|
||||
|
@ -21,12 +25,19 @@ import us.codecraft.webmagic.proxy.SimpleProxyProvider;
|
|||
import us.codecraft.webmagic.selector.Html;
|
||||
import us.codecraft.webmagic.utils.CharsetUtils;
|
||||
import us.codecraft.webmagic.utils.HttpConstant;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.UnsupportedEncodingException;
|
||||
import java.util.Map;
|
||||
|
||||
import static com.github.dreamhead.moco.Moco.*;
|
||||
import static com.github.dreamhead.moco.Moco.and;
|
||||
import static com.github.dreamhead.moco.Moco.by;
|
||||
import static com.github.dreamhead.moco.Moco.cookie;
|
||||
import static com.github.dreamhead.moco.Moco.eq;
|
||||
import static com.github.dreamhead.moco.Moco.form;
|
||||
import static com.github.dreamhead.moco.Moco.header;
|
||||
import static com.github.dreamhead.moco.Moco.httpServer;
|
||||
import static com.github.dreamhead.moco.Moco.method;
|
||||
import static com.github.dreamhead.moco.Moco.not;
|
||||
import static com.github.dreamhead.moco.Moco.query;
|
||||
import static com.github.dreamhead.moco.Moco.text;
|
||||
import static com.github.dreamhead.moco.Moco.uri;
|
||||
import static com.github.dreamhead.moco.Moco.with;
|
||||
import static org.assertj.core.api.Assertions.assertThat;
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
|
|
|
@ -1,13 +1,15 @@
|
|||
package us.codecraft.webmagic.downloader;
|
||||
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.nio.charset.Charset;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.Request;
|
||||
import us.codecraft.webmagic.Task;
|
||||
import us.codecraft.webmagic.selector.PlainText;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com
|
||||
|
@ -19,7 +21,7 @@ public class MockGithubDownloader implements Downloader {
|
|||
Page page = new Page();
|
||||
InputStream resourceAsStream = this.getClass().getResourceAsStream("/html/mock-github.html");
|
||||
try {
|
||||
page.setRawText(IOUtils.toString(resourceAsStream));
|
||||
page.setRawText(IOUtils.toString(resourceAsStream, Charset.defaultCharset()));
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
|
|
|
@ -0,0 +1,40 @@
|
|||
package us.codecraft.webmagic.processor;
|
||||
|
||||
import static org.junit.Assert.assertEquals;
|
||||
|
||||
import org.junit.Test;
|
||||
|
||||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.Site;
|
||||
|
||||
public class PageProcessorTest {
|
||||
|
||||
@Test
|
||||
public void testGetSite() {
|
||||
Site actualSite = new PageProcessor() {
|
||||
|
||||
@Override
|
||||
public void process(Page page) {
|
||||
}
|
||||
|
||||
}.getSite();
|
||||
|
||||
assertEquals(Site.me(), actualSite);
|
||||
|
||||
actualSite = new PageProcessor() {
|
||||
|
||||
@Override
|
||||
public void process(Page page) {
|
||||
}
|
||||
|
||||
@Override
|
||||
public Site getSite() {
|
||||
return Site.me().setTimeOut(123);
|
||||
};
|
||||
|
||||
}.getSite();
|
||||
|
||||
assertEquals(Site.me().setTimeOut(123), actualSite);
|
||||
}
|
||||
|
||||
}
|
|
@ -8,7 +8,7 @@
|
|||
<parent>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<artifactId>webmagic-parent</artifactId>
|
||||
<version>0.7.5</version>
|
||||
<version>0.7.6</version>
|
||||
</parent>
|
||||
|
||||
<artifactId>webmagic-coverage</artifactId>
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
<parent>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<artifactId>webmagic-parent</artifactId>
|
||||
<version>0.7.5</version>
|
||||
<version>0.7.6</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
|
|
|
@ -16,25 +16,21 @@ import java.io.*;
|
|||
* @version 0.5.3
|
||||
*/
|
||||
public class PhantomJSDownloader extends AbstractDownloader {
|
||||
|
||||
private static Logger logger = LoggerFactory.getLogger(PhantomJSDownloader.class);
|
||||
private static final Logger logger = LoggerFactory.getLogger(PhantomJSDownloader.class);
|
||||
private static String crawlJsPath;
|
||||
private static String phantomJsCommand = "phantomjs"; // default
|
||||
|
||||
private int retryNum;
|
||||
private int threadNum;
|
||||
|
||||
public PhantomJSDownloader() {
|
||||
this.initPhantomjsCrawlPath();
|
||||
}
|
||||
|
||||
/**
|
||||
* 添加新的构造函数,支持phantomjs自定义命令
|
||||
*
|
||||
* <p>
|
||||
* example:
|
||||
* phantomjs.exe 支持windows环境
|
||||
* phantomjs --ignore-ssl-errors=yes 忽略抓取地址是https时的一些错误
|
||||
* /usr/local/bin/phantomjs 命令的绝对路径,避免因系统环境变量引起的IOException
|
||||
* phantomjs.exe 支持windows环境
|
||||
* phantomjs --ignore-ssl-errors=yes 忽略抓取地址是https时的一些错误
|
||||
* /usr/local/bin/phantomjs 命令的绝对路径,避免因系统环境变量引起的IOException
|
||||
*
|
||||
* @param phantomJsCommand phantomJsCommand
|
||||
*/
|
||||
|
@ -69,20 +65,21 @@ public class PhantomJSDownloader extends AbstractDownloader {
|
|||
* -- crawl.js end
|
||||
* </pre>
|
||||
* 具体项目时可以将以上js代码复制下来使用
|
||||
*
|
||||
* <p>
|
||||
* example:
|
||||
* new PhantomJSDownloader("/your/path/phantomjs", "/your/path/crawl.js");
|
||||
* new PhantomJSDownloader("/your/path/phantomjs", "/your/path/crawl.js");
|
||||
*
|
||||
* @param phantomJsCommand phantomJsCommand
|
||||
* @param crawlJsPath crawlJsPath
|
||||
* @param crawlJsPath crawlJsPath
|
||||
*/
|
||||
public PhantomJSDownloader(String phantomJsCommand, String crawlJsPath) {
|
||||
PhantomJSDownloader.phantomJsCommand = phantomJsCommand;
|
||||
PhantomJSDownloader.crawlJsPath = crawlJsPath;
|
||||
PhantomJSDownloader.phantomJsCommand = phantomJsCommand;
|
||||
PhantomJSDownloader.crawlJsPath = crawlJsPath;
|
||||
}
|
||||
|
||||
private void initPhantomjsCrawlPath() {
|
||||
PhantomJSDownloader.crawlJsPath = new File(this.getClass().getResource("/").getPath()).getPath() + System.getProperty("file.separator") + "crawl.js ";
|
||||
PhantomJSDownloader.crawlJsPath = new File(this.getClass().getResource("/").getPath()).getPath()
|
||||
+ System.getProperty("file.separator") + "crawl.js ";
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -90,61 +87,41 @@ public class PhantomJSDownloader extends AbstractDownloader {
|
|||
if (logger.isInfoEnabled()) {
|
||||
logger.info("downloading page: " + request.getUrl());
|
||||
}
|
||||
String content = getPage(request);
|
||||
if (content.contains("HTTP request failed")) {
|
||||
for (int i = 1; i <= getRetryNum(); i++) {
|
||||
content = getPage(request);
|
||||
if (!content.contains("HTTP request failed")) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (content.contains("HTTP request failed")) {
|
||||
//when failed
|
||||
Page page = new Page();
|
||||
page.setRequest(request);
|
||||
return page;
|
||||
}
|
||||
}
|
||||
|
||||
Page page = new Page();
|
||||
page.setRawText(content);
|
||||
page.setUrl(new PlainText(request.getUrl()));
|
||||
page.setRequest(request);
|
||||
page.setStatusCode(200);
|
||||
Page page = Page.fail();
|
||||
try {
|
||||
String content = getPage(request);
|
||||
if (!content.contains("HTTP request failed")) {
|
||||
page.setDownloadSuccess(true);
|
||||
page.setRawText(content);
|
||||
page.setUrl(new PlainText(request.getUrl()));
|
||||
page.setRequest(request);
|
||||
page.setStatusCode(200);
|
||||
}
|
||||
onSuccess(request, task);
|
||||
} catch (Exception e) {
|
||||
onError(request, task, e);
|
||||
logger.warn("download page {} error", request.getUrl(), e);
|
||||
}
|
||||
return page;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setThread(int threadNum) {
|
||||
this.threadNum = threadNum;
|
||||
// ignore
|
||||
}
|
||||
|
||||
protected String getPage(Request request) {
|
||||
try {
|
||||
String url = request.getUrl();
|
||||
Runtime runtime = Runtime.getRuntime();
|
||||
Process process = runtime.exec(phantomJsCommand + " " + crawlJsPath + " " + url);
|
||||
InputStream is = process.getInputStream();
|
||||
BufferedReader br = new BufferedReader(new InputStreamReader(is));
|
||||
StringBuffer stringBuffer = new StringBuffer();
|
||||
String line;
|
||||
while ((line = br.readLine()) != null) {
|
||||
stringBuffer.append(line).append("\n");
|
||||
}
|
||||
return stringBuffer.toString();
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
protected String getPage(Request request) throws Exception {
|
||||
String url = request.getUrl();
|
||||
Runtime runtime = Runtime.getRuntime();
|
||||
Process process = runtime.exec(phantomJsCommand + " " + crawlJsPath + " " + url);
|
||||
InputStream is = process.getInputStream();
|
||||
BufferedReader br = new BufferedReader(new InputStreamReader(is));
|
||||
StringBuilder builder = new StringBuilder();
|
||||
String line;
|
||||
while ((line = br.readLine()) != null) {
|
||||
builder.append(line).append("\n");
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
public int getRetryNum() {
|
||||
return retryNum;
|
||||
}
|
||||
|
||||
public PhantomJSDownloader setRetryNum(int retryNum) {
|
||||
this.retryNum = retryNum;
|
||||
return this;
|
||||
return builder.toString();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,11 +1,13 @@
|
|||
package us.codecraft.webmagic.model;
|
||||
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.charset.Charset;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.Request;
|
||||
import us.codecraft.webmagic.selector.PlainText;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com
|
||||
|
@ -16,7 +18,7 @@ public class PageMocker {
|
|||
|
||||
public Page getMockJsonPage() throws IOException {
|
||||
Page page = new Page();
|
||||
page.setRawText(IOUtils.toString(PageMocker.class.getClassLoader().getResourceAsStream("json/mock-githubrepo.json")));
|
||||
page.setRawText(IOUtils.toString(PageMocker.class.getClassLoader().getResourceAsStream("json/mock-githubrepo.json"), Charset.defaultCharset()));
|
||||
page.setRequest(new Request("https://api.github.com/repos/code4craft/webmagic"));
|
||||
page.setUrl(new PlainText("https://api.github.com/repos/code4craft/webmagic"));
|
||||
return page;
|
||||
|
@ -24,7 +26,7 @@ public class PageMocker {
|
|||
|
||||
public Page getMockPage() throws IOException {
|
||||
Page page = new Page();
|
||||
page.setRawText(IOUtils.toString(PageMocker.class.getClassLoader().getResourceAsStream("html/mock-webmagic.html")));
|
||||
page.setRawText(IOUtils.toString(PageMocker.class.getClassLoader().getResourceAsStream("html/mock-webmagic.html"), Charset.defaultCharset()));
|
||||
page.setRequest(new Request("http://webmagic.io/list/0"));
|
||||
page.setUrl(new PlainText("http://webmagic.io/list/0"));
|
||||
return page;
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
<parent>
|
||||
<artifactId>webmagic-parent</artifactId>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<version>0.7.5</version>
|
||||
<version>0.7.6</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
|
|
|
@ -1,14 +1,14 @@
|
|||
package us.codecraft.webmagic.samples;
|
||||
|
||||
import org.apache.commons.collections.CollectionUtils;
|
||||
|
||||
import java.util.List;
|
||||
import org.apache.commons.collections4.CollectionUtils;
|
||||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.Site;
|
||||
import us.codecraft.webmagic.Spider;
|
||||
import us.codecraft.webmagic.processor.PageProcessor;
|
||||
import us.codecraft.webmagic.selector.JsonPathSelector;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com
|
||||
* @since 0.5.0
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
package us.codecraft.webmagic.samples;
|
||||
|
||||
import org.apache.commons.collections.CollectionUtils;
|
||||
import org.apache.commons.collections4.CollectionUtils;
|
||||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.Site;
|
||||
import us.codecraft.webmagic.Spider;
|
||||
|
|
|
@ -36,7 +36,7 @@ public class PhantomJSPageProcessor implements PageProcessor {
|
|||
}
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
PhantomJSDownloader phantomDownloader = new PhantomJSDownloader().setRetryNum(3);
|
||||
PhantomJSDownloader phantomDownloader = new PhantomJSDownloader();
|
||||
|
||||
CollectorPipeline<ResultItems> collectorPipeline = new ResultItemsCollectorPipeline();
|
||||
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
<parent>
|
||||
<artifactId>webmagic-parent</artifactId>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<version>0.7.5</version>
|
||||
<version>0.7.6</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
|
|
|
@ -75,6 +75,7 @@ public class Xpath2Selector implements Selector {
|
|||
private XPath2NamespaceContext() {
|
||||
put("fn", NamespaceConstant.FN);
|
||||
put("xslt", NamespaceConstant.XSLT);
|
||||
put("xhtml", NamespaceConstant.XHTML);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -1376,7 +1376,7 @@ public class XpathSelectorTest {
|
|||
|
||||
@Test
|
||||
public void testXpath2Selector() {
|
||||
Xpath2Selector xpath2Selector = new Xpath2Selector("//a/@href");
|
||||
Xpath2Selector xpath2Selector = new Xpath2Selector("//xhtml:a/@href");
|
||||
String select = xpath2Selector.select(html);
|
||||
Assert.assertEquals("http://www.oschina.net/", select);
|
||||
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
<parent>
|
||||
<artifactId>webmagic-parent</artifactId>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<version>0.7.5</version>
|
||||
<version>0.7.6</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
|
|
|
@ -1,5 +1,14 @@
|
|||
package us.codecraft.webmagic.scripts;
|
||||
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.nio.charset.Charset;
|
||||
import java.util.Iterator;
|
||||
import java.util.Map;
|
||||
import javax.script.ScriptContext;
|
||||
import javax.script.ScriptEngine;
|
||||
import javax.script.ScriptException;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.jruby.RubyHash;
|
||||
import org.python.core.PyDictionary;
|
||||
|
@ -7,14 +16,6 @@ import us.codecraft.webmagic.Page;
|
|||
import us.codecraft.webmagic.Site;
|
||||
import us.codecraft.webmagic.processor.PageProcessor;
|
||||
|
||||
import javax.script.ScriptContext;
|
||||
import javax.script.ScriptEngine;
|
||||
import javax.script.ScriptException;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.Iterator;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com
|
||||
* @since 0.4.1
|
||||
|
@ -39,7 +40,7 @@ public class ScriptProcessor implements PageProcessor {
|
|||
enginePool = new ScriptEnginePool(language, threadNum);
|
||||
InputStream resourceAsStream = this.getClass().getClassLoader().getResourceAsStream(language.getDefineFile());
|
||||
try {
|
||||
defines = IOUtils.toString(resourceAsStream);
|
||||
defines = IOUtils.toString(resourceAsStream, Charset.defaultCharset());
|
||||
} catch (IOException e) {
|
||||
throw new IllegalArgumentException(e);
|
||||
}
|
||||
|
|
|
@ -1,10 +1,12 @@
|
|||
package us.codecraft.webmagic.scripts;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.nio.charset.Charset;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com
|
||||
|
@ -35,7 +37,7 @@ public class ScriptProcessorBuilder {
|
|||
public ScriptProcessorBuilder scriptFromFile(String fileName) {
|
||||
try {
|
||||
InputStream resourceAsStream = new FileInputStream(fileName);
|
||||
this.script = IOUtils.toString(resourceAsStream);
|
||||
this.script = IOUtils.toString(resourceAsStream, Charset.defaultCharset());
|
||||
} catch (IOException e) {
|
||||
//wrap IOException because I prefer a runtime exception...
|
||||
throw new IllegalArgumentException(e);
|
||||
|
@ -46,7 +48,7 @@ public class ScriptProcessorBuilder {
|
|||
public ScriptProcessorBuilder scriptFromClassPathFile(String fileName) {
|
||||
try {
|
||||
InputStream resourceAsStream = ScriptProcessor.class.getClassLoader().getResourceAsStream(fileName);
|
||||
this.script = IOUtils.toString(resourceAsStream);
|
||||
this.script = IOUtils.toString(resourceAsStream, Charset.defaultCharset());
|
||||
} catch (IOException e) {
|
||||
//wrap IOException because I prefer a runtime exception...
|
||||
throw new IllegalArgumentException(e);
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
<parent>
|
||||
<artifactId>webmagic-parent</artifactId>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<version>0.7.5</version>
|
||||
<version>0.7.6</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
|
|
|
@ -11,7 +11,7 @@ import us.codecraft.webmagic.Page;
|
|||
import us.codecraft.webmagic.Request;
|
||||
import us.codecraft.webmagic.Site;
|
||||
import us.codecraft.webmagic.Task;
|
||||
import us.codecraft.webmagic.downloader.Downloader;
|
||||
import us.codecraft.webmagic.downloader.AbstractDownloader;
|
||||
import us.codecraft.webmagic.selector.Html;
|
||||
import us.codecraft.webmagic.selector.PlainText;
|
||||
|
||||
|
@ -24,112 +24,120 @@ import java.util.Map;
|
|||
* 需要下载Selenium driver支持。<br>
|
||||
*
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-7-26 <br>
|
||||
* Time: 下午1:37 <br>
|
||||
* Date: 13-7-26 <br>
|
||||
* Time: 下午1:37 <br>
|
||||
*/
|
||||
public class SeleniumDownloader implements Downloader, Closeable {
|
||||
public class SeleniumDownloader extends AbstractDownloader implements Closeable {
|
||||
|
||||
private volatile WebDriverPool webDriverPool;
|
||||
private volatile WebDriverPool webDriverPool;
|
||||
|
||||
private Logger logger = LoggerFactory.getLogger(getClass());
|
||||
private Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
private int sleepTime = 0;
|
||||
private int sleepTime = 0;
|
||||
|
||||
private int poolSize = 1;
|
||||
private int poolSize = 1;
|
||||
|
||||
private static final String DRIVER_PHANTOMJS = "phantomjs";
|
||||
private static final String DRIVER_PHANTOMJS = "phantomjs";
|
||||
|
||||
/**
|
||||
* 新建
|
||||
*
|
||||
* @param chromeDriverPath chromeDriverPath
|
||||
*/
|
||||
public SeleniumDownloader(String chromeDriverPath) {
|
||||
System.getProperties().setProperty("webdriver.chrome.driver",
|
||||
chromeDriverPath);
|
||||
}
|
||||
/**
|
||||
* 新建
|
||||
*
|
||||
* @param chromeDriverPath chromeDriverPath
|
||||
*/
|
||||
public SeleniumDownloader(String chromeDriverPath) {
|
||||
System.getProperties().setProperty("webdriver.chrome.driver",
|
||||
chromeDriverPath);
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructor without any filed. Construct PhantomJS browser
|
||||
*
|
||||
* @author bob.li.0718@gmail.com
|
||||
*/
|
||||
public SeleniumDownloader() {
|
||||
// System.setProperty("phantomjs.binary.path",
|
||||
// "/Users/Bingo/Downloads/phantomjs-1.9.7-macosx/bin/phantomjs");
|
||||
}
|
||||
/**
|
||||
* Constructor without any filed. Construct PhantomJS browser
|
||||
*
|
||||
* @author bob.li.0718@gmail.com
|
||||
*/
|
||||
public SeleniumDownloader() {
|
||||
// System.setProperty("phantomjs.binary.path",
|
||||
// "/Users/Bingo/Downloads/phantomjs-1.9.7-macosx/bin/phantomjs");
|
||||
}
|
||||
|
||||
/**
|
||||
* set sleep time to wait until load success
|
||||
*
|
||||
* @param sleepTime sleepTime
|
||||
* @return this
|
||||
*/
|
||||
public SeleniumDownloader setSleepTime(int sleepTime) {
|
||||
this.sleepTime = sleepTime;
|
||||
return this;
|
||||
}
|
||||
/**
|
||||
* set sleep time to wait until load success
|
||||
*
|
||||
* @param sleepTime sleepTime
|
||||
* @return this
|
||||
*/
|
||||
public SeleniumDownloader setSleepTime(int sleepTime) {
|
||||
this.sleepTime = sleepTime;
|
||||
return this;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Page download(Request request, Task task) {
|
||||
checkInit();
|
||||
WebDriver webDriver;
|
||||
try {
|
||||
webDriver = webDriverPool.get();
|
||||
} catch (InterruptedException e) {
|
||||
logger.warn("interrupted", e);
|
||||
return null;
|
||||
}
|
||||
logger.info("downloading page " + request.getUrl());
|
||||
webDriver.get(request.getUrl());
|
||||
try {
|
||||
Thread.sleep(sleepTime);
|
||||
} catch (InterruptedException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
WebDriver.Options manage = webDriver.manage();
|
||||
Site site = task.getSite();
|
||||
if (site.getCookies() != null) {
|
||||
for (Map.Entry<String, String> cookieEntry : site.getCookies()
|
||||
.entrySet()) {
|
||||
Cookie cookie = new Cookie(cookieEntry.getKey(),
|
||||
cookieEntry.getValue());
|
||||
manage.addCookie(cookie);
|
||||
}
|
||||
}
|
||||
@Override
|
||||
public Page download(Request request, Task task) {
|
||||
checkInit();
|
||||
WebDriver webDriver = null;
|
||||
Page page = Page.fail();
|
||||
try {
|
||||
webDriver = webDriverPool.get();
|
||||
|
||||
/*
|
||||
* TODO You can add mouse event or other processes
|
||||
*
|
||||
* @author: bob.li.0718@gmail.com
|
||||
*/
|
||||
logger.info("downloading page " + request.getUrl());
|
||||
webDriver.get(request.getUrl());
|
||||
try {
|
||||
if (sleepTime > 0) {
|
||||
Thread.sleep(sleepTime);
|
||||
}
|
||||
} catch (InterruptedException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
WebDriver.Options manage = webDriver.manage();
|
||||
Site site = task.getSite();
|
||||
if (site.getCookies() != null) {
|
||||
for (Map.Entry<String, String> cookieEntry : site.getCookies()
|
||||
.entrySet()) {
|
||||
Cookie cookie = new Cookie(cookieEntry.getKey(),
|
||||
cookieEntry.getValue());
|
||||
manage.addCookie(cookie);
|
||||
}
|
||||
}
|
||||
|
||||
WebElement webElement = webDriver.findElement(By.xpath("/html"));
|
||||
String content = webElement.getAttribute("outerHTML");
|
||||
Page page = new Page();
|
||||
page.setRawText(content);
|
||||
page.setHtml(new Html(content, request.getUrl()));
|
||||
page.setUrl(new PlainText(request.getUrl()));
|
||||
page.setRequest(request);
|
||||
webDriverPool.returnToPool(webDriver);
|
||||
return page;
|
||||
}
|
||||
/*
|
||||
* TODO You can add mouse event or other processes
|
||||
*
|
||||
* @author: bob.li.0718@gmail.com
|
||||
*/
|
||||
|
||||
private void checkInit() {
|
||||
if (webDriverPool == null) {
|
||||
synchronized (this) {
|
||||
webDriverPool = new WebDriverPool(poolSize);
|
||||
}
|
||||
}
|
||||
}
|
||||
WebElement webElement = webDriver.findElement(By.xpath("/html"));
|
||||
String content = webElement.getAttribute("outerHTML");
|
||||
page.setDownloadSuccess(true);
|
||||
page.setRawText(content);
|
||||
page.setHtml(new Html(content, request.getUrl()));
|
||||
page.setUrl(new PlainText(request.getUrl()));
|
||||
page.setRequest(request);
|
||||
onSuccess(request, task);
|
||||
} catch (Exception e) {
|
||||
logger.warn("download page {} error", request.getUrl(), e);
|
||||
onError(request, task, e);
|
||||
} finally {
|
||||
if (webDriver != null) {
|
||||
webDriverPool.returnToPool(webDriver);
|
||||
}
|
||||
}
|
||||
return page;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setThread(int thread) {
|
||||
this.poolSize = thread;
|
||||
}
|
||||
private void checkInit() {
|
||||
if (webDriverPool == null) {
|
||||
synchronized (this) {
|
||||
webDriverPool = new WebDriverPool(poolSize);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
webDriverPool.closeAll();
|
||||
}
|
||||
@Override
|
||||
public void setThread(int thread) {
|
||||
this.poolSize = thread;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
webDriverPool.closeAll();
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue