Merge branch 'release/0.8.0'
commit
43ce1a0db9
10
pom.xml
10
pom.xml
|
@ -1,7 +1,7 @@
|
||||||
<?xml version="1.0" encoding="UTF-8"?>
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<version>0.7.6</version>
|
<version>0.8.0</version>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
<packaging>pom</packaging>
|
<packaging>pom</packaging>
|
||||||
<properties>
|
<properties>
|
||||||
|
@ -14,14 +14,14 @@
|
||||||
<commons-collections4.version>4.4</commons-collections4.version>
|
<commons-collections4.version>4.4</commons-collections4.version>
|
||||||
<commons-io.version>2.11.0</commons-io.version>
|
<commons-io.version>2.11.0</commons-io.version>
|
||||||
<commons-lang3.version>3.12.0</commons-lang3.version>
|
<commons-lang3.version>3.12.0</commons-lang3.version>
|
||||||
<fastjson.version>2.0.14.graal</fastjson.version>
|
<fastjson.version>2.0.19.graal</fastjson.version>
|
||||||
<groovy-all.version>3.0.13</groovy-all.version>
|
<groovy-all.version>3.0.13</groovy-all.version>
|
||||||
<guava.version>31.1-jre</guava.version>
|
<guava.version>31.1-jre</guava.version>
|
||||||
<htmlcleaner.version>2.26</htmlcleaner.version>
|
<htmlcleaner.version>2.26</htmlcleaner.version>
|
||||||
<httpclient.version>4.5.13</httpclient.version>
|
<httpclient.version>4.5.13</httpclient.version>
|
||||||
<httpcore.version>4.4.15</httpcore.version>
|
<httpcore.version>4.4.15</httpcore.version>
|
||||||
<jedis.version>3.7.1</jedis.version>
|
<jedis.version>3.7.1</jedis.version>
|
||||||
<jruby.version>9.3.8.0</jruby.version>
|
<jruby.version>9.3.9.0</jruby.version>
|
||||||
<json-path.version>2.7.0</json-path.version>
|
<json-path.version>2.7.0</json-path.version>
|
||||||
<junit.version>4.13.2</junit.version>
|
<junit.version>4.13.2</junit.version>
|
||||||
<jython.version>2.7.3</jython.version>
|
<jython.version>2.7.3</jython.version>
|
||||||
|
@ -31,7 +31,7 @@
|
||||||
<phantomjsdriver.version>1.2.0</phantomjsdriver.version>
|
<phantomjsdriver.version>1.2.0</phantomjsdriver.version>
|
||||||
<saxon-he.version>11.4</saxon-he.version>
|
<saxon-he.version>11.4</saxon-he.version>
|
||||||
<selenium-java.version>3.141.59</selenium-java.version>
|
<selenium-java.version>3.141.59</selenium-java.version>
|
||||||
<slf4j.version>2.0.3</slf4j.version>
|
<slf4j.version>2.0.4</slf4j.version>
|
||||||
<spring-version>4.0.0.RELEASE</spring-version>
|
<spring-version>4.0.0.RELEASE</spring-version>
|
||||||
<xsoup.version>0.3.5</xsoup.version>
|
<xsoup.version>0.3.5</xsoup.version>
|
||||||
</properties>
|
</properties>
|
||||||
|
@ -232,7 +232,7 @@
|
||||||
<configuration>
|
<configuration>
|
||||||
<rules>
|
<rules>
|
||||||
<requireMavenVersion>
|
<requireMavenVersion>
|
||||||
<version>3.3.9</version>
|
<version>3.5.0</version>
|
||||||
</requireMavenVersion>
|
</requireMavenVersion>
|
||||||
</rules>
|
</rules>
|
||||||
</configuration>
|
</configuration>
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<artifactId>webmagic-parent</artifactId>
|
<artifactId>webmagic-parent</artifactId>
|
||||||
<version>0.7.6</version>
|
<version>0.8.0</version>
|
||||||
</parent>
|
</parent>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
|
|
@ -333,9 +333,10 @@ public class Spider implements Runnable, Task {
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// wait until new url added,
|
// wait until new url added,
|
||||||
if (waitNewUrl())
|
if (waitNewUrl()) {
|
||||||
//if interrupted
|
//if interrupted
|
||||||
break;
|
break;
|
||||||
|
}
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -805,11 +806,13 @@ public class Spider implements Runnable, Task {
|
||||||
* Set wait time when no url is polled.<br><br>
|
* Set wait time when no url is polled.<br><br>
|
||||||
*
|
*
|
||||||
* @param emptySleepTime In MILLISECONDS.
|
* @param emptySleepTime In MILLISECONDS.
|
||||||
|
* @return this
|
||||||
*/
|
*/
|
||||||
public void setEmptySleepTime(long emptySleepTime) {
|
public Spider setEmptySleepTime(long emptySleepTime) {
|
||||||
if(emptySleepTime<=0){
|
if(emptySleepTime<=0){
|
||||||
throw new IllegalArgumentException("emptySleepTime should be more than zero!");
|
throw new IllegalArgumentException("emptySleepTime should be more than zero!");
|
||||||
}
|
}
|
||||||
this.emptySleepTime = emptySleepTime;
|
this.emptySleepTime = emptySleepTime;
|
||||||
|
return this;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -82,12 +82,16 @@ public class HttpClientDownloader extends AbstractDownloader {
|
||||||
try {
|
try {
|
||||||
httpResponse = httpClient.execute(requestContext.getHttpUriRequest(), requestContext.getHttpClientContext());
|
httpResponse = httpClient.execute(requestContext.getHttpUriRequest(), requestContext.getHttpClientContext());
|
||||||
page = handleResponse(request, request.getCharset() != null ? request.getCharset() : task.getSite().getCharset(), httpResponse, task);
|
page = handleResponse(request, request.getCharset() != null ? request.getCharset() : task.getSite().getCharset(), httpResponse, task);
|
||||||
|
|
||||||
onSuccess(request, task);
|
onSuccess(request, task);
|
||||||
logger.info("downloading page success {}", request.getUrl());
|
logger.info("downloading page success {}", request.getUrl());
|
||||||
|
|
||||||
return page;
|
return page;
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
logger.warn("download page {} error", request.getUrl(), e);
|
|
||||||
onError(request, task, e);
|
onError(request, task, e);
|
||||||
|
logger.info("download page {} error", request.getUrl(), e);
|
||||||
|
|
||||||
return page;
|
return page;
|
||||||
} finally {
|
} finally {
|
||||||
if (httpResponse != null) {
|
if (httpResponse != null) {
|
||||||
|
|
|
@ -1,26 +1,51 @@
|
||||||
package us.codecraft.webmagic.scheduler;
|
package us.codecraft.webmagic.scheduler;
|
||||||
|
|
||||||
import us.codecraft.webmagic.Request;
|
|
||||||
import us.codecraft.webmagic.Task;
|
|
||||||
|
|
||||||
import java.util.concurrent.BlockingQueue;
|
import java.util.concurrent.BlockingQueue;
|
||||||
import java.util.concurrent.LinkedBlockingQueue;
|
import java.util.concurrent.LinkedBlockingQueue;
|
||||||
|
|
||||||
|
import us.codecraft.webmagic.Request;
|
||||||
|
import us.codecraft.webmagic.Site;
|
||||||
|
import us.codecraft.webmagic.Task;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Basic Scheduler implementation.<br>
|
* Basic Scheduler implementation.<br>
|
||||||
* Store urls to fetch in LinkedBlockingQueue and remove duplicate urls by HashMap.
|
* Store urls to fetch in LinkedBlockingQueue and remove duplicate urls by HashMap.
|
||||||
*
|
*
|
||||||
|
* Note: if you use this {@link QueueScheduler}
|
||||||
|
* with {@link Site#getCycleRetryTimes()} enabled, you may encountered dead-lock
|
||||||
|
* when the queue is full.
|
||||||
|
*
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
* @since 0.1.0
|
* @since 0.1.0
|
||||||
*/
|
*/
|
||||||
public class QueueScheduler extends DuplicateRemovedScheduler implements MonitorableScheduler {
|
public class QueueScheduler extends DuplicateRemovedScheduler implements MonitorableScheduler {
|
||||||
|
|
||||||
private BlockingQueue<Request> queue = new LinkedBlockingQueue<Request>();
|
private final BlockingQueue<Request> queue;
|
||||||
|
|
||||||
|
public QueueScheduler() {
|
||||||
|
this.queue = new LinkedBlockingQueue<>();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a {@code QueueScheduler} with the given (fixed) capacity.
|
||||||
|
*
|
||||||
|
* @param capacity the capacity of this queue,
|
||||||
|
* see {@link LinkedBlockingQueue#LinkedBlockingQueue(int)}
|
||||||
|
* @since 0.8.0
|
||||||
|
*/
|
||||||
|
public QueueScheduler(int capacity) {
|
||||||
|
this.queue = new LinkedBlockingQueue<>(capacity);
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void pushWhenNoDuplicate(Request request, Task task) {
|
public void pushWhenNoDuplicate(Request request, Task task) {
|
||||||
queue.add(request);
|
logger.trace("Remaining capacity: {}", this.queue.remainingCapacity());
|
||||||
|
|
||||||
|
try {
|
||||||
|
queue.put(request);
|
||||||
|
} catch (InterruptedException e) {
|
||||||
|
Thread.currentThread().interrupt();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -8,7 +8,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<artifactId>webmagic-parent</artifactId>
|
<artifactId>webmagic-parent</artifactId>
|
||||||
<version>0.7.6</version>
|
<version>0.8.0</version>
|
||||||
</parent>
|
</parent>
|
||||||
|
|
||||||
<artifactId>webmagic-coverage</artifactId>
|
<artifactId>webmagic-coverage</artifactId>
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<artifactId>webmagic-parent</artifactId>
|
<artifactId>webmagic-parent</artifactId>
|
||||||
<version>0.7.6</version>
|
<version>0.8.0</version>
|
||||||
</parent>
|
</parent>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<artifactId>webmagic-parent</artifactId>
|
<artifactId>webmagic-parent</artifactId>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<version>0.7.6</version>
|
<version>0.8.0</version>
|
||||||
</parent>
|
</parent>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<artifactId>webmagic-parent</artifactId>
|
<artifactId>webmagic-parent</artifactId>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<version>0.7.6</version>
|
<version>0.8.0</version>
|
||||||
</parent>
|
</parent>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<artifactId>webmagic-parent</artifactId>
|
<artifactId>webmagic-parent</artifactId>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<version>0.7.6</version>
|
<version>0.8.0</version>
|
||||||
</parent>
|
</parent>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<artifactId>webmagic-parent</artifactId>
|
<artifactId>webmagic-parent</artifactId>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<version>0.7.6</version>
|
<version>0.8.0</version>
|
||||||
</parent>
|
</parent>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue