Merge branch 'release/0.8.0'

master
Joe Zhou 2022-11-24 00:49:41 +08:00
commit 43ce1a0db9
11 changed files with 53 additions and 21 deletions

10
pom.xml
View File

@ -1,7 +1,7 @@
<?xml version="1.0" encoding="UTF-8"?> <?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<groupId>us.codecraft</groupId> <groupId>us.codecraft</groupId>
<version>0.7.6</version> <version>0.8.0</version>
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<packaging>pom</packaging> <packaging>pom</packaging>
<properties> <properties>
@ -14,14 +14,14 @@
<commons-collections4.version>4.4</commons-collections4.version> <commons-collections4.version>4.4</commons-collections4.version>
<commons-io.version>2.11.0</commons-io.version> <commons-io.version>2.11.0</commons-io.version>
<commons-lang3.version>3.12.0</commons-lang3.version> <commons-lang3.version>3.12.0</commons-lang3.version>
<fastjson.version>2.0.14.graal</fastjson.version> <fastjson.version>2.0.19.graal</fastjson.version>
<groovy-all.version>3.0.13</groovy-all.version> <groovy-all.version>3.0.13</groovy-all.version>
<guava.version>31.1-jre</guava.version> <guava.version>31.1-jre</guava.version>
<htmlcleaner.version>2.26</htmlcleaner.version> <htmlcleaner.version>2.26</htmlcleaner.version>
<httpclient.version>4.5.13</httpclient.version> <httpclient.version>4.5.13</httpclient.version>
<httpcore.version>4.4.15</httpcore.version> <httpcore.version>4.4.15</httpcore.version>
<jedis.version>3.7.1</jedis.version> <jedis.version>3.7.1</jedis.version>
<jruby.version>9.3.8.0</jruby.version> <jruby.version>9.3.9.0</jruby.version>
<json-path.version>2.7.0</json-path.version> <json-path.version>2.7.0</json-path.version>
<junit.version>4.13.2</junit.version> <junit.version>4.13.2</junit.version>
<jython.version>2.7.3</jython.version> <jython.version>2.7.3</jython.version>
@ -31,7 +31,7 @@
<phantomjsdriver.version>1.2.0</phantomjsdriver.version> <phantomjsdriver.version>1.2.0</phantomjsdriver.version>
<saxon-he.version>11.4</saxon-he.version> <saxon-he.version>11.4</saxon-he.version>
<selenium-java.version>3.141.59</selenium-java.version> <selenium-java.version>3.141.59</selenium-java.version>
<slf4j.version>2.0.3</slf4j.version> <slf4j.version>2.0.4</slf4j.version>
<spring-version>4.0.0.RELEASE</spring-version> <spring-version>4.0.0.RELEASE</spring-version>
<xsoup.version>0.3.5</xsoup.version> <xsoup.version>0.3.5</xsoup.version>
</properties> </properties>
@ -232,7 +232,7 @@
<configuration> <configuration>
<rules> <rules>
<requireMavenVersion> <requireMavenVersion>
<version>3.3.9</version> <version>3.5.0</version>
</requireMavenVersion> </requireMavenVersion>
</rules> </rules>
</configuration> </configuration>

View File

@ -3,7 +3,7 @@
<parent> <parent>
<groupId>us.codecraft</groupId> <groupId>us.codecraft</groupId>
<artifactId>webmagic-parent</artifactId> <artifactId>webmagic-parent</artifactId>
<version>0.7.6</version> <version>0.8.0</version>
</parent> </parent>
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>

View File

@ -333,9 +333,10 @@ public class Spider implements Runnable, Task {
} }
} else { } else {
// wait until new url added // wait until new url added
if (waitNewUrl()) if (waitNewUrl()) {
//if interrupted //if interrupted
break; break;
}
continue; continue;
} }
} }
@ -805,11 +806,13 @@ public class Spider implements Runnable, Task {
* Set wait time when no url is polled.<br><br> * Set wait time when no url is polled.<br><br>
* *
* @param emptySleepTime In MILLISECONDS. * @param emptySleepTime In MILLISECONDS.
* @return this
*/ */
public void setEmptySleepTime(long emptySleepTime) { public Spider setEmptySleepTime(long emptySleepTime) {
if(emptySleepTime<=0){ if(emptySleepTime<=0){
throw new IllegalArgumentException("emptySleepTime should be more than zero!"); throw new IllegalArgumentException("emptySleepTime should be more than zero!");
} }
this.emptySleepTime = emptySleepTime; this.emptySleepTime = emptySleepTime;
return this;
} }
} }

View File

@ -82,12 +82,16 @@ public class HttpClientDownloader extends AbstractDownloader {
try { try {
httpResponse = httpClient.execute(requestContext.getHttpUriRequest(), requestContext.getHttpClientContext()); httpResponse = httpClient.execute(requestContext.getHttpUriRequest(), requestContext.getHttpClientContext());
page = handleResponse(request, request.getCharset() != null ? request.getCharset() : task.getSite().getCharset(), httpResponse, task); page = handleResponse(request, request.getCharset() != null ? request.getCharset() : task.getSite().getCharset(), httpResponse, task);
onSuccess(request, task); onSuccess(request, task);
logger.info("downloading page success {}", request.getUrl()); logger.info("downloading page success {}", request.getUrl());
return page; return page;
} catch (IOException e) { } catch (IOException e) {
logger.warn("download page {} error", request.getUrl(), e);
onError(request, task, e); onError(request, task, e);
logger.info("download page {} error", request.getUrl(), e);
return page; return page;
} finally { } finally {
if (httpResponse != null) { if (httpResponse != null) {

View File

@ -1,26 +1,51 @@
package us.codecraft.webmagic.scheduler; package us.codecraft.webmagic.scheduler;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task;
import java.util.concurrent.BlockingQueue; import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.LinkedBlockingQueue;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Task;
/** /**
* Basic Scheduler implementation.<br> * Basic Scheduler implementation.<br>
* Store urls to fetch in LinkedBlockingQueue and remove duplicate urls by HashMap. * Store urls to fetch in LinkedBlockingQueue and remove duplicate urls by HashMap.
* *
* Note: if you use this {@link QueueScheduler}
* with {@link Site#getCycleRetryTimes()} enabled, you may encountered dead-lock
* when the queue is full.
*
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* @since 0.1.0 * @since 0.1.0
*/ */
public class QueueScheduler extends DuplicateRemovedScheduler implements MonitorableScheduler { public class QueueScheduler extends DuplicateRemovedScheduler implements MonitorableScheduler {
private BlockingQueue<Request> queue = new LinkedBlockingQueue<Request>(); private final BlockingQueue<Request> queue;
public QueueScheduler() {
this.queue = new LinkedBlockingQueue<>();
}
/**
* Creates a {@code QueueScheduler} with the given (fixed) capacity.
*
* @param capacity the capacity of this queue,
* see {@link LinkedBlockingQueue#LinkedBlockingQueue(int)}
* @since 0.8.0
*/
public QueueScheduler(int capacity) {
this.queue = new LinkedBlockingQueue<>(capacity);
}
@Override @Override
public void pushWhenNoDuplicate(Request request, Task task) { public void pushWhenNoDuplicate(Request request, Task task) {
queue.add(request); logger.trace("Remaining capacity: {}", this.queue.remainingCapacity());
try {
queue.put(request);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
} }
@Override @Override

View File

@ -8,7 +8,7 @@
<parent> <parent>
<groupId>us.codecraft</groupId> <groupId>us.codecraft</groupId>
<artifactId>webmagic-parent</artifactId> <artifactId>webmagic-parent</artifactId>
<version>0.7.6</version> <version>0.8.0</version>
</parent> </parent>
<artifactId>webmagic-coverage</artifactId> <artifactId>webmagic-coverage</artifactId>

View File

@ -3,7 +3,7 @@
<parent> <parent>
<groupId>us.codecraft</groupId> <groupId>us.codecraft</groupId>
<artifactId>webmagic-parent</artifactId> <artifactId>webmagic-parent</artifactId>
<version>0.7.6</version> <version>0.8.0</version>
</parent> </parent>
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>

View File

@ -3,7 +3,7 @@
<parent> <parent>
<artifactId>webmagic-parent</artifactId> <artifactId>webmagic-parent</artifactId>
<groupId>us.codecraft</groupId> <groupId>us.codecraft</groupId>
<version>0.7.6</version> <version>0.8.0</version>
</parent> </parent>
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>

View File

@ -3,7 +3,7 @@
<parent> <parent>
<artifactId>webmagic-parent</artifactId> <artifactId>webmagic-parent</artifactId>
<groupId>us.codecraft</groupId> <groupId>us.codecraft</groupId>
<version>0.7.6</version> <version>0.8.0</version>
</parent> </parent>
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>

View File

@ -3,7 +3,7 @@
<parent> <parent>
<artifactId>webmagic-parent</artifactId> <artifactId>webmagic-parent</artifactId>
<groupId>us.codecraft</groupId> <groupId>us.codecraft</groupId>
<version>0.7.6</version> <version>0.8.0</version>
</parent> </parent>
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>

View File

@ -3,7 +3,7 @@
<parent> <parent>
<artifactId>webmagic-parent</artifactId> <artifactId>webmagic-parent</artifactId>
<groupId>us.codecraft</groupId> <groupId>us.codecraft</groupId>
<version>0.7.6</version> <version>0.8.0</version>
</parent> </parent>
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>