Merge branch 'stable' of github.com:code4craft/webmagic
commit
feb604da87
|
@ -41,12 +41,12 @@ webmagic使用maven管理依赖,在项目中添加对应的依赖即可使用w
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<artifactId>webmagic-core</artifactId>
|
<artifactId>webmagic-core</artifactId>
|
||||||
<version>0.5.0</version>
|
<version>0.5.1</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<artifactId>webmagic-extension</artifactId>
|
<artifactId>webmagic-extension</artifactId>
|
||||||
<version>0.5.0</version>
|
<version>0.5.1</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
```
|
```
|
||||||
|
|
||||||
|
@ -192,6 +192,7 @@ webmagic遵循[Apache 2.0协议](http://opensource.org/licenses/Apache-2.0)
|
||||||
* [lidongyang](http://my.oschina.net/lidongyang)
|
* [lidongyang](http://my.oschina.net/lidongyang)
|
||||||
* [seveniu](https://github.com/seveniu)
|
* [seveniu](https://github.com/seveniu)
|
||||||
* [sebastian1118](https://github.com/sebastian1118)
|
* [sebastian1118](https://github.com/sebastian1118)
|
||||||
|
* [codev777](https://github.com/codev777)
|
||||||
|
|
||||||
### 邮件组:
|
### 邮件组:
|
||||||
|
|
||||||
|
|
|
@ -25,12 +25,12 @@ Add dependencies to your pom.xml:
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<artifactId>webmagic-core</artifactId>
|
<artifactId>webmagic-core</artifactId>
|
||||||
<version>0.5.0</version>
|
<version>0.5.1</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<artifactId>webmagic-extension</artifactId>
|
<artifactId>webmagic-extension</artifactId>
|
||||||
<version>0.5.0</version>
|
<version>0.5.1</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
```
|
```
|
||||||
|
|
||||||
|
@ -145,6 +145,7 @@ Thanks these people for commiting source code, reporting bugs or suggesting for
|
||||||
* [lidongyang](http://my.oschina.net/lidongyang)
|
* [lidongyang](http://my.oschina.net/lidongyang)
|
||||||
* [seveniu](https://github.com/seveniu)
|
* [seveniu](https://github.com/seveniu)
|
||||||
* [sebastian1118](https://github.com/sebastian1118)
|
* [sebastian1118](https://github.com/sebastian1118)
|
||||||
|
* [codev777](https://github.com/codev777)
|
||||||
|
|
||||||
|
|
||||||
### Thanks:
|
### Thanks:
|
||||||
|
|
4
pom.xml
4
pom.xml
|
@ -6,7 +6,7 @@
|
||||||
<version>7</version>
|
<version>7</version>
|
||||||
</parent>
|
</parent>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<version>0.5.0</version>
|
<version>0.5.2-SNAPSHOT</version>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
<packaging>pom</packaging>
|
<packaging>pom</packaging>
|
||||||
<properties>
|
<properties>
|
||||||
|
@ -54,7 +54,7 @@
|
||||||
<module>webmagic-selenium</module>
|
<module>webmagic-selenium</module>
|
||||||
<module>webmagic-saxon</module>
|
<module>webmagic-saxon</module>
|
||||||
<module>webmagic-samples</module>
|
<module>webmagic-samples</module>
|
||||||
<module>webmagic-avalon</module>
|
<!--<module>webmagic-avalon</module>-->
|
||||||
</modules>
|
</modules>
|
||||||
|
|
||||||
<dependencyManagement>
|
<dependencyManagement>
|
||||||
|
|
|
@ -7,7 +7,7 @@
|
||||||
</parent>
|
</parent>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<artifactId>forger</artifactId>
|
<artifactId>forger</artifactId>
|
||||||
<version>0.1.0</version>
|
<version>0.1.1-SNAPSHOT</version>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
<packaging>jar</packaging>
|
<packaging>jar</packaging>
|
||||||
<properties>
|
<properties>
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<artifactId>webmagic-parent</artifactId>
|
<artifactId>webmagic-parent</artifactId>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<version>0.5.0</version>
|
<version>0.5.1-SNAPSHOT</version>
|
||||||
</parent>
|
</parent>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
@ -39,12 +39,6 @@
|
||||||
<version>1.1.1</version>
|
<version>1.1.1</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
|
||||||
<groupId>us.codecraft</groupId>
|
|
||||||
<artifactId>forger</artifactId>
|
|
||||||
<version>0.1.1-SNAPSHOT</version>
|
|
||||||
</dependency>
|
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.freemarker</groupId>
|
<groupId>org.freemarker</groupId>
|
||||||
<artifactId>freemarker</artifactId>
|
<artifactId>freemarker</artifactId>
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<artifactId>webmagic-avalon</artifactId>
|
<artifactId>webmagic-avalon</artifactId>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<version>0.5.0</version>
|
<version>0.5.1-SNAPSHOT</version>
|
||||||
</parent>
|
</parent>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<artifactId>webmagic-avalon</artifactId>
|
<artifactId>webmagic-avalon</artifactId>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<version>0.5.0</version>
|
<version>0.5.1-SNAPSHOT</version>
|
||||||
</parent>
|
</parent>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
@ -26,7 +26,7 @@
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<artifactId>forger</artifactId>
|
<artifactId>forger</artifactId>
|
||||||
<version>0.1.0</version>
|
<version>0.1.1-SNAPSHOT</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
|
@ -150,18 +150,4 @@
|
||||||
</plugins>
|
</plugins>
|
||||||
</build>
|
</build>
|
||||||
|
|
||||||
<repositories>
|
|
||||||
<repository>
|
|
||||||
<id>sonatype-nexus-snapshots</id>
|
|
||||||
<name>Sonatype Nexus Snapshots</name>
|
|
||||||
<url>https://oss.sonatype.org/content/repositories/snapshots</url>
|
|
||||||
<releases>
|
|
||||||
<enabled>false</enabled>
|
|
||||||
</releases>
|
|
||||||
<snapshots>
|
|
||||||
<enabled>true</enabled>
|
|
||||||
</snapshots>
|
|
||||||
</repository>
|
|
||||||
</repositories>
|
|
||||||
|
|
||||||
</project>
|
</project>
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<artifactId>webmagic-avalon</artifactId>
|
<artifactId>webmagic-avalon</artifactId>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<version>0.5.0</version>
|
<version>0.5.1-SNAPSHOT</version>
|
||||||
</parent>
|
</parent>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<artifactId>webmagic-parent</artifactId>
|
<artifactId>webmagic-parent</artifactId>
|
||||||
<version>0.5.0</version>
|
<version>0.5.2-SNAPSHOT</version>
|
||||||
</parent>
|
</parent>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
|
|
@ -4,6 +4,8 @@ import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
import us.codecraft.webmagic.Request;
|
import us.codecraft.webmagic.Request;
|
||||||
import us.codecraft.webmagic.Task;
|
import us.codecraft.webmagic.Task;
|
||||||
|
import us.codecraft.webmagic.scheduler.component.DuplicateRemover;
|
||||||
|
import us.codecraft.webmagic.scheduler.component.HashSetDuplicateRemover;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Remove duplicate urls and only push urls which are not duplicate.<br></br>
|
* Remove duplicate urls and only push urls which are not duplicate.<br></br>
|
||||||
|
@ -11,30 +13,30 @@ import us.codecraft.webmagic.Task;
|
||||||
* @author code4crafer@gmail.com
|
* @author code4crafer@gmail.com
|
||||||
* @since 0.5.0
|
* @since 0.5.0
|
||||||
*/
|
*/
|
||||||
public abstract class DuplicatedRemoveScheduler implements Scheduler {
|
public abstract class DuplicateRemovedScheduler implements Scheduler {
|
||||||
|
|
||||||
protected Logger logger = LoggerFactory.getLogger(getClass());
|
protected Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
|
||||||
|
private DuplicateRemover duplicatedRemover = new HashSetDuplicateRemover();
|
||||||
|
|
||||||
|
public DuplicateRemover getDuplicateRemover() {
|
||||||
|
return duplicatedRemover;
|
||||||
|
}
|
||||||
|
|
||||||
|
public DuplicateRemovedScheduler setDuplicateRemover(DuplicateRemover duplicatedRemover) {
|
||||||
|
this.duplicatedRemover = duplicatedRemover;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void push(Request request, Task task) {
|
public void push(Request request, Task task) {
|
||||||
logger.trace("get a candidate url {}", request.getUrl());
|
logger.trace("get a candidate url {}", request.getUrl());
|
||||||
if (isDuplicate(request, task) || shouldReserved(request)) {
|
if (!duplicatedRemover.isDuplicate(request, task) || shouldReserved(request)) {
|
||||||
logger.debug("push to queue {}", request.getUrl());
|
logger.debug("push to queue {}", request.getUrl());
|
||||||
pushWhenNoDuplicate(request, task);
|
pushWhenNoDuplicate(request, task);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Reset duplicate check.
|
|
||||||
*/
|
|
||||||
public abstract void resetDuplicateCheck(Task task);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @param request
|
|
||||||
* @return
|
|
||||||
*/
|
|
||||||
protected abstract boolean isDuplicate(Request request, Task task);
|
|
||||||
|
|
||||||
protected boolean shouldReserved(Request request) {
|
protected boolean shouldReserved(Request request) {
|
||||||
return request.getExtra(Request.CYCLE_TRIED_TIMES) != null;
|
return request.getExtra(Request.CYCLE_TRIED_TIMES) != null;
|
||||||
}
|
}
|
|
@ -17,7 +17,7 @@ import java.util.concurrent.PriorityBlockingQueue;
|
||||||
* @since 0.2.1
|
* @since 0.2.1
|
||||||
*/
|
*/
|
||||||
@ThreadSafe
|
@ThreadSafe
|
||||||
public class PriorityScheduler extends LocalDuplicatedRemoveScheduler {
|
public class PriorityScheduler extends DuplicateRemovedScheduler implements MonitorableScheduler {
|
||||||
|
|
||||||
public static final int INITIAL_CAPACITY = 5;
|
public static final int INITIAL_CAPACITY = 5;
|
||||||
|
|
||||||
|
@ -65,4 +65,9 @@ public class PriorityScheduler extends LocalDuplicatedRemoveScheduler {
|
||||||
public int getLeftRequestsCount(Task task) {
|
public int getLeftRequestsCount(Task task) {
|
||||||
return noPriorityQueue.size();
|
return noPriorityQueue.size();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int getTotalRequestsCount(Task task) {
|
||||||
|
return getDuplicateRemover().getTotalRequestsCount(task);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -16,7 +16,7 @@ import java.util.concurrent.LinkedBlockingQueue;
|
||||||
* @since 0.1.0
|
* @since 0.1.0
|
||||||
*/
|
*/
|
||||||
@ThreadSafe
|
@ThreadSafe
|
||||||
public class QueueScheduler extends LocalDuplicatedRemoveScheduler {
|
public class QueueScheduler extends DuplicateRemovedScheduler implements MonitorableScheduler {
|
||||||
|
|
||||||
private BlockingQueue<Request> queue = new LinkedBlockingQueue<Request>();
|
private BlockingQueue<Request> queue = new LinkedBlockingQueue<Request>();
|
||||||
|
|
||||||
|
@ -34,4 +34,9 @@ public class QueueScheduler extends LocalDuplicatedRemoveScheduler {
|
||||||
public int getLeftRequestsCount(Task task) {
|
public int getLeftRequestsCount(Task task) {
|
||||||
return queue.size();
|
return queue.size();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int getTotalRequestsCount(Task task) {
|
||||||
|
return getDuplicateRemover().getTotalRequestsCount(task);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,70 @@
|
||||||
|
package us.codecraft.webmagic.scheduler.component;
|
||||||
|
|
||||||
|
import com.google.common.hash.BloomFilter;
|
||||||
|
import com.google.common.hash.Funnels;
|
||||||
|
import us.codecraft.webmagic.Request;
|
||||||
|
import us.codecraft.webmagic.Task;
|
||||||
|
|
||||||
|
import java.nio.charset.Charset;
|
||||||
|
import java.util.concurrent.atomic.AtomicInteger;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* BloomFilterDuplicateRemover for huge number of urls.
|
||||||
|
*
|
||||||
|
* @author code4crafer@gmail.com
|
||||||
|
* @since 0.5.1
|
||||||
|
*/
|
||||||
|
public class BloomFilterDuplicateRemover implements DuplicateRemover {
|
||||||
|
|
||||||
|
private int expectedInsertions;
|
||||||
|
|
||||||
|
private double fpp;
|
||||||
|
|
||||||
|
private AtomicInteger counter;
|
||||||
|
|
||||||
|
public BloomFilterDuplicateRemover(int expectedInsertions) {
|
||||||
|
this(expectedInsertions, 0.01);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @param expectedInsertions the number of expected insertions to the constructed
|
||||||
|
* @param fpp the desired false positive probability (must be positive and less than 1.0)
|
||||||
|
*/
|
||||||
|
public BloomFilterDuplicateRemover(int expectedInsertions, double fpp) {
|
||||||
|
this.expectedInsertions = expectedInsertions;
|
||||||
|
this.fpp = fpp;
|
||||||
|
this.bloomFilter = rebuildBloomFilter();
|
||||||
|
}
|
||||||
|
|
||||||
|
protected BloomFilter<CharSequence> rebuildBloomFilter() {
|
||||||
|
counter = new AtomicInteger(0);
|
||||||
|
return BloomFilter.create(Funnels.stringFunnel(Charset.defaultCharset()), expectedInsertions, fpp);
|
||||||
|
}
|
||||||
|
|
||||||
|
private final BloomFilter<CharSequence> bloomFilter;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean isDuplicate(Request request, Task task) {
|
||||||
|
boolean isDuplicate = bloomFilter.mightContain(getUrl(request));
|
||||||
|
if (!isDuplicate) {
|
||||||
|
bloomFilter.put(getUrl(request));
|
||||||
|
counter.incrementAndGet();
|
||||||
|
}
|
||||||
|
return isDuplicate;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected String getUrl(Request request) {
|
||||||
|
return request.getUrl();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void resetDuplicateCheck(Task task) {
|
||||||
|
rebuildBloomFilter();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int getTotalRequestsCount(Task task) {
|
||||||
|
return counter.get();
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,35 @@
|
||||||
|
package us.codecraft.webmagic.scheduler.component;
|
||||||
|
|
||||||
|
import us.codecraft.webmagic.Request;
|
||||||
|
import us.codecraft.webmagic.Task;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Remove duplicate requests.
|
||||||
|
* @author code4crafer@gmail.com
|
||||||
|
* @since 0.5.1
|
||||||
|
*/
|
||||||
|
public interface DuplicateRemover {
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* Check whether the request is duplicate.
|
||||||
|
*
|
||||||
|
* @param request
|
||||||
|
* @param task
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public boolean isDuplicate(Request request, Task task);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Reset duplicate check.
|
||||||
|
* @param task
|
||||||
|
*/
|
||||||
|
public void resetDuplicateCheck(Task task);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get TotalRequestsCount for monitor.
|
||||||
|
* @param task
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public int getTotalRequestsCount(Task task);
|
||||||
|
|
||||||
|
}
|
|
@ -1,4 +1,4 @@
|
||||||
package us.codecraft.webmagic.scheduler;
|
package us.codecraft.webmagic.scheduler.component;
|
||||||
|
|
||||||
import com.google.common.collect.Sets;
|
import com.google.common.collect.Sets;
|
||||||
import us.codecraft.webmagic.Request;
|
import us.codecraft.webmagic.Request;
|
||||||
|
@ -8,25 +8,26 @@ import java.util.Set;
|
||||||
import java.util.concurrent.ConcurrentHashMap;
|
import java.util.concurrent.ConcurrentHashMap;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Base Scheduler with duplicated urls removed by hash set.<br></br>
|
* @author code4crafer@gmail.com
|
||||||
*
|
|
||||||
* @author code4crafter@gmail.com
|
|
||||||
* @since 0.5.0
|
|
||||||
*/
|
*/
|
||||||
public abstract class LocalDuplicatedRemoveScheduler extends DuplicatedRemoveScheduler implements MonitorableScheduler {
|
public class HashSetDuplicateRemover implements DuplicateRemover {
|
||||||
|
|
||||||
private Set<String> urls = Sets.newSetFromMap(new ConcurrentHashMap<String, Boolean>());
|
private Set<String> urls = Sets.newSetFromMap(new ConcurrentHashMap<String, Boolean>());
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean isDuplicate(Request request, Task task) {
|
||||||
|
return !urls.add(getUrl(request));
|
||||||
|
}
|
||||||
|
|
||||||
|
protected String getUrl(Request request) {
|
||||||
|
return request.getUrl();
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void resetDuplicateCheck(Task task) {
|
public void resetDuplicateCheck(Task task) {
|
||||||
urls.clear();
|
urls.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
protected boolean isDuplicate(Request request, Task task) {
|
|
||||||
return urls.add(request.getUrl());
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int getTotalRequestsCount(Task task) {
|
public int getTotalRequestsCount(Task task) {
|
||||||
return urls.size();
|
return urls.size();
|
|
@ -0,0 +1,5 @@
|
||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
Component of scheduler.
|
||||||
|
</body>
|
||||||
|
</html>
|
|
@ -22,10 +22,10 @@ public class FilePersistentBase {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setPath(String path) {
|
public void setPath(String path) {
|
||||||
this.path = path;
|
|
||||||
if (!path.endsWith(PATH_SEPERATOR)) {
|
if (!path.endsWith(PATH_SEPERATOR)) {
|
||||||
path += PATH_SEPERATOR;
|
path += PATH_SEPERATOR;
|
||||||
}
|
}
|
||||||
|
this.path = path;
|
||||||
}
|
}
|
||||||
|
|
||||||
public File getFile(String fullName) {
|
public File getFile(String fullName) {
|
||||||
|
|
|
@ -0,0 +1,80 @@
|
||||||
|
package us.codecraft.webmagic.scheduler;
|
||||||
|
|
||||||
|
import org.junit.Ignore;
|
||||||
|
import org.junit.Test;
|
||||||
|
import us.codecraft.webmagic.Request;
|
||||||
|
import us.codecraft.webmagic.scheduler.component.BloomFilterDuplicateRemover;
|
||||||
|
import us.codecraft.webmagic.scheduler.component.DuplicateRemover;
|
||||||
|
import us.codecraft.webmagic.scheduler.component.HashSetDuplicateRemover;
|
||||||
|
|
||||||
|
import static org.assertj.core.api.Assertions.assertThat;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author code4crafer@gmail.com
|
||||||
|
*/
|
||||||
|
public class BloomFilterDuplicateRemoverTest {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testRemove() throws Exception {
|
||||||
|
BloomFilterDuplicateRemover bloomFilterDuplicateRemover = new BloomFilterDuplicateRemover(10);
|
||||||
|
boolean isDuplicate = bloomFilterDuplicateRemover.isDuplicate(new Request("a"), null);
|
||||||
|
assertThat(isDuplicate).isFalse();
|
||||||
|
isDuplicate = bloomFilterDuplicateRemover.isDuplicate(new Request("a"), null);
|
||||||
|
assertThat(isDuplicate).isTrue();
|
||||||
|
isDuplicate = bloomFilterDuplicateRemover.isDuplicate(new Request("b"), null);
|
||||||
|
assertThat(isDuplicate).isFalse();
|
||||||
|
isDuplicate = bloomFilterDuplicateRemover.isDuplicate(new Request("b"), null);
|
||||||
|
assertThat(isDuplicate).isTrue();
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
@Ignore("long time")
|
||||||
|
@Test
|
||||||
|
public void testMemory() throws Exception {
|
||||||
|
int times = 5000000;
|
||||||
|
DuplicateRemover duplicateRemover = new BloomFilterDuplicateRemover(times,0.005);
|
||||||
|
long freeMemory = Runtime.getRuntime().freeMemory();
|
||||||
|
long time = System.currentTimeMillis();
|
||||||
|
for (int i = 0; i < times; i++) {
|
||||||
|
duplicateRemover.isDuplicate(new Request(String.valueOf(i)), null);
|
||||||
|
}
|
||||||
|
System.out.println("Time used by bloomfilter:" + (System.currentTimeMillis() - time));
|
||||||
|
System.out.println("Memory used by bloomfilter:" + (freeMemory - Runtime.getRuntime().freeMemory()));
|
||||||
|
|
||||||
|
duplicateRemover = new HashSetDuplicateRemover();
|
||||||
|
System.gc();
|
||||||
|
freeMemory = Runtime.getRuntime().freeMemory();
|
||||||
|
time = System.currentTimeMillis();
|
||||||
|
for (int i = 0; i < times; i++) {
|
||||||
|
duplicateRemover.isDuplicate(new Request(String.valueOf(i)), null);
|
||||||
|
}
|
||||||
|
System.out.println("Time used by hashset:" + (System.currentTimeMillis() - time));
|
||||||
|
System.out.println("Memory used by hashset:" + (freeMemory - Runtime.getRuntime().freeMemory()));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Ignore("long time")
|
||||||
|
@Test
|
||||||
|
public void testMissHit() throws Exception {
|
||||||
|
int times = 5000000;
|
||||||
|
DuplicateRemover duplicateRemover = new BloomFilterDuplicateRemover(times, 0.01);
|
||||||
|
int right = 0;
|
||||||
|
int wrong = 0;
|
||||||
|
int missCheck = 0;
|
||||||
|
for (int i = 0; i < times; i++) {
|
||||||
|
boolean duplicate = duplicateRemover.isDuplicate(new Request(String.valueOf(i)), null);
|
||||||
|
if (duplicate) {
|
||||||
|
wrong++;
|
||||||
|
} else {
|
||||||
|
right++;
|
||||||
|
}
|
||||||
|
duplicate = duplicateRemover.isDuplicate(new Request(String.valueOf(i)), null);
|
||||||
|
if (!duplicate) {
|
||||||
|
missCheck++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
System.out.println("Right count: " + right + " Wrong count: " + wrong + " Miss check: " + missCheck);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
|
@ -3,7 +3,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<artifactId>webmagic-parent</artifactId>
|
<artifactId>webmagic-parent</artifactId>
|
||||||
<version>0.5.0</version>
|
<version>0.5.2-SNAPSHOT</version>
|
||||||
</parent>
|
</parent>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
|
|
@ -2,8 +2,6 @@ package us.codecraft.webmagic.scheduler;
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.commons.lang3.math.NumberUtils;
|
import org.apache.commons.lang3.math.NumberUtils;
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
import us.codecraft.webmagic.Request;
|
import us.codecraft.webmagic.Request;
|
||||||
import us.codecraft.webmagic.Task;
|
import us.codecraft.webmagic.Task;
|
||||||
|
|
||||||
|
@ -23,9 +21,7 @@ import java.util.concurrent.atomic.AtomicInteger;
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
* @since 0.2.0
|
* @since 0.2.0
|
||||||
*/
|
*/
|
||||||
public class FileCacheQueueScheduler extends LocalDuplicatedRemoveScheduler {
|
public class FileCacheQueueScheduler extends DuplicateRemovedScheduler implements MonitorableScheduler {
|
||||||
|
|
||||||
private Logger logger = LoggerFactory.getLogger(getClass());
|
|
||||||
|
|
||||||
private String filePath = System.getProperty("java.io.tmpdir");
|
private String filePath = System.getProperty("java.io.tmpdir");
|
||||||
|
|
||||||
|
@ -166,4 +162,9 @@ public class FileCacheQueueScheduler extends LocalDuplicatedRemoveScheduler {
|
||||||
public int getLeftRequestsCount(Task task) {
|
public int getLeftRequestsCount(Task task) {
|
||||||
return queue.size();
|
return queue.size();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int getTotalRequestsCount(Task task) {
|
||||||
|
return getDuplicateRemover().getTotalRequestsCount(task);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -7,6 +7,7 @@ import redis.clients.jedis.JedisPool;
|
||||||
import redis.clients.jedis.JedisPoolConfig;
|
import redis.clients.jedis.JedisPoolConfig;
|
||||||
import us.codecraft.webmagic.Request;
|
import us.codecraft.webmagic.Request;
|
||||||
import us.codecraft.webmagic.Task;
|
import us.codecraft.webmagic.Task;
|
||||||
|
import us.codecraft.webmagic.scheduler.component.DuplicateRemover;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Use Redis as url scheduler for distributed crawlers.<br>
|
* Use Redis as url scheduler for distributed crawlers.<br>
|
||||||
|
@ -14,7 +15,7 @@ import us.codecraft.webmagic.Task;
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
* @since 0.2.0
|
* @since 0.2.0
|
||||||
*/
|
*/
|
||||||
public class RedisScheduler extends DuplicatedRemoveScheduler implements MonitorableScheduler {
|
public class RedisScheduler extends DuplicateRemovedScheduler implements MonitorableScheduler, DuplicateRemover {
|
||||||
|
|
||||||
private JedisPool pool;
|
private JedisPool pool;
|
||||||
|
|
||||||
|
@ -25,11 +26,12 @@ public class RedisScheduler extends DuplicatedRemoveScheduler implements Monitor
|
||||||
private static final String ITEM_PREFIX = "item_";
|
private static final String ITEM_PREFIX = "item_";
|
||||||
|
|
||||||
public RedisScheduler(String host) {
|
public RedisScheduler(String host) {
|
||||||
pool = new JedisPool(new JedisPoolConfig(), host);
|
this(new JedisPool(new JedisPoolConfig(), host));
|
||||||
}
|
}
|
||||||
|
|
||||||
public RedisScheduler(JedisPool pool) {
|
public RedisScheduler(JedisPool pool) {
|
||||||
this.pool = pool;
|
this.pool = pool;
|
||||||
|
setDuplicateRemover(this);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -43,10 +45,10 @@ public class RedisScheduler extends DuplicatedRemoveScheduler implements Monitor
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected boolean isDuplicate(Request request, Task task) {
|
public boolean isDuplicate(Request request, Task task) {
|
||||||
Jedis jedis = pool.getResource();
|
Jedis jedis = pool.getResource();
|
||||||
try {
|
try {
|
||||||
boolean isDuplicate = !jedis.sismember(getSetKey(task), request.getUrl());
|
boolean isDuplicate = jedis.sismember(getSetKey(task), request.getUrl());
|
||||||
if (!isDuplicate) {
|
if (!isDuplicate) {
|
||||||
jedis.sadd(getSetKey(task), request.getUrl());
|
jedis.sadd(getSetKey(task), request.getUrl());
|
||||||
}
|
}
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<artifactId>webmagic-parent</artifactId>
|
<artifactId>webmagic-parent</artifactId>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<version>0.5.0</version>
|
<version>0.5.2-SNAPSHOT</version>
|
||||||
</parent>
|
</parent>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,53 @@
|
||||||
|
package us.codecraft.webmagic.samples;
|
||||||
|
|
||||||
|
import us.codecraft.webmagic.Page;
|
||||||
|
import us.codecraft.webmagic.Site;
|
||||||
|
import us.codecraft.webmagic.Spider;
|
||||||
|
import us.codecraft.webmagic.processor.PageProcessor;
|
||||||
|
import us.codecraft.webmagic.selector.Html;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author code4crafer@gmail.com
|
||||||
|
*/
|
||||||
|
public class AmanzonPageProcessor implements PageProcessor{
|
||||||
|
public void process(Page page) {
|
||||||
|
|
||||||
|
Html html = page.getHtml();
|
||||||
|
List<String> questionList = html.xpath("//table[@class='tgCustomerCommunityCenterColumn']//div[@class='content']//table[@class='dataGrid']//tr").all();
|
||||||
|
|
||||||
|
if(questionList != null && questionList.size() > 1)
|
||||||
|
{
|
||||||
|
//i=0是列名称,所以i从1开始
|
||||||
|
for( int i = 1 ; i < questionList.size(); i++)
|
||||||
|
{
|
||||||
|
System.out.println(questionList.get(i));
|
||||||
|
Html tempHtml = Html.create("<table>"+questionList.get(i)+"</table>");
|
||||||
|
String comment = tempHtml.xpath("//td[@class='title']//a/text()").toString();
|
||||||
|
System.out.println(comment);
|
||||||
|
String answerNum = tempHtml.xpath("//td[@class='num']/text()").toString();
|
||||||
|
System.out.println(answerNum);
|
||||||
|
String createTime = tempHtml.xpath("//td[3]/text()").toString();
|
||||||
|
System.out.println(createTime);
|
||||||
|
|
||||||
|
/* Document doc = Jsoup.parse(questionList.get(i));
|
||||||
|
Html hmt = Html.create(questionList.get(i)) ;
|
||||||
|
String str = hmt.links().toString();
|
||||||
|
String content = doc.getElementsByTag("a").text();
|
||||||
|
String ss = doc.text();*/
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Site getSite() {
|
||||||
|
return Site.me();
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void main(String[] args) {
|
||||||
|
Spider.create(new AmanzonPageProcessor()).test("http://www.amazon.de/forum/Fx27CUFD8S7LJ5D");
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,37 @@
|
||||||
|
package us.codecraft.webmagic.samples;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author code4crafer@gmail.com
|
||||||
|
*/
|
||||||
|
public class GithubRepo {
|
||||||
|
|
||||||
|
private String name;
|
||||||
|
|
||||||
|
private String author;
|
||||||
|
|
||||||
|
private String readme;
|
||||||
|
|
||||||
|
public String getName() {
|
||||||
|
return name;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setName(String name) {
|
||||||
|
this.name = name;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getAuthor() {
|
||||||
|
return author;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setAuthor(String author) {
|
||||||
|
this.author = author;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getReadme() {
|
||||||
|
return readme;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setReadme(String readme) {
|
||||||
|
this.readme = readme;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,40 @@
|
||||||
|
package us.codecraft.webmagic.samples;
|
||||||
|
|
||||||
|
import us.codecraft.webmagic.Page;
|
||||||
|
import us.codecraft.webmagic.Site;
|
||||||
|
import us.codecraft.webmagic.Spider;
|
||||||
|
import us.codecraft.webmagic.processor.PageProcessor;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author code4crafter@gmail.com <br>
|
||||||
|
* @since 0.5.1
|
||||||
|
*/
|
||||||
|
public class GithubRepoPageProcessor implements PageProcessor {
|
||||||
|
|
||||||
|
private Site site = Site.me().setRetryTimes(3).setSleepTime(0);
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void process(Page page) {
|
||||||
|
page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all());
|
||||||
|
page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+)").all());
|
||||||
|
GithubRepo githubRepo = new GithubRepo();
|
||||||
|
githubRepo.setAuthor(page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString());
|
||||||
|
githubRepo.setName(page.getHtml().xpath("//h1[@class='entry-title public']/strong/a/text()").toString());
|
||||||
|
githubRepo.setReadme(page.getHtml().xpath("//div[@id='readme']/tidyText()").toString());
|
||||||
|
if (githubRepo.getName() == null) {
|
||||||
|
//skip this page
|
||||||
|
page.setSkip(true);
|
||||||
|
} else {
|
||||||
|
page.putField("repo", githubRepo);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Site getSite() {
|
||||||
|
return site;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void main(String[] args) {
|
||||||
|
Spider.create(new GithubRepoPageProcessor()).addUrl("https://github.com/code4craft").thread(5).run();
|
||||||
|
}
|
||||||
|
}
|
|
@ -3,8 +3,12 @@ package us.codecraft.webmagic.samples;
|
||||||
import us.codecraft.webmagic.Page;
|
import us.codecraft.webmagic.Page;
|
||||||
import us.codecraft.webmagic.Site;
|
import us.codecraft.webmagic.Site;
|
||||||
import us.codecraft.webmagic.Spider;
|
import us.codecraft.webmagic.Spider;
|
||||||
|
import us.codecraft.webmagic.monitor.SpiderMonitor;
|
||||||
import us.codecraft.webmagic.processor.PageProcessor;
|
import us.codecraft.webmagic.processor.PageProcessor;
|
||||||
|
import us.codecraft.webmagic.scheduler.QueueScheduler;
|
||||||
|
import us.codecraft.webmagic.scheduler.component.BloomFilterDuplicateRemover;
|
||||||
|
|
||||||
|
import javax.management.JMException;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -29,7 +33,9 @@ public class OschinaBlogPageProcesser implements PageProcessor {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public static void main(String[] args) {
|
public static void main(String[] args) throws JMException {
|
||||||
Spider.create(new OschinaBlogPageProcesser()).run();
|
Spider spider = Spider.create(new OschinaBlogPageProcesser()).setScheduler(new QueueScheduler().setDuplicateRemover(new BloomFilterDuplicateRemover(2000)));
|
||||||
|
SpiderMonitor.instance().register(spider);
|
||||||
|
spider.run();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,7 @@
|
||||||
|
package us.codecraft.webmagic.samples.pipeline;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author code4crafer@gmail.com
|
||||||
|
*/
|
||||||
|
public class ReplacePipeline {
|
||||||
|
}
|
|
@ -3,7 +3,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<artifactId>webmagic-parent</artifactId>
|
<artifactId>webmagic-parent</artifactId>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<version>0.5.0</version>
|
<version>0.5.2-SNAPSHOT</version>
|
||||||
</parent>
|
</parent>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<artifactId>webmagic-parent</artifactId>
|
<artifactId>webmagic-parent</artifactId>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<version>0.5.0</version>
|
<version>0.5.2-SNAPSHOT</version>
|
||||||
</parent>
|
</parent>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<artifactId>webmagic-parent</artifactId>
|
<artifactId>webmagic-parent</artifactId>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<version>0.5.0</version>
|
<version>0.5.2-SNAPSHOT</version>
|
||||||
</parent>
|
</parent>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
|
|
@ -41,12 +41,12 @@ webmagic使用maven管理依赖,在项目中添加对应的依赖即可使用w
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<artifactId>webmagic-core</artifactId>
|
<artifactId>webmagic-core</artifactId>
|
||||||
<version>0.5.0</version>
|
<version>0.5.1</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<artifactId>webmagic-extension</artifactId>
|
<artifactId>webmagic-extension</artifactId>
|
||||||
<version>0.5.0</version>
|
<version>0.5.1</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
```
|
```
|
||||||
|
|
||||||
|
@ -192,6 +192,7 @@ webmagic遵循[Apache 2.0协议](http://opensource.org/licenses/Apache-2.0)
|
||||||
* [lidongyang](http://my.oschina.net/lidongyang)
|
* [lidongyang](http://my.oschina.net/lidongyang)
|
||||||
* [seveniu](https://github.com/seveniu)
|
* [seveniu](https://github.com/seveniu)
|
||||||
* [sebastian1118](https://github.com/sebastian1118)
|
* [sebastian1118](https://github.com/sebastian1118)
|
||||||
|
* [codev777](https://github.com/codev777)
|
||||||
|
|
||||||
### 邮件组:
|
### 邮件组:
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue