Merge branch 'stable' of github.com:code4craft/webmagic
commit
feb604da87
|
@ -41,12 +41,12 @@ webmagic使用maven管理依赖,在项目中添加对应的依赖即可使用w
|
|||
<dependency>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<artifactId>webmagic-core</artifactId>
|
||||
<version>0.5.0</version>
|
||||
<version>0.5.1</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<artifactId>webmagic-extension</artifactId>
|
||||
<version>0.5.0</version>
|
||||
<version>0.5.1</version>
|
||||
</dependency>
|
||||
```
|
||||
|
||||
|
@ -192,6 +192,7 @@ webmagic遵循[Apache 2.0协议](http://opensource.org/licenses/Apache-2.0)
|
|||
* [lidongyang](http://my.oschina.net/lidongyang)
|
||||
* [seveniu](https://github.com/seveniu)
|
||||
* [sebastian1118](https://github.com/sebastian1118)
|
||||
* [codev777](https://github.com/codev777)
|
||||
|
||||
### 邮件组:
|
||||
|
||||
|
|
|
@ -25,12 +25,12 @@ Add dependencies to your pom.xml:
|
|||
<dependency>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<artifactId>webmagic-core</artifactId>
|
||||
<version>0.5.0</version>
|
||||
<version>0.5.1</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<artifactId>webmagic-extension</artifactId>
|
||||
<version>0.5.0</version>
|
||||
<version>0.5.1</version>
|
||||
</dependency>
|
||||
```
|
||||
|
||||
|
@ -145,6 +145,7 @@ Thanks these people for commiting source code, reporting bugs or suggesting for
|
|||
* [lidongyang](http://my.oschina.net/lidongyang)
|
||||
* [seveniu](https://github.com/seveniu)
|
||||
* [sebastian1118](https://github.com/sebastian1118)
|
||||
* [codev777](https://github.com/codev777)
|
||||
|
||||
|
||||
### Thanks:
|
||||
|
|
4
pom.xml
4
pom.xml
|
@ -6,7 +6,7 @@
|
|||
<version>7</version>
|
||||
</parent>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<version>0.5.0</version>
|
||||
<version>0.5.2-SNAPSHOT</version>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
<packaging>pom</packaging>
|
||||
<properties>
|
||||
|
@ -54,7 +54,7 @@
|
|||
<module>webmagic-selenium</module>
|
||||
<module>webmagic-saxon</module>
|
||||
<module>webmagic-samples</module>
|
||||
<module>webmagic-avalon</module>
|
||||
<!--<module>webmagic-avalon</module>-->
|
||||
</modules>
|
||||
|
||||
<dependencyManagement>
|
||||
|
|
|
@ -7,7 +7,7 @@
|
|||
</parent>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<artifactId>forger</artifactId>
|
||||
<version>0.1.0</version>
|
||||
<version>0.1.1-SNAPSHOT</version>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
<packaging>jar</packaging>
|
||||
<properties>
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
<parent>
|
||||
<artifactId>webmagic-parent</artifactId>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<version>0.5.0</version>
|
||||
<version>0.5.1-SNAPSHOT</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
|
@ -39,12 +39,6 @@
|
|||
<version>1.1.1</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<artifactId>forger</artifactId>
|
||||
<version>0.1.1-SNAPSHOT</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.freemarker</groupId>
|
||||
<artifactId>freemarker</artifactId>
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
<parent>
|
||||
<artifactId>webmagic-avalon</artifactId>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<version>0.5.0</version>
|
||||
<version>0.5.1-SNAPSHOT</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
<parent>
|
||||
<artifactId>webmagic-avalon</artifactId>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<version>0.5.0</version>
|
||||
<version>0.5.1-SNAPSHOT</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
|
@ -26,7 +26,7 @@
|
|||
<dependency>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<artifactId>forger</artifactId>
|
||||
<version>0.1.0</version>
|
||||
<version>0.1.1-SNAPSHOT</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
|
@ -150,18 +150,4 @@
|
|||
</plugins>
|
||||
</build>
|
||||
|
||||
<repositories>
|
||||
<repository>
|
||||
<id>sonatype-nexus-snapshots</id>
|
||||
<name>Sonatype Nexus Snapshots</name>
|
||||
<url>https://oss.sonatype.org/content/repositories/snapshots</url>
|
||||
<releases>
|
||||
<enabled>false</enabled>
|
||||
</releases>
|
||||
<snapshots>
|
||||
<enabled>true</enabled>
|
||||
</snapshots>
|
||||
</repository>
|
||||
</repositories>
|
||||
|
||||
</project>
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
<parent>
|
||||
<artifactId>webmagic-avalon</artifactId>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<version>0.5.0</version>
|
||||
<version>0.5.1-SNAPSHOT</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
<parent>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<artifactId>webmagic-parent</artifactId>
|
||||
<version>0.5.0</version>
|
||||
<version>0.5.2-SNAPSHOT</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
|
|
|
@ -4,6 +4,8 @@ import org.slf4j.Logger;
|
|||
import org.slf4j.LoggerFactory;
|
||||
import us.codecraft.webmagic.Request;
|
||||
import us.codecraft.webmagic.Task;
|
||||
import us.codecraft.webmagic.scheduler.component.DuplicateRemover;
|
||||
import us.codecraft.webmagic.scheduler.component.HashSetDuplicateRemover;
|
||||
|
||||
/**
|
||||
* Remove duplicate urls and only push urls which are not duplicate.<br></br>
|
||||
|
@ -11,30 +13,30 @@ import us.codecraft.webmagic.Task;
|
|||
* @author code4crafer@gmail.com
|
||||
* @since 0.5.0
|
||||
*/
|
||||
public abstract class DuplicatedRemoveScheduler implements Scheduler {
|
||||
public abstract class DuplicateRemovedScheduler implements Scheduler {
|
||||
|
||||
protected Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
private DuplicateRemover duplicatedRemover = new HashSetDuplicateRemover();
|
||||
|
||||
public DuplicateRemover getDuplicateRemover() {
|
||||
return duplicatedRemover;
|
||||
}
|
||||
|
||||
public DuplicateRemovedScheduler setDuplicateRemover(DuplicateRemover duplicatedRemover) {
|
||||
this.duplicatedRemover = duplicatedRemover;
|
||||
return this;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void push(Request request, Task task) {
|
||||
logger.trace("get a candidate url {}", request.getUrl());
|
||||
if (isDuplicate(request, task) || shouldReserved(request)) {
|
||||
if (!duplicatedRemover.isDuplicate(request, task) || shouldReserved(request)) {
|
||||
logger.debug("push to queue {}", request.getUrl());
|
||||
pushWhenNoDuplicate(request, task);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Reset duplicate check.
|
||||
*/
|
||||
public abstract void resetDuplicateCheck(Task task);
|
||||
|
||||
/**
|
||||
* @param request
|
||||
* @return
|
||||
*/
|
||||
protected abstract boolean isDuplicate(Request request, Task task);
|
||||
|
||||
protected boolean shouldReserved(Request request) {
|
||||
return request.getExtra(Request.CYCLE_TRIED_TIMES) != null;
|
||||
}
|
|
@ -17,7 +17,7 @@ import java.util.concurrent.PriorityBlockingQueue;
|
|||
* @since 0.2.1
|
||||
*/
|
||||
@ThreadSafe
|
||||
public class PriorityScheduler extends LocalDuplicatedRemoveScheduler {
|
||||
public class PriorityScheduler extends DuplicateRemovedScheduler implements MonitorableScheduler {
|
||||
|
||||
public static final int INITIAL_CAPACITY = 5;
|
||||
|
||||
|
@ -65,4 +65,9 @@ public class PriorityScheduler extends LocalDuplicatedRemoveScheduler {
|
|||
public int getLeftRequestsCount(Task task) {
|
||||
return noPriorityQueue.size();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getTotalRequestsCount(Task task) {
|
||||
return getDuplicateRemover().getTotalRequestsCount(task);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -16,7 +16,7 @@ import java.util.concurrent.LinkedBlockingQueue;
|
|||
* @since 0.1.0
|
||||
*/
|
||||
@ThreadSafe
|
||||
public class QueueScheduler extends LocalDuplicatedRemoveScheduler {
|
||||
public class QueueScheduler extends DuplicateRemovedScheduler implements MonitorableScheduler {
|
||||
|
||||
private BlockingQueue<Request> queue = new LinkedBlockingQueue<Request>();
|
||||
|
||||
|
@ -34,4 +34,9 @@ public class QueueScheduler extends LocalDuplicatedRemoveScheduler {
|
|||
public int getLeftRequestsCount(Task task) {
|
||||
return queue.size();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getTotalRequestsCount(Task task) {
|
||||
return getDuplicateRemover().getTotalRequestsCount(task);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,70 @@
|
|||
package us.codecraft.webmagic.scheduler.component;
|
||||
|
||||
import com.google.common.hash.BloomFilter;
|
||||
import com.google.common.hash.Funnels;
|
||||
import us.codecraft.webmagic.Request;
|
||||
import us.codecraft.webmagic.Task;
|
||||
|
||||
import java.nio.charset.Charset;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
|
||||
/**
|
||||
* BloomFilterDuplicateRemover for huge number of urls.
|
||||
*
|
||||
* @author code4crafer@gmail.com
|
||||
* @since 0.5.1
|
||||
*/
|
||||
public class BloomFilterDuplicateRemover implements DuplicateRemover {
|
||||
|
||||
private int expectedInsertions;
|
||||
|
||||
private double fpp;
|
||||
|
||||
private AtomicInteger counter;
|
||||
|
||||
public BloomFilterDuplicateRemover(int expectedInsertions) {
|
||||
this(expectedInsertions, 0.01);
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param expectedInsertions the number of expected insertions to the constructed
|
||||
* @param fpp the desired false positive probability (must be positive and less than 1.0)
|
||||
*/
|
||||
public BloomFilterDuplicateRemover(int expectedInsertions, double fpp) {
|
||||
this.expectedInsertions = expectedInsertions;
|
||||
this.fpp = fpp;
|
||||
this.bloomFilter = rebuildBloomFilter();
|
||||
}
|
||||
|
||||
protected BloomFilter<CharSequence> rebuildBloomFilter() {
|
||||
counter = new AtomicInteger(0);
|
||||
return BloomFilter.create(Funnels.stringFunnel(Charset.defaultCharset()), expectedInsertions, fpp);
|
||||
}
|
||||
|
||||
private final BloomFilter<CharSequence> bloomFilter;
|
||||
|
||||
@Override
|
||||
public boolean isDuplicate(Request request, Task task) {
|
||||
boolean isDuplicate = bloomFilter.mightContain(getUrl(request));
|
||||
if (!isDuplicate) {
|
||||
bloomFilter.put(getUrl(request));
|
||||
counter.incrementAndGet();
|
||||
}
|
||||
return isDuplicate;
|
||||
}
|
||||
|
||||
protected String getUrl(Request request) {
|
||||
return request.getUrl();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void resetDuplicateCheck(Task task) {
|
||||
rebuildBloomFilter();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getTotalRequestsCount(Task task) {
|
||||
return counter.get();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,35 @@
|
|||
package us.codecraft.webmagic.scheduler.component;
|
||||
|
||||
import us.codecraft.webmagic.Request;
|
||||
import us.codecraft.webmagic.Task;
|
||||
|
||||
/**
|
||||
* Remove duplicate requests.
|
||||
* @author code4crafer@gmail.com
|
||||
* @since 0.5.1
|
||||
*/
|
||||
public interface DuplicateRemover {
|
||||
/**
|
||||
*
|
||||
* Check whether the request is duplicate.
|
||||
*
|
||||
* @param request
|
||||
* @param task
|
||||
* @return
|
||||
*/
|
||||
public boolean isDuplicate(Request request, Task task);
|
||||
|
||||
/**
|
||||
* Reset duplicate check.
|
||||
* @param task
|
||||
*/
|
||||
public void resetDuplicateCheck(Task task);
|
||||
|
||||
/**
|
||||
* Get TotalRequestsCount for monitor.
|
||||
* @param task
|
||||
* @return
|
||||
*/
|
||||
public int getTotalRequestsCount(Task task);
|
||||
|
||||
}
|
|
@ -1,4 +1,4 @@
|
|||
package us.codecraft.webmagic.scheduler;
|
||||
package us.codecraft.webmagic.scheduler.component;
|
||||
|
||||
import com.google.common.collect.Sets;
|
||||
import us.codecraft.webmagic.Request;
|
||||
|
@ -8,25 +8,26 @@ import java.util.Set;
|
|||
import java.util.concurrent.ConcurrentHashMap;
|
||||
|
||||
/**
|
||||
* Base Scheduler with duplicated urls removed by hash set.<br></br>
|
||||
*
|
||||
* @author code4crafter@gmail.com
|
||||
* @since 0.5.0
|
||||
* @author code4crafer@gmail.com
|
||||
*/
|
||||
public abstract class LocalDuplicatedRemoveScheduler extends DuplicatedRemoveScheduler implements MonitorableScheduler {
|
||||
public class HashSetDuplicateRemover implements DuplicateRemover {
|
||||
|
||||
private Set<String> urls = Sets.newSetFromMap(new ConcurrentHashMap<String, Boolean>());
|
||||
|
||||
@Override
|
||||
public boolean isDuplicate(Request request, Task task) {
|
||||
return !urls.add(getUrl(request));
|
||||
}
|
||||
|
||||
protected String getUrl(Request request) {
|
||||
return request.getUrl();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void resetDuplicateCheck(Task task) {
|
||||
urls.clear();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected boolean isDuplicate(Request request, Task task) {
|
||||
return urls.add(request.getUrl());
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getTotalRequestsCount(Task task) {
|
||||
return urls.size();
|
|
@ -0,0 +1,5 @@
|
|||
<html>
|
||||
<body>
|
||||
Component of scheduler.
|
||||
</body>
|
||||
</html>
|
|
@ -22,10 +22,10 @@ public class FilePersistentBase {
|
|||
}
|
||||
|
||||
public void setPath(String path) {
|
||||
this.path = path;
|
||||
if (!path.endsWith(PATH_SEPERATOR)) {
|
||||
path += PATH_SEPERATOR;
|
||||
}
|
||||
this.path = path;
|
||||
}
|
||||
|
||||
public File getFile(String fullName) {
|
||||
|
|
|
@ -0,0 +1,80 @@
|
|||
package us.codecraft.webmagic.scheduler;
|
||||
|
||||
import org.junit.Ignore;
|
||||
import org.junit.Test;
|
||||
import us.codecraft.webmagic.Request;
|
||||
import us.codecraft.webmagic.scheduler.component.BloomFilterDuplicateRemover;
|
||||
import us.codecraft.webmagic.scheduler.component.DuplicateRemover;
|
||||
import us.codecraft.webmagic.scheduler.component.HashSetDuplicateRemover;
|
||||
|
||||
import static org.assertj.core.api.Assertions.assertThat;
|
||||
|
||||
/**
|
||||
* @author code4crafer@gmail.com
|
||||
*/
|
||||
public class BloomFilterDuplicateRemoverTest {
|
||||
|
||||
@Test
|
||||
public void testRemove() throws Exception {
|
||||
BloomFilterDuplicateRemover bloomFilterDuplicateRemover = new BloomFilterDuplicateRemover(10);
|
||||
boolean isDuplicate = bloomFilterDuplicateRemover.isDuplicate(new Request("a"), null);
|
||||
assertThat(isDuplicate).isFalse();
|
||||
isDuplicate = bloomFilterDuplicateRemover.isDuplicate(new Request("a"), null);
|
||||
assertThat(isDuplicate).isTrue();
|
||||
isDuplicate = bloomFilterDuplicateRemover.isDuplicate(new Request("b"), null);
|
||||
assertThat(isDuplicate).isFalse();
|
||||
isDuplicate = bloomFilterDuplicateRemover.isDuplicate(new Request("b"), null);
|
||||
assertThat(isDuplicate).isTrue();
|
||||
|
||||
}
|
||||
|
||||
@Ignore("long time")
|
||||
@Test
|
||||
public void testMemory() throws Exception {
|
||||
int times = 5000000;
|
||||
DuplicateRemover duplicateRemover = new BloomFilterDuplicateRemover(times,0.005);
|
||||
long freeMemory = Runtime.getRuntime().freeMemory();
|
||||
long time = System.currentTimeMillis();
|
||||
for (int i = 0; i < times; i++) {
|
||||
duplicateRemover.isDuplicate(new Request(String.valueOf(i)), null);
|
||||
}
|
||||
System.out.println("Time used by bloomfilter:" + (System.currentTimeMillis() - time));
|
||||
System.out.println("Memory used by bloomfilter:" + (freeMemory - Runtime.getRuntime().freeMemory()));
|
||||
|
||||
duplicateRemover = new HashSetDuplicateRemover();
|
||||
System.gc();
|
||||
freeMemory = Runtime.getRuntime().freeMemory();
|
||||
time = System.currentTimeMillis();
|
||||
for (int i = 0; i < times; i++) {
|
||||
duplicateRemover.isDuplicate(new Request(String.valueOf(i)), null);
|
||||
}
|
||||
System.out.println("Time used by hashset:" + (System.currentTimeMillis() - time));
|
||||
System.out.println("Memory used by hashset:" + (freeMemory - Runtime.getRuntime().freeMemory()));
|
||||
}
|
||||
|
||||
@Ignore("long time")
|
||||
@Test
|
||||
public void testMissHit() throws Exception {
|
||||
int times = 5000000;
|
||||
DuplicateRemover duplicateRemover = new BloomFilterDuplicateRemover(times, 0.01);
|
||||
int right = 0;
|
||||
int wrong = 0;
|
||||
int missCheck = 0;
|
||||
for (int i = 0; i < times; i++) {
|
||||
boolean duplicate = duplicateRemover.isDuplicate(new Request(String.valueOf(i)), null);
|
||||
if (duplicate) {
|
||||
wrong++;
|
||||
} else {
|
||||
right++;
|
||||
}
|
||||
duplicate = duplicateRemover.isDuplicate(new Request(String.valueOf(i)), null);
|
||||
if (!duplicate) {
|
||||
missCheck++;
|
||||
}
|
||||
}
|
||||
|
||||
System.out.println("Right count: " + right + " Wrong count: " + wrong + " Miss check: " + missCheck);
|
||||
}
|
||||
|
||||
|
||||
}
|
|
@ -3,7 +3,7 @@
|
|||
<parent>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<artifactId>webmagic-parent</artifactId>
|
||||
<version>0.5.0</version>
|
||||
<version>0.5.2-SNAPSHOT</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
|
|
|
@ -2,8 +2,6 @@ package us.codecraft.webmagic.scheduler;
|
|||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.math.NumberUtils;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import us.codecraft.webmagic.Request;
|
||||
import us.codecraft.webmagic.Task;
|
||||
|
||||
|
@ -23,9 +21,7 @@ import java.util.concurrent.atomic.AtomicInteger;
|
|||
* @author code4crafter@gmail.com <br>
|
||||
* @since 0.2.0
|
||||
*/
|
||||
public class FileCacheQueueScheduler extends LocalDuplicatedRemoveScheduler {
|
||||
|
||||
private Logger logger = LoggerFactory.getLogger(getClass());
|
||||
public class FileCacheQueueScheduler extends DuplicateRemovedScheduler implements MonitorableScheduler {
|
||||
|
||||
private String filePath = System.getProperty("java.io.tmpdir");
|
||||
|
||||
|
@ -166,4 +162,9 @@ public class FileCacheQueueScheduler extends LocalDuplicatedRemoveScheduler {
|
|||
public int getLeftRequestsCount(Task task) {
|
||||
return queue.size();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getTotalRequestsCount(Task task) {
|
||||
return getDuplicateRemover().getTotalRequestsCount(task);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -7,6 +7,7 @@ import redis.clients.jedis.JedisPool;
|
|||
import redis.clients.jedis.JedisPoolConfig;
|
||||
import us.codecraft.webmagic.Request;
|
||||
import us.codecraft.webmagic.Task;
|
||||
import us.codecraft.webmagic.scheduler.component.DuplicateRemover;
|
||||
|
||||
/**
|
||||
* Use Redis as url scheduler for distributed crawlers.<br>
|
||||
|
@ -14,7 +15,7 @@ import us.codecraft.webmagic.Task;
|
|||
* @author code4crafter@gmail.com <br>
|
||||
* @since 0.2.0
|
||||
*/
|
||||
public class RedisScheduler extends DuplicatedRemoveScheduler implements MonitorableScheduler {
|
||||
public class RedisScheduler extends DuplicateRemovedScheduler implements MonitorableScheduler, DuplicateRemover {
|
||||
|
||||
private JedisPool pool;
|
||||
|
||||
|
@ -25,11 +26,12 @@ public class RedisScheduler extends DuplicatedRemoveScheduler implements Monitor
|
|||
private static final String ITEM_PREFIX = "item_";
|
||||
|
||||
public RedisScheduler(String host) {
|
||||
pool = new JedisPool(new JedisPoolConfig(), host);
|
||||
this(new JedisPool(new JedisPoolConfig(), host));
|
||||
}
|
||||
|
||||
public RedisScheduler(JedisPool pool) {
|
||||
this.pool = pool;
|
||||
setDuplicateRemover(this);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -43,10 +45,10 @@ public class RedisScheduler extends DuplicatedRemoveScheduler implements Monitor
|
|||
}
|
||||
|
||||
@Override
|
||||
protected boolean isDuplicate(Request request, Task task) {
|
||||
public boolean isDuplicate(Request request, Task task) {
|
||||
Jedis jedis = pool.getResource();
|
||||
try {
|
||||
boolean isDuplicate = !jedis.sismember(getSetKey(task), request.getUrl());
|
||||
boolean isDuplicate = jedis.sismember(getSetKey(task), request.getUrl());
|
||||
if (!isDuplicate) {
|
||||
jedis.sadd(getSetKey(task), request.getUrl());
|
||||
}
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
<parent>
|
||||
<artifactId>webmagic-parent</artifactId>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<version>0.5.0</version>
|
||||
<version>0.5.2-SNAPSHOT</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
|
|
|
@ -0,0 +1,53 @@
|
|||
package us.codecraft.webmagic.samples;
|
||||
|
||||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.Site;
|
||||
import us.codecraft.webmagic.Spider;
|
||||
import us.codecraft.webmagic.processor.PageProcessor;
|
||||
import us.codecraft.webmagic.selector.Html;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* @author code4crafer@gmail.com
|
||||
*/
|
||||
public class AmanzonPageProcessor implements PageProcessor{
|
||||
public void process(Page page) {
|
||||
|
||||
Html html = page.getHtml();
|
||||
List<String> questionList = html.xpath("//table[@class='tgCustomerCommunityCenterColumn']//div[@class='content']//table[@class='dataGrid']//tr").all();
|
||||
|
||||
if(questionList != null && questionList.size() > 1)
|
||||
{
|
||||
//i=0是列名称,所以i从1开始
|
||||
for( int i = 1 ; i < questionList.size(); i++)
|
||||
{
|
||||
System.out.println(questionList.get(i));
|
||||
Html tempHtml = Html.create("<table>"+questionList.get(i)+"</table>");
|
||||
String comment = tempHtml.xpath("//td[@class='title']//a/text()").toString();
|
||||
System.out.println(comment);
|
||||
String answerNum = tempHtml.xpath("//td[@class='num']/text()").toString();
|
||||
System.out.println(answerNum);
|
||||
String createTime = tempHtml.xpath("//td[3]/text()").toString();
|
||||
System.out.println(createTime);
|
||||
|
||||
/* Document doc = Jsoup.parse(questionList.get(i));
|
||||
Html hmt = Html.create(questionList.get(i)) ;
|
||||
String str = hmt.links().toString();
|
||||
String content = doc.getElementsByTag("a").text();
|
||||
String ss = doc.text();*/
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public Site getSite() {
|
||||
return Site.me();
|
||||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
Spider.create(new AmanzonPageProcessor()).test("http://www.amazon.de/forum/Fx27CUFD8S7LJ5D");
|
||||
}
|
||||
}
|
|
@ -0,0 +1,37 @@
|
|||
package us.codecraft.webmagic.samples;
|
||||
|
||||
/**
|
||||
* @author code4crafer@gmail.com
|
||||
*/
|
||||
public class GithubRepo {
|
||||
|
||||
private String name;
|
||||
|
||||
private String author;
|
||||
|
||||
private String readme;
|
||||
|
||||
public String getName() {
|
||||
return name;
|
||||
}
|
||||
|
||||
public void setName(String name) {
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
public String getAuthor() {
|
||||
return author;
|
||||
}
|
||||
|
||||
public void setAuthor(String author) {
|
||||
this.author = author;
|
||||
}
|
||||
|
||||
public String getReadme() {
|
||||
return readme;
|
||||
}
|
||||
|
||||
public void setReadme(String readme) {
|
||||
this.readme = readme;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,40 @@
|
|||
package us.codecraft.webmagic.samples;
|
||||
|
||||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.Site;
|
||||
import us.codecraft.webmagic.Spider;
|
||||
import us.codecraft.webmagic.processor.PageProcessor;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* @since 0.5.1
|
||||
*/
|
||||
public class GithubRepoPageProcessor implements PageProcessor {
|
||||
|
||||
private Site site = Site.me().setRetryTimes(3).setSleepTime(0);
|
||||
|
||||
@Override
|
||||
public void process(Page page) {
|
||||
page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all());
|
||||
page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+)").all());
|
||||
GithubRepo githubRepo = new GithubRepo();
|
||||
githubRepo.setAuthor(page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString());
|
||||
githubRepo.setName(page.getHtml().xpath("//h1[@class='entry-title public']/strong/a/text()").toString());
|
||||
githubRepo.setReadme(page.getHtml().xpath("//div[@id='readme']/tidyText()").toString());
|
||||
if (githubRepo.getName() == null) {
|
||||
//skip this page
|
||||
page.setSkip(true);
|
||||
} else {
|
||||
page.putField("repo", githubRepo);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Site getSite() {
|
||||
return site;
|
||||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
Spider.create(new GithubRepoPageProcessor()).addUrl("https://github.com/code4craft").thread(5).run();
|
||||
}
|
||||
}
|
|
@ -3,8 +3,12 @@ package us.codecraft.webmagic.samples;
|
|||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.Site;
|
||||
import us.codecraft.webmagic.Spider;
|
||||
import us.codecraft.webmagic.monitor.SpiderMonitor;
|
||||
import us.codecraft.webmagic.processor.PageProcessor;
|
||||
import us.codecraft.webmagic.scheduler.QueueScheduler;
|
||||
import us.codecraft.webmagic.scheduler.component.BloomFilterDuplicateRemover;
|
||||
|
||||
import javax.management.JMException;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
|
@ -29,7 +33,9 @@ public class OschinaBlogPageProcesser implements PageProcessor {
|
|||
|
||||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
Spider.create(new OschinaBlogPageProcesser()).run();
|
||||
public static void main(String[] args) throws JMException {
|
||||
Spider spider = Spider.create(new OschinaBlogPageProcesser()).setScheduler(new QueueScheduler().setDuplicateRemover(new BloomFilterDuplicateRemover(2000)));
|
||||
SpiderMonitor.instance().register(spider);
|
||||
spider.run();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,7 @@
|
|||
package us.codecraft.webmagic.samples.pipeline;
|
||||
|
||||
/**
|
||||
* @author code4crafer@gmail.com
|
||||
*/
|
||||
public class ReplacePipeline {
|
||||
}
|
|
@ -3,7 +3,7 @@
|
|||
<parent>
|
||||
<artifactId>webmagic-parent</artifactId>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<version>0.5.0</version>
|
||||
<version>0.5.2-SNAPSHOT</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
<parent>
|
||||
<artifactId>webmagic-parent</artifactId>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<version>0.5.0</version>
|
||||
<version>0.5.2-SNAPSHOT</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
<parent>
|
||||
<artifactId>webmagic-parent</artifactId>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<version>0.5.0</version>
|
||||
<version>0.5.2-SNAPSHOT</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
|
|
|
@ -41,12 +41,12 @@ webmagic使用maven管理依赖,在项目中添加对应的依赖即可使用w
|
|||
<dependency>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<artifactId>webmagic-core</artifactId>
|
||||
<version>0.5.0</version>
|
||||
<version>0.5.1</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<artifactId>webmagic-extension</artifactId>
|
||||
<version>0.5.0</version>
|
||||
<version>0.5.1</version>
|
||||
</dependency>
|
||||
```
|
||||
|
||||
|
@ -192,6 +192,7 @@ webmagic遵循[Apache 2.0协议](http://opensource.org/licenses/Apache-2.0)
|
|||
* [lidongyang](http://my.oschina.net/lidongyang)
|
||||
* [seveniu](https://github.com/seveniu)
|
||||
* [sebastian1118](https://github.com/sebastian1118)
|
||||
* [codev777](https://github.com/codev777)
|
||||
|
||||
### 邮件组:
|
||||
|
||||
|
|
Loading…
Reference in New Issue