some bug fix #118
parent
186b90512e
commit
4f22f1210e
|
@ -5,6 +5,7 @@ import org.slf4j.LoggerFactory;
|
||||||
import us.codecraft.webmagic.Request;
|
import us.codecraft.webmagic.Request;
|
||||||
import us.codecraft.webmagic.Task;
|
import us.codecraft.webmagic.Task;
|
||||||
import us.codecraft.webmagic.scheduler.component.DuplicateRemover;
|
import us.codecraft.webmagic.scheduler.component.DuplicateRemover;
|
||||||
|
import us.codecraft.webmagic.scheduler.component.HashSetDuplicateRemover;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Remove duplicate urls and only push urls which are not duplicate.<br></br>
|
* Remove duplicate urls and only push urls which are not duplicate.<br></br>
|
||||||
|
@ -16,7 +17,7 @@ public abstract class DuplicateRemovedScheduler implements Scheduler {
|
||||||
|
|
||||||
protected Logger logger = LoggerFactory.getLogger(getClass());
|
protected Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
|
||||||
private DuplicateRemover duplicatedRemover;
|
private DuplicateRemover duplicatedRemover = new HashSetDuplicateRemover();
|
||||||
|
|
||||||
public DuplicateRemover getDuplicateRemover() {
|
public DuplicateRemover getDuplicateRemover() {
|
||||||
return duplicatedRemover;
|
return duplicatedRemover;
|
||||||
|
|
|
@ -43,7 +43,7 @@ public class BloomFilterDuplicateRemover implements DuplicateRemover {
|
||||||
public boolean isDuplicate(Request request, Task task) {
|
public boolean isDuplicate(Request request, Task task) {
|
||||||
boolean isDuplicate = bloomFilter.mightContain(request.getUrl());
|
boolean isDuplicate = bloomFilter.mightContain(request.getUrl());
|
||||||
if (!isDuplicate) {
|
if (!isDuplicate) {
|
||||||
bloomFilter.apply(request.getUrl());
|
bloomFilter.put(request.getUrl());
|
||||||
counter.incrementAndGet();
|
counter.incrementAndGet();
|
||||||
}
|
}
|
||||||
return isDuplicate;
|
return isDuplicate;
|
||||||
|
|
|
@ -17,11 +17,11 @@ public class BloomFilterDuplicateRemoverTest {
|
||||||
boolean isDuplicate = bloomFilterDuplicateRemover.isDuplicate(new Request("a"), null);
|
boolean isDuplicate = bloomFilterDuplicateRemover.isDuplicate(new Request("a"), null);
|
||||||
assertThat(isDuplicate).isFalse();
|
assertThat(isDuplicate).isFalse();
|
||||||
isDuplicate = bloomFilterDuplicateRemover.isDuplicate(new Request("a"), null);
|
isDuplicate = bloomFilterDuplicateRemover.isDuplicate(new Request("a"), null);
|
||||||
assertThat(isDuplicate);
|
assertThat(isDuplicate).isTrue();
|
||||||
isDuplicate = bloomFilterDuplicateRemover.isDuplicate(new Request("b"), null);
|
isDuplicate = bloomFilterDuplicateRemover.isDuplicate(new Request("b"), null);
|
||||||
assertThat(isDuplicate).isFalse();
|
assertThat(isDuplicate).isFalse();
|
||||||
isDuplicate = bloomFilterDuplicateRemover.isDuplicate(new Request("b"), null);
|
isDuplicate = bloomFilterDuplicateRemover.isDuplicate(new Request("b"), null);
|
||||||
assertThat(isDuplicate);
|
assertThat(isDuplicate).isTrue();
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -3,9 +3,12 @@ package us.codecraft.webmagic.samples;
|
||||||
import us.codecraft.webmagic.Page;
|
import us.codecraft.webmagic.Page;
|
||||||
import us.codecraft.webmagic.Site;
|
import us.codecraft.webmagic.Site;
|
||||||
import us.codecraft.webmagic.Spider;
|
import us.codecraft.webmagic.Spider;
|
||||||
|
import us.codecraft.webmagic.monitor.SpiderMonitor;
|
||||||
import us.codecraft.webmagic.processor.PageProcessor;
|
import us.codecraft.webmagic.processor.PageProcessor;
|
||||||
import us.codecraft.webmagic.scheduler.RedisScheduler;
|
import us.codecraft.webmagic.scheduler.QueueScheduler;
|
||||||
|
import us.codecraft.webmagic.scheduler.component.BloomFilterDuplicateRemover;
|
||||||
|
|
||||||
|
import javax.management.JMException;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -30,7 +33,9 @@ public class OschinaBlogPageProcesser implements PageProcessor {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public static void main(String[] args) {
|
public static void main(String[] args) throws JMException {
|
||||||
Spider.create(new OschinaBlogPageProcesser()).setScheduler(new RedisScheduler("localhost")).run();
|
Spider spider = Spider.create(new OschinaBlogPageProcesser()).setScheduler(new QueueScheduler().setDuplicateRemover(new BloomFilterDuplicateRemover(2000)));
|
||||||
|
SpiderMonitor.instance().register(spider);
|
||||||
|
spider.run();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue