update urls.contains to DuplicateRemover in FileCacheQueueScheduler #157
parent
689e89a9b2
commit
42a30074c9
|
@ -4,6 +4,7 @@ import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.commons.lang3.math.NumberUtils;
|
import org.apache.commons.lang3.math.NumberUtils;
|
||||||
import us.codecraft.webmagic.Request;
|
import us.codecraft.webmagic.Request;
|
||||||
import us.codecraft.webmagic.Task;
|
import us.codecraft.webmagic.Task;
|
||||||
|
import us.codecraft.webmagic.scheduler.component.DuplicateRemover;
|
||||||
|
|
||||||
import java.io.*;
|
import java.io.*;
|
||||||
import java.util.LinkedHashSet;
|
import java.util.LinkedHashSet;
|
||||||
|
@ -68,6 +69,26 @@ public class FileCacheQueueScheduler extends DuplicateRemovedScheduler implement
|
||||||
logger.info("init cache scheduler success");
|
logger.info("init cache scheduler success");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private void initDuplicateRemover() {
|
||||||
|
setDuplicateRemover(
|
||||||
|
new DuplicateRemover() {
|
||||||
|
@Override
|
||||||
|
public boolean isDuplicate(Request request, Task task) {
|
||||||
|
return !urls.add(request.getUrl());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void resetDuplicateCheck(Task task) {
|
||||||
|
urls.clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int getTotalRequestsCount(Task task) {
|
||||||
|
return urls.size();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
private void initFlushThread() {
|
private void initFlushThread() {
|
||||||
Executors.newScheduledThreadPool(1).scheduleAtFixedRate(new Runnable() {
|
Executors.newScheduledThreadPool(1).scheduleAtFixedRate(new Runnable() {
|
||||||
@Override
|
@Override
|
||||||
|
@ -92,6 +113,7 @@ public class FileCacheQueueScheduler extends DuplicateRemovedScheduler implement
|
||||||
urls = new LinkedHashSet<String>();
|
urls = new LinkedHashSet<String>();
|
||||||
readCursorFile();
|
readCursorFile();
|
||||||
readUrlFile();
|
readUrlFile();
|
||||||
|
initDuplicateRemover();
|
||||||
} catch (FileNotFoundException e) {
|
} catch (FileNotFoundException e) {
|
||||||
//init
|
//init
|
||||||
logger.info("init cache file " + getFileName(fileUrlAllName));
|
logger.info("init cache file " + getFileName(fileUrlAllName));
|
||||||
|
@ -145,8 +167,6 @@ public class FileCacheQueueScheduler extends DuplicateRemovedScheduler implement
|
||||||
if (!inited.get()) {
|
if (!inited.get()) {
|
||||||
init(task);
|
init(task);
|
||||||
}
|
}
|
||||||
if(urls.contains(request.getUrl())) //已存在此URL 表示已抓取过 跳过
|
|
||||||
return;
|
|
||||||
queue.add(request);
|
queue.add(request);
|
||||||
fileUrlWriter.println(request.getUrl());
|
fileUrlWriter.println(request.getUrl());
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue