update redisscheduler
parent
787b952932
commit
0f2c5b5723
|
@ -148,4 +148,15 @@ public class Page {
|
|||
public ResultItems getResultItems() {
|
||||
return resultItems;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "Page{" +
|
||||
"request=" + request +
|
||||
", resultItems=" + resultItems +
|
||||
", html=" + html +
|
||||
", url=" + url +
|
||||
", targetRequests=" + targetRequests +
|
||||
'}';
|
||||
}
|
||||
}
|
||||
|
|
|
@ -113,4 +113,13 @@ public class Request implements Serializable {
|
|||
public void setUrl(String url) {
|
||||
this.url = url;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "Request{" +
|
||||
"url='" + url + '\'' +
|
||||
", extras=" + extras +
|
||||
", priority=" + priority +
|
||||
'}';
|
||||
}
|
||||
}
|
||||
|
|
|
@ -40,33 +40,33 @@ import java.util.concurrent.atomic.AtomicInteger;
|
|||
*/
|
||||
public class Spider implements Runnable, Task {
|
||||
|
||||
private Downloader downloader;
|
||||
protected Downloader downloader;
|
||||
|
||||
private List<Pipeline> pipelines = new ArrayList<Pipeline>();
|
||||
protected List<Pipeline> pipelines = new ArrayList<Pipeline>();
|
||||
|
||||
private PageProcessor pageProcessor;
|
||||
protected PageProcessor pageProcessor;
|
||||
|
||||
private List<String> startUrls;
|
||||
protected List<String> startUrls;
|
||||
|
||||
private Site site;
|
||||
protected Site site;
|
||||
|
||||
private String uuid;
|
||||
protected String uuid;
|
||||
|
||||
private Scheduler scheduler = new QueueScheduler();
|
||||
protected Scheduler scheduler = new QueueScheduler();
|
||||
|
||||
private Logger logger = Logger.getLogger(getClass());
|
||||
protected Logger logger = Logger.getLogger(getClass());
|
||||
|
||||
private ExecutorService executorService;
|
||||
protected ExecutorService executorService;
|
||||
|
||||
private int threadNum = 1;
|
||||
protected int threadNum = 1;
|
||||
|
||||
private AtomicInteger stat = new AtomicInteger(STAT_INIT);
|
||||
protected AtomicInteger stat = new AtomicInteger(STAT_INIT);
|
||||
|
||||
private final static int STAT_INIT = 0;
|
||||
protected final static int STAT_INIT = 0;
|
||||
|
||||
private final static int STAT_RUNNING = 1;
|
||||
protected final static int STAT_RUNNING = 1;
|
||||
|
||||
private final static int STAT_STOPPED = 2;
|
||||
protected final static int STAT_STOPPED = 2;
|
||||
|
||||
/**
|
||||
* 使用已定义的抽取规则新建一个Spider。
|
||||
|
@ -206,7 +206,7 @@ public class Spider implements Runnable, Task {
|
|||
destroy();
|
||||
}
|
||||
|
||||
private void destroy() {
|
||||
protected void destroy() {
|
||||
destroyEach(downloader);
|
||||
destroyEach(pageProcessor);
|
||||
for (Pipeline pipeline : pipelines) {
|
||||
|
@ -233,7 +233,7 @@ public class Spider implements Runnable, Task {
|
|||
}
|
||||
}
|
||||
|
||||
private void processRequest(Request request) {
|
||||
protected void processRequest(Request request) {
|
||||
Page page = downloader.download(request, this);
|
||||
if (page == null) {
|
||||
sleep(site.getSleepTime());
|
||||
|
@ -249,7 +249,7 @@ public class Spider implements Runnable, Task {
|
|||
sleep(site.getSleepTime());
|
||||
}
|
||||
|
||||
private void sleep(int time) {
|
||||
protected void sleep(int time) {
|
||||
try {
|
||||
Thread.sleep(time);
|
||||
} catch (InterruptedException e) {
|
||||
|
@ -257,7 +257,7 @@ public class Spider implements Runnable, Task {
|
|||
}
|
||||
}
|
||||
|
||||
private void addRequest(Page page) {
|
||||
protected void addRequest(Page page) {
|
||||
if (CollectionUtils.isNotEmpty(page.getTargetRequests())) {
|
||||
for (Request request : page.getTargetRequests()) {
|
||||
scheduler.push(request, this);
|
||||
|
@ -265,7 +265,7 @@ public class Spider implements Runnable, Task {
|
|||
}
|
||||
}
|
||||
|
||||
private void checkIfNotRunning() {
|
||||
protected void checkIfNotRunning() {
|
||||
if (!stat.compareAndSet(STAT_INIT, STAT_INIT)) {
|
||||
throw new IllegalStateException("Spider is already running!");
|
||||
}
|
||||
|
|
|
@ -66,13 +66,7 @@ public class HttpClientDownloader implements Downloader {
|
|||
}
|
||||
//
|
||||
handleGzip(httpResponse);
|
||||
String content = IOUtils.toString(httpResponse.getEntity().getContent(),
|
||||
charset);
|
||||
Page page = new Page();
|
||||
page.setHtml(new Html(UrlUtils.fixAllRelativeHrefs(content, request.getUrl())));
|
||||
page.setUrl(new PlainText(request.getUrl()));
|
||||
page.setRequest(request);
|
||||
return page;
|
||||
return handleResponse(request, charset, httpResponse,task);
|
||||
} else {
|
||||
logger.warn("code error " + statusCode + "\t" + request.getUrl());
|
||||
}
|
||||
|
@ -82,6 +76,16 @@ public class HttpClientDownloader implements Downloader {
|
|||
return null;
|
||||
}
|
||||
|
||||
protected Page handleResponse(Request request, String charset, HttpResponse httpResponse,Task task) throws IOException {
|
||||
String content = IOUtils.toString(httpResponse.getEntity().getContent(),
|
||||
charset);
|
||||
Page page = new Page();
|
||||
page.setHtml(new Html(UrlUtils.fixAllRelativeHrefs(content, request.getUrl())));
|
||||
page.setUrl(new PlainText(request.getUrl()));
|
||||
page.setRequest(request);
|
||||
return page;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setThread(int thread) {
|
||||
poolSize=thread;
|
||||
|
|
|
@ -4,8 +4,8 @@ import org.apache.commons.codec.digest.DigestUtils;
|
|||
import org.apache.log4j.Logger;
|
||||
import us.codecraft.webmagic.ResultItems;
|
||||
import us.codecraft.webmagic.Task;
|
||||
import us.codecraft.webmagic.utils.FilePersistentBase;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileWriter;
|
||||
import java.io.IOException;
|
||||
import java.io.PrintWriter;
|
||||
|
@ -18,9 +18,7 @@ import java.util.Map;
|
|||
* Date: 13-4-21
|
||||
* Time: 下午6:28
|
||||
*/
|
||||
public class FilePipeline implements Pipeline {
|
||||
|
||||
private String path = "/data/webmagic/";
|
||||
public class FilePipeline extends FilePersistentBase implements Pipeline {
|
||||
|
||||
private Logger logger = Logger.getLogger(getClass());
|
||||
|
||||
|
@ -28,7 +26,7 @@ public class FilePipeline implements Pipeline {
|
|||
* 新建一个FilePipeline,使用默认保存路径"/data/webmagic/"
|
||||
*/
|
||||
public FilePipeline() {
|
||||
|
||||
setPath("/data/webmagic/");
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -37,21 +35,14 @@ public class FilePipeline implements Pipeline {
|
|||
* @param path 文件保存路径
|
||||
*/
|
||||
public FilePipeline(String path) {
|
||||
if (!path.endsWith("/")&&!path.endsWith("\\")){
|
||||
path+="/";
|
||||
}
|
||||
this.path = path;
|
||||
setPath(path);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void process(ResultItems resultItems, Task task) {
|
||||
String path = this.path + "/" + task.getUUID() + "/";
|
||||
File file = new File(path);
|
||||
if (!file.exists()) {
|
||||
file.mkdirs();
|
||||
}
|
||||
String path = this.path + PATH_SEPERATOR + task.getUUID() + PATH_SEPERATOR;
|
||||
try {
|
||||
PrintWriter printWriter = new PrintWriter(new FileWriter(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".html"));
|
||||
PrintWriter printWriter = new PrintWriter(new FileWriter(getFile(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".html")));
|
||||
printWriter.println("url:\t" + resultItems.getRequest().getUrl());
|
||||
for (Map.Entry<String, Object> entry : resultItems.getAll().entrySet()) {
|
||||
if (entry.getValue() instanceof Iterable) {
|
||||
|
|
|
@ -0,0 +1,51 @@
|
|||
package us.codecraft.webmagic.utils;
|
||||
|
||||
import java.io.File;
|
||||
|
||||
/**
|
||||
* 文件持久化的基础类。<br>
|
||||
*
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-8-11 <br>
|
||||
* Time: 下午4:21 <br>
|
||||
*/
|
||||
public class FilePersistentBase {
|
||||
|
||||
protected String path;
|
||||
|
||||
public static String PATH_SEPERATOR = "/";
|
||||
|
||||
static {
|
||||
String property = System.getProperties().getProperty("file.separator");
|
||||
if (property != null) {
|
||||
PATH_SEPERATOR = property;
|
||||
}
|
||||
}
|
||||
|
||||
public void setPath(String path) {
|
||||
this.path = path;
|
||||
if (!path.endsWith(PATH_SEPERATOR)) {
|
||||
path += PATH_SEPERATOR;
|
||||
}
|
||||
}
|
||||
|
||||
public File getFile(String fullName) {
|
||||
checkAndMakeParentDirecotry(fullName);
|
||||
return new File(fullName);
|
||||
}
|
||||
|
||||
public void checkAndMakeParentDirecotry(String fullName) {
|
||||
int index = fullName.lastIndexOf(PATH_SEPERATOR);
|
||||
if (index > 0) {
|
||||
String path = fullName.substring(0, index);
|
||||
File file = new File(path);
|
||||
if (!file.exists()) {
|
||||
file.mkdirs();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public String getPath() {
|
||||
return path;
|
||||
}
|
||||
}
|
|
@ -2,6 +2,7 @@ package us.codecraft.webmagic.model;
|
|||
|
||||
import us.codecraft.webmagic.Site;
|
||||
import us.codecraft.webmagic.Spider;
|
||||
import us.codecraft.webmagic.processor.PageProcessor;
|
||||
|
||||
/**
|
||||
* 基于Model的Spider,封装后的入口类。<br>
|
||||
|
@ -20,6 +21,10 @@ public class OOSpider extends Spider {
|
|||
this.modelPageProcessor = modelPageProcessor;
|
||||
}
|
||||
|
||||
public OOSpider(PageProcessor pageProcessor) {
|
||||
super(pageProcessor);
|
||||
}
|
||||
|
||||
/**
|
||||
* 创建一个爬虫。<br>
|
||||
* @param site
|
||||
|
|
|
@ -7,8 +7,8 @@ import org.apache.log4j.Logger;
|
|||
import us.codecraft.webmagic.Task;
|
||||
import us.codecraft.webmagic.model.HasKey;
|
||||
import us.codecraft.webmagic.model.PageModelPipeline;
|
||||
import us.codecraft.webmagic.utils.FilePersistentBase;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileWriter;
|
||||
import java.io.IOException;
|
||||
import java.io.PrintWriter;
|
||||
|
@ -21,38 +21,29 @@ import java.io.PrintWriter;
|
|||
* Date: 13-4-21
|
||||
* Time: 下午6:28
|
||||
*/
|
||||
public class JsonFilePageModelPipeline implements PageModelPipeline {
|
||||
|
||||
private String path = "/data/webmagic/";
|
||||
public class JsonFilePageModelPipeline extends FilePersistentBase implements PageModelPipeline {
|
||||
|
||||
private Logger logger = Logger.getLogger(getClass());
|
||||
|
||||
/**
|
||||
* 新建一个FilePipeline,使用默认保存路径"/data/webmagic/"
|
||||
* 新建一个JsonFilePageModelPipeline,使用默认保存路径"/data/webmagic/"
|
||||
*/
|
||||
public JsonFilePageModelPipeline() {
|
||||
|
||||
setPath("/data/webmagic/");
|
||||
}
|
||||
|
||||
/**
|
||||
* 新建一个FilePipeline
|
||||
* 新建一个JsonFilePageModelPipeline
|
||||
*
|
||||
* @param path 文件保存路径
|
||||
*/
|
||||
public JsonFilePageModelPipeline(String path) {
|
||||
if (!path.endsWith("/") && !path.endsWith("\\")) {
|
||||
path += "/";
|
||||
}
|
||||
this.path = path;
|
||||
setPath(path);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void process(Object o, Task task) {
|
||||
String path = this.path + "/" + task.getUUID() + "/";
|
||||
File file = new File(path);
|
||||
if (!file.exists()) {
|
||||
file.mkdirs();
|
||||
}
|
||||
try {
|
||||
String filename;
|
||||
if (o instanceof HasKey) {
|
||||
|
@ -60,7 +51,7 @@ public class JsonFilePageModelPipeline implements PageModelPipeline {
|
|||
} else {
|
||||
filename = path + DigestUtils.md5Hex(ToStringBuilder.reflectionToString(o)) + ".json";
|
||||
}
|
||||
PrintWriter printWriter = new PrintWriter(new FileWriter(filename));
|
||||
PrintWriter printWriter = new PrintWriter(new FileWriter(getFile(filename)));
|
||||
printWriter.write(JSON.toJSONString(o));
|
||||
printWriter.close();
|
||||
} catch (IOException e) {
|
||||
|
|
|
@ -5,6 +5,7 @@ import org.apache.commons.codec.digest.DigestUtils;
|
|||
import org.apache.log4j.Logger;
|
||||
import us.codecraft.webmagic.ResultItems;
|
||||
import us.codecraft.webmagic.Task;
|
||||
import us.codecraft.webmagic.utils.FilePersistentBase;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileWriter;
|
||||
|
@ -18,40 +19,31 @@ import java.io.PrintWriter;
|
|||
* Date: 13-4-21
|
||||
* Time: 下午6:28
|
||||
*/
|
||||
public class JsonFilePipeline implements Pipeline {
|
||||
|
||||
private String path = "/data/webmagic/";
|
||||
public class JsonFilePipeline extends FilePersistentBase implements Pipeline {
|
||||
|
||||
private Logger logger = Logger.getLogger(getClass());
|
||||
|
||||
/**
|
||||
* 新建一个FilePipeline,使用默认保存路径"/data/webmagic/"
|
||||
* 新建一个JsonFilePipeline,使用默认保存路径"/data/webmagic/"
|
||||
*/
|
||||
public JsonFilePipeline() {
|
||||
|
||||
setPath("/data/webmagic");
|
||||
}
|
||||
|
||||
/**
|
||||
* 新建一个FilePipeline
|
||||
* 新建一个JsonFilePipeline
|
||||
*
|
||||
* @param path 文件保存路径
|
||||
*/
|
||||
public JsonFilePipeline(String path) {
|
||||
if (!path.endsWith("/")&&!path.endsWith("\\")){
|
||||
path+="/";
|
||||
}
|
||||
this.path = path;
|
||||
setPath(path);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void process(ResultItems resultItems, Task task) {
|
||||
String path = this.path + "/" + task.getUUID() + "/";
|
||||
File file = new File(path);
|
||||
if (!file.exists()) {
|
||||
file.mkdirs();
|
||||
}
|
||||
try {
|
||||
PrintWriter printWriter = new PrintWriter(new FileWriter(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".json"));
|
||||
PrintWriter printWriter = new PrintWriter(new FileWriter(new File(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".json")));
|
||||
printWriter.write(JSON.toJSONString(resultItems.getAll()));
|
||||
printWriter.close();
|
||||
} catch (IOException e) {
|
||||
|
|
|
@ -12,8 +12,8 @@ import us.codecraft.webmagic.Task;
|
|||
* 使用redis管理url,构建一个分布式的爬虫。<br>
|
||||
*
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-7-25 <br>
|
||||
* Time: 上午7:07 <br>
|
||||
* Date: 13-7-25 <br>
|
||||
* Time: 上午7:07 <br>
|
||||
*/
|
||||
public class RedisScheduler implements Scheduler {
|
||||
|
||||
|
@ -32,34 +32,42 @@ public class RedisScheduler implements Scheduler {
|
|||
@Override
|
||||
public synchronized void push(Request request, Task task) {
|
||||
Jedis jedis = pool.getResource();
|
||||
//使用SortedSet进行url去重
|
||||
if (jedis.zrank(SET_PREFIX + task.getUUID(), request.getUrl()) == null) {
|
||||
//使用List保存队列
|
||||
jedis.rpush(QUEUE_PREFIX + task.getUUID(), request.getUrl());
|
||||
jedis.zadd(SET_PREFIX + task.getUUID(), request.getPriority(), request.getUrl());
|
||||
if (request.getExtras() != null) {
|
||||
String key = ITEM_PREFIX + DigestUtils.shaHex(request.getUrl());
|
||||
byte[] bytes = JSON.toJSONString(request).getBytes();
|
||||
jedis.set(key.getBytes(), bytes);
|
||||
try {
|
||||
//使用Set进行url去重
|
||||
if (!jedis.sismember(SET_PREFIX + task.getUUID(), request.getUrl())) {
|
||||
//使用List保存队列
|
||||
jedis.rpush(QUEUE_PREFIX + task.getUUID(), request.getUrl());
|
||||
jedis.sadd(SET_PREFIX + task.getUUID(), request.getUrl());
|
||||
if (request.getExtras() != null) {
|
||||
String field = DigestUtils.shaHex(request.getUrl());
|
||||
String value = JSON.toJSONString(request);
|
||||
jedis.hset((ITEM_PREFIX + task.getUUID()), field, value);
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
pool.returnResource(jedis);
|
||||
}
|
||||
pool.returnResource(jedis);
|
||||
}
|
||||
|
||||
@Override
|
||||
public synchronized Request poll(Task task) {
|
||||
Jedis jedis = pool.getResource();
|
||||
String url = jedis.lpop(QUEUE_PREFIX + task.getUUID());
|
||||
if (url == null) {
|
||||
return null;
|
||||
try {
|
||||
String url = jedis.lpop(QUEUE_PREFIX + task.getUUID());
|
||||
if (url == null) {
|
||||
return null;
|
||||
}
|
||||
String key = ITEM_PREFIX + task.getUUID();
|
||||
String field = DigestUtils.shaHex(url);
|
||||
byte[] bytes = jedis.hget(key.getBytes(),field.getBytes());
|
||||
if (bytes != null) {
|
||||
Request o = JSON.parseObject(new String(bytes), Request.class);
|
||||
return o;
|
||||
}
|
||||
Request request = new Request(url);
|
||||
return request;
|
||||
} finally {
|
||||
pool.returnResource(jedis);
|
||||
}
|
||||
String key = ITEM_PREFIX + DigestUtils.shaHex(url);
|
||||
byte[] bytes = jedis.get(key.getBytes());
|
||||
if (bytes != null) {
|
||||
Request o = JSON.parseObject(new String(bytes),Request.class);
|
||||
return o;
|
||||
}
|
||||
pool.returnResource(jedis);
|
||||
return new Request(url);
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue