update javadoc
parent
b1f023ead5
commit
e1e25cb5e7
|
@ -3,6 +3,7 @@ package us.codecraft.webmagic;
|
|||
/**
|
||||
* Request对象封装了待抓取的url信息。<br/>
|
||||
* 在PageProcessor中,Request对象可以通过{@link us.codecraft.webmagic.Page#getRequest()} 获取。<br/>
|
||||
* <br/>
|
||||
* Request对象包含一个extra属性,可以写入一些必须的上下文,这个特性在某些场合会有用。<br/>
|
||||
* <pre>
|
||||
* Example:
|
||||
|
|
|
@ -85,7 +85,7 @@ public class Site {
|
|||
/**
|
||||
* 获取已设置的domain
|
||||
*
|
||||
* @return
|
||||
* @return 已设置的domain
|
||||
*/
|
||||
public String getDomain() {
|
||||
return domain;
|
||||
|
|
|
@ -15,9 +15,19 @@ import java.util.List;
|
|||
|
||||
/**
|
||||
* <pre>
|
||||
* webmagic爬虫的入口类。
|
||||
* 示例:
|
||||
*webmagic爬虫的入口类。
|
||||
*
|
||||
*示例:
|
||||
*定义一个最简单的爬虫:
|
||||
* Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*")).run();
|
||||
*
|
||||
*使用FilePipeline保存结果到文件:
|
||||
* Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*"))
|
||||
* .pipeline(new FilePipeline("/data/temp/webmagic/")).run();
|
||||
*
|
||||
*使用FileCacheQueueScheduler缓存URL,关闭爬虫后下次自动从停止的页面继续抓取:
|
||||
* Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*"))
|
||||
* .scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")).run();
|
||||
* </pre>
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-4-21
|
||||
|
@ -41,36 +51,60 @@ public class Spider implements Runnable, Task {
|
|||
|
||||
private Logger logger = Logger.getLogger(getClass());
|
||||
|
||||
public Spider(PageProcessor pageProcessor){
|
||||
/**
|
||||
* 使用已定义的抽取规则新建一个Spider。
|
||||
* @param pageProcessor 已定义的抽取规则
|
||||
*/
|
||||
public Spider(PageProcessor pageProcessor) {
|
||||
this.pageProcessor = pageProcessor;
|
||||
this.site = pageProcessor.getSite();
|
||||
this.startUrls = pageProcessor.getSite().getStartUrls();
|
||||
}
|
||||
|
||||
/**
|
||||
* 使用已定义的抽取规则新建一个Spider。
|
||||
* @param pageProcessor 已定义的抽取规则
|
||||
* @return 新建的Spider
|
||||
*/
|
||||
public static Spider create(PageProcessor pageProcessor) {
|
||||
return new Spider(pageProcessor);
|
||||
}
|
||||
|
||||
/**
|
||||
* 重新设置startUrls,会覆盖Site本身的startUrls。
|
||||
* @param startUrls
|
||||
* @return this
|
||||
*/
|
||||
public Spider startUrls(List<String> startUrls) {
|
||||
this.startUrls = startUrls;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Spider startUrl(String startUrl) {
|
||||
startUrls = new ArrayList<String>();
|
||||
startUrls.add(startUrl);
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* 为爬虫设置一个唯一ID,用于标志任务,默认情况下使用domain作为uuid,对于单domain多任务的情况,请为重复任务设置不同的ID。
|
||||
* @param uuid 唯一ID
|
||||
* @return this
|
||||
*/
|
||||
public Spider setUUID(String uuid) {
|
||||
this.uuid = uuid;
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* 设置调度器。调度器用于保存待抓取URL,并可以进行去重、同步、持久化等工作。默认情况下使用内存中的阻塞队列进行调度。
|
||||
* @param scheduler 调度器
|
||||
* @return this
|
||||
*/
|
||||
public Spider scheduler(Scheduler scheduler) {
|
||||
this.scheduler = scheduler;
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* 设置处理管道。处理管道用于最终抽取结果的后处理,例如:保存到文件、保存到数据库等。默认情况下会输出到控制台。
|
||||
* @param pipeline 处理管道
|
||||
* @return this
|
||||
*/
|
||||
public Spider pipeline(Pipeline pipeline) {
|
||||
this.pipelines.add(pipeline);
|
||||
return this;
|
||||
|
@ -79,8 +113,10 @@ public class Spider implements Runnable, Task {
|
|||
|
||||
@Override
|
||||
public void run() {
|
||||
for (String startUrl : startUrls) {
|
||||
scheduler.push(new Request(startUrl), this);
|
||||
if (startUrls != null) {
|
||||
for (String startUrl : startUrls) {
|
||||
scheduler.push(new Request(startUrl), this);
|
||||
}
|
||||
}
|
||||
Request request = scheduler.poll(this);
|
||||
if (pipelines.isEmpty()) {
|
||||
|
|
|
@ -1,12 +1,17 @@
|
|||
package us.codecraft.webmagic;
|
||||
|
||||
/**
|
||||
* 抓取任务的抽象接口。<br>
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-6-18
|
||||
* Time: 下午2:57
|
||||
*/
|
||||
public interface Task {
|
||||
|
||||
/**
|
||||
* 返回唯一标志该任务的字符串,以区分不同任务。
|
||||
* @return uuid
|
||||
*/
|
||||
public String getUUID();
|
||||
|
||||
}
|
||||
|
|
|
@ -33,6 +33,6 @@ public class DianpingProcessor implements PageProcessor {
|
|||
|
||||
public static void main(String[] args) {
|
||||
DianpingProcessor dianpingProcessor = new DianpingProcessor();
|
||||
Spider.create(dianpingProcessor).startUrl("http://www.dianping.com/shanghai/food").run();
|
||||
Spider.create(dianpingProcessor).run();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -30,6 +30,7 @@ public class DiandianProcessorTest {
|
|||
//ConsolePipeline输出结果到控制台
|
||||
//FileCacheQueueSchedular保存url,支持断点续传,临时文件输出到/data/temp/webmagic/cache目录
|
||||
//Spider.run()执行
|
||||
|
||||
Spider.create(diaoyuwengProcessor).pipeline(new ConsolePipeline()).pipeline(pipeline).scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")).
|
||||
run();
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue