update javadoc

master
yihua.huang 2013-06-20 08:21:48 +08:00
parent b1f023ead5
commit e1e25cb5e7
6 changed files with 56 additions and 13 deletions

View File

@ -3,6 +3,7 @@ package us.codecraft.webmagic;
/** /**
* Requesturl<br/> * Requesturl<br/>
* PageProcessorRequest{@link us.codecraft.webmagic.Page#getRequest()} <br/> * PageProcessorRequest{@link us.codecraft.webmagic.Page#getRequest()} <br/>
* <br/>
* Requestextra<br/> * Requestextra<br/>
* <pre> * <pre>
* Example: * Example:

View File

@ -85,7 +85,7 @@ public class Site {
/** /**
* domain * domain
* *
* @return * @return domain
*/ */
public String getDomain() { public String getDomain() {
return domain; return domain;

View File

@ -15,9 +15,19 @@ import java.util.List;
/** /**
* <pre> * <pre>
* webmagic *webmagic
* *
*
*
* Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*")).run(); * Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*")).run();
*
*使FilePipeline:
* Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*"))
* .pipeline(new FilePipeline("/data/temp/webmagic/")).run();
*
*使FileCacheQueueSchedulerURL:
* Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*"))
* .scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")).run();
* </pre> * </pre>
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* Date: 13-4-21 * Date: 13-4-21
@ -41,36 +51,60 @@ public class Spider implements Runnable, Task {
private Logger logger = Logger.getLogger(getClass()); private Logger logger = Logger.getLogger(getClass());
public Spider(PageProcessor pageProcessor){ /**
* 使Spider
* @param pageProcessor
*/
public Spider(PageProcessor pageProcessor) {
this.pageProcessor = pageProcessor; this.pageProcessor = pageProcessor;
this.site = pageProcessor.getSite(); this.site = pageProcessor.getSite();
this.startUrls = pageProcessor.getSite().getStartUrls();
} }
/**
* 使Spider
* @param pageProcessor
* @return Spider
*/
public static Spider create(PageProcessor pageProcessor) { public static Spider create(PageProcessor pageProcessor) {
return new Spider(pageProcessor); return new Spider(pageProcessor);
} }
/**
* startUrlsSitestartUrls
* @param startUrls
* @return this
*/
public Spider startUrls(List<String> startUrls) { public Spider startUrls(List<String> startUrls) {
this.startUrls = startUrls; this.startUrls = startUrls;
return this; return this;
} }
public Spider startUrl(String startUrl) { /**
startUrls = new ArrayList<String>(); * ID使domainuuiddomainID
startUrls.add(startUrl); * @param uuid ID
return this; * @return this
} */
public Spider setUUID(String uuid) { public Spider setUUID(String uuid) {
this.uuid = uuid; this.uuid = uuid;
return this; return this;
} }
/**
* URL使
* @param scheduler
* @return this
*/
public Spider scheduler(Scheduler scheduler) { public Spider scheduler(Scheduler scheduler) {
this.scheduler = scheduler; this.scheduler = scheduler;
return this; return this;
} }
/**
*
* @param pipeline
* @return this
*/
public Spider pipeline(Pipeline pipeline) { public Spider pipeline(Pipeline pipeline) {
this.pipelines.add(pipeline); this.pipelines.add(pipeline);
return this; return this;
@ -79,9 +113,11 @@ public class Spider implements Runnable, Task {
@Override @Override
public void run() { public void run() {
if (startUrls != null) {
for (String startUrl : startUrls) { for (String startUrl : startUrls) {
scheduler.push(new Request(startUrl), this); scheduler.push(new Request(startUrl), this);
} }
}
Request request = scheduler.poll(this); Request request = scheduler.poll(this);
if (pipelines.isEmpty()) { if (pipelines.isEmpty()) {
pipelines.add(new ConsolePipeline()); pipelines.add(new ConsolePipeline());

View File

@ -1,12 +1,17 @@
package us.codecraft.webmagic; package us.codecraft.webmagic;
/** /**
* <br>
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* Date: 13-6-18 * Date: 13-6-18
* Time: 2:57 * Time: 2:57
*/ */
public interface Task { public interface Task {
/**
*
* @return uuid
*/
public String getUUID(); public String getUUID();
} }

View File

@ -33,6 +33,6 @@ public class DianpingProcessor implements PageProcessor {
public static void main(String[] args) { public static void main(String[] args) {
DianpingProcessor dianpingProcessor = new DianpingProcessor(); DianpingProcessor dianpingProcessor = new DianpingProcessor();
Spider.create(dianpingProcessor).startUrl("http://www.dianping.com/shanghai/food").run(); Spider.create(dianpingProcessor).run();
} }
} }

View File

@ -30,6 +30,7 @@ public class DiandianProcessorTest {
//ConsolePipeline输出结果到控制台 //ConsolePipeline输出结果到控制台
//FileCacheQueueSchedular保存url支持断点续传临时文件输出到/data/temp/webmagic/cache目录 //FileCacheQueueSchedular保存url支持断点续传临时文件输出到/data/temp/webmagic/cache目录
//Spider.run()执行 //Spider.run()执行
Spider.create(diaoyuwengProcessor).pipeline(new ConsolePipeline()).pipeline(pipeline).scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")). Spider.create(diaoyuwengProcessor).pipeline(new ConsolePipeline()).pipeline(pipeline).scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")).
run(); run();
} }