update javadoc

master
yihua.huang 2013-06-20 08:21:48 +08:00
parent b1f023ead5
commit e1e25cb5e7
6 changed files with 56 additions and 13 deletions

View File

@ -3,6 +3,7 @@ package us.codecraft.webmagic;
/**
* Requesturl<br/>
* PageProcessorRequest{@link us.codecraft.webmagic.Page#getRequest()} <br/>
* <br/>
* Requestextra<br/>
* <pre>
* Example:

View File

@ -85,7 +85,7 @@ public class Site {
/**
* domain
*
* @return
* @return domain
*/
public String getDomain() {
return domain;

View File

@ -15,9 +15,19 @@ import java.util.List;
/**
* <pre>
* webmagic
*
*webmagic
*
*
*
* Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*")).run();
*
*使FilePipeline:
* Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*"))
* .pipeline(new FilePipeline("/data/temp/webmagic/")).run();
*
*使FileCacheQueueSchedulerURL:
* Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*"))
* .scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")).run();
* </pre>
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
@ -41,36 +51,60 @@ public class Spider implements Runnable, Task {
private Logger logger = Logger.getLogger(getClass());
public Spider(PageProcessor pageProcessor){
/**
* 使Spider
* @param pageProcessor
*/
public Spider(PageProcessor pageProcessor) {
this.pageProcessor = pageProcessor;
this.site = pageProcessor.getSite();
this.startUrls = pageProcessor.getSite().getStartUrls();
}
/**
* 使Spider
* @param pageProcessor
* @return Spider
*/
public static Spider create(PageProcessor pageProcessor) {
return new Spider(pageProcessor);
}
/**
* startUrlsSitestartUrls
* @param startUrls
* @return this
*/
public Spider startUrls(List<String> startUrls) {
this.startUrls = startUrls;
return this;
}
public Spider startUrl(String startUrl) {
startUrls = new ArrayList<String>();
startUrls.add(startUrl);
return this;
}
/**
* ID使domainuuiddomainID
* @param uuid ID
* @return this
*/
public Spider setUUID(String uuid) {
this.uuid = uuid;
return this;
}
/**
* URL使
* @param scheduler
* @return this
*/
public Spider scheduler(Scheduler scheduler) {
this.scheduler = scheduler;
return this;
}
/**
*
* @param pipeline
* @return this
*/
public Spider pipeline(Pipeline pipeline) {
this.pipelines.add(pipeline);
return this;
@ -79,8 +113,10 @@ public class Spider implements Runnable, Task {
@Override
public void run() {
for (String startUrl : startUrls) {
scheduler.push(new Request(startUrl), this);
if (startUrls != null) {
for (String startUrl : startUrls) {
scheduler.push(new Request(startUrl), this);
}
}
Request request = scheduler.poll(this);
if (pipelines.isEmpty()) {

View File

@ -1,12 +1,17 @@
package us.codecraft.webmagic;
/**
* <br>
* @author code4crafter@gmail.com <br>
* Date: 13-6-18
* Time: 2:57
*/
public interface Task {
/**
*
* @return uuid
*/
public String getUUID();
}

View File

@ -33,6 +33,6 @@ public class DianpingProcessor implements PageProcessor {
public static void main(String[] args) {
DianpingProcessor dianpingProcessor = new DianpingProcessor();
Spider.create(dianpingProcessor).startUrl("http://www.dianping.com/shanghai/food").run();
Spider.create(dianpingProcessor).run();
}
}

View File

@ -30,6 +30,7 @@ public class DiandianProcessorTest {
//ConsolePipeline输出结果到控制台
//FileCacheQueueSchedular保存url支持断点续传临时文件输出到/data/temp/webmagic/cache目录
//Spider.run()执行
Spider.create(diaoyuwengProcessor).pipeline(new ConsolePipeline()).pipeline(pipeline).scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")).
run();
}