update Spider api

master
yihua.huang 2013-06-20 07:53:48 +08:00
parent 69ff524d86
commit 7bed01c9f2
13 changed files with 161 additions and 64 deletions

View File

@ -3,7 +3,9 @@ package us.codecraft.webmagic;
import java.util.*;
/**
* Site
* Site<br>
* getter<br>
*
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* Time: 12:13
@ -30,73 +32,157 @@ public class Site {
DEFAULT_STATUS_CODE_SET.add(200);
}
/**
* Sitenew Site()
*
* @return
*/
public static Site me() {
return new Site();
}
public Site setCookie(String name, String value) {
/**
* cookie访cookie{@link #getDomain()}
*
* @param name cookie
* @param value cookie
* @return this
*/
public Site addCookie(String name, String value) {
cookies.put(name, value);
return this;
}
/**
* user-agentuser-agent
*
* @param userAgent userAgent
* @return this
*/
public Site setUserAgent(String userAgent) {
this.userAgent = userAgent;
return this;
}
/**
* cookie
*
* @return cookie
*/
public Map<String, String> getCookies() {
return cookies;
}
/**
* user-agent
*
* @return user-agent
*/
public String getUserAgent() {
return userAgent;
}
/**
* domain
*
* @return
*/
public String getDomain() {
return domain;
}
/**
* <br>
* Spider
*
* @param domain
* @return this
*/
public Site setDomain(String domain) {
this.domain = domain;
return this;
}
public String getEncoding() {
return encoding;
}
/**
* Html meta<br>
* encoding<br>
*
* @param encoding "utf-8""gbk"
* @return this
*/
public Site setEncoding(String encoding) {
this.encoding = encoding;
return this;
}
public Set<Integer> getAcceptStatCode() {
return acceptStatCode;
/**
*
*
* @return domain
*/
public String getEncoding() {
return encoding;
}
/**
* http<br>
* 200<br>
* <br>
*
* @param acceptStatCode
* @return this
*/
public Site setAcceptStatCode(Set<Integer> acceptStatCode) {
this.acceptStatCode = acceptStatCode;
return this;
}
/**
*
*
* @return
*/
public Set<Integer> getAcceptStatCode() {
return acceptStatCode;
}
/**
*
* @return
*/
public List<String> getStartUrls() {
return startUrls;
}
/**
*
* @param startUrl
* @return this
*/
public Site addStartUrl(String startUrl) {
this.startUrls.add(startUrl);
return this;
}
public int getSleepTime() {
return sleepTime;
}
/**
* (...)
*
* @param sleepTime
* @return this
*/
public Site setSleepTime(int sleepTime) {
this.sleepTime = sleepTime;
return this;
}
/**
*
* @return
*/
public int getSleepTime() {
return sleepTime;
}
@Override
public boolean equals(Object o) {
if (this == o) return true;

View File

@ -7,13 +7,18 @@ import us.codecraft.webmagic.downloader.HttpClientDownloader;
import us.codecraft.webmagic.pipeline.ConsolePipeline;
import us.codecraft.webmagic.pipeline.Pipeline;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.schedular.QueueSchedular;
import us.codecraft.webmagic.schedular.Schedular;
import us.codecraft.webmagic.schedular.QueueScheduler;
import us.codecraft.webmagic.schedular.Scheduler;
import java.util.ArrayList;
import java.util.List;
/**
* <pre>
* webmagic
*
* Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*")).run();
* </pre>
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* Time: 6:53
@ -32,18 +37,17 @@ public class Spider implements Runnable, Task {
private String uuid;
private Schedular schedular = new QueueSchedular();
private Scheduler scheduler = new QueueScheduler();
private Logger logger = Logger.getLogger(getClass());
public static Spider me() {
return new Spider();
}
public Spider processor(PageProcessor pageProcessor) {
public Spider(PageProcessor pageProcessor){
this.pageProcessor = pageProcessor;
this.site = pageProcessor.getSite();
return this;
}
public static Spider create(PageProcessor pageProcessor) {
return new Spider(pageProcessor);
}
public Spider startUrls(List<String> startUrls) {
@ -57,8 +61,13 @@ public class Spider implements Runnable, Task {
return this;
}
public Spider schedular(Schedular schedular) {
this.schedular = schedular;
public Spider setUUID(String uuid) {
this.uuid = uuid;
return this;
}
public Spider schedular(Scheduler scheduler) {
this.scheduler = scheduler;
return this;
}
@ -71,9 +80,9 @@ public class Spider implements Runnable, Task {
@Override
public void run() {
for (String startUrl : startUrls) {
schedular.push(new Request(startUrl), this);
scheduler.push(new Request(startUrl), this);
}
Request request = schedular.poll(this);
Request request = scheduler.poll(this);
if (pipelines.isEmpty()) {
pipelines.add(new ConsolePipeline());
}
@ -89,16 +98,10 @@ public class Spider implements Runnable, Task {
pipeline.process(page, this);
}
sleep(site.getSleepTime());
request = schedular.poll(this);
request = scheduler.poll(this);
}
}
public Spider setUUID(String uuid) {
this.uuid = uuid;
return this;
}
private void sleep(int time) {
try {
Thread.sleep(time);
@ -110,7 +113,7 @@ public class Spider implements Runnable, Task {
private void addRequest(Page page) {
if (CollectionUtils.isNotEmpty(page.getTargetRequests())) {
for (Request request : page.getTargetRequests()) {
schedular.push(request, this);
scheduler.push(request, this);
}
}
}

View File

@ -20,7 +20,7 @@ import java.util.concurrent.atomic.AtomicInteger;
* Date: 13-4-21
* Time: 1:13
*/
public class FileCacheQueueSchedular implements Schedular {
public class FileCacheQueueScheduler implements Scheduler {
private Logger logger = Logger.getLogger(getClass());
@ -44,7 +44,7 @@ public class FileCacheQueueSchedular implements Schedular {
private Set<String> urls;
public FileCacheQueueSchedular(String filePath) {
public FileCacheQueueScheduler(String filePath) {
this.filePath = filePath;
}

View File

@ -14,7 +14,7 @@ import java.util.concurrent.LinkedBlockingQueue;
* Date: 13-4-21
* Time: 1:13
*/
public class QueueSchedular implements Schedular {
public class QueueScheduler implements Scheduler {
private Logger logger = Logger.getLogger(getClass());

View File

@ -8,7 +8,7 @@ import us.codecraft.webmagic.Task;
* Date: 13-4-21
* Time: 1:12
*/
public interface Schedular {
public interface Scheduler {
public void push(Request request,Task task);

View File

@ -18,6 +18,10 @@ public class Html extends PlainText {
super(text);
}
public static Html create(String text) {
return new Html(text);
}
@Override
protected Selectable select(Selector selector, List<String> strings) {
List<String> results = new ArrayList<String>();

View File

@ -24,6 +24,10 @@ public class PlainText implements Selectable {
this.strings = results;
}
public static PlainText create(String text) {
return new PlainText(text);
}
@Override
public Selectable xpath(String xpath) {
throw new UnsupportedOperationException();

View File

@ -15,7 +15,7 @@ public class HttpClientDownloaderTest {
@Test
public void testCookie() {
Site site = Site.me().setDomain("www.diandian.com").setCookie("t", "yct7q7e6v319wpg4cpxqduu5m77lcgix");
Site site = Site.me().setDomain("www.diandian.com").addCookie("t", "yct7q7e6v319wpg4cpxqduu5m77lcgix");
HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
Page download = httpClientDownloader.download(new Request("http://www.diandian.com"), site);
Assert.assertTrue(download.getHtml().toString().contains("flashsword30"));

View File

@ -33,6 +33,6 @@ public class DianpingProcessor implements PageProcessor {
public static void main(String[] args) {
DianpingProcessor dianpingProcessor = new DianpingProcessor();
Spider.me().processor(dianpingProcessor).startUrl("http://www.dianping.com/shanghai/food").run();
Spider.create(dianpingProcessor).startUrl("http://www.dianping.com/shanghai/food").run();
}
}

View File

@ -5,7 +5,7 @@ import org.junit.Test;
import us.codecraft.webmagic.pipeline.FilePipeline;
import us.codecraft.webmagic.processor.SimplePageProcessor;
import us.codecraft.webmagic.samples.HuxiuProcessor;
import us.codecraft.webmagic.schedular.FileCacheQueueSchedular;
import us.codecraft.webmagic.schedular.FileCacheQueueScheduler;
/**
* @author code4crafter@gmail.com <br>
@ -18,7 +18,7 @@ public class SpiderTest {
@Ignore
@Test
public void testSpider() throws InterruptedException {
Spider me = Spider.me().pipeline(new FilePipeline()).processor(new HuxiuProcessor());
Spider me = Spider.create(new HuxiuProcessor()).pipeline(new FilePipeline());
me.run();
}
@ -26,13 +26,13 @@ public class SpiderTest {
@Test
public void testGlobalSpider(){
// PageProcessor pageProcessor = new MeicanProcessor();
// Spider.me().pipeline(new FilePipeline()).schedular(new FileCacheQueueSchedular(pageProcessor.getSite(),"/data/temp/webmagic/cache/")).
// Spider.me().pipeline(new FilePipeline()).schedular(new FileCacheQueueScheduler(pageProcessor.getSite(),"/data/temp/webmagic/cache/")).
// processor(pageProcessor).run();
SimplePageProcessor pageProcessor2 = new SimplePageProcessor("http://www.diaoyuweng.com/home.php?mod=space&uid=88304&do=thread&view=me&type=thread&from=space", "http://www.diaoyuweng.com/thread-*-1-1.html");
System.out.println(pageProcessor2.getSite().getEncoding());
pageProcessor2.getSite().setSleepTime(500);
Spider.me().pipeline(new FilePipeline()).schedular(new FileCacheQueueSchedular("/data/temp/webmagic/cache/")).
processor(pageProcessor2).run();
Spider.create(pageProcessor2).pipeline(new FilePipeline()).schedular(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")).
run();
}

View File

@ -6,7 +6,7 @@ import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.ConsolePipeline;
import us.codecraft.webmagic.pipeline.FreemarkerPipeline;
import us.codecraft.webmagic.samples.DiandianBlogProcessor;
import us.codecraft.webmagic.schedular.FileCacheQueueSchedular;
import us.codecraft.webmagic.schedular.FileCacheQueueScheduler;
import java.io.IOException;
@ -30,7 +30,7 @@ public class DiandianProcessorTest {
//ConsolePipeline输出结果到控制台
//FileCacheQueueSchedular保存url支持断点续传临时文件输出到/data/temp/webmagic/cache目录
//Spider.run()执行
Spider.me().pipeline(new ConsolePipeline()).pipeline(pipeline).schedular(new FileCacheQueueSchedular("/data/temp/webmagic/cache/")).
processor(diaoyuwengProcessor).run();
Spider.create(diaoyuwengProcessor).pipeline(new ConsolePipeline()).pipeline(pipeline).schedular(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")).
run();
}
}

View File

@ -6,7 +6,7 @@ import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.FilePipeline;
import us.codecraft.webmagic.pipeline.FreemarkerPipeline;
import us.codecraft.webmagic.samples.DiaoyuwengProcessor;
import us.codecraft.webmagic.schedular.FileCacheQueueSchedular;
import us.codecraft.webmagic.schedular.FileCacheQueueScheduler;
import java.io.IOException;
@ -22,7 +22,7 @@ public class DiaoyuwengProcessorTest {
public void test() throws IOException {
DiaoyuwengProcessor diaoyuwengProcessor = new DiaoyuwengProcessor();
FreemarkerPipeline pipeline = new FreemarkerPipeline("wordpress.ftl");
Spider.me().pipeline(new FilePipeline()).pipeline(pipeline).schedular(new FileCacheQueueSchedular("/data/temp/webmagic/cache/")).
processor(diaoyuwengProcessor).run();
Spider.create(diaoyuwengProcessor).pipeline(new FilePipeline()).pipeline(pipeline).schedular(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")).
run();
}
}

View File

@ -6,7 +6,7 @@ import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.FilePipeline;
import us.codecraft.webmagic.pipeline.FreemarkerPipeline;
import us.codecraft.webmagic.samples.SinaBlogProcesser;
import us.codecraft.webmagic.schedular.FileCacheQueueSchedular;
import us.codecraft.webmagic.schedular.FileCacheQueueScheduler;
import java.io.IOException;
@ -30,7 +30,7 @@ public class SinablogProcessorTest {
//ConsolePipeline输出结果到控制台
//FileCacheQueueSchedular保存url支持断点续传临时文件输出到/data/temp/webmagic/cache目录
//Spider.run()执行
Spider.me().pipeline(new FilePipeline()).pipeline(pipeline).schedular(new FileCacheQueueSchedular("/data/temp/webmagic/cache/")).
processor(sinaBlogProcesser).run();
Spider.create(sinaBlogProcesser).pipeline(new FilePipeline()).pipeline(pipeline).schedular(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")).
run();
}
}