update Spider api
parent
69ff524d86
commit
7bed01c9f2
|
@ -3,10 +3,12 @@ package us.codecraft.webmagic;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Site定义一个待抓取的站点的各种信息。
|
* Site定义一个待抓取的站点的各种信息。<br>
|
||||||
|
* 这个类的所有getter方法,一般都只会被爬虫框架内部进行调用。<br>
|
||||||
|
*
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
* Date: 13-4-21
|
* Date: 13-4-21
|
||||||
* Time: 下午12:13
|
* Time: 下午12:13
|
||||||
*/
|
*/
|
||||||
public class Site {
|
public class Site {
|
||||||
|
|
||||||
|
@ -30,73 +32,157 @@ public class Site {
|
||||||
DEFAULT_STATUS_CODE_SET.add(200);
|
DEFAULT_STATUS_CODE_SET.add(200);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 创建一个Site对象,等价于new Site()
|
||||||
|
*
|
||||||
|
* @return 新建的对象
|
||||||
|
*/
|
||||||
public static Site me() {
|
public static Site me() {
|
||||||
return new Site();
|
return new Site();
|
||||||
}
|
}
|
||||||
|
|
||||||
public Site setCookie(String name, String value) {
|
/**
|
||||||
|
* 为这个站点添加一个cookie,可用于抓取某些需要登录访问的站点。这个cookie的域名与{@link #getDomain()}是一致的
|
||||||
|
*
|
||||||
|
* @param name cookie的名称
|
||||||
|
* @param value cookie的值
|
||||||
|
* @return this
|
||||||
|
*/
|
||||||
|
public Site addCookie(String name, String value) {
|
||||||
cookies.put(name, value);
|
cookies.put(name, value);
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 为这个站点设置user-agent,很多网站都对user-agent进行了限制,不设置此选项可能会得到期望之外的结果。
|
||||||
|
*
|
||||||
|
* @param userAgent userAgent
|
||||||
|
* @return this
|
||||||
|
*/
|
||||||
public Site setUserAgent(String userAgent) {
|
public Site setUserAgent(String userAgent) {
|
||||||
this.userAgent = userAgent;
|
this.userAgent = userAgent;
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 获取已经设置的所有cookie
|
||||||
|
*
|
||||||
|
* @return 已经设置的所有cookie
|
||||||
|
*/
|
||||||
public Map<String, String> getCookies() {
|
public Map<String, String> getCookies() {
|
||||||
return cookies;
|
return cookies;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 获取已设置的user-agent
|
||||||
|
*
|
||||||
|
* @return 已设置的user-agent
|
||||||
|
*/
|
||||||
public String getUserAgent() {
|
public String getUserAgent() {
|
||||||
return userAgent;
|
return userAgent;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 获取已设置的domain
|
||||||
|
*
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
public String getDomain() {
|
public String getDomain() {
|
||||||
return domain;
|
return domain;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 设置这个站点所在域名,必须项。<br>
|
||||||
|
* 目前不支持多个域名的抓取。抓取多个域名请新建一个Spider。
|
||||||
|
*
|
||||||
|
* @param domain 爬虫会抓取的域名
|
||||||
|
* @return this
|
||||||
|
*/
|
||||||
public Site setDomain(String domain) {
|
public Site setDomain(String domain) {
|
||||||
this.domain = domain;
|
this.domain = domain;
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getEncoding() {
|
/**
|
||||||
return encoding;
|
* 设置页面编码,若不设置则自动根据Html meta信息获取。<br>
|
||||||
}
|
* 一般无需设置encoding,如果发现下载的结果是乱码,则可以设置此项。<br>
|
||||||
|
*
|
||||||
|
* @param encoding 编码格式,主要是"utf-8"、"gbk"两种
|
||||||
|
* @return this
|
||||||
|
*/
|
||||||
public Site setEncoding(String encoding) {
|
public Site setEncoding(String encoding) {
|
||||||
this.encoding = encoding;
|
this.encoding = encoding;
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Set<Integer> getAcceptStatCode() {
|
/**
|
||||||
return acceptStatCode;
|
* 获取已设置的编码
|
||||||
|
*
|
||||||
|
* @return 已设置的domain
|
||||||
|
*/
|
||||||
|
public String getEncoding() {
|
||||||
|
return encoding;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 设置可接受的http状态码,仅当状态码在这个集合中时,才会读取页面内容。<br>
|
||||||
|
* 默认为200,正常情况下,无须设置此项。<br>
|
||||||
|
* 某些站点会错误的返回状态码,此时可以对这个选项进行设置。<br>
|
||||||
|
*
|
||||||
|
* @param acceptStatCode 可接受的状态码
|
||||||
|
* @return this
|
||||||
|
*/
|
||||||
public Site setAcceptStatCode(Set<Integer> acceptStatCode) {
|
public Site setAcceptStatCode(Set<Integer> acceptStatCode) {
|
||||||
this.acceptStatCode = acceptStatCode;
|
this.acceptStatCode = acceptStatCode;
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 获取可接受的状态码
|
||||||
|
*
|
||||||
|
* @return 可接受的状态码
|
||||||
|
*/
|
||||||
|
public Set<Integer> getAcceptStatCode() {
|
||||||
|
return acceptStatCode;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 获取初始页面的地址列表
|
||||||
|
* @return 初始页面的地址列表
|
||||||
|
*/
|
||||||
public List<String> getStartUrls() {
|
public List<String> getStartUrls() {
|
||||||
return startUrls;
|
return startUrls;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 增加初始页面的地址,可反复调用此方法增加多个初始地址。
|
||||||
|
* @param startUrl 初始页面的地址
|
||||||
|
* @return this
|
||||||
|
*/
|
||||||
public Site addStartUrl(String startUrl) {
|
public Site addStartUrl(String startUrl) {
|
||||||
this.startUrls.add(startUrl);
|
this.startUrls.add(startUrl);
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
public int getSleepTime() {
|
/**
|
||||||
return sleepTime;
|
* 设置两次抓取之间的间隔,避免对目标站点压力过大(或者避免被防火墙屏蔽...)。
|
||||||
}
|
*
|
||||||
|
* @param sleepTime 单位毫秒
|
||||||
|
* @return this
|
||||||
|
*/
|
||||||
public Site setSleepTime(int sleepTime) {
|
public Site setSleepTime(int sleepTime) {
|
||||||
this.sleepTime = sleepTime;
|
this.sleepTime = sleepTime;
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 获取两次抓取之间的间隔
|
||||||
|
* @return 两次抓取之间的间隔,单位毫秒
|
||||||
|
*/
|
||||||
|
public int getSleepTime() {
|
||||||
|
return sleepTime;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean equals(Object o) {
|
public boolean equals(Object o) {
|
||||||
if (this == o) return true;
|
if (this == o) return true;
|
||||||
|
|
|
@ -7,13 +7,18 @@ import us.codecraft.webmagic.downloader.HttpClientDownloader;
|
||||||
import us.codecraft.webmagic.pipeline.ConsolePipeline;
|
import us.codecraft.webmagic.pipeline.ConsolePipeline;
|
||||||
import us.codecraft.webmagic.pipeline.Pipeline;
|
import us.codecraft.webmagic.pipeline.Pipeline;
|
||||||
import us.codecraft.webmagic.processor.PageProcessor;
|
import us.codecraft.webmagic.processor.PageProcessor;
|
||||||
import us.codecraft.webmagic.schedular.QueueSchedular;
|
import us.codecraft.webmagic.schedular.QueueScheduler;
|
||||||
import us.codecraft.webmagic.schedular.Schedular;
|
import us.codecraft.webmagic.schedular.Scheduler;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
* <pre>
|
||||||
|
* webmagic爬虫的入口类。
|
||||||
|
* 示例:
|
||||||
|
* Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*")).run();
|
||||||
|
* </pre>
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
* Date: 13-4-21
|
* Date: 13-4-21
|
||||||
* Time: 上午6:53
|
* Time: 上午6:53
|
||||||
|
@ -32,18 +37,17 @@ public class Spider implements Runnable, Task {
|
||||||
|
|
||||||
private String uuid;
|
private String uuid;
|
||||||
|
|
||||||
private Schedular schedular = new QueueSchedular();
|
private Scheduler scheduler = new QueueScheduler();
|
||||||
|
|
||||||
private Logger logger = Logger.getLogger(getClass());
|
private Logger logger = Logger.getLogger(getClass());
|
||||||
|
|
||||||
public static Spider me() {
|
public Spider(PageProcessor pageProcessor){
|
||||||
return new Spider();
|
|
||||||
}
|
|
||||||
|
|
||||||
public Spider processor(PageProcessor pageProcessor) {
|
|
||||||
this.pageProcessor = pageProcessor;
|
this.pageProcessor = pageProcessor;
|
||||||
this.site = pageProcessor.getSite();
|
this.site = pageProcessor.getSite();
|
||||||
return this;
|
}
|
||||||
|
|
||||||
|
public static Spider create(PageProcessor pageProcessor) {
|
||||||
|
return new Spider(pageProcessor);
|
||||||
}
|
}
|
||||||
|
|
||||||
public Spider startUrls(List<String> startUrls) {
|
public Spider startUrls(List<String> startUrls) {
|
||||||
|
@ -57,8 +61,13 @@ public class Spider implements Runnable, Task {
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Spider schedular(Schedular schedular) {
|
public Spider setUUID(String uuid) {
|
||||||
this.schedular = schedular;
|
this.uuid = uuid;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Spider schedular(Scheduler scheduler) {
|
||||||
|
this.scheduler = scheduler;
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -71,9 +80,9 @@ public class Spider implements Runnable, Task {
|
||||||
@Override
|
@Override
|
||||||
public void run() {
|
public void run() {
|
||||||
for (String startUrl : startUrls) {
|
for (String startUrl : startUrls) {
|
||||||
schedular.push(new Request(startUrl), this);
|
scheduler.push(new Request(startUrl), this);
|
||||||
}
|
}
|
||||||
Request request = schedular.poll(this);
|
Request request = scheduler.poll(this);
|
||||||
if (pipelines.isEmpty()) {
|
if (pipelines.isEmpty()) {
|
||||||
pipelines.add(new ConsolePipeline());
|
pipelines.add(new ConsolePipeline());
|
||||||
}
|
}
|
||||||
|
@ -89,16 +98,10 @@ public class Spider implements Runnable, Task {
|
||||||
pipeline.process(page, this);
|
pipeline.process(page, this);
|
||||||
}
|
}
|
||||||
sleep(site.getSleepTime());
|
sleep(site.getSleepTime());
|
||||||
request = schedular.poll(this);
|
request = scheduler.poll(this);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public Spider setUUID(String uuid) {
|
|
||||||
this.uuid = uuid;
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private void sleep(int time) {
|
private void sleep(int time) {
|
||||||
try {
|
try {
|
||||||
Thread.sleep(time);
|
Thread.sleep(time);
|
||||||
|
@ -110,7 +113,7 @@ public class Spider implements Runnable, Task {
|
||||||
private void addRequest(Page page) {
|
private void addRequest(Page page) {
|
||||||
if (CollectionUtils.isNotEmpty(page.getTargetRequests())) {
|
if (CollectionUtils.isNotEmpty(page.getTargetRequests())) {
|
||||||
for (Request request : page.getTargetRequests()) {
|
for (Request request : page.getTargetRequests()) {
|
||||||
schedular.push(request, this);
|
scheduler.push(request, this);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -20,7 +20,7 @@ import java.util.concurrent.atomic.AtomicInteger;
|
||||||
* Date: 13-4-21
|
* Date: 13-4-21
|
||||||
* Time: 下午1:13
|
* Time: 下午1:13
|
||||||
*/
|
*/
|
||||||
public class FileCacheQueueSchedular implements Schedular {
|
public class FileCacheQueueScheduler implements Scheduler {
|
||||||
|
|
||||||
private Logger logger = Logger.getLogger(getClass());
|
private Logger logger = Logger.getLogger(getClass());
|
||||||
|
|
||||||
|
@ -44,7 +44,7 @@ public class FileCacheQueueSchedular implements Schedular {
|
||||||
|
|
||||||
private Set<String> urls;
|
private Set<String> urls;
|
||||||
|
|
||||||
public FileCacheQueueSchedular(String filePath) {
|
public FileCacheQueueScheduler(String filePath) {
|
||||||
this.filePath = filePath;
|
this.filePath = filePath;
|
||||||
}
|
}
|
||||||
|
|
|
@ -14,7 +14,7 @@ import java.util.concurrent.LinkedBlockingQueue;
|
||||||
* Date: 13-4-21
|
* Date: 13-4-21
|
||||||
* Time: 下午1:13
|
* Time: 下午1:13
|
||||||
*/
|
*/
|
||||||
public class QueueSchedular implements Schedular {
|
public class QueueScheduler implements Scheduler {
|
||||||
|
|
||||||
private Logger logger = Logger.getLogger(getClass());
|
private Logger logger = Logger.getLogger(getClass());
|
||||||
|
|
|
@ -8,7 +8,7 @@ import us.codecraft.webmagic.Task;
|
||||||
* Date: 13-4-21
|
* Date: 13-4-21
|
||||||
* Time: 下午1:12
|
* Time: 下午1:12
|
||||||
*/
|
*/
|
||||||
public interface Schedular {
|
public interface Scheduler {
|
||||||
|
|
||||||
public void push(Request request,Task task);
|
public void push(Request request,Task task);
|
||||||
|
|
|
@ -5,8 +5,8 @@ import java.util.List;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
* Date: 13-4-21
|
* Date: 13-4-21
|
||||||
* Time: 上午7:54
|
* Time: 上午7:54
|
||||||
*/
|
*/
|
||||||
public class Html extends PlainText {
|
public class Html extends PlainText {
|
||||||
|
|
||||||
|
@ -18,12 +18,16 @@ public class Html extends PlainText {
|
||||||
super(text);
|
super(text);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static Html create(String text) {
|
||||||
|
return new Html(text);
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected Selectable select(Selector selector, List<String> strings) {
|
protected Selectable select(Selector selector, List<String> strings) {
|
||||||
List<String> results = new ArrayList<String>();
|
List<String> results = new ArrayList<String>();
|
||||||
for (String string : strings) {
|
for (String string : strings) {
|
||||||
String result = selector.select(string);
|
String result = selector.select(string);
|
||||||
if (result!=null){
|
if (result != null) {
|
||||||
results.add(result);
|
results.add(result);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -43,13 +47,13 @@ public class Html extends PlainText {
|
||||||
@Override
|
@Override
|
||||||
public Selectable smartContent() {
|
public Selectable smartContent() {
|
||||||
SmartContentSelector smartContentSelector = SelectorFactory.getInstatnce().newSmartContentSelector();
|
SmartContentSelector smartContentSelector = SelectorFactory.getInstatnce().newSmartContentSelector();
|
||||||
return select(smartContentSelector,strings);
|
return select(smartContentSelector, strings);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Selectable links() {
|
public Selectable links() {
|
||||||
XpathSelector xpathSelector = SelectorFactory.getInstatnce().newXpathSelector("//a/@href");
|
XpathSelector xpathSelector = SelectorFactory.getInstatnce().newXpathSelector("//a/@href");
|
||||||
return selectList(xpathSelector,strings);
|
return selectList(xpathSelector, strings);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -24,6 +24,10 @@ public class PlainText implements Selectable {
|
||||||
this.strings = results;
|
this.strings = results;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static PlainText create(String text) {
|
||||||
|
return new PlainText(text);
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Selectable xpath(String xpath) {
|
public Selectable xpath(String xpath) {
|
||||||
throw new UnsupportedOperationException();
|
throw new UnsupportedOperationException();
|
||||||
|
|
|
@ -15,7 +15,7 @@ public class HttpClientDownloaderTest {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testCookie() {
|
public void testCookie() {
|
||||||
Site site = Site.me().setDomain("www.diandian.com").setCookie("t", "yct7q7e6v319wpg4cpxqduu5m77lcgix");
|
Site site = Site.me().setDomain("www.diandian.com").addCookie("t", "yct7q7e6v319wpg4cpxqduu5m77lcgix");
|
||||||
HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
|
HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
|
||||||
Page download = httpClientDownloader.download(new Request("http://www.diandian.com"), site);
|
Page download = httpClientDownloader.download(new Request("http://www.diandian.com"), site);
|
||||||
Assert.assertTrue(download.getHtml().toString().contains("flashsword30"));
|
Assert.assertTrue(download.getHtml().toString().contains("flashsword30"));
|
||||||
|
|
|
@ -33,6 +33,6 @@ public class DianpingProcessor implements PageProcessor {
|
||||||
|
|
||||||
public static void main(String[] args) {
|
public static void main(String[] args) {
|
||||||
DianpingProcessor dianpingProcessor = new DianpingProcessor();
|
DianpingProcessor dianpingProcessor = new DianpingProcessor();
|
||||||
Spider.me().processor(dianpingProcessor).startUrl("http://www.dianping.com/shanghai/food").run();
|
Spider.create(dianpingProcessor).startUrl("http://www.dianping.com/shanghai/food").run();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -5,7 +5,7 @@ import org.junit.Test;
|
||||||
import us.codecraft.webmagic.pipeline.FilePipeline;
|
import us.codecraft.webmagic.pipeline.FilePipeline;
|
||||||
import us.codecraft.webmagic.processor.SimplePageProcessor;
|
import us.codecraft.webmagic.processor.SimplePageProcessor;
|
||||||
import us.codecraft.webmagic.samples.HuxiuProcessor;
|
import us.codecraft.webmagic.samples.HuxiuProcessor;
|
||||||
import us.codecraft.webmagic.schedular.FileCacheQueueSchedular;
|
import us.codecraft.webmagic.schedular.FileCacheQueueScheduler;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
|
@ -18,7 +18,7 @@ public class SpiderTest {
|
||||||
@Ignore
|
@Ignore
|
||||||
@Test
|
@Test
|
||||||
public void testSpider() throws InterruptedException {
|
public void testSpider() throws InterruptedException {
|
||||||
Spider me = Spider.me().pipeline(new FilePipeline()).processor(new HuxiuProcessor());
|
Spider me = Spider.create(new HuxiuProcessor()).pipeline(new FilePipeline());
|
||||||
me.run();
|
me.run();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -26,13 +26,13 @@ public class SpiderTest {
|
||||||
@Test
|
@Test
|
||||||
public void testGlobalSpider(){
|
public void testGlobalSpider(){
|
||||||
// PageProcessor pageProcessor = new MeicanProcessor();
|
// PageProcessor pageProcessor = new MeicanProcessor();
|
||||||
// Spider.me().pipeline(new FilePipeline()).schedular(new FileCacheQueueSchedular(pageProcessor.getSite(),"/data/temp/webmagic/cache/")).
|
// Spider.me().pipeline(new FilePipeline()).schedular(new FileCacheQueueScheduler(pageProcessor.getSite(),"/data/temp/webmagic/cache/")).
|
||||||
// processor(pageProcessor).run();
|
// processor(pageProcessor).run();
|
||||||
SimplePageProcessor pageProcessor2 = new SimplePageProcessor("http://www.diaoyuweng.com/home.php?mod=space&uid=88304&do=thread&view=me&type=thread&from=space", "http://www.diaoyuweng.com/thread-*-1-1.html");
|
SimplePageProcessor pageProcessor2 = new SimplePageProcessor("http://www.diaoyuweng.com/home.php?mod=space&uid=88304&do=thread&view=me&type=thread&from=space", "http://www.diaoyuweng.com/thread-*-1-1.html");
|
||||||
System.out.println(pageProcessor2.getSite().getEncoding());
|
System.out.println(pageProcessor2.getSite().getEncoding());
|
||||||
pageProcessor2.getSite().setSleepTime(500);
|
pageProcessor2.getSite().setSleepTime(500);
|
||||||
Spider.me().pipeline(new FilePipeline()).schedular(new FileCacheQueueSchedular("/data/temp/webmagic/cache/")).
|
Spider.create(pageProcessor2).pipeline(new FilePipeline()).schedular(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")).
|
||||||
processor(pageProcessor2).run();
|
run();
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -6,7 +6,7 @@ import us.codecraft.webmagic.Spider;
|
||||||
import us.codecraft.webmagic.pipeline.ConsolePipeline;
|
import us.codecraft.webmagic.pipeline.ConsolePipeline;
|
||||||
import us.codecraft.webmagic.pipeline.FreemarkerPipeline;
|
import us.codecraft.webmagic.pipeline.FreemarkerPipeline;
|
||||||
import us.codecraft.webmagic.samples.DiandianBlogProcessor;
|
import us.codecraft.webmagic.samples.DiandianBlogProcessor;
|
||||||
import us.codecraft.webmagic.schedular.FileCacheQueueSchedular;
|
import us.codecraft.webmagic.schedular.FileCacheQueueScheduler;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
|
@ -30,7 +30,7 @@ public class DiandianProcessorTest {
|
||||||
//ConsolePipeline输出结果到控制台
|
//ConsolePipeline输出结果到控制台
|
||||||
//FileCacheQueueSchedular保存url,支持断点续传,临时文件输出到/data/temp/webmagic/cache目录
|
//FileCacheQueueSchedular保存url,支持断点续传,临时文件输出到/data/temp/webmagic/cache目录
|
||||||
//Spider.run()执行
|
//Spider.run()执行
|
||||||
Spider.me().pipeline(new ConsolePipeline()).pipeline(pipeline).schedular(new FileCacheQueueSchedular("/data/temp/webmagic/cache/")).
|
Spider.create(diaoyuwengProcessor).pipeline(new ConsolePipeline()).pipeline(pipeline).schedular(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")).
|
||||||
processor(diaoyuwengProcessor).run();
|
run();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -6,14 +6,14 @@ import us.codecraft.webmagic.Spider;
|
||||||
import us.codecraft.webmagic.pipeline.FilePipeline;
|
import us.codecraft.webmagic.pipeline.FilePipeline;
|
||||||
import us.codecraft.webmagic.pipeline.FreemarkerPipeline;
|
import us.codecraft.webmagic.pipeline.FreemarkerPipeline;
|
||||||
import us.codecraft.webmagic.samples.DiaoyuwengProcessor;
|
import us.codecraft.webmagic.samples.DiaoyuwengProcessor;
|
||||||
import us.codecraft.webmagic.schedular.FileCacheQueueSchedular;
|
import us.codecraft.webmagic.schedular.FileCacheQueueScheduler;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
* Date: 13-6-9
|
* Date: 13-6-9
|
||||||
* Time: 上午8:02
|
* Time: 上午8:02
|
||||||
*/
|
*/
|
||||||
public class DiaoyuwengProcessorTest {
|
public class DiaoyuwengProcessorTest {
|
||||||
|
|
||||||
|
@ -22,7 +22,7 @@ public class DiaoyuwengProcessorTest {
|
||||||
public void test() throws IOException {
|
public void test() throws IOException {
|
||||||
DiaoyuwengProcessor diaoyuwengProcessor = new DiaoyuwengProcessor();
|
DiaoyuwengProcessor diaoyuwengProcessor = new DiaoyuwengProcessor();
|
||||||
FreemarkerPipeline pipeline = new FreemarkerPipeline("wordpress.ftl");
|
FreemarkerPipeline pipeline = new FreemarkerPipeline("wordpress.ftl");
|
||||||
Spider.me().pipeline(new FilePipeline()).pipeline(pipeline).schedular(new FileCacheQueueSchedular("/data/temp/webmagic/cache/")).
|
Spider.create(diaoyuwengProcessor).pipeline(new FilePipeline()).pipeline(pipeline).schedular(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")).
|
||||||
processor(diaoyuwengProcessor).run();
|
run();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -6,14 +6,14 @@ import us.codecraft.webmagic.Spider;
|
||||||
import us.codecraft.webmagic.pipeline.FilePipeline;
|
import us.codecraft.webmagic.pipeline.FilePipeline;
|
||||||
import us.codecraft.webmagic.pipeline.FreemarkerPipeline;
|
import us.codecraft.webmagic.pipeline.FreemarkerPipeline;
|
||||||
import us.codecraft.webmagic.samples.SinaBlogProcesser;
|
import us.codecraft.webmagic.samples.SinaBlogProcesser;
|
||||||
import us.codecraft.webmagic.schedular.FileCacheQueueSchedular;
|
import us.codecraft.webmagic.schedular.FileCacheQueueScheduler;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
* Date: 13-6-9
|
* Date: 13-6-9
|
||||||
* Time: 上午8:02
|
* Time: 上午8:02
|
||||||
*/
|
*/
|
||||||
public class SinablogProcessorTest {
|
public class SinablogProcessorTest {
|
||||||
|
|
||||||
|
@ -30,7 +30,7 @@ public class SinablogProcessorTest {
|
||||||
//ConsolePipeline输出结果到控制台
|
//ConsolePipeline输出结果到控制台
|
||||||
//FileCacheQueueSchedular保存url,支持断点续传,临时文件输出到/data/temp/webmagic/cache目录
|
//FileCacheQueueSchedular保存url,支持断点续传,临时文件输出到/data/temp/webmagic/cache目录
|
||||||
//Spider.run()执行
|
//Spider.run()执行
|
||||||
Spider.me().pipeline(new FilePipeline()).pipeline(pipeline).schedular(new FileCacheQueueSchedular("/data/temp/webmagic/cache/")).
|
Spider.create(sinaBlogProcesser).pipeline(new FilePipeline()).pipeline(pipeline).schedular(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")).
|
||||||
processor(sinaBlogProcesser).run();
|
run();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue