add meican
parent
1d870f3c91
commit
413b97c904
|
@ -2,11 +2,10 @@ package us.codecraft.spider;
|
||||||
|
|
||||||
import org.junit.Ignore;
|
import org.junit.Ignore;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
import us.codecraft.spider.pipeline.ConsolePipeline;
|
|
||||||
import us.codecraft.spider.pipeline.FilePipeline;
|
import us.codecraft.spider.pipeline.FilePipeline;
|
||||||
import us.codecraft.spider.processor.SimplePageProcessor;
|
import us.codecraft.spider.processor.PageProcessor;
|
||||||
import us.codecraft.spider.samples.DianpingBlogProcessor;
|
|
||||||
import us.codecraft.spider.samples.HuxiuProcessor;
|
import us.codecraft.spider.samples.HuxiuProcessor;
|
||||||
|
import us.codecraft.spider.samples.MeicanProcessor;
|
||||||
import us.codecraft.spider.schedular.FileCacheQueueSchedular;
|
import us.codecraft.spider.schedular.FileCacheQueueSchedular;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -25,8 +24,7 @@ public class SpiderTest {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testGlobalSpider(){
|
public void testGlobalSpider(){
|
||||||
SimplePageProcessor pageProcessor = new SimplePageProcessor("http://blog.163.com/", "http://blog.163.com/*/blog/static/*");
|
PageProcessor pageProcessor = new MeicanProcessor();
|
||||||
pageProcessor.getSite().setEncoding("gbk");
|
|
||||||
Spider.me().pipeline(new FilePipeline()).schedular(new FileCacheQueueSchedular(pageProcessor.getSite(),"/data/temp/spider/cache/")).
|
Spider.me().pipeline(new FilePipeline()).schedular(new FileCacheQueueSchedular(pageProcessor.getSite(),"/data/temp/spider/cache/")).
|
||||||
processor(pageProcessor).run();
|
processor(pageProcessor).run();
|
||||||
// SimplePageProcessor pageProcessor2 = new SimplePageProcessor("http://lol.duowan.com/", "http://lol.duowan.com/*.html");
|
// SimplePageProcessor pageProcessor2 = new SimplePageProcessor("http://lol.duowan.com/", "http://lol.duowan.com/*.html");
|
||||||
|
|
|
@ -0,0 +1,27 @@
|
||||||
|
package us.codecraft.spider.samples;
|
||||||
|
|
||||||
|
import us.codecraft.spider.Page;
|
||||||
|
import us.codecraft.spider.Site;
|
||||||
|
import us.codecraft.spider.processor.PageProcessor;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* User: cairne
|
||||||
|
* Date: 13-5-20
|
||||||
|
* Time: 下午5:31
|
||||||
|
*/
|
||||||
|
public class KaichibaProcessor implements PageProcessor {
|
||||||
|
@Override
|
||||||
|
public void process(Page page) {
|
||||||
|
//http://progressdaily.diandian.com/post/2013-01-24/40046867275
|
||||||
|
int i = Integer.valueOf(page.getUrl().r("shop/(\\d+)").toString()) + 1;
|
||||||
|
page.addTargetRequests("http://kaichiba.com/shop/"+i);
|
||||||
|
page.putField("title",page.getHtml().x("//Title"));
|
||||||
|
page.putField("items", page.getHtml().xs("//li[@class=\"foodTitle\"]").rp("^\\s+", "").rp("\\s+$", "").rp("<span>.*?</span>", ""));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Site getSite() {
|
||||||
|
return Site.me().setDomain("kaichiba.com").setStartUrl("http://kaichiba.com/shop/41725781").setEncoding("utf-8").
|
||||||
|
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,33 @@
|
||||||
|
package us.codecraft.spider.samples;
|
||||||
|
|
||||||
|
import us.codecraft.spider.Page;
|
||||||
|
import us.codecraft.spider.Site;
|
||||||
|
import us.codecraft.spider.processor.PageProcessor;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* User: cairne
|
||||||
|
* Date: 13-5-20
|
||||||
|
* Time: 下午5:31
|
||||||
|
*/
|
||||||
|
public class MeicanProcessor implements PageProcessor {
|
||||||
|
@Override
|
||||||
|
public void process(Page page) {
|
||||||
|
//http://progressdaily.diandian.com/post/2013-01-24/40046867275
|
||||||
|
List<String> requests = page.getHtml().xs("//a[@class=\"area_link flat_btn\"]/@href").toStrings();
|
||||||
|
if (requests.size() > 2) {
|
||||||
|
requests = requests.subList(0, 2);
|
||||||
|
}
|
||||||
|
page.addTargetRequests(requests);
|
||||||
|
page.addTargetRequests(page.getHtml().as().rs("(.*/restaurant/[^#]+)").toStrings());
|
||||||
|
page.putField("items", page.getHtml().xs("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"name\"]"));
|
||||||
|
page.putField("prices", page.getHtml().xs("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"price_outer\"]/span[@class=\"price\"]"));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Site getSite() {
|
||||||
|
return Site.me().setDomain("meican.com").setStartUrl("http://www.meican.com/shanghai/districts").setEncoding("utf-8").
|
||||||
|
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue