diff --git a/src/test/java/us/codecraft/spider/SpiderTest.java b/src/test/java/us/codecraft/spider/SpiderTest.java index cbc84a2..83e8e8f 100644 --- a/src/test/java/us/codecraft/spider/SpiderTest.java +++ b/src/test/java/us/codecraft/spider/SpiderTest.java @@ -2,11 +2,10 @@ package us.codecraft.spider; import org.junit.Ignore; import org.junit.Test; -import us.codecraft.spider.pipeline.ConsolePipeline; import us.codecraft.spider.pipeline.FilePipeline; -import us.codecraft.spider.processor.SimplePageProcessor; -import us.codecraft.spider.samples.DianpingBlogProcessor; +import us.codecraft.spider.processor.PageProcessor; import us.codecraft.spider.samples.HuxiuProcessor; +import us.codecraft.spider.samples.MeicanProcessor; import us.codecraft.spider.schedular.FileCacheQueueSchedular; /** @@ -25,8 +24,7 @@ public class SpiderTest { @Test public void testGlobalSpider(){ - SimplePageProcessor pageProcessor = new SimplePageProcessor("http://blog.163.com/", "http://blog.163.com/*/blog/static/*"); - pageProcessor.getSite().setEncoding("gbk"); + PageProcessor pageProcessor = new MeicanProcessor(); Spider.me().pipeline(new FilePipeline()).schedular(new FileCacheQueueSchedular(pageProcessor.getSite(),"/data/temp/spider/cache/")). processor(pageProcessor).run(); // SimplePageProcessor pageProcessor2 = new SimplePageProcessor("http://lol.duowan.com/", "http://lol.duowan.com/*.html"); diff --git a/src/test/java/us/codecraft/spider/samples/KaichibaProcessor.java b/src/test/java/us/codecraft/spider/samples/KaichibaProcessor.java new file mode 100644 index 0000000..5985803 --- /dev/null +++ b/src/test/java/us/codecraft/spider/samples/KaichibaProcessor.java @@ -0,0 +1,27 @@ +package us.codecraft.spider.samples; + +import us.codecraft.spider.Page; +import us.codecraft.spider.Site; +import us.codecraft.spider.processor.PageProcessor; + +/** + * User: cairne + * Date: 13-5-20 + * Time: 下午5:31 + */ +public class KaichibaProcessor implements PageProcessor { + @Override + public void process(Page page) { + //http://progressdaily.diandian.com/post/2013-01-24/40046867275 + int i = Integer.valueOf(page.getUrl().r("shop/(\\d+)").toString()) + 1; + page.addTargetRequests("http://kaichiba.com/shop/"+i); + page.putField("title",page.getHtml().x("//Title")); + page.putField("items", page.getHtml().xs("//li[@class=\"foodTitle\"]").rp("^\\s+", "").rp("\\s+$", "").rp(".*?", "")); + } + + @Override + public Site getSite() { + return Site.me().setDomain("kaichiba.com").setStartUrl("http://kaichiba.com/shop/41725781").setEncoding("utf-8"). + setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); + } +} diff --git a/src/test/java/us/codecraft/spider/samples/MeicanProcessor.java b/src/test/java/us/codecraft/spider/samples/MeicanProcessor.java new file mode 100644 index 0000000..3d15cd2 --- /dev/null +++ b/src/test/java/us/codecraft/spider/samples/MeicanProcessor.java @@ -0,0 +1,33 @@ +package us.codecraft.spider.samples; + +import us.codecraft.spider.Page; +import us.codecraft.spider.Site; +import us.codecraft.spider.processor.PageProcessor; + +import java.util.List; + +/** + * User: cairne + * Date: 13-5-20 + * Time: 下午5:31 + */ +public class MeicanProcessor implements PageProcessor { + @Override + public void process(Page page) { + //http://progressdaily.diandian.com/post/2013-01-24/40046867275 + List requests = page.getHtml().xs("//a[@class=\"area_link flat_btn\"]/@href").toStrings(); + if (requests.size() > 2) { + requests = requests.subList(0, 2); + } + page.addTargetRequests(requests); + page.addTargetRequests(page.getHtml().as().rs("(.*/restaurant/[^#]+)").toStrings()); + page.putField("items", page.getHtml().xs("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"name\"]")); + page.putField("prices", page.getHtml().xs("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"price_outer\"]/span[@class=\"price\"]")); + } + + @Override + public Site getSite() { + return Site.me().setDomain("meican.com").setStartUrl("http://www.meican.com/shanghai/districts").setEncoding("utf-8"). + setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); + } +}