diff --git a/src/test/java/us/codecraft/spider/SpiderTest.java b/src/test/java/us/codecraft/spider/SpiderTest.java
index cbc84a2..83e8e8f 100644
--- a/src/test/java/us/codecraft/spider/SpiderTest.java
+++ b/src/test/java/us/codecraft/spider/SpiderTest.java
@@ -2,11 +2,10 @@ package us.codecraft.spider;
import org.junit.Ignore;
import org.junit.Test;
-import us.codecraft.spider.pipeline.ConsolePipeline;
import us.codecraft.spider.pipeline.FilePipeline;
-import us.codecraft.spider.processor.SimplePageProcessor;
-import us.codecraft.spider.samples.DianpingBlogProcessor;
+import us.codecraft.spider.processor.PageProcessor;
import us.codecraft.spider.samples.HuxiuProcessor;
+import us.codecraft.spider.samples.MeicanProcessor;
import us.codecraft.spider.schedular.FileCacheQueueSchedular;
/**
@@ -25,8 +24,7 @@ public class SpiderTest {
@Test
public void testGlobalSpider(){
- SimplePageProcessor pageProcessor = new SimplePageProcessor("http://blog.163.com/", "http://blog.163.com/*/blog/static/*");
- pageProcessor.getSite().setEncoding("gbk");
+ PageProcessor pageProcessor = new MeicanProcessor();
Spider.me().pipeline(new FilePipeline()).schedular(new FileCacheQueueSchedular(pageProcessor.getSite(),"/data/temp/spider/cache/")).
processor(pageProcessor).run();
// SimplePageProcessor pageProcessor2 = new SimplePageProcessor("http://lol.duowan.com/", "http://lol.duowan.com/*.html");
diff --git a/src/test/java/us/codecraft/spider/samples/KaichibaProcessor.java b/src/test/java/us/codecraft/spider/samples/KaichibaProcessor.java
new file mode 100644
index 0000000..5985803
--- /dev/null
+++ b/src/test/java/us/codecraft/spider/samples/KaichibaProcessor.java
@@ -0,0 +1,27 @@
+package us.codecraft.spider.samples;
+
+import us.codecraft.spider.Page;
+import us.codecraft.spider.Site;
+import us.codecraft.spider.processor.PageProcessor;
+
+/**
+ * User: cairne
+ * Date: 13-5-20
+ * Time: 下午5:31
+ */
+public class KaichibaProcessor implements PageProcessor {
+ @Override
+ public void process(Page page) {
+ //http://progressdaily.diandian.com/post/2013-01-24/40046867275
+ int i = Integer.valueOf(page.getUrl().r("shop/(\\d+)").toString()) + 1;
+ page.addTargetRequests("http://kaichiba.com/shop/"+i);
+ page.putField("title",page.getHtml().x("//Title"));
+ page.putField("items", page.getHtml().xs("//li[@class=\"foodTitle\"]").rp("^\\s+", "").rp("\\s+$", "").rp(".*?", ""));
+ }
+
+ @Override
+ public Site getSite() {
+ return Site.me().setDomain("kaichiba.com").setStartUrl("http://kaichiba.com/shop/41725781").setEncoding("utf-8").
+ setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
+ }
+}
diff --git a/src/test/java/us/codecraft/spider/samples/MeicanProcessor.java b/src/test/java/us/codecraft/spider/samples/MeicanProcessor.java
new file mode 100644
index 0000000..3d15cd2
--- /dev/null
+++ b/src/test/java/us/codecraft/spider/samples/MeicanProcessor.java
@@ -0,0 +1,33 @@
+package us.codecraft.spider.samples;
+
+import us.codecraft.spider.Page;
+import us.codecraft.spider.Site;
+import us.codecraft.spider.processor.PageProcessor;
+
+import java.util.List;
+
+/**
+ * User: cairne
+ * Date: 13-5-20
+ * Time: 下午5:31
+ */
+public class MeicanProcessor implements PageProcessor {
+ @Override
+ public void process(Page page) {
+ //http://progressdaily.diandian.com/post/2013-01-24/40046867275
+ List requests = page.getHtml().xs("//a[@class=\"area_link flat_btn\"]/@href").toStrings();
+ if (requests.size() > 2) {
+ requests = requests.subList(0, 2);
+ }
+ page.addTargetRequests(requests);
+ page.addTargetRequests(page.getHtml().as().rs("(.*/restaurant/[^#]+)").toStrings());
+ page.putField("items", page.getHtml().xs("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"name\"]"));
+ page.putField("prices", page.getHtml().xs("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"price_outer\"]/span[@class=\"price\"]"));
+ }
+
+ @Override
+ public Site getSite() {
+ return Site.me().setDomain("meican.com").setStartUrl("http://www.meican.com/shanghai/districts").setEncoding("utf-8").
+ setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
+ }
+}