diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/samples/DiandianBlogProcessor.java b/webmagic-core/src/test/java/us/codecraft/webmagic/samples/DiandianBlogProcessor.java deleted file mode 100644 index efd1ff7..0000000 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/samples/DiandianBlogProcessor.java +++ /dev/null @@ -1,29 +0,0 @@ -package us.codecraft.webmagic.samples; - -import us.codecraft.webmagic.Site; -import us.codecraft.webmagic.Page; -import us.codecraft.webmagic.processor.PageProcessor; - -import java.util.List; - -/** - * User: cairne - * Date: 13-4-21 - * Time: 下午8:08 - */ -public class DiandianBlogProcessor implements PageProcessor { - @Override - public void process(Page page) { - //http://progressdaily.diandian.com/post/2013-01-24/40046867275 - List requests = page.getHtml().rs("]*href=[\"']{1}(http://17dujingdian\\.com/post/[^#]*?)[\"']{1}").toStrings(); - page.addTargetRequests(requests); - page.putField("title",page.getHtml().x("//div[@id='content']//h2/a")); - page.putField("content",page.getHtml().sc()); - } - - @Override - public Site getSite() { - return Site.me().setDomain("www.diandian.com").setStartUrl("http://17dujingdian.com/"). - setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); - } -} diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/samples/DianpingBlogProcessor.java b/webmagic-core/src/test/java/us/codecraft/webmagic/samples/DianpingBlogProcessor.java deleted file mode 100644 index dd601ad..0000000 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/samples/DianpingBlogProcessor.java +++ /dev/null @@ -1,33 +0,0 @@ -package us.codecraft.webmagic.samples; - -import us.codecraft.webmagic.Site; -import us.codecraft.webmagic.Page; -import us.codecraft.webmagic.processor.PageProcessor; - -import java.util.List; - -/** - * User: cairne - * Date: 13-4-21 - * Time: 下午8:08 - */ -public class DianpingBlogProcessor implements PageProcessor { - @Override - public void process(Page page) { - //http://progressdaily.diandian.com/post/2013-01-24/40046867275 - List requests = page.getHtml().rs("]*href=[\"']{1}(/shop/.*?)[\"']{1}").toStrings(); - page.addTargetRequests(requests); - requests = page.getHtml().rs("]*href=[\"']{1}(/search/category/.*?)[\"']{1}").toStrings(); - page.addTargetRequests(requests); - if (page.getUrl().toString().contains("shop")){ - page.putField("title", page.getHtml().x("//h1[@class='shop-title']")); - page.putField("content", page.getHtml().sc()); - } - } - - @Override - public Site getSite() { - return Site.me().setDomain("www.dianping.com").setStartUrl("http://www.dianping.com/"). - setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); - } -} diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java b/webmagic-core/src/test/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java deleted file mode 100644 index 05b68b6..0000000 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java +++ /dev/null @@ -1,33 +0,0 @@ -package us.codecraft.webmagic.samples; - -import us.codecraft.webmagic.Page; -import us.codecraft.webmagic.Site; -import us.codecraft.webmagic.processor.PageProcessor; - -import java.util.List; - -/** - * User: cairne - * Date: 13-4-21 - * Time: 下午8:08 - */ -public class DiaoyuwengProcessor implements PageProcessor { - @Override - public void process(Page page) { - //http://progressdaily.diandian.com/post/2013-01-24/40046867275 - List requests = page.getHtml().rs("]*href=[\"']{1}(/shop/.*?)[\"']{1}").toStrings(); - page.addTargetRequests(requests); - requests = page.getHtml().rs("]*href=[\"']{1}(/search/category/.*?)[\"']{1}").toStrings(); - page.addTargetRequests(requests); - if (page.getUrl().toString().contains("shop")){ - page.putField("title", page.getHtml().x("//h1[@class='shop-title']")); - page.putField("content", page.getHtml().sc()); - } - } - - @Override - public Site getSite() { - return Site.me().setDomain("www.dianping.com").setStartUrl("http://www.dianping.com/"). - setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); - } -} diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/samples/F58PageProcesser.java b/webmagic-core/src/test/java/us/codecraft/webmagic/samples/F58PageProcesser.java deleted file mode 100644 index 78211c4..0000000 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/samples/F58PageProcesser.java +++ /dev/null @@ -1,28 +0,0 @@ -package us.codecraft.webmagic.samples; - -import us.codecraft.webmagic.Site; -import us.codecraft.webmagic.Page; -import us.codecraft.webmagic.processor.PageProcessor; - -import java.util.List; - -/** - * User: cairne - * Date: 13-4-21 - * Time: 下午1:48 - */ -public class F58PageProcesser implements PageProcessor { - - @Override - public void process(Page page) { - List strings = page.getHtml().rs("]*href=[\"']{1}(/yewu/.*?)[\"']{1}").toStrings(); - page.addTargetRequests(strings); - page.putField("title",page.getHtml().r("(.*)")); - page.putField("body",page.getHtml().x("//dd[@class='w133']")); - } - - @Override - public Site getSite() { - return Site.me().setDomain("sh.58.com").setStartUrl("http://sh.58.com/"); //To change body of implemented methods use File | Settings | File Templates. - } -} diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/samples/HuxiuProcessor.java b/webmagic-core/src/test/java/us/codecraft/webmagic/samples/HuxiuProcessor.java deleted file mode 100644 index 82552f9..0000000 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/samples/HuxiuProcessor.java +++ /dev/null @@ -1,29 +0,0 @@ -package us.codecraft.webmagic.samples; - -import us.codecraft.webmagic.Site; -import us.codecraft.webmagic.Page; -import us.codecraft.webmagic.processor.PageProcessor; - -import java.util.List; - -/** - * User: cairne - * Date: 13-4-21 - * Time: 下午8:08 - */ -public class HuxiuProcessor implements PageProcessor { - @Override - public void process(Page page) { - //http://progressdaily.diandian.com/post/2013-01-24/40046867275 - List requests = page.getHtml().rs("\"']*href=[\"']{1}([/]{0,1}article[^<>#\"']*?)[\"']{1}").toStrings(); - page.addTargetRequests(requests); - page.putField("title",page.getHtml().x("//div[@class='neirong']//h1[@class='ph xs5']")); - page.putField("content",page.getHtml().sc()); - } - - @Override - public Site getSite() { - return Site.me().setDomain("www.huxiu.com").setStartUrl("http://www.huxiu.com/"). - setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); - } -} diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/samples/KaichibaProcessor.java b/webmagic-core/src/test/java/us/codecraft/webmagic/samples/KaichibaProcessor.java deleted file mode 100644 index 58a2cb8..0000000 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/samples/KaichibaProcessor.java +++ /dev/null @@ -1,27 +0,0 @@ -package us.codecraft.webmagic.samples; - -import us.codecraft.webmagic.Page; -import us.codecraft.webmagic.Site; -import us.codecraft.webmagic.processor.PageProcessor; - -/** - * User: cairne - * Date: 13-5-20 - * Time: 下午5:31 - */ -public class KaichibaProcessor implements PageProcessor { - @Override - public void process(Page page) { - //http://progressdaily.diandian.com/post/2013-01-24/40046867275 - int i = Integer.valueOf(page.getUrl().r("shop/(\\d+)").toString()) + 1; - page.addTargetRequests("http://kaichiba.com/shop/"+i); - page.putField("title",page.getHtml().x("//Title")); - page.putField("items", page.getHtml().xs("//li[@class=\"foodTitle\"]").rp("^\\s+", "").rp("\\s+$", "").rp(".*?", "")); - } - - @Override - public Site getSite() { - return Site.me().setDomain("kaichiba.com").setStartUrl("http://kaichiba.com/shop/41725781").setEncoding("utf-8"). - setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); - } -} diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/samples/MeicanProcessor.java b/webmagic-core/src/test/java/us/codecraft/webmagic/samples/MeicanProcessor.java deleted file mode 100644 index 637aec1..0000000 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/samples/MeicanProcessor.java +++ /dev/null @@ -1,33 +0,0 @@ -package us.codecraft.webmagic.samples; - -import us.codecraft.webmagic.Page; -import us.codecraft.webmagic.Site; -import us.codecraft.webmagic.processor.PageProcessor; - -import java.util.List; - -/** - * User: cairne - * Date: 13-5-20 - * Time: 下午5:31 - */ -public class MeicanProcessor implements PageProcessor { - @Override - public void process(Page page) { - //http://progressdaily.diandian.com/post/2013-01-24/40046867275 - List requests = page.getHtml().xs("//a[@class=\"area_link flat_btn\"]/@href").toStrings(); - if (requests.size() > 2) { - requests = requests.subList(0, 2); - } - page.addTargetRequests(requests); - page.addTargetRequests(page.getHtml().as().rs("(.*/restaurant/[^#]+)").toStrings()); - page.putField("items", page.getHtml().xs("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"name\"]")); - page.putField("prices", page.getHtml().xs("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"price_outer\"]/span[@class=\"price\"]")); - } - - @Override - public Site getSite() { - return Site.me().setDomain("meican.com").setStartUrl("http://www.meican.com/shanghai/districts").setEncoding("utf-8"). - setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); - } -} diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/samples/NjuBBSProcessor.java b/webmagic-core/src/test/java/us/codecraft/webmagic/samples/NjuBBSProcessor.java deleted file mode 100644 index ca46de6..0000000 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/samples/NjuBBSProcessor.java +++ /dev/null @@ -1,28 +0,0 @@ -package us.codecraft.webmagic.samples; - -import us.codecraft.webmagic.Site; -import us.codecraft.webmagic.Page; -import us.codecraft.webmagic.processor.PageProcessor; - -import java.util.List; - -/** - * User: cairne - * Date: 13-4-21 - * Time: 下午8:08 - */ -public class NjuBBSProcessor implements PageProcessor { - @Override - public void process(Page page) { - List requests = page.getHtml().rs("]*href=(bbstcon\\?board=Pictures&file=[^>]*)").toStrings(); - page.addTargetRequests(requests); - page.putField("title",page.getHtml().x("//div[@id='content']//h2/a")); - page.putField("content",page.getHtml().sc()); - } - - @Override - public Site getSite() { - return Site.me().setDomain("bbs.nju.edu.cn").setStartUrl("http://bbs.nju.edu.cn/board?board=Pictures"). - setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); - } -} diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java b/webmagic-core/src/test/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java deleted file mode 100644 index 2166d9b..0000000 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java +++ /dev/null @@ -1,30 +0,0 @@ -package us.codecraft.webmagic.samples; - -import us.codecraft.webmagic.Site; -import us.codecraft.webmagic.Page; -import us.codecraft.webmagic.processor.PageProcessor; - -import java.util.List; - -/** - * User: cairne - * Date: 13-4-21 - * Time: 下午1:48 - */ -public class OschinaBlogPageProcesser implements PageProcessor { - - @Override - public void process(Page page) { - List strings = page.getHtml().as().r("(http://my\\.oschina\\.net)").toStrings(); - page.addTargetRequests(strings); - page.putField("title", page.getHtml().xs("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1")); - page.putField("content", page.getHtml().sc()); - page.putField("author", page.getUrl().r("my\\.oschina\\.net/(\\w+)/blog/\\d+")); - } - - @Override - public Site getSite() { - return Site.me().setDomain("my.oschina.net").setStartUrl("http://www.oschina.net/"). - setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); - } -} diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/samples/OschinaPageProcesser.java b/webmagic-core/src/test/java/us/codecraft/webmagic/samples/OschinaPageProcesser.java deleted file mode 100644 index cdfbc1e..0000000 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/samples/OschinaPageProcesser.java +++ /dev/null @@ -1,29 +0,0 @@ -package us.codecraft.webmagic.samples; - -import us.codecraft.webmagic.Site; -import us.codecraft.webmagic.Page; -import us.codecraft.webmagic.processor.PageProcessor; - -import java.util.List; - -/** - * User: cairne - * Date: 13-4-21 - * Time: 下午1:48 - */ -public class OschinaPageProcesser implements PageProcessor { - - @Override - public void process(Page page) { - List strings = page.getHtml().rs("]*href=[\"']{1}(http://www\\.oschina\\.net/question/[\\w]+)[\"']{1}").toStrings(); - page.addTargetRequests(strings); - page.putField("title", page.getHtml().x("//div[@class='QTitle']/h1/a")); - page.putField("content", page.getHtml().xs("//div[@class='Question']//div[@class='Content']/div[@class='detail']")); - } - - @Override - public Site getSite() { - return Site.me().setDomain("www.oschina.net").setStartUrl("http://www.oschina.net/"). - setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); - } -} diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/samples/QzoneBlogProcessor.java b/webmagic-core/src/test/java/us/codecraft/webmagic/samples/QzoneBlogProcessor.java deleted file mode 100644 index 67ef671..0000000 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/samples/QzoneBlogProcessor.java +++ /dev/null @@ -1,32 +0,0 @@ -package us.codecraft.webmagic.samples; - -import us.codecraft.webmagic.Page; -import us.codecraft.webmagic.Site; -import us.codecraft.webmagic.processor.PageProcessor; - -import java.util.List; - -/** - * User: cairne - * Date: 13-4-21 - * Time: 下午8:08 - */ -public class QzoneBlogProcessor implements PageProcessor { - @Override - public void process(Page page) { - //http://progressdaily.diandian.com/post/2013-01-24/40046867275 - - //http://b1.cnc.qzone.qq.com/cgi-bin/blognew/get_abs?hostUin=233017404&uin=233017404&blogType=0&statYear=2013&source=0&statYear=2013&g_tk=291639571&g_tk=291639571&reqInfo=7&pos=0&num=15&source=0&rand=0.46480297949165106 - // &cateName=&cateHex=&statYear=2013&reqInfo=7&pos=0&num=15&sortType=0&source=0&rand=0.46480297949165106&g_tk=291639571&verbose=1&ref=qzone - List requests = page.getHtml().rs("]*href=[\"']{1}(http://17dujingdian\\.com/post/[^#]*?)[\"']{1}").toStrings(); - page.addTargetRequests(requests); - page.putField("title",page.getHtml().x("//div[@id='content']//h2/a")); - page.putField("content",page.getHtml().sc()); - } - - @Override - public Site getSite() { - return Site.me().setDomain("www.diandian.com").setStartUrl("http://17dujingdian.com/"). - setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); - } -} diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/samples/SinaBlogProcesser.java b/webmagic-core/src/test/java/us/codecraft/webmagic/samples/SinaBlogProcesser.java deleted file mode 100644 index b86fff8..0000000 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/samples/SinaBlogProcesser.java +++ /dev/null @@ -1,29 +0,0 @@ -package us.codecraft.webmagic.samples; - -import us.codecraft.webmagic.Site; -import us.codecraft.webmagic.Page; -import us.codecraft.webmagic.processor.PageProcessor; - -/** - * User: cairne - * Date: 13-4-21 - * Time: 下午1:48 - */ -public class SinaBlogProcesser implements PageProcessor { - - @Override - public void process(Page page) { - page.addTargetRequests(page.getHtml().rs("]*href=[\"']{1}(http://blog\\.sina\\.com\\.cn/s/blog_.*?)[\"']{1}").toStrings()); - page.putField("title", page.getHtml().x("//div[@class='articalTitle']/h2")); - page.putField("body",page.getHtml().sc()); - //x("//dd[@class='w133']") - page.putField("date",page.getHtml().x("//div[@id='articlebody']//span[@class='time SG_txtc']").r("\\((.*)\\)")); - page.putField("tags",page.getHtml().xs("//td[@class='blog_tag']/h3/a")); - } - - @Override - public Site getSite() { - return Site.me().setDomain("blog.sina.com.cn").setStartUrl("http://blog.sina.com.cn/"). - setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); - } -} diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/samples/TianyaPageProcesser.java b/webmagic-core/src/test/java/us/codecraft/webmagic/samples/TianyaPageProcesser.java deleted file mode 100644 index 7a8920b..0000000 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/samples/TianyaPageProcesser.java +++ /dev/null @@ -1,28 +0,0 @@ -package us.codecraft.webmagic.samples; - -import us.codecraft.webmagic.Site; -import us.codecraft.webmagic.Page; -import us.codecraft.webmagic.processor.PageProcessor; - -import java.util.List; - -/** - * User: cairne - * Date: 13-4-21 - * Time: 下午1:48 - */ -public class TianyaPageProcesser implements PageProcessor { - - @Override - public void process(Page page) { - List strings = page.getHtml().rs("]*href=[\"']{1}(/post-free.*?\\.shtml)[\"']{1}").toStrings(); - page.addTargetRequests(strings); - page.putField("title", page.getHtml().x("//div[@id='post_head']//span[@class='s_title']//b")); - page.putField("body",page.getHtml().sc()); - } - - @Override - public Site getSite() { - return Site.me().setDomain("http://bbs.tianya.cn/").setStartUrl("http://bbs.tianya.cn/"); //To change body of implemented methods use File | Settings | File Templates. - } -} diff --git a/webmagic-samples/src/test/java/us/codecraft/webmagic/SpiderTest.java b/webmagic-samples/src/test/java/us/codecraft/webmagic/SpiderTest.java index 5cb9848..838c76b 100644 --- a/webmagic-samples/src/test/java/us/codecraft/webmagic/SpiderTest.java +++ b/webmagic-samples/src/test/java/us/codecraft/webmagic/SpiderTest.java @@ -27,8 +27,8 @@ public class SpiderTest { // Spider.me().pipeline(new FilePipeline()).schedular(new FileCacheQueueSchedular(pageProcessor.getSite(),"/data/temp/webmagic/cache/")). // processor(pageProcessor).run(); SimplePageProcessor pageProcessor2 = new SimplePageProcessor("http://www.diaoyuweng.com/home.php?mod=space&uid=88304&do=thread&view=me&type=thread&from=space", "http://www.diaoyuweng.com/thread-*-1-1.html"); - pageProcessor2.getSite().setEncoding("GBK"); System.out.println(pageProcessor2.getSite().getEncoding()); + pageProcessor2.getSite().setSleepTime(500); Spider.me().pipeline(new FilePipeline()).schedular(new FileCacheQueueSchedular(pageProcessor2.getSite(),"/data/temp/webmagic/cache/")). processor(pageProcessor2).run();