From 85b7cf1563337ae07e448d3de0f5c5939fa676b6 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Mon, 2 Sep 2013 13:52:41 +0800 Subject: [PATCH] complete test --- .../main/java/us/codecraft/webmagic/selector/Html.java | 2 +- .../us/codecraft/webmagic/samples/HuxiuProcessor.java | 5 ++--- .../webmagic/samples/InfoQMiniBookProcessor.java | 4 ---- .../codecraft/webmagic/samples/IteyeBlogProcessor.java | 3 +-- .../us/codecraft/webmagic/samples/KaichibaProcessor.java | 5 +++++ .../us/codecraft/webmagic/samples/MeicanProcessor.java | 9 +++++++-- .../webmagic/samples/OschinaBlogPageProcesser.java | 9 ++++----- 7 files changed, 20 insertions(+), 17 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java index 493c762..a4ea0d3 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java @@ -53,7 +53,7 @@ public class Html extends PlainText { @Override public Selectable links() { - XpathSelector xpathSelector = Selectors.xpath("//a/@href"); + XsoupSelector xpathSelector = new XsoupSelector("//a/@href"); return selectList(xpathSelector, strings); } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java index 4ac9310..136eeb8 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java @@ -15,10 +15,9 @@ import java.util.List; public class HuxiuProcessor implements PageProcessor { @Override public void process(Page page) { - //http://progressdaily.diandian.com/post/2013-01-24/40046867275 - List requests = page.getHtml().regex("\"']*href=[\"']{1}([/]{0,1}article[^<>#\"']*?)[\"']{1}").all(); + List requests = page.getHtml().links().regex(".*article.*").all(); page.addTargetRequests(requests); - page.putField("title",page.getHtml().xpath("//div[@class='neirong']//h1[@class='ph xs5']")); + page.putField("title",page.getHtml().xpath("//div[@class='clearfix neirong']//h1/text()")); page.putField("content",page.getHtml().smartContent()); } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/InfoQMiniBookProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/InfoQMiniBookProcessor.java index b43c3c5..38de3bc 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/InfoQMiniBookProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/InfoQMiniBookProcessor.java @@ -4,9 +4,7 @@ import org.apache.commons.collections.CollectionUtils; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; -import us.codecraft.webmagic.pipeline.FilePipeline; import us.codecraft.webmagic.processor.PageProcessor; -import us.codecraft.webmagic.scheduler.RedisScheduler; import java.util.List; @@ -41,8 +39,6 @@ public class InfoQMiniBookProcessor implements PageProcessor { public static void main(String[] args) { Spider.create(new InfoQMiniBookProcessor()) - .scheduler(new RedisScheduler("localhost")) - .pipeline(new FilePipeline("/data/temp/webmagic/")) .thread(5) .run(); } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/IteyeBlogProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/IteyeBlogProcessor.java index c0b3f73..f80f895 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/IteyeBlogProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/IteyeBlogProcessor.java @@ -3,7 +3,6 @@ package us.codecraft.webmagic.samples; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; -import us.codecraft.webmagic.pipeline.FilePipeline; import us.codecraft.webmagic.processor.PageProcessor; /** @@ -32,6 +31,6 @@ public class IteyeBlogProcessor implements PageProcessor { } public static void main(String[] args) { - Spider.create(new IteyeBlogProcessor()).thread(5).pipeline(new FilePipeline("/data/webmagic/")).run(); + Spider.create(new IteyeBlogProcessor()).thread(5).run(); } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/KaichibaProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/KaichibaProcessor.java index aff18a6..0ab6c64 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/KaichibaProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/KaichibaProcessor.java @@ -2,6 +2,7 @@ package us.codecraft.webmagic.samples; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; /** @@ -24,4 +25,8 @@ public class KaichibaProcessor implements PageProcessor { return Site.me().setDomain("kaichiba.com").addStartUrl("http://kaichiba.com/shop/41725781").setCharset("utf-8"). setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); } + + public static void main(String[] args) { + Spider.create(new KaichibaProcessor()).run(); + } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MeicanProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MeicanProcessor.java index a4e6e43..bfa347d 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MeicanProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MeicanProcessor.java @@ -2,6 +2,7 @@ package us.codecraft.webmagic.samples; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; import java.util.List; @@ -21,8 +22,8 @@ public class MeicanProcessor implements PageProcessor { } page.addTargetRequests(requests); page.addTargetRequests(page.getHtml().links().regex("(.*/restaurant/[^#]+)").all()); - page.putField("items", page.getHtml().xpath("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"name\"]")); - page.putField("prices", page.getHtml().xpath("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"price_outer\"]/span[@class=\"price\"]")); + page.putField("items", page.getHtml().xpath("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"name\"]/text()")); + page.putField("prices", page.getHtml().xpath("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"price_outer\"]/span[@class=\"price\"]/text()")); } @Override @@ -30,4 +31,8 @@ public class MeicanProcessor implements PageProcessor { return Site.me().setDomain("meican.com").addStartUrl("http://www.meican.com/shanghai/districts").setCharset("utf-8"). setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); } + + public static void main(String[] args) { + Spider.create(new MeicanProcessor()).run(); + } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java index 8ba7063..e447003 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java @@ -1,9 +1,8 @@ package us.codecraft.webmagic.samples; -import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; -import us.codecraft.webmagic.pipeline.ConsolePipeline; import us.codecraft.webmagic.processor.PageProcessor; import java.util.List; @@ -21,8 +20,8 @@ public class OschinaBlogPageProcesser implements PageProcessor { public void process(Page page) { List links = page.getHtml().links().regex("http://my\\.oschina\\.net/flashsword/blog/\\d+").all(); page.addTargetRequests(links); - page.putField("title", page.getHtml().xpath("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1").toString()); - page.putField("content", page.getHtml().$("div.content").toString()); + page.putField("title", page.getHtml().xpath("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1/text()").toString()); + page.putField("content", page.getHtml().xpath("//div[@class='BlogContent']/tidyText()").toString()); page.putField("tags",page.getHtml().xpath("//div[@class='BlogTags']/a/text()").all()); } @@ -33,6 +32,6 @@ public class OschinaBlogPageProcesser implements PageProcessor { } public static void main(String[] args) { - Spider.create(new OschinaBlogPageProcesser()).pipeline(new ConsolePipeline()).run(); + Spider.create(new OschinaBlogPageProcesser()).run(); } }