diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcesser.java deleted file mode 100644 index dcb6eff..0000000 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcesser.java +++ /dev/null @@ -1,37 +0,0 @@ -package us.codecraft.webmagic.samples; - -import us.codecraft.webmagic.Page; -import us.codecraft.webmagic.Site; -import us.codecraft.webmagic.Spider; -import us.codecraft.webmagic.processor.PageProcessor; - -/** - * @author code4crafter@gmail.com
- */ -public class SinaBlogProcesser implements PageProcessor { - - private Site site; - - @Override - public void process(Page page) { - page.addTargetRequests(page.getHtml().xpath("//div[@class='articalfrontback SG_j_linedot1 clearfix']").links().all()); - page.putField("title", page.getHtml().xpath("//div[@class='articalTitle']/h2")); - page.putField("content",page.getHtml().xpath("//div[@id='articlebody']//div[@class='articalContent']")); - page.putField("id",page.getUrl().regex("http://blog\\.sina\\.com\\.cn/s/blog_(\\w+)")); - page.putField("date",page.getHtml().xpath("//div[@id='articlebody']//span[@class='time SG_txtc']").regex("\\((.*)\\)")); -// page.putField("tags",page.getHtml().xpath("//td[@class='blog_tag']/h3/a")); - } - - @Override - public Site getSite() { - if (site==null){ - site = Site.me().setDomain("blog.sina.com.cn").addStartUrl("http://blog.sina.com.cn/s/blog_4701280b0102egl0.html").setSleepTime(3000). - setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); - } - return site; - } - - public static void main(String[] args) { - Spider.create(new SinaBlogProcesser()).run(); - } -} diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcessor.java new file mode 100644 index 0000000..01094aa --- /dev/null +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcessor.java @@ -0,0 +1,47 @@ +package us.codecraft.webmagic.samples; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.processor.PageProcessor; + +/** + * @author code4crafter@gmail.com
+ */ +public class SinaBlogProcessor implements PageProcessor { + + public static final String URL_LIST = "http://blog\\.sina\\.com\\.cn/s/articlelist_1487828712_0_\\d+\\.html"; + + public static final String URL_POST = "http://blog\\.sina\\.com\\.cn/s/blog_\\w+\\.html"; + + private Site site = Site + .me() + .setDomain("blog.sina.com.cn") + .setSleepTime(3000) + .setUserAgent( + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); + + @Override + public void process(Page page) { + if (page.getUrl().regex(URL_LIST).match()) { + page.addTargetRequests(page.getHtml().xpath("//div[@class=\"articleList\"]").links().regex(URL_POST).all()); + page.addTargetRequests(page.getHtml().links().regex(URL_LIST).all()); + } else { + page.putField("title", page.getHtml().xpath("//div[@class='articalTitle']/h2")); + page.putField("content", page.getHtml().xpath("//div[@id='articlebody']//div[@class='articalContent']")); + page.putField("id", page.getUrl().regex("http://blog\\.sina\\.com\\.cn/s/blog_(\\w+)")); + page.putField("date", + page.getHtml().xpath("//div[@id='articlebody']//span[@class='time SG_txtc']").regex("\\((.*)\\)")); + } + } + + @Override + public Site getSite() { + return site; + } + + public static void main(String[] args) { + Spider.create(new SinaBlogProcessor()).addUrl("http://blog.sina.com.cn/s/articlelist_1487828712_0_1.html") + .run(); + } +} diff --git a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/SinablogProcessorTest.java b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/SinablogProcessorTest.java index 026f8d5..d7cd5d5 100644 --- a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/SinablogProcessorTest.java +++ b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/SinablogProcessorTest.java @@ -5,7 +5,7 @@ import org.junit.Test; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.pipeline.FilePipeline; import us.codecraft.webmagic.pipeline.JsonFilePipeline; -import us.codecraft.webmagic.samples.SinaBlogProcesser; +import us.codecraft.webmagic.samples.SinaBlogProcessor; import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler; import java.io.IOException; @@ -20,7 +20,7 @@ public class SinablogProcessorTest { @Ignore @Test public void test() throws IOException { - SinaBlogProcesser sinaBlogProcesser = new SinaBlogProcesser(); + SinaBlogProcessor sinaBlogProcessor = new SinaBlogProcessor(); //pipeline是抓取结束后的处理 //默认放到/data/webmagic/ftl/[domain]目录下 JsonFilePipeline pipeline = new JsonFilePipeline("/data/webmagic/"); @@ -29,7 +29,7 @@ public class SinablogProcessorTest { //ConsolePipeline输出结果到控制台 //FileCacheQueueSchedular保存url,支持断点续传,临时文件输出到/data/temp/webmagic/cache目录 //Spider.run()执行 - Spider.create(sinaBlogProcesser).pipeline(new FilePipeline()).pipeline(pipeline).scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")). + Spider.create(sinaBlogProcessor).pipeline(new FilePipeline()).pipeline(pipeline).scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")). run(); } }