From 3d4ad02b2943e6102f2ba6ccf3f61ef10a686e29 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Mon, 10 Jun 2013 12:15:29 +0800 Subject: [PATCH] fix freemarker dir error --- .../webmagic/pipeline/FreemarkerPipeline.java | 4 +++ .../webmagic/samples/SinaBlogProcesser.java | 17 ++++++---- .../processor/SinablogProcessorTest.java | 34 +++++++++++++++++++ 3 files changed, 49 insertions(+), 6 deletions(-) create mode 100644 webmagic-samples/src/test/java/us/codecraft/webmagic/processor/SinablogProcessorTest.java diff --git a/webmagic-plugin/src/main/java/us/codecraft/webmagic/pipeline/FreemarkerPipeline.java b/webmagic-plugin/src/main/java/us/codecraft/webmagic/pipeline/FreemarkerPipeline.java index 9bdbed8..ef71837 100644 --- a/webmagic-plugin/src/main/java/us/codecraft/webmagic/pipeline/FreemarkerPipeline.java +++ b/webmagic-plugin/src/main/java/us/codecraft/webmagic/pipeline/FreemarkerPipeline.java @@ -44,6 +44,10 @@ public class FreemarkerPipeline implements Pipeline { String domain = site.getDomain(); domain = UrlUtils.getDomain(domain); String path = this.path + "" + domain + "/"; + File file = new File(path); + if (!file.exists()) { + file.mkdir(); + } try { PrintWriter printWriter = new PrintWriter(new FileWriter(path + DigestUtils.md5Hex(page.getUrl().toString()) + ".html")); template.process(page.getFields(), printWriter); diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcesser.java index b86fff8..7906526 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcesser.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcesser.java @@ -11,19 +11,24 @@ import us.codecraft.webmagic.processor.PageProcessor; */ public class SinaBlogProcesser implements PageProcessor { + private Site site; + @Override public void process(Page page) { - page.addTargetRequests(page.getHtml().rs("]*href=[\"']{1}(http://blog\\.sina\\.com\\.cn/s/blog_.*?)[\"']{1}").toStrings()); + page.addTargetRequests(page.getHtml().as().rs("(http://blog\\.sina\\.com\\.cn/s/blog_.*)").toStrings()); page.putField("title", page.getHtml().x("//div[@class='articalTitle']/h2")); - page.putField("body",page.getHtml().sc()); - //x("//dd[@class='w133']") + page.putField("content",page.getHtml().x("//div[@id='articlebody']//div[@class='articalContent']")); + page.putField("id",page.getUrl().r("http://blog\\.sina\\.com\\.cn/s/blog_(\\w+)")); page.putField("date",page.getHtml().x("//div[@id='articlebody']//span[@class='time SG_txtc']").r("\\((.*)\\)")); - page.putField("tags",page.getHtml().xs("//td[@class='blog_tag']/h3/a")); +// page.putField("tags",page.getHtml().xs("//td[@class='blog_tag']/h3/a")); } @Override public Site getSite() { - return Site.me().setDomain("blog.sina.com.cn").setStartUrl("http://blog.sina.com.cn/"). - setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); + if (site==null){ + site = Site.me().setDomain("blog.sina.com.cn").setStartUrl("http://blog.sina.com.cn/flashsword20").setSleepTime(3000). + setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); + } + return site; } } diff --git a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/SinablogProcessorTest.java b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/SinablogProcessorTest.java new file mode 100644 index 0000000..0a5cc1b --- /dev/null +++ b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/SinablogProcessorTest.java @@ -0,0 +1,34 @@ +package us.codecraft.webmagic.processor; + +import org.junit.Test; +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.pipeline.FilePipeline; +import us.codecraft.webmagic.pipeline.FreemarkerPipeline; +import us.codecraft.webmagic.samples.SinaBlogProcesser; +import us.codecraft.webmagic.schedular.FileCacheQueueSchedular; + +import java.io.IOException; + +/** + * User: cairne + * Date: 13-6-9 + * Time: 上午8:02 + */ +public class SinablogProcessorTest { + + @Test + public void test() throws IOException { + SinaBlogProcesser sinaBlogProcesser = new SinaBlogProcesser(); + //pipeline是抓取结束后的处理 + //ftl文件放到classpath:ftl/文件夹下 + //默认放到/data/temp/webmagic/ftl/[domain]目录下 + FreemarkerPipeline pipeline = new FreemarkerPipeline("wordpress.ftl"); + //Spider.me()是简化写法,其实就是new一个啦 + //Spider.pipeline()设定一个pipeline,支持链式调用 + //ConsolePipeline输出结果到控制台 + //FileCacheQueueSchedular保存url,支持断点续传,临时文件输出到/data/temp/webmagic/cache目录 + //Spider.run()执行 + Spider.me().pipeline(new FilePipeline()).pipeline(pipeline).schedular(new FileCacheQueueSchedular(sinaBlogProcesser.getSite(), "/data/temp/webmagic/cache/")). + processor(sinaBlogProcesser).run(); + } +}