fix freemarker dir error
parent
412abeb8df
commit
3d4ad02b29
|
@ -44,6 +44,10 @@ public class FreemarkerPipeline implements Pipeline {
|
|||
String domain = site.getDomain();
|
||||
domain = UrlUtils.getDomain(domain);
|
||||
String path = this.path + "" + domain + "/";
|
||||
File file = new File(path);
|
||||
if (!file.exists()) {
|
||||
file.mkdir();
|
||||
}
|
||||
try {
|
||||
PrintWriter printWriter = new PrintWriter(new FileWriter(path + DigestUtils.md5Hex(page.getUrl().toString()) + ".html"));
|
||||
template.process(page.getFields(), printWriter);
|
||||
|
|
|
@ -11,19 +11,24 @@ import us.codecraft.webmagic.processor.PageProcessor;
|
|||
*/
|
||||
public class SinaBlogProcesser implements PageProcessor {
|
||||
|
||||
private Site site;
|
||||
|
||||
@Override
|
||||
public void process(Page page) {
|
||||
page.addTargetRequests(page.getHtml().rs("<a[^<>]*href=[\"']{1}(http://blog\\.sina\\.com\\.cn/s/blog_.*?)[\"']{1}").toStrings());
|
||||
page.addTargetRequests(page.getHtml().as().rs("(http://blog\\.sina\\.com\\.cn/s/blog_.*)").toStrings());
|
||||
page.putField("title", page.getHtml().x("//div[@class='articalTitle']/h2"));
|
||||
page.putField("body",page.getHtml().sc());
|
||||
//x("//dd[@class='w133']")
|
||||
page.putField("content",page.getHtml().x("//div[@id='articlebody']//div[@class='articalContent']"));
|
||||
page.putField("id",page.getUrl().r("http://blog\\.sina\\.com\\.cn/s/blog_(\\w+)"));
|
||||
page.putField("date",page.getHtml().x("//div[@id='articlebody']//span[@class='time SG_txtc']").r("\\((.*)\\)"));
|
||||
page.putField("tags",page.getHtml().xs("//td[@class='blog_tag']/h3/a"));
|
||||
// page.putField("tags",page.getHtml().xs("//td[@class='blog_tag']/h3/a"));
|
||||
}
|
||||
|
||||
@Override
|
||||
public Site getSite() {
|
||||
return Site.me().setDomain("blog.sina.com.cn").setStartUrl("http://blog.sina.com.cn/").
|
||||
if (site==null){
|
||||
site = Site.me().setDomain("blog.sina.com.cn").setStartUrl("http://blog.sina.com.cn/flashsword20").setSleepTime(3000).
|
||||
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
|
||||
}
|
||||
return site;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,34 @@
|
|||
package us.codecraft.webmagic.processor;
|
||||
|
||||
import org.junit.Test;
|
||||
import us.codecraft.webmagic.Spider;
|
||||
import us.codecraft.webmagic.pipeline.FilePipeline;
|
||||
import us.codecraft.webmagic.pipeline.FreemarkerPipeline;
|
||||
import us.codecraft.webmagic.samples.SinaBlogProcesser;
|
||||
import us.codecraft.webmagic.schedular.FileCacheQueueSchedular;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* User: cairne
|
||||
* Date: 13-6-9
|
||||
* Time: 上午8:02
|
||||
*/
|
||||
public class SinablogProcessorTest {
|
||||
|
||||
@Test
|
||||
public void test() throws IOException {
|
||||
SinaBlogProcesser sinaBlogProcesser = new SinaBlogProcesser();
|
||||
//pipeline是抓取结束后的处理
|
||||
//ftl文件放到classpath:ftl/文件夹下
|
||||
//默认放到/data/temp/webmagic/ftl/[domain]目录下
|
||||
FreemarkerPipeline pipeline = new FreemarkerPipeline("wordpress.ftl");
|
||||
//Spider.me()是简化写法,其实就是new一个啦
|
||||
//Spider.pipeline()设定一个pipeline,支持链式调用
|
||||
//ConsolePipeline输出结果到控制台
|
||||
//FileCacheQueueSchedular保存url,支持断点续传,临时文件输出到/data/temp/webmagic/cache目录
|
||||
//Spider.run()执行
|
||||
Spider.me().pipeline(new FilePipeline()).pipeline(pipeline).schedular(new FileCacheQueueSchedular(sinaBlogProcesser.getSite(), "/data/temp/webmagic/cache/")).
|
||||
processor(sinaBlogProcesser).run();
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue