add comments
parent
c2e691a55f
commit
412abeb8df
|
@ -49,9 +49,7 @@ public class FreemarkerPipeline implements Pipeline {
|
|||
template.process(page.getFields(), printWriter);
|
||||
printWriter.close();
|
||||
} catch (TemplateException e) {
|
||||
e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates.
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates.
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
package us.codecraft.webmagic.samples;
|
||||
|
||||
import us.codecraft.webmagic.Site;
|
||||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.Site;
|
||||
import us.codecraft.webmagic.processor.PageProcessor;
|
||||
|
||||
import java.util.List;
|
||||
|
@ -12,18 +12,34 @@ import java.util.List;
|
|||
* Time: 下午8:08
|
||||
*/
|
||||
public class DiandianBlogProcessor implements PageProcessor {
|
||||
|
||||
private Site site;
|
||||
|
||||
@Override
|
||||
public void process(Page page) {
|
||||
//http://progressdaily.diandian.com/post/2013-01-24/40046867275
|
||||
List<String> requests = page.getHtml().rs("<a[^<>]*href=[\"']{1}(http://17dujingdian\\.com/post/[^#]*?)[\"']{1}").toStrings();
|
||||
//a()表示提取链接,as()表示提取所有链接
|
||||
//getHtml()返回Html对象,支持链式调用
|
||||
//r()表示用正则表达式提取一条内容,rs()表示提取多条内容
|
||||
//toString()表示取单条结果,toStrings()表示取多条
|
||||
List<String> requests = page.getHtml().as().rs("(.*/post/.*)").toStrings();
|
||||
//使用page.addTargetRequests()方法将待抓取的链接加入队列
|
||||
page.addTargetRequests(requests);
|
||||
page.putField("title",page.getHtml().x("//div[@id='content']//h2/a"));
|
||||
page.putField("content",page.getHtml().sc());
|
||||
//page.putField(key,value)将抽取的内容加入结果Map
|
||||
//x()和xs()使用xpath进行抽取
|
||||
page.putField("title", page.getHtml().x("//title").r("(.*?)\\|"));
|
||||
//sc()使用readability技术直接抽取正文,对于规整的文本有比较好的抽取正确率
|
||||
page.putField("content", page.getHtml().sc());
|
||||
page.putField("date", page.getUrl().r("post/(\\d+-\\d+-\\d+)/"));
|
||||
page.putField("id", page.getUrl().r("post/\\d+-\\d+-\\d+/(\\d+)"));
|
||||
}
|
||||
|
||||
@Override
|
||||
public Site getSite() {
|
||||
return Site.me().setDomain("www.diandian.com").setStartUrl("http://17dujingdian.com/").
|
||||
//site定义抽取配置,以及开始url等
|
||||
if (site == null) {
|
||||
site = Site.me().setDomain("progressdaily.diandian.com").setStartUrl("http://progressdaily.diandian.com/").
|
||||
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
|
||||
}
|
||||
return site;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -13,6 +13,9 @@ import java.util.List;
|
|||
* Time: 下午8:08
|
||||
*/
|
||||
public class DiaoyuwengProcessor implements PageProcessor {
|
||||
|
||||
private Site site;
|
||||
|
||||
@Override
|
||||
public void process(Page page) {
|
||||
List<String> requests = page.getHtml().as().rs("(http://www\\.diaoyuweng\\.com/home\\.php\\?mod=space&uid=88304&do=thread&view=me&type=thread&order=dateline&from=space&page=\\d+)").toStrings();
|
||||
|
@ -29,7 +32,10 @@ public class DiaoyuwengProcessor implements PageProcessor {
|
|||
|
||||
@Override
|
||||
public Site getSite() {
|
||||
return Site.me().setDomain("www.diaoyuweng.com").setStartUrl("http://www.diaoyuweng.com/home.php?mod=space&uid=88304&do=thread&view=me&type=thread&from=space").
|
||||
if (site==null){
|
||||
site= Site.me().setDomain("www.diaoyuweng.com").setStartUrl("http://www.diaoyuweng.com/home.php?mod=space&uid=88304&do=thread&view=me&type=thread&from=space").
|
||||
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31").setEncoding("GBK").setSleepTime(500);
|
||||
}
|
||||
return site;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -3,6 +3,6 @@ touch wordpress.xml
|
|||
cat wp-head.xml >> wordpress.xml
|
||||
for f in `ls`;
|
||||
do
|
||||
cat ${f} >> wordpress.xml
|
||||
cat ${f} >> ../wordpress.xml
|
||||
done;
|
||||
cat wp-bottom.xml >> wordpress.xml
|
|
@ -0,0 +1,34 @@
|
|||
package us.codecraft.webmagic.processor;
|
||||
|
||||
import org.junit.Test;
|
||||
import us.codecraft.webmagic.Spider;
|
||||
import us.codecraft.webmagic.pipeline.ConsolePipeline;
|
||||
import us.codecraft.webmagic.pipeline.FreemarkerPipeline;
|
||||
import us.codecraft.webmagic.samples.DiandianBlogProcessor;
|
||||
import us.codecraft.webmagic.schedular.FileCacheQueueSchedular;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* User: cairne
|
||||
* Date: 13-6-9
|
||||
* Time: 上午8:02
|
||||
*/
|
||||
public class DiandianProcessorTest {
|
||||
|
||||
@Test
|
||||
public void test() throws IOException {
|
||||
DiandianBlogProcessor diaoyuwengProcessor = new DiandianBlogProcessor();
|
||||
//pipeline是抓取结束后的处理
|
||||
//ftl文件放到classpath:ftl/文件夹下
|
||||
//默认放到/data/temp/webmagic/ftl/[domain]目录下
|
||||
FreemarkerPipeline pipeline = new FreemarkerPipeline("wordpress.ftl");
|
||||
//Spider.me()是简化写法,其实就是new一个啦
|
||||
//Spider.pipeline()设定一个pipeline,支持链式调用
|
||||
//ConsolePipeline输出结果到控制台
|
||||
//FileCacheQueueSchedular保存url,支持断点续传,临时文件输出到/data/temp/webmagic/cache目录
|
||||
//Spider.run()执行
|
||||
Spider.me().pipeline(new ConsolePipeline()).pipeline(pipeline).schedular(new FileCacheQueueSchedular(diaoyuwengProcessor.getSite(), "/data/temp/webmagic/cache/")).
|
||||
processor(diaoyuwengProcessor).run();
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue