From e6d415cd69a80eaafa18345ab467c3e5010612d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=BB=84=E4=BA=BF=E5=8D=8E?= Date: Sun, 9 Jun 2013 22:55:49 +0900 Subject: [PATCH] update samples --- README.md | 24 ++---------------------- 1 file changed, 2 insertions(+), 22 deletions(-) diff --git a/README.md b/README.md index 723adf4..e4adfd2 100644 --- a/README.md +++ b/README.md @@ -61,27 +61,7 @@ webmagic定制的核心是PageProcessor接口。一个最简单的webmagic爬虫 } } ---- - -TODO - - - public class OschinaBlogPageProcesser implements PageProcessor { - - @Override - public void process(Page page) { - List strings = page.getHtml().rs("]*href=[\"']{1}(http://my\\.oschina\\.net/\\w+/blog/\\d+)[\"']{1}").toStrings(); - page.addTargetRequests(strings); - page.putField("title", page.getHtml().xs("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1")); - page.putField("content", page.getHtml().sc()); - page.putField("author", page.getUrl().r("my\\.oschina\\.net/(\\w+)/blog/\\d+")); - } - - @Override - public Site getSite() { - return Site.me().setDomain("my.oschina.net").setStartUrl("http://www.oschina.net/"). - setUserAgent("Mozilla/5.0 (Macintosh; Chrome/26.0.1410.65 Safari/537.31"); - } - } +### 示例 +可参考作者博客[使用webmagic抓取页面并保存为wordpress文件](http://my.oschina.net/flashsword/blog/136846)