diff --git a/README.md b/README.md index a9f856e..1600a78 100644 --- a/README.md +++ b/README.md @@ -22,27 +22,27 @@ Add dependencies to your pom.xml: ```xml - - us.codecraft - webmagic-core - 0.4.3 - - - us.codecraft - webmagic-extension - 0.4.3 - + + us.codecraft + webmagic-core + 0.4.3 + + + us.codecraft + webmagic-extension + 0.4.3 + ``` WebMagic use slf4j with slf4j-log4j12 implementation. If you customized your slf4j implementation, please exclude slf4j-log4j12. ```xml - - - org.slf4j - slf4j-log4j12 - - + + + org.slf4j + slf4j-log4j12 + + ``` @@ -53,30 +53,30 @@ WebMagic use slf4j with slf4j-log4j12 implementation. If you customized your slf Write a class implements PageProcessor: ```java - public class OschinaBlogPageProcesser implements PageProcessor { +public class OschinaBlogPageProcesser implements PageProcessor { - private Site site = Site.me().setDomain("my.oschina.net"); + private Site site = Site.me().setDomain("my.oschina.net"); - @Override - public void process(Page page) { - List links = page.getHtml().links().regex("http://my\\.oschina\\.net/flashsword/blog/\\d+").all(); - page.addTargetRequests(links); - page.putField("title", page.getHtml().xpath("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1").toString()); - page.putField("content", page.getHtml().$("div.content").toString()); - page.putField("tags",page.getHtml().xpath("//div[@class='BlogTags']/a/text()").all()); - } - - @Override - public Site getSite() { - return site; - - } - - public static void main(String[] args) { - Spider.create(new OschinaBlogPageProcesser()).addUrl("http://my.oschina.net/flashsword/blog") - .addPipeline(new ConsolePipeline()).run(); - } + @Override + public void process(Page page) { + List links = page.getHtml().links().regex("http://my\\.oschina\\.net/flashsword/blog/\\d+").all(); + page.addTargetRequests(links); + page.putField("title", page.getHtml().xpath("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1").toString()); + page.putField("content", page.getHtml().$("div.content").toString()); + page.putField("tags",page.getHtml().xpath("//div[@class='BlogTags']/a/text()").all()); } + + @Override + public Site getSite() { + return site; + + } + + public static void main(String[] args) { + Spider.create(new OschinaBlogPageProcesser()).addUrl("http://my.oschina.net/flashsword/blog") + .addPipeline(new ConsolePipeline()).run(); + } +} ``` * `page.addTargetRequests(links)` @@ -86,24 +86,24 @@ Write a class implements PageProcessor: You can also use annotation way: ```java - @TargetUrl("http://my.oschina.net/flashsword/blog/\\d+") - public class OschinaBlog { +@TargetUrl("http://my.oschina.net/flashsword/blog/\\d+") +public class OschinaBlog { - @ExtractBy("//title") - private String title; + @ExtractBy("//title") + private String title; - @ExtractBy(value = "div.BlogContent",type = ExtractBy.Type.Css) - private String content; + @ExtractBy(value = "div.BlogContent",type = ExtractBy.Type.Css) + private String content; - @ExtractBy(value = "//div[@class='BlogTags']/a/text()", multi = true) - private List tags; + @ExtractBy(value = "//div[@class='BlogTags']/a/text()", multi = true) + private List tags; - public static void main(String[] args) { - OOSpider.create( - Site.me(), - new ConsolePageModelPipeline(), OschinaBlog.class).addUrl("http://my.oschina.net/flashsword/blog").run(); - } - } + public static void main(String[] args) { + OOSpider.create( + Site.me(), + new ConsolePageModelPipeline(), OschinaBlog.class).addUrl("http://my.oschina.net/flashsword/blog").run(); + } +} ``` ### Docs and samples: