From bc1d14fed4c58e5f4f5b7702935a7199c7101940 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Fri, 2 May 2014 17:54:21 +0800 Subject: [PATCH] sample --- .../webmagic/samples/GithubRepo.java | 7 ++++ .../samples/GithubRepoPageProcessor.java | 37 +++++++++++++++++++ 2 files changed, 44 insertions(+) create mode 100644 webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GithubRepo.java create mode 100644 webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GithubRepoPageProcessor.java diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GithubRepo.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GithubRepo.java new file mode 100644 index 0000000..1a2d889 --- /dev/null +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GithubRepo.java @@ -0,0 +1,7 @@ +package us.codecraft.webmagic.samples; + +/** + * @author code4crafer@gmail.com + */ +public class GithubRepo { +} diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GithubRepoPageProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GithubRepoPageProcessor.java new file mode 100644 index 0000000..db498a8 --- /dev/null +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GithubRepoPageProcessor.java @@ -0,0 +1,37 @@ +package us.codecraft.webmagic.samples; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.processor.PageProcessor; + +/** + * @author code4crafter@gmail.com
+ * @since 0.3.2 + */ +public class GithubRepoPageProcessor implements PageProcessor { + + private Site site = Site.me().setRetryTimes(3).setSleepTime(0); + + @Override + public void process(Page page) { + page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all()); + page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+)").all()); + page.putField("author", page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString()); + page.putField("name", page.getHtml().xpath("//h1[@class='entry-title public']/strong/a/text()").toString()); + if (page.getResultItems().get("name")==null){ + //skip this page + page.setSkip(true); + } + page.putField("readme", page.getHtml().xpath("//div[@id='readme']/tidyText()")); + } + + @Override + public Site getSite() { + return site; + } + + public static void main(String[] args) { + Spider.create(new GithubRepoPageProcessor()).addUrl("https://github.com/code4craft").thread(5).run(); + } +}