From ca072c5575d7e6a4caa6324b128c20cf50efc364 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sun, 8 May 2016 12:09:45 +0800 Subject: [PATCH] fix URL regex in GithubRepoPageProcessor #305 --- .../webmagic/processor/example/GithubRepoPageProcessor.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/GithubRepoPageProcessor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/GithubRepoPageProcessor.java index f4ae058..e475148 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/GithubRepoPageProcessor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/GithubRepoPageProcessor.java @@ -15,8 +15,8 @@ public class GithubRepoPageProcessor implements PageProcessor { @Override public void process(Page page) { - page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all()); - page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+)").all()); + page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/[\\w\\-]+/[\\w\\-])").all()); + page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/[\\w\\-])").all()); page.putField("author", page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString()); page.putField("name", page.getHtml().xpath("//h1[@class='entry-title public']/strong/a/text()").toString()); if (page.getResultItems().get("name")==null){