From 8b15f3c63d867e22cc58a3aaab55a967756e6194 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 10 Aug 2013 20:33:47 +0800 Subject: [PATCH] add test --- .../java/us/codecraft/webmagic/Spider.java | 5 ++ .../webmagic/model/samples/GithubRepo.java | 54 +++++++++++++++++-- 2 files changed, 54 insertions(+), 5 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index cf62796..0126881 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -220,6 +220,11 @@ public class Spider implements Runnable, Task { } } + public void test(String url){ + checkComponent(); + processRequest(new Request(url)); + } + private void processRequest(Request request) { Page page = downloader.download(request, this); if (page == null) { diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/GithubRepo.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/GithubRepo.java index f752829..edef1c0 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/GithubRepo.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/GithubRepo.java @@ -1,23 +1,67 @@ package us.codecraft.webmagic.model.samples; import us.codecraft.webmagic.Site; -import us.codecraft.webmagic.model.ConsolePageModelPipeline; +import us.codecraft.webmagic.model.HasKey; import us.codecraft.webmagic.model.OOSpider; import us.codecraft.webmagic.model.annotation.ExtractBy; +import us.codecraft.webmagic.model.annotation.ExtractByUrl; +import us.codecraft.webmagic.model.annotation.HelpUrl; import us.codecraft.webmagic.model.annotation.TargetUrl; +import us.codecraft.webmagic.pipeline.JsonFilePageModelPipeline; + +import java.util.List; /** * @author code4crafter@gmail.com
* Date: 13-8-10
* Time: 下午6:37
*/ -@TargetUrl("https://github.com/code4craft/*") -public class GithubRepo { +@TargetUrl("https://github.com/\\w+/\\w+") +@HelpUrl({"https://github.com/\\w+\\?tab=repositories","https://github.com/\\w+","https://github.com/explore/*"}) +public class GithubRepo implements HasKey { - @ExtractBy("//h1[@class='entry-title']/strong/a/text()") + @ExtractBy(value = "//h1[@class='entry-title public']/strong/a/text()", notNull = true) private String name; + @ExtractByUrl("https://github\\.com/(\\w+)/.*") + private String author; + + @ExtractBy("//div[@id='readme']") + private String readme; + + @ExtractBy(value = "//div[@class='repository-lang-stats']//li//span[@class='lang']",multi = true) + private List language; + + @ExtractByUrl + private String url; + public static void main(String[] args) { - OOSpider.create(Site.me().addStartUrl("https://github.com/code4craft/"), new ConsolePageModelPipeline(), GithubRepo.class).run(); + OOSpider.create(Site.me().addStartUrl("https://github.com/explore").setSleepTime(0), + new JsonFilePageModelPipeline(), GithubRepo.class).thread(15).run(); + } + + @Override + public String key() { + return author+":"+name; + } + + public String getName() { + return name; + } + + public String getReadme() { + return readme; + } + + public String getAuthor() { + return author; + } + + public List getLanguage() { + return language; + } + + public String getUrl() { + return url; } }