diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
index cf62796..0126881 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
@@ -220,6 +220,11 @@ public class Spider implements Runnable, Task {
}
}
+ public void test(String url){
+ checkComponent();
+ processRequest(new Request(url));
+ }
+
private void processRequest(Request request) {
Page page = downloader.download(request, this);
if (page == null) {
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/GithubRepo.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/GithubRepo.java
index f752829..edef1c0 100644
--- a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/GithubRepo.java
+++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/GithubRepo.java
@@ -1,23 +1,67 @@
package us.codecraft.webmagic.model.samples;
import us.codecraft.webmagic.Site;
-import us.codecraft.webmagic.model.ConsolePageModelPipeline;
+import us.codecraft.webmagic.model.HasKey;
import us.codecraft.webmagic.model.OOSpider;
import us.codecraft.webmagic.model.annotation.ExtractBy;
+import us.codecraft.webmagic.model.annotation.ExtractByUrl;
+import us.codecraft.webmagic.model.annotation.HelpUrl;
import us.codecraft.webmagic.model.annotation.TargetUrl;
+import us.codecraft.webmagic.pipeline.JsonFilePageModelPipeline;
+
+import java.util.List;
/**
* @author code4crafter@gmail.com
* Date: 13-8-10
* Time: 下午6:37
*/
-@TargetUrl("https://github.com/code4craft/*")
-public class GithubRepo {
+@TargetUrl("https://github.com/\\w+/\\w+")
+@HelpUrl({"https://github.com/\\w+\\?tab=repositories","https://github.com/\\w+","https://github.com/explore/*"})
+public class GithubRepo implements HasKey {
- @ExtractBy("//h1[@class='entry-title']/strong/a/text()")
+ @ExtractBy(value = "//h1[@class='entry-title public']/strong/a/text()", notNull = true)
private String name;
+ @ExtractByUrl("https://github\\.com/(\\w+)/.*")
+ private String author;
+
+ @ExtractBy("//div[@id='readme']")
+ private String readme;
+
+ @ExtractBy(value = "//div[@class='repository-lang-stats']//li//span[@class='lang']",multi = true)
+ private List language;
+
+ @ExtractByUrl
+ private String url;
+
public static void main(String[] args) {
- OOSpider.create(Site.me().addStartUrl("https://github.com/code4craft/"), new ConsolePageModelPipeline(), GithubRepo.class).run();
+ OOSpider.create(Site.me().addStartUrl("https://github.com/explore").setSleepTime(0),
+ new JsonFilePageModelPipeline(), GithubRepo.class).thread(15).run();
+ }
+
+ @Override
+ public String key() {
+ return author+":"+name;
+ }
+
+ public String getName() {
+ return name;
+ }
+
+ public String getReadme() {
+ return readme;
+ }
+
+ public String getAuthor() {
+ return author;
+ }
+
+ public List getLanguage() {
+ return language;
+ }
+
+ public String getUrl() {
+ return url;
}
}