diff --git a/release-note.md b/release-note.md index a778a3e..9582302 100755 --- a/release-note.md +++ b/release-note.md @@ -28,13 +28,15 @@ Release Notes } +增加一个Spider.test(url)方法,用于开发爬虫时进行调试。 + 增加基于redis的分布式支持。 增加XPath2.0语法支持(webmagic-saxon模块)。 增加基于Selenium的浏览器渲染支持,用于抓取动态加载内容(webmagic-selenium模块)。 -修复一些已有bug。 +修复了不支持https的bug。 补充了文档:[webmagic-0.2.0用户手册](http://code4craft.github.io/webmagic/)。 diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 0126881..32653d6 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -220,9 +220,17 @@ public class Spider implements Runnable, Task { } } - public void test(String url){ + /** + * 用某些特定URL进行爬虫测试 + * @param urls 要抓取的url + */ + public void test(String... urls){ checkComponent(); - processRequest(new Request(url)); + if (urls.length>0){ + for (String url : urls) { + processRequest(new Request(url)); + } + } } private void processRequest(Request request) { diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/GithubRepo.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/GithubRepo.java index edef1c0..79a20ff 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/GithubRepo.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/GithubRepo.java @@ -8,6 +8,7 @@ import us.codecraft.webmagic.model.annotation.ExtractByUrl; import us.codecraft.webmagic.model.annotation.HelpUrl; import us.codecraft.webmagic.model.annotation.TargetUrl; import us.codecraft.webmagic.pipeline.JsonFilePageModelPipeline; +import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler; import java.util.List; @@ -32,12 +33,19 @@ public class GithubRepo implements HasKey { @ExtractBy(value = "//div[@class='repository-lang-stats']//li//span[@class='lang']",multi = true) private List language; + @ExtractBy("//a[@class='social-count js-social-count']/text()") + private String star; + + @ExtractBy("//a[@class='social-count js-social-count']/text()") + private String fork; + @ExtractByUrl private String url; public static void main(String[] args) { - OOSpider.create(Site.me().addStartUrl("https://github.com/explore").setSleepTime(0), - new JsonFilePageModelPipeline(), GithubRepo.class).thread(15).run(); + OOSpider.create(Site.me().addStartUrl("https://github.com/explore").setSleepTime(0).setRetryTimes(3), + new JsonFilePageModelPipeline(), GithubRepo.class) + .scheduler(new FileCacheQueueScheduler("/data/webmagic/cache/")).thread(15).run(); } @Override @@ -64,4 +72,12 @@ public class GithubRepo implements HasKey { public String getUrl() { return url; } + + public String getStar() { + return star; + } + + public String getFork() { + return fork; + } }