release notes and docs
parent
1f86ce7720
commit
787b952932
|
@ -28,13 +28,15 @@ Release Notes
|
|||
|
||||
}
|
||||
|
||||
增加一个Spider.test(url)方法,用于开发爬虫时进行调试。
|
||||
|
||||
增加基于redis的分布式支持。
|
||||
|
||||
增加XPath2.0语法支持(webmagic-saxon模块)。
|
||||
|
||||
增加基于Selenium的浏览器渲染支持,用于抓取动态加载内容(webmagic-selenium模块)。
|
||||
|
||||
修复一些已有bug。
|
||||
修复了不支持https的bug。
|
||||
|
||||
补充了文档:[webmagic-0.2.0用户手册](http://code4craft.github.io/webmagic/)。
|
||||
|
||||
|
|
|
@ -220,9 +220,17 @@ public class Spider implements Runnable, Task {
|
|||
}
|
||||
}
|
||||
|
||||
public void test(String url){
|
||||
/**
|
||||
* 用某些特定URL进行爬虫测试
|
||||
* @param urls 要抓取的url
|
||||
*/
|
||||
public void test(String... urls){
|
||||
checkComponent();
|
||||
processRequest(new Request(url));
|
||||
if (urls.length>0){
|
||||
for (String url : urls) {
|
||||
processRequest(new Request(url));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void processRequest(Request request) {
|
||||
|
|
|
@ -8,6 +8,7 @@ import us.codecraft.webmagic.model.annotation.ExtractByUrl;
|
|||
import us.codecraft.webmagic.model.annotation.HelpUrl;
|
||||
import us.codecraft.webmagic.model.annotation.TargetUrl;
|
||||
import us.codecraft.webmagic.pipeline.JsonFilePageModelPipeline;
|
||||
import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
|
@ -32,12 +33,19 @@ public class GithubRepo implements HasKey {
|
|||
@ExtractBy(value = "//div[@class='repository-lang-stats']//li//span[@class='lang']",multi = true)
|
||||
private List<String> language;
|
||||
|
||||
@ExtractBy("//a[@class='social-count js-social-count']/text()")
|
||||
private String star;
|
||||
|
||||
@ExtractBy("//a[@class='social-count js-social-count']/text()")
|
||||
private String fork;
|
||||
|
||||
@ExtractByUrl
|
||||
private String url;
|
||||
|
||||
public static void main(String[] args) {
|
||||
OOSpider.create(Site.me().addStartUrl("https://github.com/explore").setSleepTime(0),
|
||||
new JsonFilePageModelPipeline(), GithubRepo.class).thread(15).run();
|
||||
OOSpider.create(Site.me().addStartUrl("https://github.com/explore").setSleepTime(0).setRetryTimes(3),
|
||||
new JsonFilePageModelPipeline(), GithubRepo.class)
|
||||
.scheduler(new FileCacheQueueScheduler("/data/webmagic/cache/")).thread(15).run();
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -64,4 +72,12 @@ public class GithubRepo implements HasKey {
|
|||
public String getUrl() {
|
||||
return url;
|
||||
}
|
||||
|
||||
public String getStar() {
|
||||
return star;
|
||||
}
|
||||
|
||||
public String getFork() {
|
||||
return fork;
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue