release notes and docs
parent
1f86ce7720
commit
787b952932
|
@ -28,13 +28,15 @@ Release Notes
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
增加一个Spider.test(url)方法,用于开发爬虫时进行调试。
|
||||||
|
|
||||||
增加基于redis的分布式支持。
|
增加基于redis的分布式支持。
|
||||||
|
|
||||||
增加XPath2.0语法支持(webmagic-saxon模块)。
|
增加XPath2.0语法支持(webmagic-saxon模块)。
|
||||||
|
|
||||||
增加基于Selenium的浏览器渲染支持,用于抓取动态加载内容(webmagic-selenium模块)。
|
增加基于Selenium的浏览器渲染支持,用于抓取动态加载内容(webmagic-selenium模块)。
|
||||||
|
|
||||||
修复一些已有bug。
|
修复了不支持https的bug。
|
||||||
|
|
||||||
补充了文档:[webmagic-0.2.0用户手册](http://code4craft.github.io/webmagic/)。
|
补充了文档:[webmagic-0.2.0用户手册](http://code4craft.github.io/webmagic/)。
|
||||||
|
|
||||||
|
|
|
@ -220,9 +220,17 @@ public class Spider implements Runnable, Task {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public void test(String url){
|
/**
|
||||||
|
* 用某些特定URL进行爬虫测试
|
||||||
|
* @param urls 要抓取的url
|
||||||
|
*/
|
||||||
|
public void test(String... urls){
|
||||||
checkComponent();
|
checkComponent();
|
||||||
processRequest(new Request(url));
|
if (urls.length>0){
|
||||||
|
for (String url : urls) {
|
||||||
|
processRequest(new Request(url));
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private void processRequest(Request request) {
|
private void processRequest(Request request) {
|
||||||
|
|
|
@ -8,6 +8,7 @@ import us.codecraft.webmagic.model.annotation.ExtractByUrl;
|
||||||
import us.codecraft.webmagic.model.annotation.HelpUrl;
|
import us.codecraft.webmagic.model.annotation.HelpUrl;
|
||||||
import us.codecraft.webmagic.model.annotation.TargetUrl;
|
import us.codecraft.webmagic.model.annotation.TargetUrl;
|
||||||
import us.codecraft.webmagic.pipeline.JsonFilePageModelPipeline;
|
import us.codecraft.webmagic.pipeline.JsonFilePageModelPipeline;
|
||||||
|
import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
|
@ -32,12 +33,19 @@ public class GithubRepo implements HasKey {
|
||||||
@ExtractBy(value = "//div[@class='repository-lang-stats']//li//span[@class='lang']",multi = true)
|
@ExtractBy(value = "//div[@class='repository-lang-stats']//li//span[@class='lang']",multi = true)
|
||||||
private List<String> language;
|
private List<String> language;
|
||||||
|
|
||||||
|
@ExtractBy("//a[@class='social-count js-social-count']/text()")
|
||||||
|
private String star;
|
||||||
|
|
||||||
|
@ExtractBy("//a[@class='social-count js-social-count']/text()")
|
||||||
|
private String fork;
|
||||||
|
|
||||||
@ExtractByUrl
|
@ExtractByUrl
|
||||||
private String url;
|
private String url;
|
||||||
|
|
||||||
public static void main(String[] args) {
|
public static void main(String[] args) {
|
||||||
OOSpider.create(Site.me().addStartUrl("https://github.com/explore").setSleepTime(0),
|
OOSpider.create(Site.me().addStartUrl("https://github.com/explore").setSleepTime(0).setRetryTimes(3),
|
||||||
new JsonFilePageModelPipeline(), GithubRepo.class).thread(15).run();
|
new JsonFilePageModelPipeline(), GithubRepo.class)
|
||||||
|
.scheduler(new FileCacheQueueScheduler("/data/webmagic/cache/")).thread(15).run();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -64,4 +72,12 @@ public class GithubRepo implements HasKey {
|
||||||
public String getUrl() {
|
public String getUrl() {
|
||||||
return url;
|
return url;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public String getStar() {
|
||||||
|
return star;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getFork() {
|
||||||
|
return fork;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue