samples
parent
bc1d14fed4
commit
64293cba20
|
@ -4,4 +4,34 @@ package us.codecraft.webmagic.samples;
|
||||||
* @author code4crafer@gmail.com
|
* @author code4crafer@gmail.com
|
||||||
*/
|
*/
|
||||||
public class GithubRepo {
|
public class GithubRepo {
|
||||||
|
|
||||||
|
private String name;
|
||||||
|
|
||||||
|
private String author;
|
||||||
|
|
||||||
|
private String readme;
|
||||||
|
|
||||||
|
public String getName() {
|
||||||
|
return name;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setName(String name) {
|
||||||
|
this.name = name;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getAuthor() {
|
||||||
|
return author;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setAuthor(String author) {
|
||||||
|
this.author = author;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getReadme() {
|
||||||
|
return readme;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setReadme(String readme) {
|
||||||
|
this.readme = readme;
|
||||||
|
}
|
||||||
}
|
}
|
|
@ -7,7 +7,7 @@ import us.codecraft.webmagic.processor.PageProcessor;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
* @since 0.3.2
|
* @since 0.5.1
|
||||||
*/
|
*/
|
||||||
public class GithubRepoPageProcessor implements PageProcessor {
|
public class GithubRepoPageProcessor implements PageProcessor {
|
||||||
|
|
||||||
|
@ -17,13 +17,16 @@ public class GithubRepoPageProcessor implements PageProcessor {
|
||||||
public void process(Page page) {
|
public void process(Page page) {
|
||||||
page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all());
|
page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all());
|
||||||
page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+)").all());
|
page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+)").all());
|
||||||
page.putField("author", page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString());
|
GithubRepo githubRepo = new GithubRepo();
|
||||||
page.putField("name", page.getHtml().xpath("//h1[@class='entry-title public']/strong/a/text()").toString());
|
githubRepo.setAuthor(page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString());
|
||||||
if (page.getResultItems().get("name")==null){
|
githubRepo.setName(page.getHtml().xpath("//h1[@class='entry-title public']/strong/a/text()").toString());
|
||||||
|
githubRepo.setReadme(page.getHtml().xpath("//div[@id='readme']/tidyText()").toString());
|
||||||
|
if (githubRepo.getName() == null) {
|
||||||
//skip this page
|
//skip this page
|
||||||
page.setSkip(true);
|
page.setSkip(true);
|
||||||
|
} else {
|
||||||
|
page.putField("repo", githubRepo);
|
||||||
}
|
}
|
||||||
page.putField("readme", page.getHtml().xpath("//div[@id='readme']/tidyText()"));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -0,0 +1,7 @@
|
||||||
|
package us.codecraft.webmagic.samples.pipeline;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author code4crafer@gmail.com
|
||||||
|
*/
|
||||||
|
public class ReplacePipeline {
|
||||||
|
}
|
Loading…
Reference in New Issue