diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index b126d9f..404a6dd 100755 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -9,6 +9,9 @@ us.codecraft webmagic-scripts + + 1.1.2-2 + @@ -16,6 +19,12 @@ jruby 1.7.6 + + org.jetbrains.kotlin + kotlin-stdlib + ${kotlin.version} + + org.codehaus.groovy groovy-all @@ -48,6 +57,7 @@ + ${project.basedir}/src/main/kotlin,${project.basedir}/src/main/groovy,${project.basedir}/src/main/java maven-compiler-plugin diff --git a/webmagic-scripts/src/main/kotlin/Github.kt b/webmagic-scripts/src/main/kotlin/Github.kt new file mode 100644 index 0000000..3d6ca21 --- /dev/null +++ b/webmagic-scripts/src/main/kotlin/Github.kt @@ -0,0 +1,40 @@ + +import us.codecraft.webmagic.Page +import us.codecraft.webmagic.Site +import us.codecraft.webmagic.Spider +import us.codecraft.webmagic.processor.PageProcessor +import us.codecraft.webmagic.processor.example.GithubRepoPageProcessor + +/** + * + * @author code4crafter@gmail.com + * Date: 2017/5/31 + * Time: 下午11:33 + * + */ +class GithubRepoPageProcessor : PageProcessor { + + private val site = Site.me().setRetryTimes(3).setSleepTime(1000).setTimeOut(10000) + + override fun process(page: Page) { + page.addTargetRequests(page.html.links().regex("(https://github\\.com/[\\w\\-]+/[\\w\\-]+)").all()) + page.addTargetRequests(page.html.links().regex("(https://github\\.com/[\\w\\-])").all()) + page.putField("author", page.url.regex("https://github\\.com/(\\w+)/.*").toString()) + page.putField("name", page.html.xpath("//h1[@class='public']/strong/a/text()").toString()) + if (page.resultItems.get("name") == null) { + //skip this page + page.setSkip(true) + } + page.putField("readme", page.html.xpath("//div[@id='readme']/tidyText()")) + } + + override fun getSite(): Site { + return site + } + + companion object { + @JvmStatic fun main(args: Array) { + Spider.create(GithubRepoPageProcessor()).addUrl("https://github.com/code4craft").thread(5).run() + } + } +}