invite kotlin experimental
parent
3c653d941a
commit
818a2b2408
|
@ -9,6 +9,9 @@
|
||||||
|
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<artifactId>webmagic-scripts</artifactId>
|
<artifactId>webmagic-scripts</artifactId>
|
||||||
|
<properties>
|
||||||
|
<kotlin.version>1.1.2-2</kotlin.version>
|
||||||
|
</properties>
|
||||||
|
|
||||||
<dependencies>
|
<dependencies>
|
||||||
<dependency>
|
<dependency>
|
||||||
|
@ -16,6 +19,12 @@
|
||||||
<artifactId>jruby</artifactId>
|
<artifactId>jruby</artifactId>
|
||||||
<version>1.7.6</version>
|
<version>1.7.6</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.jetbrains.kotlin</groupId>
|
||||||
|
<artifactId>kotlin-stdlib</artifactId>
|
||||||
|
<version>${kotlin.version}</version>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.codehaus.groovy</groupId>
|
<groupId>org.codehaus.groovy</groupId>
|
||||||
<artifactId>groovy-all</artifactId>
|
<artifactId>groovy-all</artifactId>
|
||||||
|
@ -48,6 +57,7 @@
|
||||||
</dependencies>
|
</dependencies>
|
||||||
|
|
||||||
<build>
|
<build>
|
||||||
|
<sourceDirectory>${project.basedir}/src/main/kotlin,${project.basedir}/src/main/groovy,${project.basedir}/src/main/java</sourceDirectory>
|
||||||
<plugins>
|
<plugins>
|
||||||
<plugin>
|
<plugin>
|
||||||
<artifactId>maven-compiler-plugin</artifactId>
|
<artifactId>maven-compiler-plugin</artifactId>
|
||||||
|
|
|
@ -0,0 +1,40 @@
|
||||||
|
|
||||||
|
import us.codecraft.webmagic.Page
|
||||||
|
import us.codecraft.webmagic.Site
|
||||||
|
import us.codecraft.webmagic.Spider
|
||||||
|
import us.codecraft.webmagic.processor.PageProcessor
|
||||||
|
import us.codecraft.webmagic.processor.example.GithubRepoPageProcessor
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @author code4crafter@gmail.com
|
||||||
|
* Date: 2017/5/31
|
||||||
|
* Time: 下午11:33
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
class GithubRepoPageProcessor : PageProcessor {
|
||||||
|
|
||||||
|
private val site = Site.me().setRetryTimes(3).setSleepTime(1000).setTimeOut(10000)
|
||||||
|
|
||||||
|
override fun process(page: Page) {
|
||||||
|
page.addTargetRequests(page.html.links().regex("(https://github\\.com/[\\w\\-]+/[\\w\\-]+)").all())
|
||||||
|
page.addTargetRequests(page.html.links().regex("(https://github\\.com/[\\w\\-])").all())
|
||||||
|
page.putField("author", page.url.regex("https://github\\.com/(\\w+)/.*").toString())
|
||||||
|
page.putField("name", page.html.xpath("//h1[@class='public']/strong/a/text()").toString())
|
||||||
|
if (page.resultItems.get<Any>("name") == null) {
|
||||||
|
//skip this page
|
||||||
|
page.setSkip(true)
|
||||||
|
}
|
||||||
|
page.putField("readme", page.html.xpath("//div[@id='readme']/tidyText()"))
|
||||||
|
}
|
||||||
|
|
||||||
|
override fun getSite(): Site {
|
||||||
|
return site
|
||||||
|
}
|
||||||
|
|
||||||
|
companion object {
|
||||||
|
@JvmStatic fun main(args: Array<String>) {
|
||||||
|
Spider.create(GithubRepoPageProcessor()).addUrl("https://github.com/code4craft").thread(5).run()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue