Merge branch 'osc'

master
yihua.huang 2016-01-21 19:33:30 +08:00
commit 0fd4623f0a
30 changed files with 347 additions and 0 deletions

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

BIN
lib/guava-15.0.jar 100644

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

BIN
lib/jdom2-2.0.4.jar 100644

Binary file not shown.

BIN
lib/jedis-2.0.0.jar 100644

Binary file not shown.

Binary file not shown.

Binary file not shown.

BIN
lib/jsoup-1.7.2.jar 100644

Binary file not shown.

BIN
lib/junit-4.7.jar 100644

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

BIN
lib/xsoup-0.1.0.jar 100644

Binary file not shown.

3
make.sh 100644
View File

@ -0,0 +1,3 @@
#!/bin/sh
mvn clean package
rsync -avz --delete ./webmagic-samples/target/lib/ ./lib/

View File

@ -0,0 +1,156 @@
<?xml version="1.0" encoding="UTF-8"?>
<project name="module_webmagic-core" default="compile.module.webmagic-core">
<dirname property="module.webmagic-core.basedir" file="${ant.file.module_webmagic-core}"/>
<property name="module.jdk.home.webmagic-core" value="${project.jdk.home}"/>
<property name="module.jdk.bin.webmagic-core" value="${project.jdk.bin}"/>
<property name="module.jdk.classpath.webmagic-core" value="${project.jdk.classpath}"/>
<property name="compiler.args.webmagic-core" value="${compiler.args}"/>
<property name="webmagic-core.output.dir" value="${module.webmagic-core.basedir}/target/classes"/>
<property name="webmagic-core.testoutput.dir" value="${module.webmagic-core.basedir}/target/test-classes"/>
<path id="webmagic-core.module.bootclasspath">
<!-- Paths to be included in compilation bootclasspath -->
</path>
<path id="webmagic-core.module.production.classpath">
<path refid="${module.jdk.classpath.webmagic-core}"/>
<path refid="library.maven:_org.apache.httpcomponents:httpclient:4.2.4.classpath"/>
<path refid="library.maven:_org.apache.httpcomponents:httpcore:4.2.4.classpath"/>
<path refid="library.maven:_commons-logging:commons-logging:1.1.1.classpath"/>
<path refid="library.maven:_commons-codec:commons-codec:1.6.classpath"/>
<path refid="library.maven:_com.google.guava:guava:13.0.1.classpath"/>
<path refid="library.maven:_org.apache.commons:commons-lang3:3.1.classpath"/>
<path refid="library.maven:_log4j:log4j:1.2.17.classpath"/>
<path refid="library.maven:_commons-collections:commons-collections:3.2.1.classpath"/>
<path refid="library.maven:_net.sourceforge.htmlcleaner:htmlcleaner:2.4.classpath"/>
<path refid="library.maven:_org.jdom:jdom2:2.0.4.classpath"/>
<path refid="library.maven:_commons-io:commons-io:1.3.2.classpath"/>
</path>
<path id="webmagic-core.runtime.production.module.classpath">
<pathelement location="${webmagic-core.output.dir}"/>
<path refid="library.maven:_org.apache.httpcomponents:httpclient:4.2.4.classpath"/>
<path refid="library.maven:_org.apache.httpcomponents:httpcore:4.2.4.classpath"/>
<path refid="library.maven:_commons-logging:commons-logging:1.1.1.classpath"/>
<path refid="library.maven:_commons-codec:commons-codec:1.6.classpath"/>
<path refid="library.maven:_com.google.guava:guava:13.0.1.classpath"/>
<path refid="library.maven:_org.apache.commons:commons-lang3:3.1.classpath"/>
<path refid="library.maven:_log4j:log4j:1.2.17.classpath"/>
<path refid="library.maven:_commons-collections:commons-collections:3.2.1.classpath"/>
<path refid="library.maven:_net.sourceforge.htmlcleaner:htmlcleaner:2.4.classpath"/>
<path refid="library.maven:_org.jdom:jdom2:2.0.4.classpath"/>
<path refid="library.maven:_commons-io:commons-io:1.3.2.classpath"/>
</path>
<path id="webmagic-core.module.classpath">
<path refid="${module.jdk.classpath.webmagic-core}"/>
<pathelement location="${webmagic-core.output.dir}"/>
<path refid="library.maven:_org.apache.httpcomponents:httpclient:4.2.4.classpath"/>
<path refid="library.maven:_org.apache.httpcomponents:httpcore:4.2.4.classpath"/>
<path refid="library.maven:_commons-logging:commons-logging:1.1.1.classpath"/>
<path refid="library.maven:_commons-codec:commons-codec:1.6.classpath"/>
<path refid="library.maven:_junit:junit:4.7.classpath"/>
<path refid="library.maven:_com.google.guava:guava:13.0.1.classpath"/>
<path refid="library.maven:_org.apache.commons:commons-lang3:3.1.classpath"/>
<path refid="library.maven:_log4j:log4j:1.2.17.classpath"/>
<path refid="library.maven:_commons-collections:commons-collections:3.2.1.classpath"/>
<path refid="library.maven:_net.sourceforge.htmlcleaner:htmlcleaner:2.4.classpath"/>
<path refid="library.maven:_org.jdom:jdom2:2.0.4.classpath"/>
<path refid="library.maven:_commons-io:commons-io:1.3.2.classpath"/>
</path>
<path id="webmagic-core.runtime.module.classpath">
<pathelement location="${webmagic-core.testoutput.dir}"/>
<pathelement location="${webmagic-core.output.dir}"/>
<path refid="library.maven:_org.apache.httpcomponents:httpclient:4.2.4.classpath"/>
<path refid="library.maven:_org.apache.httpcomponents:httpcore:4.2.4.classpath"/>
<path refid="library.maven:_commons-logging:commons-logging:1.1.1.classpath"/>
<path refid="library.maven:_commons-codec:commons-codec:1.6.classpath"/>
<path refid="library.maven:_junit:junit:4.7.classpath"/>
<path refid="library.maven:_com.google.guava:guava:13.0.1.classpath"/>
<path refid="library.maven:_org.apache.commons:commons-lang3:3.1.classpath"/>
<path refid="library.maven:_log4j:log4j:1.2.17.classpath"/>
<path refid="library.maven:_commons-collections:commons-collections:3.2.1.classpath"/>
<path refid="library.maven:_net.sourceforge.htmlcleaner:htmlcleaner:2.4.classpath"/>
<path refid="library.maven:_org.jdom:jdom2:2.0.4.classpath"/>
<path refid="library.maven:_commons-io:commons-io:1.3.2.classpath"/>
</path>
<patternset id="excluded.from.module.webmagic-core">
<patternset refid="ignored.files"/>
</patternset>
<patternset id="excluded.from.compilation.webmagic-core">
<patternset refid="excluded.from.module.webmagic-core"/>
</patternset>
<path id="webmagic-core.module.sourcepath">
<dirset dir="${module.webmagic-core.basedir}">
<include name="src/main/java"/>
<include name="src/main/resources"/>
</dirset>
</path>
<path id="webmagic-core.module.test.sourcepath">
<dirset dir="${module.webmagic-core.basedir}">
<include name="src/test/java"/>
<include name="src/test/resources"/>
</dirset>
</path>
<target name="compile.module.webmagic-core" depends="compile.module.webmagic-core.production,compile.module.webmagic-core.tests" description="Compile module webmagic-core"/>
<target name="compile.module.webmagic-core.production" depends="register.custom.compilers" description="Compile module webmagic-core; production classes">
<mkdir dir="${webmagic-core.output.dir}"/>
<javac2 destdir="${webmagic-core.output.dir}" debug="${compiler.debug}" nowarn="${compiler.generate.no.warnings}" memorymaximumsize="${compiler.max.memory}" fork="true" executable="${module.jdk.bin.webmagic-core}/javac">
<compilerarg line="${compiler.args.webmagic-core}"/>
<bootclasspath refid="webmagic-core.module.bootclasspath"/>
<classpath refid="webmagic-core.module.production.classpath"/>
<src refid="webmagic-core.module.sourcepath"/>
<patternset refid="excluded.from.compilation.webmagic-core"/>
</javac2>
<copy todir="${webmagic-core.output.dir}">
<fileset dir="${module.webmagic-core.basedir}/src/main/java">
<patternset refid="compiler.resources"/>
<type type="file"/>
</fileset>
<fileset dir="${module.webmagic-core.basedir}/src/main/resources">
<patternset refid="compiler.resources"/>
<type type="file"/>
</fileset>
</copy>
</target>
<target name="compile.module.webmagic-core.tests" depends="register.custom.compilers,compile.module.webmagic-core.production" description="compile module webmagic-core; test classes" unless="skip.tests">
<mkdir dir="${webmagic-core.testoutput.dir}"/>
<javac2 destdir="${webmagic-core.testoutput.dir}" debug="${compiler.debug}" nowarn="${compiler.generate.no.warnings}" memorymaximumsize="${compiler.max.memory}" fork="true" executable="${module.jdk.bin.webmagic-core}/javac">
<compilerarg line="${compiler.args.webmagic-core}"/>
<bootclasspath refid="webmagic-core.module.bootclasspath"/>
<classpath refid="webmagic-core.module.classpath"/>
<src refid="webmagic-core.module.test.sourcepath"/>
<patternset refid="excluded.from.compilation.webmagic-core"/>
</javac2>
<copy todir="${webmagic-core.testoutput.dir}">
<fileset dir="${module.webmagic-core.basedir}/src/test/java">
<patternset refid="compiler.resources"/>
<type type="file"/>
</fileset>
<fileset dir="${module.webmagic-core.basedir}/src/test/resources">
<patternset refid="compiler.resources"/>
<type type="file"/>
</fileset>
</copy>
</target>
<target name="clean.module.webmagic-core" description="cleanup module">
<delete dir="${webmagic-core.output.dir}"/>
<delete dir="${webmagic-core.testoutput.dir}"/>
</target>
</project>

View File

@ -0,0 +1,37 @@
package us.codecraft.webmagic.model.samples;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.model.AfterExtractor;
import us.codecraft.webmagic.model.OOSpider;
import us.codecraft.webmagic.model.annotation.ExtractBy;
import us.codecraft.webmagic.model.annotation.TargetUrl;
import java.util.List;
/**
* @author yihua.huang@dianping.com <br>
* Date: 13-8-13 <br>
* Time: 10:13 <br>
*/
@TargetUrl("http://*.alpha.dp/*")
public class DianpingFtlDataScanner implements AfterExtractor {
@ExtractBy(value = "(DP\\.data\\(\\{.*\\}\\));", type = ExtractBy.Type.Regex, notNull = true, multi = true)
private List<String> data;
public static void main(String[] args) {
OOSpider.create(Site.me().addStartUrl("http://w.alpha.dp/").setSleepTime(0), DianpingFtlDataScanner.class)
.thread(5).run();
}
@Override
public void afterProcess(Page page) {
if (data.size() > 1) {
System.err.println(page.getUrl());
}
if (data.size() > 0 && data.get(0).length() > 100) {
System.err.println(page.getUrl());
}
}
}

View File

@ -0,0 +1,46 @@
package us.codecraft.webmagic.samples;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.PlainText;
import java.util.List;
/**
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* Time: 8:08
*/
public class DiaoyuwengProcessor implements PageProcessor {
private Site site;
@Override
public void process(Page page) {
List<String> requests = page.getHtml().links().regex("(http://www\\.diaoyuweng\\.com/home\\.php\\?mod=space&uid=88304&do=thread&view=me&type=thread&order=dateline&from=space&page=\\d+)").all();
page.addTargetRequests(requests);
requests = page.getHtml().links().regex("(http://www\\.diaoyuweng\\.com/thread-\\d+-1-1.html)").all();
page.addTargetRequests(requests);
if (page.getUrl().toString().contains("thread")){
page.putField("title", page.getHtml().xpath("//a[@id='thread_subject']"));
page.putField("content", page.getHtml().xpath("//div[@class='pcb']//tbody/tidyText()"));
page.putField("date",page.getHtml().regex("发表于 (\\d{4}-\\d+-\\d+ \\d+:\\d+:\\d+)"));
page.putField("id",new PlainText("1000"+page.getUrl().regex("http://www\\.diaoyuweng\\.com/thread-(\\d+)-1-1.html").toString()));
}
}
@Override
public Site getSite() {
if (site==null){
site= Site.me().setDomain("www.diaoyuweng.com").addStartUrl("http://www.diaoyuweng.com/home.php?mod=space&uid=88304&do=thread&view=me&type=thread&from=space").
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31").setCharset("GBK").setSleepTime(500);
}
return site;
}
public static void main(String[] args) {
Spider.create(new DiaoyuwengProcessor()).run();
}
}

View File

@ -0,0 +1,34 @@
package us.codecraft.webmagic.samples;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.scheduler.RedisScheduler;
import java.util.List;
/**
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* Time: 1:48
*/
public class F58PageProcesser implements PageProcessor {
@Override
public void process(Page page) {
List<String> strings = page.getHtml().links().regex(".*/yewu/.*").all();
page.addTargetRequests(strings);
page.putField("title",page.getHtml().regex("<title>(.*)</title>"));
page.putField("body",page.getHtml().xpath("//dd"));
}
@Override
public Site getSite() {
return Site.me().setDomain("sh.58.com").addStartUrl("http://sh1.51a8.com/").setCycleRetryTimes(2); //To change body of implemented methods use File | Settings | File Templates.
}
public static void main(String[] args) {
Spider.create(new F58PageProcesser()).setScheduler(new RedisScheduler("localhost")).run();
}
}

View File

@ -27,4 +27,5 @@ public class HuxiuProcessor implements PageProcessor {
public static void main(String[] args) {
Spider.create(new HuxiuProcessor()).run();
}
}

View File

@ -0,0 +1,32 @@
package us.codecraft.webmagic.samples;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
/**
* @author code4crafter@gmail.com <br>
* Date: 13-5-20
* Time: 5:31
*/
public class KaichibaProcessor implements PageProcessor {
@Override
public void process(Page page) {
//http://progressdaily.diandian.com/post/2013-01-24/40046867275
int i = Integer.valueOf(page.getUrl().regex("shop/(\\d+)").toString()) + 1;
page.addTargetRequest("http://kaichiba.com/shop/" + i);
page.putField("title",page.getHtml().xpath("//Title"));
page.putField("items", page.getHtml().xpath("//li[@class=\"foodTitle\"]").replace("^\\s+", "").replace("\\s+$", "").replace("<span>.*?</span>", ""));
}
@Override
public Site getSite() {
return Site.me().setDomain("kaichiba.com").addStartUrl("http://kaichiba.com/shop/41725781").setCharset("utf-8").
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
}
public static void main(String[] args) {
Spider.create(new KaichibaProcessor()).run();
}
}

View File

@ -0,0 +1,38 @@
package us.codecraft.webmagic.samples;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import java.util.List;
/**
* @author code4crafter@gmail.com <br>
* Date: 13-5-20
* Time: 5:31
*/
public class MeicanProcessor implements PageProcessor {
@Override
public void process(Page page) {
//http://progressdaily.diandian.com/post/2013-01-24/40046867275
List<String> requests = page.getHtml().xpath("//a[@class=\"area_link flat_btn\"]/@href").all();
if (requests.size() > 2) {
requests = requests.subList(0, 2);
}
page.addTargetRequests(requests);
page.addTargetRequests(page.getHtml().links().regex("(.*/restaurant/[^#]+)").all());
page.putField("items", page.getHtml().xpath("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"name\"]/text()"));
page.putField("prices", page.getHtml().xpath("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"price_outer\"]/span[@class=\"price\"]/text()"));
}
@Override
public Site getSite() {
return Site.me().setDomain("meican.com").addStartUrl("http://www.meican.com/shanghai/districts").setCharset("utf-8").
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
}
public static void main(String[] args) {
Spider.create(new MeicanProcessor()).run();
}
}