Merge branch 'osc'
commit
0fd4623f0a
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,3 @@
|
|||
#!/bin/sh
|
||||
mvn clean package
|
||||
rsync -avz --delete ./webmagic-samples/target/lib/ ./lib/
|
|
@ -0,0 +1,156 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project name="module_webmagic-core" default="compile.module.webmagic-core">
|
||||
<dirname property="module.webmagic-core.basedir" file="${ant.file.module_webmagic-core}"/>
|
||||
|
||||
<property name="module.jdk.home.webmagic-core" value="${project.jdk.home}"/>
|
||||
<property name="module.jdk.bin.webmagic-core" value="${project.jdk.bin}"/>
|
||||
<property name="module.jdk.classpath.webmagic-core" value="${project.jdk.classpath}"/>
|
||||
|
||||
<property name="compiler.args.webmagic-core" value="${compiler.args}"/>
|
||||
|
||||
<property name="webmagic-core.output.dir" value="${module.webmagic-core.basedir}/target/classes"/>
|
||||
<property name="webmagic-core.testoutput.dir" value="${module.webmagic-core.basedir}/target/test-classes"/>
|
||||
|
||||
<path id="webmagic-core.module.bootclasspath">
|
||||
<!-- Paths to be included in compilation bootclasspath -->
|
||||
</path>
|
||||
|
||||
<path id="webmagic-core.module.production.classpath">
|
||||
<path refid="${module.jdk.classpath.webmagic-core}"/>
|
||||
<path refid="library.maven:_org.apache.httpcomponents:httpclient:4.2.4.classpath"/>
|
||||
<path refid="library.maven:_org.apache.httpcomponents:httpcore:4.2.4.classpath"/>
|
||||
<path refid="library.maven:_commons-logging:commons-logging:1.1.1.classpath"/>
|
||||
<path refid="library.maven:_commons-codec:commons-codec:1.6.classpath"/>
|
||||
<path refid="library.maven:_com.google.guava:guava:13.0.1.classpath"/>
|
||||
<path refid="library.maven:_org.apache.commons:commons-lang3:3.1.classpath"/>
|
||||
<path refid="library.maven:_log4j:log4j:1.2.17.classpath"/>
|
||||
<path refid="library.maven:_commons-collections:commons-collections:3.2.1.classpath"/>
|
||||
<path refid="library.maven:_net.sourceforge.htmlcleaner:htmlcleaner:2.4.classpath"/>
|
||||
<path refid="library.maven:_org.jdom:jdom2:2.0.4.classpath"/>
|
||||
<path refid="library.maven:_commons-io:commons-io:1.3.2.classpath"/>
|
||||
</path>
|
||||
|
||||
<path id="webmagic-core.runtime.production.module.classpath">
|
||||
<pathelement location="${webmagic-core.output.dir}"/>
|
||||
<path refid="library.maven:_org.apache.httpcomponents:httpclient:4.2.4.classpath"/>
|
||||
<path refid="library.maven:_org.apache.httpcomponents:httpcore:4.2.4.classpath"/>
|
||||
<path refid="library.maven:_commons-logging:commons-logging:1.1.1.classpath"/>
|
||||
<path refid="library.maven:_commons-codec:commons-codec:1.6.classpath"/>
|
||||
<path refid="library.maven:_com.google.guava:guava:13.0.1.classpath"/>
|
||||
<path refid="library.maven:_org.apache.commons:commons-lang3:3.1.classpath"/>
|
||||
<path refid="library.maven:_log4j:log4j:1.2.17.classpath"/>
|
||||
<path refid="library.maven:_commons-collections:commons-collections:3.2.1.classpath"/>
|
||||
<path refid="library.maven:_net.sourceforge.htmlcleaner:htmlcleaner:2.4.classpath"/>
|
||||
<path refid="library.maven:_org.jdom:jdom2:2.0.4.classpath"/>
|
||||
<path refid="library.maven:_commons-io:commons-io:1.3.2.classpath"/>
|
||||
</path>
|
||||
|
||||
<path id="webmagic-core.module.classpath">
|
||||
<path refid="${module.jdk.classpath.webmagic-core}"/>
|
||||
<pathelement location="${webmagic-core.output.dir}"/>
|
||||
<path refid="library.maven:_org.apache.httpcomponents:httpclient:4.2.4.classpath"/>
|
||||
<path refid="library.maven:_org.apache.httpcomponents:httpcore:4.2.4.classpath"/>
|
||||
<path refid="library.maven:_commons-logging:commons-logging:1.1.1.classpath"/>
|
||||
<path refid="library.maven:_commons-codec:commons-codec:1.6.classpath"/>
|
||||
<path refid="library.maven:_junit:junit:4.7.classpath"/>
|
||||
<path refid="library.maven:_com.google.guava:guava:13.0.1.classpath"/>
|
||||
<path refid="library.maven:_org.apache.commons:commons-lang3:3.1.classpath"/>
|
||||
<path refid="library.maven:_log4j:log4j:1.2.17.classpath"/>
|
||||
<path refid="library.maven:_commons-collections:commons-collections:3.2.1.classpath"/>
|
||||
<path refid="library.maven:_net.sourceforge.htmlcleaner:htmlcleaner:2.4.classpath"/>
|
||||
<path refid="library.maven:_org.jdom:jdom2:2.0.4.classpath"/>
|
||||
<path refid="library.maven:_commons-io:commons-io:1.3.2.classpath"/>
|
||||
</path>
|
||||
|
||||
<path id="webmagic-core.runtime.module.classpath">
|
||||
<pathelement location="${webmagic-core.testoutput.dir}"/>
|
||||
<pathelement location="${webmagic-core.output.dir}"/>
|
||||
<path refid="library.maven:_org.apache.httpcomponents:httpclient:4.2.4.classpath"/>
|
||||
<path refid="library.maven:_org.apache.httpcomponents:httpcore:4.2.4.classpath"/>
|
||||
<path refid="library.maven:_commons-logging:commons-logging:1.1.1.classpath"/>
|
||||
<path refid="library.maven:_commons-codec:commons-codec:1.6.classpath"/>
|
||||
<path refid="library.maven:_junit:junit:4.7.classpath"/>
|
||||
<path refid="library.maven:_com.google.guava:guava:13.0.1.classpath"/>
|
||||
<path refid="library.maven:_org.apache.commons:commons-lang3:3.1.classpath"/>
|
||||
<path refid="library.maven:_log4j:log4j:1.2.17.classpath"/>
|
||||
<path refid="library.maven:_commons-collections:commons-collections:3.2.1.classpath"/>
|
||||
<path refid="library.maven:_net.sourceforge.htmlcleaner:htmlcleaner:2.4.classpath"/>
|
||||
<path refid="library.maven:_org.jdom:jdom2:2.0.4.classpath"/>
|
||||
<path refid="library.maven:_commons-io:commons-io:1.3.2.classpath"/>
|
||||
</path>
|
||||
|
||||
|
||||
<patternset id="excluded.from.module.webmagic-core">
|
||||
<patternset refid="ignored.files"/>
|
||||
</patternset>
|
||||
|
||||
<patternset id="excluded.from.compilation.webmagic-core">
|
||||
<patternset refid="excluded.from.module.webmagic-core"/>
|
||||
</patternset>
|
||||
|
||||
<path id="webmagic-core.module.sourcepath">
|
||||
<dirset dir="${module.webmagic-core.basedir}">
|
||||
<include name="src/main/java"/>
|
||||
<include name="src/main/resources"/>
|
||||
</dirset>
|
||||
</path>
|
||||
|
||||
<path id="webmagic-core.module.test.sourcepath">
|
||||
<dirset dir="${module.webmagic-core.basedir}">
|
||||
<include name="src/test/java"/>
|
||||
<include name="src/test/resources"/>
|
||||
</dirset>
|
||||
</path>
|
||||
|
||||
|
||||
<target name="compile.module.webmagic-core" depends="compile.module.webmagic-core.production,compile.module.webmagic-core.tests" description="Compile module webmagic-core"/>
|
||||
|
||||
<target name="compile.module.webmagic-core.production" depends="register.custom.compilers" description="Compile module webmagic-core; production classes">
|
||||
<mkdir dir="${webmagic-core.output.dir}"/>
|
||||
<javac2 destdir="${webmagic-core.output.dir}" debug="${compiler.debug}" nowarn="${compiler.generate.no.warnings}" memorymaximumsize="${compiler.max.memory}" fork="true" executable="${module.jdk.bin.webmagic-core}/javac">
|
||||
<compilerarg line="${compiler.args.webmagic-core}"/>
|
||||
<bootclasspath refid="webmagic-core.module.bootclasspath"/>
|
||||
<classpath refid="webmagic-core.module.production.classpath"/>
|
||||
<src refid="webmagic-core.module.sourcepath"/>
|
||||
<patternset refid="excluded.from.compilation.webmagic-core"/>
|
||||
</javac2>
|
||||
|
||||
<copy todir="${webmagic-core.output.dir}">
|
||||
<fileset dir="${module.webmagic-core.basedir}/src/main/java">
|
||||
<patternset refid="compiler.resources"/>
|
||||
<type type="file"/>
|
||||
</fileset>
|
||||
<fileset dir="${module.webmagic-core.basedir}/src/main/resources">
|
||||
<patternset refid="compiler.resources"/>
|
||||
<type type="file"/>
|
||||
</fileset>
|
||||
</copy>
|
||||
</target>
|
||||
|
||||
<target name="compile.module.webmagic-core.tests" depends="register.custom.compilers,compile.module.webmagic-core.production" description="compile module webmagic-core; test classes" unless="skip.tests">
|
||||
<mkdir dir="${webmagic-core.testoutput.dir}"/>
|
||||
<javac2 destdir="${webmagic-core.testoutput.dir}" debug="${compiler.debug}" nowarn="${compiler.generate.no.warnings}" memorymaximumsize="${compiler.max.memory}" fork="true" executable="${module.jdk.bin.webmagic-core}/javac">
|
||||
<compilerarg line="${compiler.args.webmagic-core}"/>
|
||||
<bootclasspath refid="webmagic-core.module.bootclasspath"/>
|
||||
<classpath refid="webmagic-core.module.classpath"/>
|
||||
<src refid="webmagic-core.module.test.sourcepath"/>
|
||||
<patternset refid="excluded.from.compilation.webmagic-core"/>
|
||||
</javac2>
|
||||
|
||||
<copy todir="${webmagic-core.testoutput.dir}">
|
||||
<fileset dir="${module.webmagic-core.basedir}/src/test/java">
|
||||
<patternset refid="compiler.resources"/>
|
||||
<type type="file"/>
|
||||
</fileset>
|
||||
<fileset dir="${module.webmagic-core.basedir}/src/test/resources">
|
||||
<patternset refid="compiler.resources"/>
|
||||
<type type="file"/>
|
||||
</fileset>
|
||||
</copy>
|
||||
</target>
|
||||
|
||||
<target name="clean.module.webmagic-core" description="cleanup module">
|
||||
<delete dir="${webmagic-core.output.dir}"/>
|
||||
<delete dir="${webmagic-core.testoutput.dir}"/>
|
||||
</target>
|
||||
</project>
|
|
@ -0,0 +1,37 @@
|
|||
package us.codecraft.webmagic.model.samples;
|
||||
|
||||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.Site;
|
||||
import us.codecraft.webmagic.model.AfterExtractor;
|
||||
import us.codecraft.webmagic.model.OOSpider;
|
||||
import us.codecraft.webmagic.model.annotation.ExtractBy;
|
||||
import us.codecraft.webmagic.model.annotation.TargetUrl;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* @author yihua.huang@dianping.com <br>
|
||||
* Date: 13-8-13 <br>
|
||||
* Time: 上午10:13 <br>
|
||||
*/
|
||||
@TargetUrl("http://*.alpha.dp/*")
|
||||
public class DianpingFtlDataScanner implements AfterExtractor {
|
||||
|
||||
@ExtractBy(value = "(DP\\.data\\(\\{.*\\}\\));", type = ExtractBy.Type.Regex, notNull = true, multi = true)
|
||||
private List<String> data;
|
||||
|
||||
public static void main(String[] args) {
|
||||
OOSpider.create(Site.me().addStartUrl("http://w.alpha.dp/").setSleepTime(0), DianpingFtlDataScanner.class)
|
||||
.thread(5).run();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void afterProcess(Page page) {
|
||||
if (data.size() > 1) {
|
||||
System.err.println(page.getUrl());
|
||||
}
|
||||
if (data.size() > 0 && data.get(0).length() > 100) {
|
||||
System.err.println(page.getUrl());
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,46 @@
|
|||
package us.codecraft.webmagic.samples;
|
||||
|
||||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.Site;
|
||||
import us.codecraft.webmagic.Spider;
|
||||
import us.codecraft.webmagic.processor.PageProcessor;
|
||||
import us.codecraft.webmagic.selector.PlainText;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-4-21
|
||||
* Time: 下午8:08
|
||||
*/
|
||||
public class DiaoyuwengProcessor implements PageProcessor {
|
||||
|
||||
private Site site;
|
||||
|
||||
@Override
|
||||
public void process(Page page) {
|
||||
List<String> requests = page.getHtml().links().regex("(http://www\\.diaoyuweng\\.com/home\\.php\\?mod=space&uid=88304&do=thread&view=me&type=thread&order=dateline&from=space&page=\\d+)").all();
|
||||
page.addTargetRequests(requests);
|
||||
requests = page.getHtml().links().regex("(http://www\\.diaoyuweng\\.com/thread-\\d+-1-1.html)").all();
|
||||
page.addTargetRequests(requests);
|
||||
if (page.getUrl().toString().contains("thread")){
|
||||
page.putField("title", page.getHtml().xpath("//a[@id='thread_subject']"));
|
||||
page.putField("content", page.getHtml().xpath("//div[@class='pcb']//tbody/tidyText()"));
|
||||
page.putField("date",page.getHtml().regex("发表于 (\\d{4}-\\d+-\\d+ \\d+:\\d+:\\d+)"));
|
||||
page.putField("id",new PlainText("1000"+page.getUrl().regex("http://www\\.diaoyuweng\\.com/thread-(\\d+)-1-1.html").toString()));
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Site getSite() {
|
||||
if (site==null){
|
||||
site= Site.me().setDomain("www.diaoyuweng.com").addStartUrl("http://www.diaoyuweng.com/home.php?mod=space&uid=88304&do=thread&view=me&type=thread&from=space").
|
||||
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31").setCharset("GBK").setSleepTime(500);
|
||||
}
|
||||
return site;
|
||||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
Spider.create(new DiaoyuwengProcessor()).run();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,34 @@
|
|||
package us.codecraft.webmagic.samples;
|
||||
|
||||
import us.codecraft.webmagic.Site;
|
||||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.Spider;
|
||||
import us.codecraft.webmagic.processor.PageProcessor;
|
||||
import us.codecraft.webmagic.scheduler.RedisScheduler;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-4-21
|
||||
* Time: 下午1:48
|
||||
*/
|
||||
public class F58PageProcesser implements PageProcessor {
|
||||
|
||||
@Override
|
||||
public void process(Page page) {
|
||||
List<String> strings = page.getHtml().links().regex(".*/yewu/.*").all();
|
||||
page.addTargetRequests(strings);
|
||||
page.putField("title",page.getHtml().regex("<title>(.*)</title>"));
|
||||
page.putField("body",page.getHtml().xpath("//dd"));
|
||||
}
|
||||
|
||||
@Override
|
||||
public Site getSite() {
|
||||
return Site.me().setDomain("sh.58.com").addStartUrl("http://sh1.51a8.com/").setCycleRetryTimes(2); //To change body of implemented methods use File | Settings | File Templates.
|
||||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
Spider.create(new F58PageProcesser()).setScheduler(new RedisScheduler("localhost")).run();
|
||||
}
|
||||
}
|
|
@ -27,4 +27,5 @@ public class HuxiuProcessor implements PageProcessor {
|
|||
public static void main(String[] args) {
|
||||
Spider.create(new HuxiuProcessor()).run();
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -0,0 +1,32 @@
|
|||
package us.codecraft.webmagic.samples;
|
||||
|
||||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.Site;
|
||||
import us.codecraft.webmagic.Spider;
|
||||
import us.codecraft.webmagic.processor.PageProcessor;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-5-20
|
||||
* Time: 下午5:31
|
||||
*/
|
||||
public class KaichibaProcessor implements PageProcessor {
|
||||
@Override
|
||||
public void process(Page page) {
|
||||
//http://progressdaily.diandian.com/post/2013-01-24/40046867275
|
||||
int i = Integer.valueOf(page.getUrl().regex("shop/(\\d+)").toString()) + 1;
|
||||
page.addTargetRequest("http://kaichiba.com/shop/" + i);
|
||||
page.putField("title",page.getHtml().xpath("//Title"));
|
||||
page.putField("items", page.getHtml().xpath("//li[@class=\"foodTitle\"]").replace("^\\s+", "").replace("\\s+$", "").replace("<span>.*?</span>", ""));
|
||||
}
|
||||
|
||||
@Override
|
||||
public Site getSite() {
|
||||
return Site.me().setDomain("kaichiba.com").addStartUrl("http://kaichiba.com/shop/41725781").setCharset("utf-8").
|
||||
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
|
||||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
Spider.create(new KaichibaProcessor()).run();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,38 @@
|
|||
package us.codecraft.webmagic.samples;
|
||||
|
||||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.Site;
|
||||
import us.codecraft.webmagic.Spider;
|
||||
import us.codecraft.webmagic.processor.PageProcessor;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-5-20
|
||||
* Time: 下午5:31
|
||||
*/
|
||||
public class MeicanProcessor implements PageProcessor {
|
||||
@Override
|
||||
public void process(Page page) {
|
||||
//http://progressdaily.diandian.com/post/2013-01-24/40046867275
|
||||
List<String> requests = page.getHtml().xpath("//a[@class=\"area_link flat_btn\"]/@href").all();
|
||||
if (requests.size() > 2) {
|
||||
requests = requests.subList(0, 2);
|
||||
}
|
||||
page.addTargetRequests(requests);
|
||||
page.addTargetRequests(page.getHtml().links().regex("(.*/restaurant/[^#]+)").all());
|
||||
page.putField("items", page.getHtml().xpath("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"name\"]/text()"));
|
||||
page.putField("prices", page.getHtml().xpath("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"price_outer\"]/span[@class=\"price\"]/text()"));
|
||||
}
|
||||
|
||||
@Override
|
||||
public Site getSite() {
|
||||
return Site.me().setDomain("meican.com").addStartUrl("http://www.meican.com/shanghai/districts").setCharset("utf-8").
|
||||
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
|
||||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
Spider.create(new MeicanProcessor()).run();
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue