complete test
parent
d7cd9e5747
commit
85b7cf1563
|
@ -53,7 +53,7 @@ public class Html extends PlainText {
|
|||
|
||||
@Override
|
||||
public Selectable links() {
|
||||
XpathSelector xpathSelector = Selectors.xpath("//a/@href");
|
||||
XsoupSelector xpathSelector = new XsoupSelector("//a/@href");
|
||||
return selectList(xpathSelector, strings);
|
||||
}
|
||||
|
||||
|
|
|
@ -15,10 +15,9 @@ import java.util.List;
|
|||
public class HuxiuProcessor implements PageProcessor {
|
||||
@Override
|
||||
public void process(Page page) {
|
||||
//http://progressdaily.diandian.com/post/2013-01-24/40046867275
|
||||
List<String> requests = page.getHtml().regex("<a[^<>\"']*href=[\"']{1}([/]{0,1}article[^<>#\"']*?)[\"']{1}").all();
|
||||
List<String> requests = page.getHtml().links().regex(".*article.*").all();
|
||||
page.addTargetRequests(requests);
|
||||
page.putField("title",page.getHtml().xpath("//div[@class='neirong']//h1[@class='ph xs5']"));
|
||||
page.putField("title",page.getHtml().xpath("//div[@class='clearfix neirong']//h1/text()"));
|
||||
page.putField("content",page.getHtml().smartContent());
|
||||
}
|
||||
|
||||
|
|
|
@ -4,9 +4,7 @@ import org.apache.commons.collections.CollectionUtils;
|
|||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.Site;
|
||||
import us.codecraft.webmagic.Spider;
|
||||
import us.codecraft.webmagic.pipeline.FilePipeline;
|
||||
import us.codecraft.webmagic.processor.PageProcessor;
|
||||
import us.codecraft.webmagic.scheduler.RedisScheduler;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
|
@ -41,8 +39,6 @@ public class InfoQMiniBookProcessor implements PageProcessor {
|
|||
|
||||
public static void main(String[] args) {
|
||||
Spider.create(new InfoQMiniBookProcessor())
|
||||
.scheduler(new RedisScheduler("localhost"))
|
||||
.pipeline(new FilePipeline("/data/temp/webmagic/"))
|
||||
.thread(5)
|
||||
.run();
|
||||
}
|
||||
|
|
|
@ -3,7 +3,6 @@ package us.codecraft.webmagic.samples;
|
|||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.Site;
|
||||
import us.codecraft.webmagic.Spider;
|
||||
import us.codecraft.webmagic.pipeline.FilePipeline;
|
||||
import us.codecraft.webmagic.processor.PageProcessor;
|
||||
|
||||
/**
|
||||
|
@ -32,6 +31,6 @@ public class IteyeBlogProcessor implements PageProcessor {
|
|||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
Spider.create(new IteyeBlogProcessor()).thread(5).pipeline(new FilePipeline("/data/webmagic/")).run();
|
||||
Spider.create(new IteyeBlogProcessor()).thread(5).run();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -2,6 +2,7 @@ package us.codecraft.webmagic.samples;
|
|||
|
||||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.Site;
|
||||
import us.codecraft.webmagic.Spider;
|
||||
import us.codecraft.webmagic.processor.PageProcessor;
|
||||
|
||||
/**
|
||||
|
@ -24,4 +25,8 @@ public class KaichibaProcessor implements PageProcessor {
|
|||
return Site.me().setDomain("kaichiba.com").addStartUrl("http://kaichiba.com/shop/41725781").setCharset("utf-8").
|
||||
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
|
||||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
Spider.create(new KaichibaProcessor()).run();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -2,6 +2,7 @@ package us.codecraft.webmagic.samples;
|
|||
|
||||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.Site;
|
||||
import us.codecraft.webmagic.Spider;
|
||||
import us.codecraft.webmagic.processor.PageProcessor;
|
||||
|
||||
import java.util.List;
|
||||
|
@ -21,8 +22,8 @@ public class MeicanProcessor implements PageProcessor {
|
|||
}
|
||||
page.addTargetRequests(requests);
|
||||
page.addTargetRequests(page.getHtml().links().regex("(.*/restaurant/[^#]+)").all());
|
||||
page.putField("items", page.getHtml().xpath("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"name\"]"));
|
||||
page.putField("prices", page.getHtml().xpath("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"price_outer\"]/span[@class=\"price\"]"));
|
||||
page.putField("items", page.getHtml().xpath("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"name\"]/text()"));
|
||||
page.putField("prices", page.getHtml().xpath("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"price_outer\"]/span[@class=\"price\"]/text()"));
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -30,4 +31,8 @@ public class MeicanProcessor implements PageProcessor {
|
|||
return Site.me().setDomain("meican.com").addStartUrl("http://www.meican.com/shanghai/districts").setCharset("utf-8").
|
||||
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
|
||||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
Spider.create(new MeicanProcessor()).run();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,9 +1,8 @@
|
|||
package us.codecraft.webmagic.samples;
|
||||
|
||||
import us.codecraft.webmagic.Site;
|
||||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.Site;
|
||||
import us.codecraft.webmagic.Spider;
|
||||
import us.codecraft.webmagic.pipeline.ConsolePipeline;
|
||||
import us.codecraft.webmagic.processor.PageProcessor;
|
||||
|
||||
import java.util.List;
|
||||
|
@ -21,8 +20,8 @@ public class OschinaBlogPageProcesser implements PageProcessor {
|
|||
public void process(Page page) {
|
||||
List<String> links = page.getHtml().links().regex("http://my\\.oschina\\.net/flashsword/blog/\\d+").all();
|
||||
page.addTargetRequests(links);
|
||||
page.putField("title", page.getHtml().xpath("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1").toString());
|
||||
page.putField("content", page.getHtml().$("div.content").toString());
|
||||
page.putField("title", page.getHtml().xpath("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1/text()").toString());
|
||||
page.putField("content", page.getHtml().xpath("//div[@class='BlogContent']/tidyText()").toString());
|
||||
page.putField("tags",page.getHtml().xpath("//div[@class='BlogTags']/a/text()").all());
|
||||
}
|
||||
|
||||
|
@ -33,6 +32,6 @@ public class OschinaBlogPageProcesser implements PageProcessor {
|
|||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
Spider.create(new OschinaBlogPageProcesser()).pipeline(new ConsolePipeline()).run();
|
||||
Spider.create(new OschinaBlogPageProcesser()).run();
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue