add diaoyuwen
parent
413b97c904
commit
d1fc1cf305
|
@ -3,9 +3,8 @@ package us.codecraft.spider;
|
|||
import org.junit.Ignore;
|
||||
import org.junit.Test;
|
||||
import us.codecraft.spider.pipeline.FilePipeline;
|
||||
import us.codecraft.spider.processor.PageProcessor;
|
||||
import us.codecraft.spider.processor.SimplePageProcessor;
|
||||
import us.codecraft.spider.samples.HuxiuProcessor;
|
||||
import us.codecraft.spider.samples.MeicanProcessor;
|
||||
import us.codecraft.spider.schedular.FileCacheQueueSchedular;
|
||||
|
||||
/**
|
||||
|
@ -24,12 +23,14 @@ public class SpiderTest {
|
|||
|
||||
@Test
|
||||
public void testGlobalSpider(){
|
||||
PageProcessor pageProcessor = new MeicanProcessor();
|
||||
Spider.me().pipeline(new FilePipeline()).schedular(new FileCacheQueueSchedular(pageProcessor.getSite(),"/data/temp/spider/cache/")).
|
||||
processor(pageProcessor).run();
|
||||
// SimplePageProcessor pageProcessor2 = new SimplePageProcessor("http://lol.duowan.com/", "http://lol.duowan.com/*.html");
|
||||
// Spider.me().pipeline(new FilePipeline()).schedular(new FileCacheQueueSchedular(pageProcessor2.getSite(),"/data/temp/spider/cache/")).
|
||||
// processor(pageProcessor2).run();
|
||||
// PageProcessor pageProcessor = new MeicanProcessor();
|
||||
// Spider.me().pipeline(new FilePipeline()).schedular(new FileCacheQueueSchedular(pageProcessor.getSite(),"/data/temp/spider/cache/")).
|
||||
// processor(pageProcessor).run();
|
||||
SimplePageProcessor pageProcessor2 = new SimplePageProcessor("http://www.diaoyuweng.com/home.php?mod=space&uid=88304&do=thread&view=me&type=thread&from=space", "http://www.diaoyuweng.com/thread-*-1-1.html");
|
||||
pageProcessor2.getSite().setEncoding("GBK");
|
||||
System.out.println(pageProcessor2.getSite().getEncoding());
|
||||
Spider.me().pipeline(new FilePipeline()).schedular(new FileCacheQueueSchedular(pageProcessor2.getSite(),"/data/temp/spider/cache/")).
|
||||
processor(pageProcessor2).run();
|
||||
|
||||
|
||||
}
|
||||
|
|
|
@ -0,0 +1,33 @@
|
|||
package us.codecraft.spider.samples;
|
||||
|
||||
import us.codecraft.spider.Page;
|
||||
import us.codecraft.spider.Site;
|
||||
import us.codecraft.spider.processor.PageProcessor;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* User: cairne
|
||||
* Date: 13-4-21
|
||||
* Time: 下午8:08
|
||||
*/
|
||||
public class DiaoyuwengProcessor implements PageProcessor {
|
||||
@Override
|
||||
public void process(Page page) {
|
||||
//http://progressdaily.diandian.com/post/2013-01-24/40046867275
|
||||
List<String> requests = page.getHtml().rs("<a[^<>]*href=[\"']{1}(/shop/.*?)[\"']{1}").toStrings();
|
||||
page.addTargetRequests(requests);
|
||||
requests = page.getHtml().rs("<a[^<>]*href=[\"']{1}(/search/category/.*?)[\"']{1}").toStrings();
|
||||
page.addTargetRequests(requests);
|
||||
if (page.getUrl().toString().contains("shop")){
|
||||
page.putField("title", page.getHtml().x("//h1[@class='shop-title']"));
|
||||
page.putField("content", page.getHtml().sc());
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Site getSite() {
|
||||
return Site.me().setDomain("www.dianping.com").setStartUrl("http://www.dianping.com/").
|
||||
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue