add starter
parent
7555ea0afc
commit
afec9d31b8
|
@ -0,0 +1,57 @@
|
||||||
|
package us.codecraft.webmagic.main;
|
||||||
|
|
||||||
|
import us.codecraft.webmagic.Site;
|
||||||
|
import us.codecraft.webmagic.model.OOSpider;
|
||||||
|
import us.codecraft.webmagic.model.samples.IteyeBlog;
|
||||||
|
import us.codecraft.webmagic.model.samples.News163;
|
||||||
|
import us.codecraft.webmagic.model.samples.OschinaBlog;
|
||||||
|
import us.codecraft.webmagic.pipeline.ConsolePipeline;
|
||||||
|
|
||||||
|
import java.util.LinkedHashMap;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.Scanner;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author code4crafter@gmail.com <br>
|
||||||
|
* @date: 13-8-7 <br>
|
||||||
|
* Time: 下午9:24 <br>
|
||||||
|
*/
|
||||||
|
public class QuickStarter {
|
||||||
|
|
||||||
|
public static void main(String[] args) {
|
||||||
|
Map<String, Class> clazzMap = new LinkedHashMap<String, Class>();
|
||||||
|
clazzMap.put("1", OschinaBlog.class);
|
||||||
|
clazzMap.put("2", IteyeBlog.class);
|
||||||
|
clazzMap.put("3", News163.class);
|
||||||
|
Map<String, String> urlMap = new LinkedHashMap<String, String>();
|
||||||
|
urlMap.put("1", "http://my.oschina.net/flashsword/blog");
|
||||||
|
urlMap.put("2", "http://flashsword20.iteye.com/");
|
||||||
|
urlMap.put("3", "http://news.163.com/");
|
||||||
|
Scanner stdin = new Scanner(System.in);
|
||||||
|
String key = null;
|
||||||
|
System.out.println("Choose a Spider demo:");
|
||||||
|
for (Map.Entry<String, Class> classEntry : clazzMap.entrySet()) {
|
||||||
|
System.out.println(classEntry.getKey()+"\t" + classEntry.getValue() + "\t" + urlMap.get(classEntry.getKey()));
|
||||||
|
}
|
||||||
|
while (key == null) {
|
||||||
|
key = new String(stdin.nextLine());
|
||||||
|
if (clazzMap.get(key) == null) {
|
||||||
|
System.out.println("Invalid choice!");
|
||||||
|
key = null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
System.out.println("The demo started and will last 60 seconds...");
|
||||||
|
|
||||||
|
//Start spider
|
||||||
|
OOSpider.create(Site.me().addStartUrl(urlMap.get(key)), clazzMap.get(key)).pipeline(new ConsolePipeline()).runAsync();
|
||||||
|
|
||||||
|
|
||||||
|
try {
|
||||||
|
Thread.sleep(60000);
|
||||||
|
} catch (InterruptedException e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
}
|
||||||
|
System.out.println("The demo stopped!");
|
||||||
|
System.exit(0);
|
||||||
|
}
|
||||||
|
}
|
|
@ -28,7 +28,7 @@ public class News163 implements PagedModel {
|
||||||
@ExtractByUrl(value = "http://news\\.163\\.com/\\d+/\\d+/\\d+/\\w+_(\\d+)\\.html", notNull = false)
|
@ExtractByUrl(value = "http://news\\.163\\.com/\\d+/\\d+/\\d+/\\w+_(\\d+)\\.html", notNull = false)
|
||||||
private String page;
|
private String page;
|
||||||
|
|
||||||
@ExtractBy(value = "//div[@class=\"ep-pages\"]//a/@href", multi = true)
|
@ExtractBy(value = "//div[@class=\"ep-pages\"]//a/@href", multi = true,notNull = false)
|
||||||
@ExtractBy2(value = "http://news\\.163\\.com/\\d+/\\d+/\\d+/\\w+_(\\d+)\\.html", type = ExtractBy2.Type.Regex)
|
@ExtractBy2(value = "http://news\\.163\\.com/\\d+/\\d+/\\d+/\\w+_(\\d+)\\.html", type = ExtractBy2.Type.Regex)
|
||||||
private List<String> otherPage;
|
private List<String> otherPage;
|
||||||
|
|
||||||
|
|
|
@ -1,49 +0,0 @@
|
||||||
package us.codecraft.webmagic.samples;
|
|
||||||
|
|
||||||
import us.codecraft.webmagic.Page;
|
|
||||||
import us.codecraft.webmagic.Site;
|
|
||||||
import us.codecraft.webmagic.Spider;
|
|
||||||
import us.codecraft.webmagic.pipeline.FilePipeline;
|
|
||||||
import us.codecraft.webmagic.processor.PageProcessor;
|
|
||||||
import us.codecraft.webmagic.scheduler.RedisScheduler;
|
|
||||||
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Author code4crafter@gmail.com
|
|
||||||
* Date: 13-6-24
|
|
||||||
* Time: 下午2:12
|
|
||||||
*/
|
|
||||||
public class GlobalProcessor implements PageProcessor {
|
|
||||||
|
|
||||||
private Site site;
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void process(Page page) {
|
|
||||||
final List<String> requests = page.getHtml().links().all();
|
|
||||||
page.addTargetRequests(requests);
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public Site getSite() {
|
|
||||||
if (site == null) {
|
|
||||||
site = Site.me().setDomain("www.2345.com").setSleepTime(0)
|
|
||||||
.addStartUrl("http://www.2345.com/").addStartUrl("http://hao.360.cn/")
|
|
||||||
.addStartUrl("http://www.baidu.com/s?wd=%E7%BD%91%E7%AB%99%E5%AF%BC%E8%88%AA&rsv_spt=1&issp=1&rsv_bp=0&ie=utf-8&tn=80039098_oem_dg&rsv_n=2&rsv_sug3=6&rsv_sug4=698&rsv_sug=0&rsv_sug1=3")
|
|
||||||
.setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
|
|
||||||
}
|
|
||||||
return site;
|
|
||||||
}
|
|
||||||
|
|
||||||
public static void main(String[] args) {
|
|
||||||
Spider.create(new GlobalProcessor()).thread(10)
|
|
||||||
.scheduler(new RedisScheduler("localhost"))
|
|
||||||
.pipeline(new FilePipeline("/data/webmagic/test/"))
|
|
||||||
.runAsync();
|
|
||||||
Spider.create(new GlobalProcessor()).thread(10)
|
|
||||||
.scheduler(new RedisScheduler("localhost"))
|
|
||||||
.pipeline(new FilePipeline("/data/webmagic/test/"))
|
|
||||||
.run();
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,20 +0,0 @@
|
||||||
package us.codecraft.webmagic.samples;
|
|
||||||
|
|
||||||
import us.codecraft.webmagic.Spider;
|
|
||||||
import us.codecraft.webmagic.pipeline.FilePipeline;
|
|
||||||
import us.codecraft.webmagic.processor.SimplePageProcessor;
|
|
||||||
import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @author code4crafter@gmail.com <br>
|
|
||||||
* @date: 13-7-14 <br>
|
|
||||||
* Time: 上午8:33 <br>
|
|
||||||
*/
|
|
||||||
public class GuoxueProcessor {
|
|
||||||
|
|
||||||
public static void main(String[] args) {
|
|
||||||
SimplePageProcessor simplePageProcessor = new SimplePageProcessor("http://www.guoxue123.cn/", "http://www.guoxue123.cn/*");
|
|
||||||
simplePageProcessor.getSite().setCharset("GBK").setSleepTime(500);
|
|
||||||
Spider.create(simplePageProcessor).pipeline(new FilePipeline("/data/webmagic/")).scheduler(new FileCacheQueueScheduler("/data/webmagic/")).run();
|
|
||||||
}
|
|
||||||
}
|
|
Loading…
Reference in New Issue