diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/main/QuickStarter.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/main/QuickStarter.java new file mode 100644 index 0000000..2046fcb --- /dev/null +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/main/QuickStarter.java @@ -0,0 +1,57 @@ +package us.codecraft.webmagic.main; + +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.model.OOSpider; +import us.codecraft.webmagic.model.samples.IteyeBlog; +import us.codecraft.webmagic.model.samples.News163; +import us.codecraft.webmagic.model.samples.OschinaBlog; +import us.codecraft.webmagic.pipeline.ConsolePipeline; + +import java.util.LinkedHashMap; +import java.util.Map; +import java.util.Scanner; + +/** + * @author code4crafter@gmail.com
+ * @date: 13-8-7
+ * Time: 下午9:24
+ */ +public class QuickStarter { + + public static void main(String[] args) { + Map clazzMap = new LinkedHashMap(); + clazzMap.put("1", OschinaBlog.class); + clazzMap.put("2", IteyeBlog.class); + clazzMap.put("3", News163.class); + Map urlMap = new LinkedHashMap(); + urlMap.put("1", "http://my.oschina.net/flashsword/blog"); + urlMap.put("2", "http://flashsword20.iteye.com/"); + urlMap.put("3", "http://news.163.com/"); + Scanner stdin = new Scanner(System.in); + String key = null; + System.out.println("Choose a Spider demo:"); + for (Map.Entry classEntry : clazzMap.entrySet()) { + System.out.println(classEntry.getKey()+"\t" + classEntry.getValue() + "\t" + urlMap.get(classEntry.getKey())); + } + while (key == null) { + key = new String(stdin.nextLine()); + if (clazzMap.get(key) == null) { + System.out.println("Invalid choice!"); + key = null; + } + } + System.out.println("The demo started and will last 60 seconds..."); + + //Start spider + OOSpider.create(Site.me().addStartUrl(urlMap.get(key)), clazzMap.get(key)).pipeline(new ConsolePipeline()).runAsync(); + + + try { + Thread.sleep(60000); + } catch (InterruptedException e) { + e.printStackTrace(); + } + System.out.println("The demo stopped!"); + System.exit(0); + } +} diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java index 848800d..6baa8ae 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java @@ -28,7 +28,7 @@ public class News163 implements PagedModel { @ExtractByUrl(value = "http://news\\.163\\.com/\\d+/\\d+/\\d+/\\w+_(\\d+)\\.html", notNull = false) private String page; - @ExtractBy(value = "//div[@class=\"ep-pages\"]//a/@href", multi = true) + @ExtractBy(value = "//div[@class=\"ep-pages\"]//a/@href", multi = true,notNull = false) @ExtractBy2(value = "http://news\\.163\\.com/\\d+/\\d+/\\d+/\\w+_(\\d+)\\.html", type = ExtractBy2.Type.Regex) private List otherPage; diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GlobalProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GlobalProcessor.java deleted file mode 100644 index 0448683..0000000 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GlobalProcessor.java +++ /dev/null @@ -1,49 +0,0 @@ -package us.codecraft.webmagic.samples; - -import us.codecraft.webmagic.Page; -import us.codecraft.webmagic.Site; -import us.codecraft.webmagic.Spider; -import us.codecraft.webmagic.pipeline.FilePipeline; -import us.codecraft.webmagic.processor.PageProcessor; -import us.codecraft.webmagic.scheduler.RedisScheduler; - -import java.util.List; - -/** - * Author code4crafter@gmail.com - * Date: 13-6-24 - * Time: 下午2:12 - */ -public class GlobalProcessor implements PageProcessor { - - private Site site; - - @Override - public void process(Page page) { - final List requests = page.getHtml().links().all(); - page.addTargetRequests(requests); - - } - - @Override - public Site getSite() { - if (site == null) { - site = Site.me().setDomain("www.2345.com").setSleepTime(0) - .addStartUrl("http://www.2345.com/").addStartUrl("http://hao.360.cn/") - .addStartUrl("http://www.baidu.com/s?wd=%E7%BD%91%E7%AB%99%E5%AF%BC%E8%88%AA&rsv_spt=1&issp=1&rsv_bp=0&ie=utf-8&tn=80039098_oem_dg&rsv_n=2&rsv_sug3=6&rsv_sug4=698&rsv_sug=0&rsv_sug1=3") - .setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); - } - return site; - } - - public static void main(String[] args) { - Spider.create(new GlobalProcessor()).thread(10) - .scheduler(new RedisScheduler("localhost")) - .pipeline(new FilePipeline("/data/webmagic/test/")) - .runAsync(); - Spider.create(new GlobalProcessor()).thread(10) - .scheduler(new RedisScheduler("localhost")) - .pipeline(new FilePipeline("/data/webmagic/test/")) - .run(); - } -} diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GuoxueProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GuoxueProcessor.java deleted file mode 100644 index 5d7d355..0000000 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GuoxueProcessor.java +++ /dev/null @@ -1,20 +0,0 @@ -package us.codecraft.webmagic.samples; - -import us.codecraft.webmagic.Spider; -import us.codecraft.webmagic.pipeline.FilePipeline; -import us.codecraft.webmagic.processor.SimplePageProcessor; -import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler; - -/** - * @author code4crafter@gmail.com
- * @date: 13-7-14
- * Time: 上午8:33
- */ -public class GuoxueProcessor { - - public static void main(String[] args) { - SimplePageProcessor simplePageProcessor = new SimplePageProcessor("http://www.guoxue123.cn/", "http://www.guoxue123.cn/*"); - simplePageProcessor.getSite().setCharset("GBK").setSleepTime(500); - Spider.create(simplePageProcessor).pipeline(new FilePipeline("/data/webmagic/")).scheduler(new FileCacheQueueScheduler("/data/webmagic/")).run(); - } -}