From 3b00190f9981bf005dae504496989269cc906cb2 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Thu, 10 Oct 2013 00:40:44 +0800 Subject: [PATCH] api without implementation for #28: add specific url crawl --- .../webmagic/example/BaiduBaike.java | 28 ++++++++++++++ .../us/codecraft/webmagic/model/OOSpider.java | 27 ++++++++++++-- .../model/annotation/UrlTemplate.java | 37 +++++++++++++++++++ .../webmagic/model/direct/Param.java | 15 ++++++++ .../webmagic/model/samples/Kr36NewsModel.java | 20 ++++++++-- 5 files changed, 120 insertions(+), 7 deletions(-) create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/example/BaiduBaike.java create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/UrlTemplate.java create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/model/direct/Param.java diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/BaiduBaike.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/BaiduBaike.java new file mode 100644 index 0000000..becc311 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/BaiduBaike.java @@ -0,0 +1,28 @@ +package us.codecraft.webmagic.example; + +import us.codecraft.webmagic.model.OOSpider; +import us.codecraft.webmagic.model.annotation.ExtractBy; +import us.codecraft.webmagic.model.annotation.UrlTemplate; +import us.codecraft.webmagic.model.direct.Param; + +import java.util.ArrayList; +import java.util.List; + +/** + * @author code4crafter@gmail.com + */ +@UrlTemplate("http://baike.baidu.com/search/word?word=${word}&enc=utf8") +public class BaiduBaike { + + private String word; + + @ExtractBy("//div[@id='lemmaContent-0']//div[@class='para']/allText()") + private String description; + + public static void main(String[] args) { + List words = new ArrayList(); + words.add(new Param().put("word","红烧肉")); + OOSpider.direct(words, BaiduBaike.class).thread(10).run(); + } + +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/OOSpider.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/OOSpider.java index 3cee9ad..efa5faf 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/OOSpider.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/OOSpider.java @@ -2,8 +2,11 @@ package us.codecraft.webmagic.model; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.model.direct.Param; import us.codecraft.webmagic.processor.PageProcessor; +import java.util.Collection; + /** * The spider for page model extractor.
* In webmagic, we call a POJO containing extract result as "page model".
@@ -22,13 +25,14 @@ import us.codecraft.webmagic.processor.PageProcessor; * {@literal @}ExtractBy(value = "//div[@class='BlogTags']/a/text()", multi = true) * private List tags; * } - + * * And start the spider by: *
  *   OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog")
  *        ,new JsonFilePageModelPipeline(), OschinaBlog.class).run();
  * }
- 
+ * + * * @author code4crafter@gmail.com
* @since 0.2.0 */ @@ -49,6 +53,7 @@ public class OOSpider extends Spider { /** * create a spider + * * @param site * @param pageModelPipeline * @param pageModels @@ -57,7 +62,7 @@ public class OOSpider extends Spider { this(ModelPageProcessor.create(site, pageModels)); this.modelPipeline = new ModelPipeline(); super.addPipeline(modelPipeline); - if (pageModelPipeline!=null){ + if (pageModelPipeline != null) { for (Class pageModel : pageModels) { this.modelPipeline.put(pageModel, pageModelPipeline); } @@ -72,6 +77,22 @@ public class OOSpider extends Spider { return new OOSpider(site, pageModelPipeline, pageModels); } + public static OOSpider direct(Site site, PageModelPipeline pageModelPipeline, Class... pageModels) { + return new OOSpider(site, pageModelPipeline, pageModels); + } + + public static OOSpider direct(PageModelPipeline pageModelPipeline, Class... pageModels) { + return new OOSpider(null, pageModelPipeline, pageModels); + } + + public static OOSpider direct(Class... pageModels) { + return new OOSpider(null, null, pageModels); + } + + public static OOSpider direct(Collection params,Class... pageModels) { + return new OOSpider(null, null, pageModels); + } + public OOSpider addPageModel(PageModelPipeline pageModelPipeline, Class... pageModels) { for (Class pageModel : pageModels) { modelPageProcessor.addPageModel(pageModel); diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/UrlTemplate.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/UrlTemplate.java new file mode 100644 index 0000000..a940a64 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/UrlTemplate.java @@ -0,0 +1,37 @@ +package us.codecraft.webmagic.model.annotation; + +import java.lang.annotation.ElementType; +import java.lang.annotation.Retention; +import java.lang.annotation.Target; + +/** + * Define the url patterns for class.
+ * All urls matching the pattern will be crawled and extracted for new objects.
+ * + * @author code4crafter@gmail.com
+ * @since 0.3.3 + */ +@Retention(java.lang.annotation.RetentionPolicy.RUNTIME) +@Target({ElementType.TYPE}) +public @interface UrlTemplate { + + /** + * The url patterns for class.
+ * Use regex expression with some changes:
+ * "." stand for literal character "." instead of "any character".
+ * "*" stand for any legal character for url in 0-n length ([^"'#]*) instead of "any length".
+ * + * @return the url patterns for class + */ + String value(); + + /** + * Define the region for url extracting.
+ * Only support XPath.
+ * When sourceRegion is set, the urls will be extracted only from the region instead of entire content.
+ * + * @return the region for url extracting + */ + String encoding() default "utf8"; + +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/direct/Param.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/direct/Param.java new file mode 100644 index 0000000..c66e854 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/direct/Param.java @@ -0,0 +1,15 @@ +package us.codecraft.webmagic.model.direct; + +import java.util.LinkedHashMap; + +/** + * @author code4crafter@gmail.com + */ +public class Param extends LinkedHashMap{ + + @Override + public Param put(String key, Object value) { + super.put(key, value); + return this; + } +} diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/Kr36NewsModel.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/Kr36NewsModel.java index de3fdf5..b381c96 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/Kr36NewsModel.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/Kr36NewsModel.java @@ -1,12 +1,12 @@ package us.codecraft.webmagic.model.samples; import us.codecraft.webmagic.Site; -import us.codecraft.webmagic.model.ConsolePageModelPipeline; import us.codecraft.webmagic.model.OOSpider; import us.codecraft.webmagic.model.annotation.ExtractBy; import us.codecraft.webmagic.model.annotation.ExtractByUrl; import us.codecraft.webmagic.model.annotation.HelpUrl; import us.codecraft.webmagic.model.annotation.TargetUrl; +import us.codecraft.webmagic.pipeline.JsonFilePageModelPipeline; /** * @author code4crafter@gmail.com
@@ -18,14 +18,26 @@ public class Kr36NewsModel { @ExtractBy("//h1[@class='entry-title sep10']") private String title; - @ExtractBy("//div[@class='mainContent sep-10']") + @ExtractBy("//div[@class='mainContent sep-10']/tidyText()") private String content; @ExtractByUrl private String url; public static void main(String[] args) { - OOSpider.create(Site.me().addStartUrl("http://www.36kr.com/"), new ConsolePageModelPipeline(), - Kr36NewsModel.class).run(); + OOSpider.create(Site.me().addStartUrl("http://www.36kr.com/").setSleepTime(0),new JsonFilePageModelPipeline(), + Kr36NewsModel.class).thread(20).run(); + } + + public String getTitle() { + return title; + } + + public String getContent() { + return content; + } + + public String getUrl() { + return url; } }