diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java index 6a35178..4c7b992 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java @@ -96,11 +96,6 @@ public class Site { * @return get domain */ public String getDomain() { - if (domain == null) { - if (startUrls.size() > 0) { - domain = UrlUtils.getDomain(startUrls.get(0)); - } - } return domain; } @@ -176,6 +171,11 @@ public class Site { */ public Site addStartUrl(String startUrl) { this.startUrls.add(startUrl); + if (domain == null) { + if (startUrls.size() > 0) { + domain = UrlUtils.getDomain(startUrls.get(0)); + } + } return this; } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/OschinaBlog.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/OschinaBlog.java new file mode 100644 index 0000000..703d6a4 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/OschinaBlog.java @@ -0,0 +1,59 @@ +package us.codecraft.webmagic.example; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.model.AfterExtractor; +import us.codecraft.webmagic.model.OOSpider; +import us.codecraft.webmagic.model.annotation.ExtractBy; +import us.codecraft.webmagic.model.annotation.Formatter; +import us.codecraft.webmagic.model.annotation.TargetUrl; +import us.codecraft.webmagic.pipeline.JsonFilePageModelPipeline; + +import java.util.List; + +/** + * @author code4crafter@gmail.com
+ */ +@TargetUrl("http://my.oschina.net/flashsword/blog/\\d+") +public class OschinaBlog implements AfterExtractor{ + + @ExtractBy("//title/text()") + private String title; + + @ExtractBy(value = "div.BlogContent",type = ExtractBy.Type.Css) + private String content; + + @ExtractBy(value = "//div[@class='BlogTags']/a/text()", multi = true) + private List tags; + + @Formatter("YYYY-MM-dd HH:mm") + @ExtractBy("//div[@class='BlogStat']/regex('\\d+-\\d+-\\d+\\s+\\d+:\\d+')") + private String date; + + public static void main(String[] args) { + OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog") + ,new JsonFilePageModelPipeline("/data/webmagic/"), OschinaBlog.class).run(); + } + + public String getTitle() { + return title; + } + + public String getContent() { + return content; + } + + public List getTags() { + return tags; + } + +// public Date getDate() { +// return date; +// } + + @Override + public void afterProcess(Page page) { + System.out.println(date); + System.out.println(title); + } +} diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java index 8e6602c..a7f51ad 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java @@ -6,7 +6,6 @@ import us.codecraft.webmagic.model.annotation.ExtractBy; import us.codecraft.webmagic.model.annotation.TargetUrl; import us.codecraft.webmagic.pipeline.JsonFilePageModelPipeline; -import java.util.Date; import java.util.List; /** @@ -24,9 +23,6 @@ public class OschinaBlog{ @ExtractBy(value = "//div[@class='BlogTags']/a/text()", multi = true) private List tags; - @ExtractBy("//div[class='BlogStat']/regex('\\d{4}-\\d{1,2}-\\d{1,2} \\d{1,2}:\\d{1,2}')") - private Date date; - public static void main(String[] args) { OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog") ,new JsonFilePageModelPipeline(), OschinaBlog.class).run();