diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index a25fd02..2717b66 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -2,6 +2,7 @@ package us.codecraft.webmagic; import org.apache.commons.collections.CollectionUtils; import org.apache.log4j.Logger; +import us.codecraft.webmagic.annotation.ObjectPageProcessor; import us.codecraft.webmagic.downloader.Destroyable; import us.codecraft.webmagic.downloader.Downloader; import us.codecraft.webmagic.downloader.HttpClientDownloader; @@ -89,6 +90,10 @@ public class Spider implements Runnable, Task { return new Spider(pageProcessor); } + public static Spider create(Site site,Class... pageModels) { + return new Spider(ObjectPageProcessor.create(site,pageModels)); + } + /** * 重新设置startUrls,会覆盖Site本身的startUrls。 * diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ExtractBy.java b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ExtractBy.java index 7c749b3..4c791fd 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ExtractBy.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ExtractBy.java @@ -18,4 +18,6 @@ public @interface ExtractBy { public enum Type {XPath, Regex, Css}; Type type() default Type.XPath; + + boolean notNull() default true; } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ExtractByUrl.java b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ExtractByUrl.java index 3ecb451..57747f5 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ExtractByUrl.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ExtractByUrl.java @@ -11,8 +11,10 @@ import java.lang.annotation.Target; */ @Retention(java.lang.annotation.RetentionPolicy.RUNTIME) @Target({ElementType.FIELD}) -public @interface ExtractByUrl { +public @interface ExtractByUrl{ String value() default ""; + boolean notNull() default true; + } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/FieldExtractor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/FieldExtractor.java index d241c8d..f415cb8 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/FieldExtractor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/FieldExtractor.java @@ -20,18 +20,15 @@ class FieldExtractor { private Method setterMethod; + private final boolean notNull; + static enum Source {Html, Url} - public FieldExtractor(Field field, Selector selector) { - this.field = field; - this.selector = selector; - this.source = Source.Html; - } - - public FieldExtractor(Field field, Selector selector, Source source) { + public FieldExtractor(Field field, Selector selector, Source source, boolean notNull) { this.field = field; this.selector = selector; this.source = source; + this.notNull = notNull; } Field getField() { @@ -53,4 +50,8 @@ class FieldExtractor { Method getSetterMethod() { return setterMethod; } + + boolean isNotNull() { + return notNull; + } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/HelpUrl.java b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/HelpUrl.java new file mode 100644 index 0000000..3020817 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/HelpUrl.java @@ -0,0 +1,17 @@ +package us.codecraft.webmagic.annotation; + +import java.lang.annotation.ElementType; +import java.lang.annotation.Retention; +import java.lang.annotation.Target; + +/** + * @author yihua.huang@dianping.com
+ * @date: 13-8-1
+ * Time: 下午8:40
+ */ +@Retention(java.lang.annotation.RetentionPolicy.RUNTIME) +@Target({ElementType.TYPE}) +public @interface HelpUrl { + + String[] value(); +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ObjectPageProcessor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ObjectPageProcessor.java index ae3131e..ad8297e 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ObjectPageProcessor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ObjectPageProcessor.java @@ -40,6 +40,7 @@ public class ObjectPageProcessor implements PageProcessor { targetUrlPatterns = new HashSet(); for (PageModelExtractor pageModelExtractor : pageModelExtractorList) { targetUrlPatterns.addAll(pageModelExtractor.getTargetUrlPatterns()); + targetUrlPatterns.addAll(pageModelExtractor.getHelpUrlPatterns()); } } @@ -47,6 +48,9 @@ public class ObjectPageProcessor implements PageProcessor { public void process(Page page) { for (PageModelExtractor pageModelExtractor : pageModelExtractorList) { Object process = pageModelExtractor.process(page); + if (process==null){ + page.getResultItems().setSkip(true); + } postProcessPageModel(pageModelExtractor.getClazz(), process); page.putField(pageModelExtractor.getClazz().getCanonicalName(), process); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/PageModelExtractor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/PageModelExtractor.java index 7d0d4f2..41f635c 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/PageModelExtractor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/PageModelExtractor.java @@ -24,6 +24,8 @@ class PageModelExtractor { private List targetUrlPatterns; + private List helpUrlPatterns; + private Class clazz; private List fieldExtractors; @@ -57,7 +59,7 @@ class PageModelExtractor { default: selector = new XpathSelector(value); } - FieldExtractor fieldExtractor = new FieldExtractor(field, selector); + FieldExtractor fieldExtractor = new FieldExtractor(field, selector, FieldExtractor.Source.Html, extractBy.notNull()); Method setterMethod = getSetterMethod(clazz, field); if (setterMethod != null) { fieldExtractor.setSetterMethod(setterMethod); @@ -70,7 +72,7 @@ class PageModelExtractor { if (regexPattern.trim().equals("")) { regexPattern = ".*"; } - FieldExtractor fieldExtractor = new FieldExtractor(field, new RegexSelector(regexPattern), FieldExtractor.Source.Url); + FieldExtractor fieldExtractor = new FieldExtractor(field, new RegexSelector(regexPattern), FieldExtractor.Source.Url, extractByUrl.notNull()); Method setterMethod = getSetterMethod(clazz, field); if (setterMethod != null) { fieldExtractor.setSetterMethod(setterMethod); @@ -102,6 +104,14 @@ class PageModelExtractor { targetUrlPatterns.add(Pattern.compile(s.replace(".", "\\.").replace("*", "[^\"'#]*"))); } } + helpUrlPatterns = new ArrayList(); + annotation = clazz.getAnnotation(HelpUrl.class); + if (annotation != null) { + String[] value = ((HelpUrl) annotation).value(); + for (String s : value) { + helpUrlPatterns.add(Pattern.compile(s.replace(".", "\\.").replace("*", "[^\"'#]*"))); + } + } } public Object process(Page page) { @@ -129,7 +139,10 @@ class PageModelExtractor { default: value = fieldExtractor.getSelector().select(page.getHtml().toString()); } - setField(o,fieldExtractor,value); + if (value==null&&fieldExtractor.isNotNull()){ + page.getResultItems().setSkip(true); + } + setField(o, fieldExtractor, value); } } catch (InstantiationException e) { e.printStackTrace(); @@ -142,8 +155,8 @@ class PageModelExtractor { } private void setField(Object o, FieldExtractor fieldExtractor, String value) throws IllegalAccessException, InvocationTargetException { - if (fieldExtractor.getSetterMethod()!=null){ - fieldExtractor.getSetterMethod().invoke(o,value); + if (fieldExtractor.getSetterMethod() != null) { + fieldExtractor.getSetterMethod().invoke(o, value); } fieldExtractor.getField().set(o, value); } @@ -155,4 +168,8 @@ class PageModelExtractor { List getTargetUrlPatterns() { return targetUrlPatterns; } + + List getHelpUrlPatterns() { + return helpUrlPatterns; + } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/annotation/samples/IteyeBlog.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/annotation/samples/IteyeBlog.java new file mode 100644 index 0000000..8fbf089 --- /dev/null +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/annotation/samples/IteyeBlog.java @@ -0,0 +1,34 @@ +package us.codecraft.webmagic.annotation.samples; + +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.annotation.ExtractBy; +import us.codecraft.webmagic.annotation.TargetUrl; + +/** + * @author yihua.huang@dianping.com
+ * @date: 13-8-2
+ * Time: 上午7:52
+ */ +@TargetUrl("http://dengminhui.iteye.com/blog/*") +public class IteyeBlog { + + @ExtractBy("//title") + private String title; + + @ExtractBy(value = "div#blog_content",type = ExtractBy.Type.Css) + private String content; + + @Override + public String toString() { + return "IteyeBlog{" + + "title='" + title + '\'' + + ", content='" + content + '\'' + + '}'; + } + + public static void main(String[] args) { + Spider.create(Site.me().addStartUrl("http://dengminhui.iteye.com/blog"),IteyeBlog.class).run(); + } + +}