diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
index a25fd02..2717b66 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
@@ -2,6 +2,7 @@ package us.codecraft.webmagic;
import org.apache.commons.collections.CollectionUtils;
import org.apache.log4j.Logger;
+import us.codecraft.webmagic.annotation.ObjectPageProcessor;
import us.codecraft.webmagic.downloader.Destroyable;
import us.codecraft.webmagic.downloader.Downloader;
import us.codecraft.webmagic.downloader.HttpClientDownloader;
@@ -89,6 +90,10 @@ public class Spider implements Runnable, Task {
return new Spider(pageProcessor);
}
+ public static Spider create(Site site,Class... pageModels) {
+ return new Spider(ObjectPageProcessor.create(site,pageModels));
+ }
+
/**
* 重新设置startUrls,会覆盖Site本身的startUrls。
*
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ExtractBy.java b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ExtractBy.java
index 7c749b3..4c791fd 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ExtractBy.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ExtractBy.java
@@ -18,4 +18,6 @@ public @interface ExtractBy {
public enum Type {XPath, Regex, Css};
Type type() default Type.XPath;
+
+ boolean notNull() default true;
}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ExtractByUrl.java b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ExtractByUrl.java
index 3ecb451..57747f5 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ExtractByUrl.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ExtractByUrl.java
@@ -11,8 +11,10 @@ import java.lang.annotation.Target;
*/
@Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
@Target({ElementType.FIELD})
-public @interface ExtractByUrl {
+public @interface ExtractByUrl{
String value() default "";
+ boolean notNull() default true;
+
}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/FieldExtractor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/FieldExtractor.java
index d241c8d..f415cb8 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/FieldExtractor.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/FieldExtractor.java
@@ -20,18 +20,15 @@ class FieldExtractor {
private Method setterMethod;
+ private final boolean notNull;
+
static enum Source {Html, Url}
- public FieldExtractor(Field field, Selector selector) {
- this.field = field;
- this.selector = selector;
- this.source = Source.Html;
- }
-
- public FieldExtractor(Field field, Selector selector, Source source) {
+ public FieldExtractor(Field field, Selector selector, Source source, boolean notNull) {
this.field = field;
this.selector = selector;
this.source = source;
+ this.notNull = notNull;
}
Field getField() {
@@ -53,4 +50,8 @@ class FieldExtractor {
Method getSetterMethod() {
return setterMethod;
}
+
+ boolean isNotNull() {
+ return notNull;
+ }
}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/HelpUrl.java b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/HelpUrl.java
new file mode 100644
index 0000000..3020817
--- /dev/null
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/HelpUrl.java
@@ -0,0 +1,17 @@
+package us.codecraft.webmagic.annotation;
+
+import java.lang.annotation.ElementType;
+import java.lang.annotation.Retention;
+import java.lang.annotation.Target;
+
+/**
+ * @author yihua.huang@dianping.com
+ * @date: 13-8-1
+ * Time: 下午8:40
+ */
+@Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
+@Target({ElementType.TYPE})
+public @interface HelpUrl {
+
+ String[] value();
+}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ObjectPageProcessor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ObjectPageProcessor.java
index ae3131e..ad8297e 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ObjectPageProcessor.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ObjectPageProcessor.java
@@ -40,6 +40,7 @@ public class ObjectPageProcessor implements PageProcessor {
targetUrlPatterns = new HashSet();
for (PageModelExtractor pageModelExtractor : pageModelExtractorList) {
targetUrlPatterns.addAll(pageModelExtractor.getTargetUrlPatterns());
+ targetUrlPatterns.addAll(pageModelExtractor.getHelpUrlPatterns());
}
}
@@ -47,6 +48,9 @@ public class ObjectPageProcessor implements PageProcessor {
public void process(Page page) {
for (PageModelExtractor pageModelExtractor : pageModelExtractorList) {
Object process = pageModelExtractor.process(page);
+ if (process==null){
+ page.getResultItems().setSkip(true);
+ }
postProcessPageModel(pageModelExtractor.getClazz(), process);
page.putField(pageModelExtractor.getClazz().getCanonicalName(), process);
}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/PageModelExtractor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/PageModelExtractor.java
index 7d0d4f2..41f635c 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/PageModelExtractor.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/PageModelExtractor.java
@@ -24,6 +24,8 @@ class PageModelExtractor {
private List targetUrlPatterns;
+ private List helpUrlPatterns;
+
private Class clazz;
private List fieldExtractors;
@@ -57,7 +59,7 @@ class PageModelExtractor {
default:
selector = new XpathSelector(value);
}
- FieldExtractor fieldExtractor = new FieldExtractor(field, selector);
+ FieldExtractor fieldExtractor = new FieldExtractor(field, selector, FieldExtractor.Source.Html, extractBy.notNull());
Method setterMethod = getSetterMethod(clazz, field);
if (setterMethod != null) {
fieldExtractor.setSetterMethod(setterMethod);
@@ -70,7 +72,7 @@ class PageModelExtractor {
if (regexPattern.trim().equals("")) {
regexPattern = ".*";
}
- FieldExtractor fieldExtractor = new FieldExtractor(field, new RegexSelector(regexPattern), FieldExtractor.Source.Url);
+ FieldExtractor fieldExtractor = new FieldExtractor(field, new RegexSelector(regexPattern), FieldExtractor.Source.Url, extractByUrl.notNull());
Method setterMethod = getSetterMethod(clazz, field);
if (setterMethod != null) {
fieldExtractor.setSetterMethod(setterMethod);
@@ -102,6 +104,14 @@ class PageModelExtractor {
targetUrlPatterns.add(Pattern.compile(s.replace(".", "\\.").replace("*", "[^\"'#]*")));
}
}
+ helpUrlPatterns = new ArrayList();
+ annotation = clazz.getAnnotation(HelpUrl.class);
+ if (annotation != null) {
+ String[] value = ((HelpUrl) annotation).value();
+ for (String s : value) {
+ helpUrlPatterns.add(Pattern.compile(s.replace(".", "\\.").replace("*", "[^\"'#]*")));
+ }
+ }
}
public Object process(Page page) {
@@ -129,7 +139,10 @@ class PageModelExtractor {
default:
value = fieldExtractor.getSelector().select(page.getHtml().toString());
}
- setField(o,fieldExtractor,value);
+ if (value==null&&fieldExtractor.isNotNull()){
+ page.getResultItems().setSkip(true);
+ }
+ setField(o, fieldExtractor, value);
}
} catch (InstantiationException e) {
e.printStackTrace();
@@ -142,8 +155,8 @@ class PageModelExtractor {
}
private void setField(Object o, FieldExtractor fieldExtractor, String value) throws IllegalAccessException, InvocationTargetException {
- if (fieldExtractor.getSetterMethod()!=null){
- fieldExtractor.getSetterMethod().invoke(o,value);
+ if (fieldExtractor.getSetterMethod() != null) {
+ fieldExtractor.getSetterMethod().invoke(o, value);
}
fieldExtractor.getField().set(o, value);
}
@@ -155,4 +168,8 @@ class PageModelExtractor {
List getTargetUrlPatterns() {
return targetUrlPatterns;
}
+
+ List getHelpUrlPatterns() {
+ return helpUrlPatterns;
+ }
}
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/annotation/samples/IteyeBlog.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/annotation/samples/IteyeBlog.java
new file mode 100644
index 0000000..8fbf089
--- /dev/null
+++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/annotation/samples/IteyeBlog.java
@@ -0,0 +1,34 @@
+package us.codecraft.webmagic.annotation.samples;
+
+import us.codecraft.webmagic.Site;
+import us.codecraft.webmagic.Spider;
+import us.codecraft.webmagic.annotation.ExtractBy;
+import us.codecraft.webmagic.annotation.TargetUrl;
+
+/**
+ * @author yihua.huang@dianping.com
+ * @date: 13-8-2
+ * Time: 上午7:52
+ */
+@TargetUrl("http://dengminhui.iteye.com/blog/*")
+public class IteyeBlog {
+
+ @ExtractBy("//title")
+ private String title;
+
+ @ExtractBy(value = "div#blog_content",type = ExtractBy.Type.Css)
+ private String content;
+
+ @Override
+ public String toString() {
+ return "IteyeBlog{" +
+ "title='" + title + '\'' +
+ ", content='" + content + '\'' +
+ '}';
+ }
+
+ public static void main(String[] args) {
+ Spider.create(Site.me().addStartUrl("http://dengminhui.iteye.com/blog"),IteyeBlog.class).run();
+ }
+
+}