invite notnull
parent
06a39af0f3
commit
7a4dbb1f15
|
@ -2,6 +2,7 @@ package us.codecraft.webmagic;
|
|||
|
||||
import org.apache.commons.collections.CollectionUtils;
|
||||
import org.apache.log4j.Logger;
|
||||
import us.codecraft.webmagic.annotation.ObjectPageProcessor;
|
||||
import us.codecraft.webmagic.downloader.Destroyable;
|
||||
import us.codecraft.webmagic.downloader.Downloader;
|
||||
import us.codecraft.webmagic.downloader.HttpClientDownloader;
|
||||
|
@ -89,6 +90,10 @@ public class Spider implements Runnable, Task {
|
|||
return new Spider(pageProcessor);
|
||||
}
|
||||
|
||||
public static Spider create(Site site,Class... pageModels) {
|
||||
return new Spider(ObjectPageProcessor.create(site,pageModels));
|
||||
}
|
||||
|
||||
/**
|
||||
* 重新设置startUrls,会覆盖Site本身的startUrls。
|
||||
*
|
||||
|
|
|
@ -18,4 +18,6 @@ public @interface ExtractBy {
|
|||
public enum Type {XPath, Regex, Css};
|
||||
|
||||
Type type() default Type.XPath;
|
||||
|
||||
boolean notNull() default true;
|
||||
}
|
||||
|
|
|
@ -11,8 +11,10 @@ import java.lang.annotation.Target;
|
|||
*/
|
||||
@Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
|
||||
@Target({ElementType.FIELD})
|
||||
public @interface ExtractByUrl {
|
||||
public @interface ExtractByUrl{
|
||||
|
||||
String value() default "";
|
||||
|
||||
boolean notNull() default true;
|
||||
|
||||
}
|
||||
|
|
|
@ -20,18 +20,15 @@ class FieldExtractor {
|
|||
|
||||
private Method setterMethod;
|
||||
|
||||
private final boolean notNull;
|
||||
|
||||
static enum Source {Html, Url}
|
||||
|
||||
public FieldExtractor(Field field, Selector selector) {
|
||||
this.field = field;
|
||||
this.selector = selector;
|
||||
this.source = Source.Html;
|
||||
}
|
||||
|
||||
public FieldExtractor(Field field, Selector selector, Source source) {
|
||||
public FieldExtractor(Field field, Selector selector, Source source, boolean notNull) {
|
||||
this.field = field;
|
||||
this.selector = selector;
|
||||
this.source = source;
|
||||
this.notNull = notNull;
|
||||
}
|
||||
|
||||
Field getField() {
|
||||
|
@ -53,4 +50,8 @@ class FieldExtractor {
|
|||
Method getSetterMethod() {
|
||||
return setterMethod;
|
||||
}
|
||||
|
||||
boolean isNotNull() {
|
||||
return notNull;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,17 @@
|
|||
package us.codecraft.webmagic.annotation;
|
||||
|
||||
import java.lang.annotation.ElementType;
|
||||
import java.lang.annotation.Retention;
|
||||
import java.lang.annotation.Target;
|
||||
|
||||
/**
|
||||
* @author yihua.huang@dianping.com <br>
|
||||
* @date: 13-8-1 <br>
|
||||
* Time: 下午8:40 <br>
|
||||
*/
|
||||
@Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
|
||||
@Target({ElementType.TYPE})
|
||||
public @interface HelpUrl {
|
||||
|
||||
String[] value();
|
||||
}
|
|
@ -40,6 +40,7 @@ public class ObjectPageProcessor implements PageProcessor {
|
|||
targetUrlPatterns = new HashSet<Pattern>();
|
||||
for (PageModelExtractor pageModelExtractor : pageModelExtractorList) {
|
||||
targetUrlPatterns.addAll(pageModelExtractor.getTargetUrlPatterns());
|
||||
targetUrlPatterns.addAll(pageModelExtractor.getHelpUrlPatterns());
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -47,6 +48,9 @@ public class ObjectPageProcessor implements PageProcessor {
|
|||
public void process(Page page) {
|
||||
for (PageModelExtractor pageModelExtractor : pageModelExtractorList) {
|
||||
Object process = pageModelExtractor.process(page);
|
||||
if (process==null){
|
||||
page.getResultItems().setSkip(true);
|
||||
}
|
||||
postProcessPageModel(pageModelExtractor.getClazz(), process);
|
||||
page.putField(pageModelExtractor.getClazz().getCanonicalName(), process);
|
||||
}
|
||||
|
|
|
@ -24,6 +24,8 @@ class PageModelExtractor {
|
|||
|
||||
private List<Pattern> targetUrlPatterns;
|
||||
|
||||
private List<Pattern> helpUrlPatterns;
|
||||
|
||||
private Class clazz;
|
||||
|
||||
private List<FieldExtractor> fieldExtractors;
|
||||
|
@ -57,7 +59,7 @@ class PageModelExtractor {
|
|||
default:
|
||||
selector = new XpathSelector(value);
|
||||
}
|
||||
FieldExtractor fieldExtractor = new FieldExtractor(field, selector);
|
||||
FieldExtractor fieldExtractor = new FieldExtractor(field, selector, FieldExtractor.Source.Html, extractBy.notNull());
|
||||
Method setterMethod = getSetterMethod(clazz, field);
|
||||
if (setterMethod != null) {
|
||||
fieldExtractor.setSetterMethod(setterMethod);
|
||||
|
@ -70,7 +72,7 @@ class PageModelExtractor {
|
|||
if (regexPattern.trim().equals("")) {
|
||||
regexPattern = ".*";
|
||||
}
|
||||
FieldExtractor fieldExtractor = new FieldExtractor(field, new RegexSelector(regexPattern), FieldExtractor.Source.Url);
|
||||
FieldExtractor fieldExtractor = new FieldExtractor(field, new RegexSelector(regexPattern), FieldExtractor.Source.Url, extractByUrl.notNull());
|
||||
Method setterMethod = getSetterMethod(clazz, field);
|
||||
if (setterMethod != null) {
|
||||
fieldExtractor.setSetterMethod(setterMethod);
|
||||
|
@ -102,6 +104,14 @@ class PageModelExtractor {
|
|||
targetUrlPatterns.add(Pattern.compile(s.replace(".", "\\.").replace("*", "[^\"'#]*")));
|
||||
}
|
||||
}
|
||||
helpUrlPatterns = new ArrayList<Pattern>();
|
||||
annotation = clazz.getAnnotation(HelpUrl.class);
|
||||
if (annotation != null) {
|
||||
String[] value = ((HelpUrl) annotation).value();
|
||||
for (String s : value) {
|
||||
helpUrlPatterns.add(Pattern.compile(s.replace(".", "\\.").replace("*", "[^\"'#]*")));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public Object process(Page page) {
|
||||
|
@ -129,7 +139,10 @@ class PageModelExtractor {
|
|||
default:
|
||||
value = fieldExtractor.getSelector().select(page.getHtml().toString());
|
||||
}
|
||||
setField(o,fieldExtractor,value);
|
||||
if (value==null&&fieldExtractor.isNotNull()){
|
||||
page.getResultItems().setSkip(true);
|
||||
}
|
||||
setField(o, fieldExtractor, value);
|
||||
}
|
||||
} catch (InstantiationException e) {
|
||||
e.printStackTrace();
|
||||
|
@ -142,8 +155,8 @@ class PageModelExtractor {
|
|||
}
|
||||
|
||||
private void setField(Object o, FieldExtractor fieldExtractor, String value) throws IllegalAccessException, InvocationTargetException {
|
||||
if (fieldExtractor.getSetterMethod()!=null){
|
||||
fieldExtractor.getSetterMethod().invoke(o,value);
|
||||
if (fieldExtractor.getSetterMethod() != null) {
|
||||
fieldExtractor.getSetterMethod().invoke(o, value);
|
||||
}
|
||||
fieldExtractor.getField().set(o, value);
|
||||
}
|
||||
|
@ -155,4 +168,8 @@ class PageModelExtractor {
|
|||
List<Pattern> getTargetUrlPatterns() {
|
||||
return targetUrlPatterns;
|
||||
}
|
||||
|
||||
List<Pattern> getHelpUrlPatterns() {
|
||||
return helpUrlPatterns;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,34 @@
|
|||
package us.codecraft.webmagic.annotation.samples;
|
||||
|
||||
import us.codecraft.webmagic.Site;
|
||||
import us.codecraft.webmagic.Spider;
|
||||
import us.codecraft.webmagic.annotation.ExtractBy;
|
||||
import us.codecraft.webmagic.annotation.TargetUrl;
|
||||
|
||||
/**
|
||||
* @author yihua.huang@dianping.com <br>
|
||||
* @date: 13-8-2 <br>
|
||||
* Time: 上午7:52 <br>
|
||||
*/
|
||||
@TargetUrl("http://dengminhui.iteye.com/blog/*")
|
||||
public class IteyeBlog {
|
||||
|
||||
@ExtractBy("//title")
|
||||
private String title;
|
||||
|
||||
@ExtractBy(value = "div#blog_content",type = ExtractBy.Type.Css)
|
||||
private String content;
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "IteyeBlog{" +
|
||||
"title='" + title + '\'' +
|
||||
", content='" + content + '\'' +
|
||||
'}';
|
||||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
Spider.create(Site.me().addStartUrl("http://dengminhui.iteye.com/blog"),IteyeBlog.class).run();
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in New Issue