add list support
parent
d4de60a562
commit
65518f7672
|
@ -10,16 +10,17 @@ import java.lang.annotation.Target;
|
|||
* Time: 下午8:40 <br>
|
||||
*/
|
||||
@Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
|
||||
@Target({ElementType.FIELD})
|
||||
@Target({ElementType.FIELD,ElementType.TYPE})
|
||||
public @interface ExtractBy {
|
||||
|
||||
|
||||
//TODO: add list support
|
||||
String value();
|
||||
|
||||
public enum Type {XPath, Regex, Css};
|
||||
public enum Type {XPath2, XPath, Regex, Css}
|
||||
|
||||
Type type() default Type.XPath;
|
||||
Type type() default Type.XPath2;
|
||||
|
||||
boolean notNull() default true;
|
||||
|
||||
boolean multi() default false;
|
||||
|
||||
}
|
||||
|
|
|
@ -17,4 +17,6 @@ public @interface ExtractByUrl{
|
|||
|
||||
boolean notNull() default true;
|
||||
|
||||
boolean multi() default false;
|
||||
|
||||
}
|
||||
|
|
|
@ -0,0 +1,40 @@
|
|||
package us.codecraft.webmagic.oo;
|
||||
|
||||
import us.codecraft.webmagic.selector.Selector;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* @date: 13-8-1 <br>
|
||||
* Time: 下午9:48 <br>
|
||||
*/
|
||||
class Extractor {
|
||||
|
||||
protected final Selector selector;
|
||||
|
||||
protected final Source source;
|
||||
|
||||
protected final boolean notNull;
|
||||
|
||||
protected final boolean multi;
|
||||
|
||||
static enum Source {Html, Url}
|
||||
|
||||
public Extractor(Selector selector, Source source, boolean notNull, boolean multi) {
|
||||
this.selector = selector;
|
||||
this.source = source;
|
||||
this.notNull = notNull;
|
||||
this.multi = multi;
|
||||
}
|
||||
|
||||
Selector getSelector() {
|
||||
return selector;
|
||||
}
|
||||
|
||||
Source getSource() {
|
||||
return source;
|
||||
}
|
||||
|
||||
boolean isNotNull() {
|
||||
return notNull;
|
||||
}
|
||||
}
|
|
@ -10,25 +10,15 @@ import java.lang.reflect.Method;
|
|||
* @date: 13-8-1 <br>
|
||||
* Time: 下午9:48 <br>
|
||||
*/
|
||||
class FieldExtractor {
|
||||
class FieldExtractor extends Extractor{
|
||||
|
||||
private final Field field;
|
||||
|
||||
private final Selector selector;
|
||||
|
||||
private final Source source;
|
||||
|
||||
private Method setterMethod;
|
||||
|
||||
private final boolean notNull;
|
||||
|
||||
static enum Source {Html, Url}
|
||||
|
||||
public FieldExtractor(Field field, Selector selector, Source source, boolean notNull) {
|
||||
public FieldExtractor(Field field, Selector selector, Source source, boolean notNull,boolean multi) {
|
||||
super(selector, source, notNull,multi);
|
||||
this.field = field;
|
||||
this.selector = selector;
|
||||
this.source = source;
|
||||
this.notNull = notNull;
|
||||
}
|
||||
|
||||
Field getField() {
|
||||
|
|
|
@ -2,7 +2,6 @@ package us.codecraft.webmagic.oo;
|
|||
|
||||
import us.codecraft.webmagic.Site;
|
||||
import us.codecraft.webmagic.Spider;
|
||||
import us.codecraft.webmagic.pipeline.Pipeline;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com <br>
|
||||
|
@ -50,8 +49,4 @@ public class OOSpider extends Spider {
|
|||
return this;
|
||||
}
|
||||
|
||||
public Spider pipeline(Pipeline pipeline) {
|
||||
throw new UnsupportedOperationException("Sorry, OOSpider can only use ObjectPipeline");
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -2,10 +2,7 @@ package us.codecraft.webmagic.oo;
|
|||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.selector.CssSelector;
|
||||
import us.codecraft.webmagic.selector.RegexSelector;
|
||||
import us.codecraft.webmagic.selector.Selector;
|
||||
import us.codecraft.webmagic.selector.XpathSelector;
|
||||
import us.codecraft.webmagic.selector.*;
|
||||
|
||||
import java.lang.annotation.Annotation;
|
||||
import java.lang.reflect.Field;
|
||||
|
@ -42,20 +39,22 @@ class PageModelExtractor {
|
|||
this.clazz = clazz;
|
||||
initTargetUrlPatterns();
|
||||
fieldExtractors = new ArrayList<FieldExtractor>();
|
||||
if (clazz.isAssignableFrom(AfterExtractor.class)){
|
||||
if (clazz.isAssignableFrom(AfterExtractor.class)) {
|
||||
try {
|
||||
afterExtractor=(AfterExtractor)clazz.newInstance();
|
||||
afterExtractor = (AfterExtractor) clazz.newInstance();
|
||||
} catch (Exception e) {
|
||||
throw new IllegalArgumentException(e);
|
||||
}
|
||||
}
|
||||
for (Field field : clazz.getDeclaredFields()) {
|
||||
field.setAccessible(true);
|
||||
if (!field.getType().isAssignableFrom(String.class)){
|
||||
throw new IllegalStateException("Field "+field.getName()+" must be string");
|
||||
}
|
||||
ExtractBy extractBy = field.getAnnotation(ExtractBy.class);
|
||||
if (extractBy != null) {
|
||||
if (!extractBy.multi() && !field.getType().isAssignableFrom(String.class)) {
|
||||
throw new IllegalStateException("Field " + field.getName() + " must be string");
|
||||
} else if (extractBy.multi() && !field.getType().isAssignableFrom(List.class)) {
|
||||
throw new IllegalStateException("Field " + field.getName() + " must be list");
|
||||
}
|
||||
String value = extractBy.value();
|
||||
Selector selector;
|
||||
switch (extractBy.type()) {
|
||||
|
@ -68,10 +67,13 @@ class PageModelExtractor {
|
|||
case XPath:
|
||||
selector = new XpathSelector(value);
|
||||
break;
|
||||
case XPath2:
|
||||
selector = new Xpath2Selector(value);
|
||||
break;
|
||||
default:
|
||||
selector = new XpathSelector(value);
|
||||
selector = new Xpath2Selector(value);
|
||||
}
|
||||
FieldExtractor fieldExtractor = new FieldExtractor(field, selector, FieldExtractor.Source.Html, extractBy.notNull());
|
||||
FieldExtractor fieldExtractor = new FieldExtractor(field, selector, FieldExtractor.Source.Html, extractBy.notNull(), extractBy.multi());
|
||||
Method setterMethod = getSetterMethod(clazz, field);
|
||||
if (setterMethod != null) {
|
||||
fieldExtractor.setSetterMethod(setterMethod);
|
||||
|
@ -80,11 +82,16 @@ class PageModelExtractor {
|
|||
}
|
||||
ExtractByUrl extractByUrl = field.getAnnotation(ExtractByUrl.class);
|
||||
if (extractByUrl != null) {
|
||||
if (!extractByUrl.multi() && !field.getType().isAssignableFrom(String.class)) {
|
||||
throw new IllegalStateException("Field " + field.getName() + " must be string");
|
||||
} else if (extractByUrl.multi() && !field.getType().isAssignableFrom(List.class)) {
|
||||
throw new IllegalStateException("Field " + field.getName() + " must be list");
|
||||
}
|
||||
String regexPattern = extractByUrl.value();
|
||||
if (regexPattern.trim().equals("")) {
|
||||
regexPattern = ".*";
|
||||
}
|
||||
FieldExtractor fieldExtractor = new FieldExtractor(field, new RegexSelector(regexPattern), FieldExtractor.Source.Url, extractByUrl.notNull());
|
||||
FieldExtractor fieldExtractor = new FieldExtractor(field, new RegexSelector(regexPattern), FieldExtractor.Source.Url, extractByUrl.notNull(), extractByUrl.multi());
|
||||
Method setterMethod = getSetterMethod(clazz, field);
|
||||
if (setterMethod != null) {
|
||||
fieldExtractor.setSetterMethod(setterMethod);
|
||||
|
@ -138,24 +145,42 @@ class PageModelExtractor {
|
|||
try {
|
||||
o = clazz.newInstance();
|
||||
for (FieldExtractor fieldExtractor : fieldExtractors) {
|
||||
String value;
|
||||
switch (fieldExtractor.getSource()) {
|
||||
case Html:
|
||||
value = fieldExtractor.getSelector().select(page.getHtml().toString());
|
||||
break;
|
||||
case Url:
|
||||
value = fieldExtractor.getSelector().select(page.getUrl().toString());
|
||||
break;
|
||||
default:
|
||||
value = fieldExtractor.getSelector().select(page.getHtml().toString());
|
||||
if (fieldExtractor.multi) {
|
||||
List<String> value;
|
||||
switch (fieldExtractor.getSource()) {
|
||||
case Html:
|
||||
value = fieldExtractor.getSelector().selectList(page.getHtml().toString());
|
||||
break;
|
||||
case Url:
|
||||
value = fieldExtractor.getSelector().selectList(page.getUrl().toString());
|
||||
break;
|
||||
default:
|
||||
value = fieldExtractor.getSelector().selectList(page.getHtml().toString());
|
||||
}
|
||||
if ((value == null || value.size() == 0) && fieldExtractor.isNotNull()) {
|
||||
page.getResultItems().setSkip(true);
|
||||
}
|
||||
setField(o, fieldExtractor, value);
|
||||
} else {
|
||||
String value;
|
||||
switch (fieldExtractor.getSource()) {
|
||||
case Html:
|
||||
value = fieldExtractor.getSelector().select(page.getHtml().toString());
|
||||
break;
|
||||
case Url:
|
||||
value = fieldExtractor.getSelector().select(page.getUrl().toString());
|
||||
break;
|
||||
default:
|
||||
value = fieldExtractor.getSelector().select(page.getHtml().toString());
|
||||
}
|
||||
if (value == null && fieldExtractor.isNotNull()) {
|
||||
page.getResultItems().setSkip(true);
|
||||
}
|
||||
setField(o, fieldExtractor, value);
|
||||
}
|
||||
if (value==null&&fieldExtractor.isNotNull()){
|
||||
page.getResultItems().setSkip(true);
|
||||
}
|
||||
setField(o, fieldExtractor, value);
|
||||
}
|
||||
if (afterExtractor!=null){
|
||||
afterExtractor.afterProcess(page,o);
|
||||
if (afterExtractor != null) {
|
||||
afterExtractor.afterProcess(page, o);
|
||||
}
|
||||
} catch (InstantiationException e) {
|
||||
e.printStackTrace();
|
||||
|
@ -167,7 +192,7 @@ class PageModelExtractor {
|
|||
return o;
|
||||
}
|
||||
|
||||
private void setField(Object o, FieldExtractor fieldExtractor, String value) throws IllegalAccessException, InvocationTargetException {
|
||||
private void setField(Object o, FieldExtractor fieldExtractor, Object value) throws IllegalAccessException, InvocationTargetException {
|
||||
if (fieldExtractor.getSetterMethod() != null) {
|
||||
fieldExtractor.getSetterMethod().invoke(o, value);
|
||||
}
|
||||
|
|
|
@ -1,5 +1,7 @@
|
|||
package us.codecraft.webmagic.oo;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* @author yihua.huang@dianping.com <br>
|
||||
* @date: 13-8-1 <br>
|
||||
|
@ -11,7 +13,10 @@ public class OschinaBlog {
|
|||
@ExtractBy("//title")
|
||||
private String title;
|
||||
|
||||
@ExtractBy(value = "div.BlogContent",type = ExtractBy.Type.Css)
|
||||
@ExtractBy(value = "div.BlogContent", type = ExtractBy.Type.Css)
|
||||
private String content;
|
||||
|
||||
@ExtractBy(value = "//div[@class='BlogTags']/a/text()", multi = true)
|
||||
private List<String> tags;
|
||||
|
||||
}
|
||||
|
|
|
@ -1,6 +1,5 @@
|
|||
package us.codecraft.webmagic.oo;
|
||||
|
||||
import org.junit.Ignore;
|
||||
import org.junit.Test;
|
||||
import us.codecraft.webmagic.Site;
|
||||
|
||||
|
@ -11,7 +10,7 @@ import us.codecraft.webmagic.Site;
|
|||
*/
|
||||
public class TestFetcher {
|
||||
|
||||
@Ignore("takes long")
|
||||
// @Ignore("takes long")
|
||||
@Test
|
||||
public void test() {
|
||||
OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog/145796"), OschinaBlog.class)
|
||||
|
|
Loading…
Reference in New Issue