add list support
parent
d4de60a562
commit
65518f7672
|
@ -10,16 +10,17 @@ import java.lang.annotation.Target;
|
||||||
* Time: 下午8:40 <br>
|
* Time: 下午8:40 <br>
|
||||||
*/
|
*/
|
||||||
@Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
|
@Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
|
||||||
@Target({ElementType.FIELD})
|
@Target({ElementType.FIELD,ElementType.TYPE})
|
||||||
public @interface ExtractBy {
|
public @interface ExtractBy {
|
||||||
|
|
||||||
|
|
||||||
//TODO: add list support
|
|
||||||
String value();
|
String value();
|
||||||
|
|
||||||
public enum Type {XPath, Regex, Css};
|
public enum Type {XPath2, XPath, Regex, Css}
|
||||||
|
|
||||||
Type type() default Type.XPath;
|
Type type() default Type.XPath2;
|
||||||
|
|
||||||
boolean notNull() default true;
|
boolean notNull() default true;
|
||||||
|
|
||||||
|
boolean multi() default false;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,4 +17,6 @@ public @interface ExtractByUrl{
|
||||||
|
|
||||||
boolean notNull() default true;
|
boolean notNull() default true;
|
||||||
|
|
||||||
|
boolean multi() default false;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,40 @@
|
||||||
|
package us.codecraft.webmagic.oo;
|
||||||
|
|
||||||
|
import us.codecraft.webmagic.selector.Selector;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author code4crafter@gmail.com <br>
|
||||||
|
* @date: 13-8-1 <br>
|
||||||
|
* Time: 下午9:48 <br>
|
||||||
|
*/
|
||||||
|
class Extractor {
|
||||||
|
|
||||||
|
protected final Selector selector;
|
||||||
|
|
||||||
|
protected final Source source;
|
||||||
|
|
||||||
|
protected final boolean notNull;
|
||||||
|
|
||||||
|
protected final boolean multi;
|
||||||
|
|
||||||
|
static enum Source {Html, Url}
|
||||||
|
|
||||||
|
public Extractor(Selector selector, Source source, boolean notNull, boolean multi) {
|
||||||
|
this.selector = selector;
|
||||||
|
this.source = source;
|
||||||
|
this.notNull = notNull;
|
||||||
|
this.multi = multi;
|
||||||
|
}
|
||||||
|
|
||||||
|
Selector getSelector() {
|
||||||
|
return selector;
|
||||||
|
}
|
||||||
|
|
||||||
|
Source getSource() {
|
||||||
|
return source;
|
||||||
|
}
|
||||||
|
|
||||||
|
boolean isNotNull() {
|
||||||
|
return notNull;
|
||||||
|
}
|
||||||
|
}
|
|
@ -10,25 +10,15 @@ import java.lang.reflect.Method;
|
||||||
* @date: 13-8-1 <br>
|
* @date: 13-8-1 <br>
|
||||||
* Time: 下午9:48 <br>
|
* Time: 下午9:48 <br>
|
||||||
*/
|
*/
|
||||||
class FieldExtractor {
|
class FieldExtractor extends Extractor{
|
||||||
|
|
||||||
private final Field field;
|
private final Field field;
|
||||||
|
|
||||||
private final Selector selector;
|
|
||||||
|
|
||||||
private final Source source;
|
|
||||||
|
|
||||||
private Method setterMethod;
|
private Method setterMethod;
|
||||||
|
|
||||||
private final boolean notNull;
|
public FieldExtractor(Field field, Selector selector, Source source, boolean notNull,boolean multi) {
|
||||||
|
super(selector, source, notNull,multi);
|
||||||
static enum Source {Html, Url}
|
|
||||||
|
|
||||||
public FieldExtractor(Field field, Selector selector, Source source, boolean notNull) {
|
|
||||||
this.field = field;
|
this.field = field;
|
||||||
this.selector = selector;
|
|
||||||
this.source = source;
|
|
||||||
this.notNull = notNull;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Field getField() {
|
Field getField() {
|
||||||
|
|
|
@ -2,7 +2,6 @@ package us.codecraft.webmagic.oo;
|
||||||
|
|
||||||
import us.codecraft.webmagic.Site;
|
import us.codecraft.webmagic.Site;
|
||||||
import us.codecraft.webmagic.Spider;
|
import us.codecraft.webmagic.Spider;
|
||||||
import us.codecraft.webmagic.pipeline.Pipeline;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
|
@ -50,8 +49,4 @@ public class OOSpider extends Spider {
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Spider pipeline(Pipeline pipeline) {
|
|
||||||
throw new UnsupportedOperationException("Sorry, OOSpider can only use ObjectPipeline");
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -2,10 +2,7 @@ package us.codecraft.webmagic.oo;
|
||||||
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
import us.codecraft.webmagic.Page;
|
import us.codecraft.webmagic.Page;
|
||||||
import us.codecraft.webmagic.selector.CssSelector;
|
import us.codecraft.webmagic.selector.*;
|
||||||
import us.codecraft.webmagic.selector.RegexSelector;
|
|
||||||
import us.codecraft.webmagic.selector.Selector;
|
|
||||||
import us.codecraft.webmagic.selector.XpathSelector;
|
|
||||||
|
|
||||||
import java.lang.annotation.Annotation;
|
import java.lang.annotation.Annotation;
|
||||||
import java.lang.reflect.Field;
|
import java.lang.reflect.Field;
|
||||||
|
@ -42,20 +39,22 @@ class PageModelExtractor {
|
||||||
this.clazz = clazz;
|
this.clazz = clazz;
|
||||||
initTargetUrlPatterns();
|
initTargetUrlPatterns();
|
||||||
fieldExtractors = new ArrayList<FieldExtractor>();
|
fieldExtractors = new ArrayList<FieldExtractor>();
|
||||||
if (clazz.isAssignableFrom(AfterExtractor.class)){
|
if (clazz.isAssignableFrom(AfterExtractor.class)) {
|
||||||
try {
|
try {
|
||||||
afterExtractor=(AfterExtractor)clazz.newInstance();
|
afterExtractor = (AfterExtractor) clazz.newInstance();
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
throw new IllegalArgumentException(e);
|
throw new IllegalArgumentException(e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for (Field field : clazz.getDeclaredFields()) {
|
for (Field field : clazz.getDeclaredFields()) {
|
||||||
field.setAccessible(true);
|
field.setAccessible(true);
|
||||||
if (!field.getType().isAssignableFrom(String.class)){
|
|
||||||
throw new IllegalStateException("Field "+field.getName()+" must be string");
|
|
||||||
}
|
|
||||||
ExtractBy extractBy = field.getAnnotation(ExtractBy.class);
|
ExtractBy extractBy = field.getAnnotation(ExtractBy.class);
|
||||||
if (extractBy != null) {
|
if (extractBy != null) {
|
||||||
|
if (!extractBy.multi() && !field.getType().isAssignableFrom(String.class)) {
|
||||||
|
throw new IllegalStateException("Field " + field.getName() + " must be string");
|
||||||
|
} else if (extractBy.multi() && !field.getType().isAssignableFrom(List.class)) {
|
||||||
|
throw new IllegalStateException("Field " + field.getName() + " must be list");
|
||||||
|
}
|
||||||
String value = extractBy.value();
|
String value = extractBy.value();
|
||||||
Selector selector;
|
Selector selector;
|
||||||
switch (extractBy.type()) {
|
switch (extractBy.type()) {
|
||||||
|
@ -68,10 +67,13 @@ class PageModelExtractor {
|
||||||
case XPath:
|
case XPath:
|
||||||
selector = new XpathSelector(value);
|
selector = new XpathSelector(value);
|
||||||
break;
|
break;
|
||||||
|
case XPath2:
|
||||||
|
selector = new Xpath2Selector(value);
|
||||||
|
break;
|
||||||
default:
|
default:
|
||||||
selector = new XpathSelector(value);
|
selector = new Xpath2Selector(value);
|
||||||
}
|
}
|
||||||
FieldExtractor fieldExtractor = new FieldExtractor(field, selector, FieldExtractor.Source.Html, extractBy.notNull());
|
FieldExtractor fieldExtractor = new FieldExtractor(field, selector, FieldExtractor.Source.Html, extractBy.notNull(), extractBy.multi());
|
||||||
Method setterMethod = getSetterMethod(clazz, field);
|
Method setterMethod = getSetterMethod(clazz, field);
|
||||||
if (setterMethod != null) {
|
if (setterMethod != null) {
|
||||||
fieldExtractor.setSetterMethod(setterMethod);
|
fieldExtractor.setSetterMethod(setterMethod);
|
||||||
|
@ -80,11 +82,16 @@ class PageModelExtractor {
|
||||||
}
|
}
|
||||||
ExtractByUrl extractByUrl = field.getAnnotation(ExtractByUrl.class);
|
ExtractByUrl extractByUrl = field.getAnnotation(ExtractByUrl.class);
|
||||||
if (extractByUrl != null) {
|
if (extractByUrl != null) {
|
||||||
|
if (!extractByUrl.multi() && !field.getType().isAssignableFrom(String.class)) {
|
||||||
|
throw new IllegalStateException("Field " + field.getName() + " must be string");
|
||||||
|
} else if (extractByUrl.multi() && !field.getType().isAssignableFrom(List.class)) {
|
||||||
|
throw new IllegalStateException("Field " + field.getName() + " must be list");
|
||||||
|
}
|
||||||
String regexPattern = extractByUrl.value();
|
String regexPattern = extractByUrl.value();
|
||||||
if (regexPattern.trim().equals("")) {
|
if (regexPattern.trim().equals("")) {
|
||||||
regexPattern = ".*";
|
regexPattern = ".*";
|
||||||
}
|
}
|
||||||
FieldExtractor fieldExtractor = new FieldExtractor(field, new RegexSelector(regexPattern), FieldExtractor.Source.Url, extractByUrl.notNull());
|
FieldExtractor fieldExtractor = new FieldExtractor(field, new RegexSelector(regexPattern), FieldExtractor.Source.Url, extractByUrl.notNull(), extractByUrl.multi());
|
||||||
Method setterMethod = getSetterMethod(clazz, field);
|
Method setterMethod = getSetterMethod(clazz, field);
|
||||||
if (setterMethod != null) {
|
if (setterMethod != null) {
|
||||||
fieldExtractor.setSetterMethod(setterMethod);
|
fieldExtractor.setSetterMethod(setterMethod);
|
||||||
|
@ -138,24 +145,42 @@ class PageModelExtractor {
|
||||||
try {
|
try {
|
||||||
o = clazz.newInstance();
|
o = clazz.newInstance();
|
||||||
for (FieldExtractor fieldExtractor : fieldExtractors) {
|
for (FieldExtractor fieldExtractor : fieldExtractors) {
|
||||||
String value;
|
if (fieldExtractor.multi) {
|
||||||
switch (fieldExtractor.getSource()) {
|
List<String> value;
|
||||||
case Html:
|
switch (fieldExtractor.getSource()) {
|
||||||
value = fieldExtractor.getSelector().select(page.getHtml().toString());
|
case Html:
|
||||||
break;
|
value = fieldExtractor.getSelector().selectList(page.getHtml().toString());
|
||||||
case Url:
|
break;
|
||||||
value = fieldExtractor.getSelector().select(page.getUrl().toString());
|
case Url:
|
||||||
break;
|
value = fieldExtractor.getSelector().selectList(page.getUrl().toString());
|
||||||
default:
|
break;
|
||||||
value = fieldExtractor.getSelector().select(page.getHtml().toString());
|
default:
|
||||||
|
value = fieldExtractor.getSelector().selectList(page.getHtml().toString());
|
||||||
|
}
|
||||||
|
if ((value == null || value.size() == 0) && fieldExtractor.isNotNull()) {
|
||||||
|
page.getResultItems().setSkip(true);
|
||||||
|
}
|
||||||
|
setField(o, fieldExtractor, value);
|
||||||
|
} else {
|
||||||
|
String value;
|
||||||
|
switch (fieldExtractor.getSource()) {
|
||||||
|
case Html:
|
||||||
|
value = fieldExtractor.getSelector().select(page.getHtml().toString());
|
||||||
|
break;
|
||||||
|
case Url:
|
||||||
|
value = fieldExtractor.getSelector().select(page.getUrl().toString());
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
value = fieldExtractor.getSelector().select(page.getHtml().toString());
|
||||||
|
}
|
||||||
|
if (value == null && fieldExtractor.isNotNull()) {
|
||||||
|
page.getResultItems().setSkip(true);
|
||||||
|
}
|
||||||
|
setField(o, fieldExtractor, value);
|
||||||
}
|
}
|
||||||
if (value==null&&fieldExtractor.isNotNull()){
|
|
||||||
page.getResultItems().setSkip(true);
|
|
||||||
}
|
|
||||||
setField(o, fieldExtractor, value);
|
|
||||||
}
|
}
|
||||||
if (afterExtractor!=null){
|
if (afterExtractor != null) {
|
||||||
afterExtractor.afterProcess(page,o);
|
afterExtractor.afterProcess(page, o);
|
||||||
}
|
}
|
||||||
} catch (InstantiationException e) {
|
} catch (InstantiationException e) {
|
||||||
e.printStackTrace();
|
e.printStackTrace();
|
||||||
|
@ -167,7 +192,7 @@ class PageModelExtractor {
|
||||||
return o;
|
return o;
|
||||||
}
|
}
|
||||||
|
|
||||||
private void setField(Object o, FieldExtractor fieldExtractor, String value) throws IllegalAccessException, InvocationTargetException {
|
private void setField(Object o, FieldExtractor fieldExtractor, Object value) throws IllegalAccessException, InvocationTargetException {
|
||||||
if (fieldExtractor.getSetterMethod() != null) {
|
if (fieldExtractor.getSetterMethod() != null) {
|
||||||
fieldExtractor.getSetterMethod().invoke(o, value);
|
fieldExtractor.getSetterMethod().invoke(o, value);
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,5 +1,7 @@
|
||||||
package us.codecraft.webmagic.oo;
|
package us.codecraft.webmagic.oo;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @author yihua.huang@dianping.com <br>
|
* @author yihua.huang@dianping.com <br>
|
||||||
* @date: 13-8-1 <br>
|
* @date: 13-8-1 <br>
|
||||||
|
@ -11,7 +13,10 @@ public class OschinaBlog {
|
||||||
@ExtractBy("//title")
|
@ExtractBy("//title")
|
||||||
private String title;
|
private String title;
|
||||||
|
|
||||||
@ExtractBy(value = "div.BlogContent",type = ExtractBy.Type.Css)
|
@ExtractBy(value = "div.BlogContent", type = ExtractBy.Type.Css)
|
||||||
private String content;
|
private String content;
|
||||||
|
|
||||||
|
@ExtractBy(value = "//div[@class='BlogTags']/a/text()", multi = true)
|
||||||
|
private List<String> tags;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,6 +1,5 @@
|
||||||
package us.codecraft.webmagic.oo;
|
package us.codecraft.webmagic.oo;
|
||||||
|
|
||||||
import org.junit.Ignore;
|
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
import us.codecraft.webmagic.Site;
|
import us.codecraft.webmagic.Site;
|
||||||
|
|
||||||
|
@ -11,7 +10,7 @@ import us.codecraft.webmagic.Site;
|
||||||
*/
|
*/
|
||||||
public class TestFetcher {
|
public class TestFetcher {
|
||||||
|
|
||||||
@Ignore("takes long")
|
// @Ignore("takes long")
|
||||||
@Test
|
@Test
|
||||||
public void test() {
|
public void test() {
|
||||||
OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog/145796"), OschinaBlog.class)
|
OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog/145796"), OschinaBlog.class)
|
||||||
|
|
Loading…
Reference in New Issue