add annotation ExtractByRaw
parent
1a50c64e33
commit
a5c85c3c8b
|
@ -0,0 +1,27 @@
|
|||
package us.codecraft.webmagic.model;
|
||||
|
||||
import java.lang.annotation.ElementType;
|
||||
import java.lang.annotation.Retention;
|
||||
import java.lang.annotation.Target;
|
||||
|
||||
/**
|
||||
* 对于在Class级别就使用过ExtractBy的类,在字段中想抽取全部内容可使用此方法。<br>
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* @date: 13-8-1 <br>
|
||||
* Time: 下午8:40 <br>
|
||||
*/
|
||||
@Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
|
||||
@Target({ElementType.FIELD,ElementType.TYPE})
|
||||
public @interface ExtractByRaw {
|
||||
|
||||
String value();
|
||||
|
||||
public enum Type {XPath2, XPath, Regex, Css}
|
||||
|
||||
Type type() default Type.XPath2;
|
||||
|
||||
boolean notNull() default true;
|
||||
|
||||
boolean multi() default false;
|
||||
|
||||
}
|
|
@ -17,7 +17,7 @@ class Extractor {
|
|||
|
||||
protected final boolean multi;
|
||||
|
||||
static enum Source {Html, Url}
|
||||
static enum Source {Html, Url, RawHtml}
|
||||
|
||||
public Extractor(Selector selector, Source source, boolean notNull, boolean multi) {
|
||||
this.selector = selector;
|
||||
|
|
|
@ -46,56 +46,100 @@ class PageModelExtractor {
|
|||
fieldExtractors = new ArrayList<FieldExtractor>();
|
||||
for (Field field : clazz.getDeclaredFields()) {
|
||||
field.setAccessible(true);
|
||||
ExtractBy extractBy = field.getAnnotation(ExtractBy.class);
|
||||
if (extractBy != null) {
|
||||
if (!extractBy.multi() && !String.class.isAssignableFrom(field.getType())) {
|
||||
throw new IllegalStateException("Field " + field.getName() + " must be string");
|
||||
} else if (extractBy.multi() && !List.class.isAssignableFrom(field.getType())) {
|
||||
throw new IllegalStateException("Field " + field.getName() + " must be list");
|
||||
}
|
||||
String value = extractBy.value();
|
||||
Selector selector;
|
||||
switch (extractBy.type()) {
|
||||
case Css:
|
||||
selector = new CssSelector(value);
|
||||
break;
|
||||
case Regex:
|
||||
selector = new RegexSelector(value);
|
||||
break;
|
||||
case XPath:
|
||||
selector = new XpathSelector(value);
|
||||
break;
|
||||
case XPath2:
|
||||
selector = new Xpath2Selector(value);
|
||||
break;
|
||||
default:
|
||||
selector = new Xpath2Selector(value);
|
||||
}
|
||||
FieldExtractor fieldExtractor = new FieldExtractor(field, selector, FieldExtractor.Source.Html, extractBy.notNull(), extractBy.multi());
|
||||
Method setterMethod = getSetterMethod(clazz, field);
|
||||
if (setterMethod != null) {
|
||||
fieldExtractor.setSetterMethod(setterMethod);
|
||||
}
|
||||
fieldExtractors.add(fieldExtractor);
|
||||
getAnnotationExtractBy(clazz, field);
|
||||
getAnnotationExtractByRaw(clazz,field);
|
||||
getAnnotationExtractByUrl(clazz, field);
|
||||
}
|
||||
}
|
||||
|
||||
private void getAnnotationExtractByUrl(Class clazz, Field field) {
|
||||
ExtractByUrl extractByUrl = field.getAnnotation(ExtractByUrl.class);
|
||||
if (extractByUrl != null) {
|
||||
if (!extractByUrl.multi() && !String.class.isAssignableFrom(field.getType())) {
|
||||
throw new IllegalStateException("Field " + field.getName() + " must be string");
|
||||
} else if (extractByUrl.multi() && !List.class.isAssignableFrom(field.getType())) {
|
||||
throw new IllegalStateException("Field " + field.getName() + " must be list");
|
||||
}
|
||||
ExtractByUrl extractByUrl = field.getAnnotation(ExtractByUrl.class);
|
||||
if (extractByUrl != null) {
|
||||
if (!extractByUrl.multi() && !String.class.isAssignableFrom(field.getType())) {
|
||||
throw new IllegalStateException("Field " + field.getName() + " must be string");
|
||||
} else if (extractByUrl.multi() && !List.class.isAssignableFrom(field.getType())) {
|
||||
throw new IllegalStateException("Field " + field.getName() + " must be list");
|
||||
}
|
||||
String regexPattern = extractByUrl.value();
|
||||
if (regexPattern.trim().equals("")) {
|
||||
regexPattern = ".*";
|
||||
}
|
||||
FieldExtractor fieldExtractor = new FieldExtractor(field, new RegexSelector(regexPattern), FieldExtractor.Source.Url, extractByUrl.notNull(), extractByUrl.multi());
|
||||
Method setterMethod = getSetterMethod(clazz, field);
|
||||
if (setterMethod != null) {
|
||||
fieldExtractor.setSetterMethod(setterMethod);
|
||||
}
|
||||
fieldExtractors.add(fieldExtractor);
|
||||
String regexPattern = extractByUrl.value();
|
||||
if (regexPattern.trim().equals("")) {
|
||||
regexPattern = ".*";
|
||||
}
|
||||
FieldExtractor fieldExtractor = new FieldExtractor(field, new RegexSelector(regexPattern), FieldExtractor.Source.Url, extractByUrl.notNull(), extractByUrl.multi());
|
||||
Method setterMethod = getSetterMethod(clazz, field);
|
||||
if (setterMethod != null) {
|
||||
fieldExtractor.setSetterMethod(setterMethod);
|
||||
}
|
||||
fieldExtractors.add(fieldExtractor);
|
||||
}
|
||||
}
|
||||
|
||||
private void getAnnotationExtractBy(Class clazz, Field field) {
|
||||
ExtractBy extractBy = field.getAnnotation(ExtractBy.class);
|
||||
if (extractBy != null) {
|
||||
if (!extractBy.multi() && !String.class.isAssignableFrom(field.getType())) {
|
||||
throw new IllegalStateException("Field " + field.getName() + " must be string");
|
||||
} else if (extractBy.multi() && !List.class.isAssignableFrom(field.getType())) {
|
||||
throw new IllegalStateException("Field " + field.getName() + " must be list");
|
||||
}
|
||||
String value = extractBy.value();
|
||||
Selector selector;
|
||||
switch (extractBy.type()) {
|
||||
case Css:
|
||||
selector = new CssSelector(value);
|
||||
break;
|
||||
case Regex:
|
||||
selector = new RegexSelector(value);
|
||||
break;
|
||||
case XPath:
|
||||
selector = new XpathSelector(value);
|
||||
break;
|
||||
case XPath2:
|
||||
selector = new Xpath2Selector(value);
|
||||
break;
|
||||
default:
|
||||
selector = new Xpath2Selector(value);
|
||||
}
|
||||
FieldExtractor fieldExtractor = new FieldExtractor(field, selector, FieldExtractor.Source.Html, extractBy.notNull(), extractBy.multi());
|
||||
Method setterMethod = getSetterMethod(clazz, field);
|
||||
if (setterMethod != null) {
|
||||
fieldExtractor.setSetterMethod(setterMethod);
|
||||
}
|
||||
fieldExtractors.add(fieldExtractor);
|
||||
}
|
||||
}
|
||||
|
||||
private void getAnnotationExtractByRaw(Class clazz, Field field) {
|
||||
ExtractByRaw extractByRaw = field.getAnnotation(ExtractByRaw.class);
|
||||
if (extractByRaw != null) {
|
||||
if (!extractByRaw.multi() && !String.class.isAssignableFrom(field.getType())) {
|
||||
throw new IllegalStateException("Field " + field.getName() + " must be string");
|
||||
} else if (extractByRaw.multi() && !List.class.isAssignableFrom(field.getType())) {
|
||||
throw new IllegalStateException("Field " + field.getName() + " must be list");
|
||||
}
|
||||
String value = extractByRaw.value();
|
||||
Selector selector;
|
||||
switch (extractByRaw.type()) {
|
||||
case Css:
|
||||
selector = new CssSelector(value);
|
||||
break;
|
||||
case Regex:
|
||||
selector = new RegexSelector(value);
|
||||
break;
|
||||
case XPath:
|
||||
selector = new XpathSelector(value);
|
||||
break;
|
||||
case XPath2:
|
||||
selector = new Xpath2Selector(value);
|
||||
break;
|
||||
default:
|
||||
selector = new Xpath2Selector(value);
|
||||
}
|
||||
FieldExtractor fieldExtractor = new FieldExtractor(field, selector, FieldExtractor.Source.RawHtml, extractByRaw.notNull(), extractByRaw.multi());
|
||||
Method setterMethod = getSetterMethod(clazz, field);
|
||||
if (setterMethod != null) {
|
||||
fieldExtractor.setSetterMethod(setterMethod);
|
||||
}
|
||||
fieldExtractors.add(fieldExtractor);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -181,6 +225,9 @@ class PageModelExtractor {
|
|||
if (fieldExtractor.multi) {
|
||||
List<String> value;
|
||||
switch (fieldExtractor.getSource()) {
|
||||
case RawHtml:
|
||||
value = fieldExtractor.getSelector().selectList(page.getHtml().toString());
|
||||
break;
|
||||
case Html:
|
||||
value = fieldExtractor.getSelector().selectList(html);
|
||||
break;
|
||||
|
@ -197,6 +244,9 @@ class PageModelExtractor {
|
|||
} else {
|
||||
String value;
|
||||
switch (fieldExtractor.getSource()) {
|
||||
case RawHtml:
|
||||
value = fieldExtractor.getSelector().select(page.getHtml().toString());
|
||||
break;
|
||||
case Html:
|
||||
value = fieldExtractor.getSelector().select(html);
|
||||
break;
|
||||
|
|
|
@ -22,6 +22,9 @@ public class OschinaBlog implements AfterExtractor {
|
|||
@ExtractBy(value = "//div[@class='BlogTags']/a/text()", multi = true)
|
||||
private List<String> tags;
|
||||
|
||||
@ExtractBy(value = "//div[@class='BlogTags']/a/text()", multi = true)
|
||||
private List<String> comments;
|
||||
|
||||
@Override
|
||||
public void afterProcess(Page page) {
|
||||
System.out.println("title:\t"+title);
|
||||
|
|
|
@ -0,0 +1,13 @@
|
|||
package us.codecraft.webmagic.model;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* @date: 13-8-1 <br>
|
||||
* Time: 下午10:18 <br>
|
||||
*/
|
||||
@TargetUrl("http://my.oschina.net/flashsword/blog/*")
|
||||
public class OschinaBlogComment {
|
||||
|
||||
|
||||
|
||||
}
|
Loading…
Reference in New Issue