add combo extract to replace Extract2 Extract3...

master
yihua.huang 2013-08-17 17:23:11 +08:00
parent f946fcdfea
commit 3ba7a76f44
13 changed files with 213 additions and 230 deletions

View File

@ -5,8 +5,7 @@ import java.util.List;
/** /**
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* Date: 13-8-3 <br> * @since 0.2.0
* Time: 5:29 <br>
*/ */
public class AndSelector implements Selector { public class AndSelector implements Selector {
@ -18,6 +17,10 @@ public class AndSelector implements Selector {
} }
} }
public AndSelector(List<Selector> selectors) {
this.selectors = selectors;
}
@Override @Override
public String select(String text) { public String select(String text) {
for (Selector selector : selectors) { for (Selector selector : selectors) {

View File

@ -5,8 +5,7 @@ import java.util.List;
/** /**
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* Date: 13-8-3 <br> * @since 0.2.0
* Time: 5:29 <br>
*/ */
public class OrSelector implements Selector { public class OrSelector implements Selector {
@ -18,11 +17,15 @@ public class OrSelector implements Selector {
} }
} }
public OrSelector(List<Selector> selectors) {
this.selectors = selectors;
}
@Override @Override
public String select(String text) { public String select(String text) {
for (Selector selector : selectors) { for (Selector selector : selectors) {
text = selector.select(text); text = selector.select(text);
if (text!=null){ if (text != null) {
return text; return text;
} }
} }

View File

@ -4,6 +4,7 @@ import org.apache.commons.lang3.StringUtils;
import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.model.annotation.*; import us.codecraft.webmagic.model.annotation.*;
import us.codecraft.webmagic.selector.*; import us.codecraft.webmagic.selector.*;
import us.codecraft.webmagic.utils.ExtractorUtils;
import java.lang.annotation.Annotation; import java.lang.annotation.Annotation;
import java.lang.reflect.Field; import java.lang.reflect.Field;
@ -49,20 +50,15 @@ class PageModelExtractor {
for (Field field : clazz.getDeclaredFields()) { for (Field field : clazz.getDeclaredFields()) {
field.setAccessible(true); field.setAccessible(true);
FieldExtractor fieldExtractor = getAnnotationExtractBy(clazz, field); FieldExtractor fieldExtractor = getAnnotationExtractBy(clazz, field);
FieldExtractor fieldExtractorTmp = getAnnotationExtractByRaw(clazz, field); FieldExtractor fieldExtractorTmp = getAnnotationExtractCombo(clazz, field);
if (fieldExtractor != null && fieldExtractorTmp != null) { if (fieldExtractor != null && fieldExtractorTmp != null) {
throw new IllegalStateException("Only one of 'ExtractBy ExtractByRaw ExtractByUrl' can be added to a field!"); throw new IllegalStateException("Only one of 'ExtractBy ComboExtract ExtractByUrl' can be added to a field!");
} else if (fieldExtractor == null && fieldExtractorTmp != null) { } else if (fieldExtractor == null && fieldExtractorTmp != null) {
fieldExtractor = fieldExtractorTmp; fieldExtractor = fieldExtractorTmp;
} }
// ExtractBy2 & ExtractBy3
if (fieldExtractor!=null){
addAnnotationExtractBy2(fieldExtractor);
addAnnotationExtractBy3(fieldExtractor);
}
fieldExtractorTmp = getAnnotationExtractByUrl(clazz, field); fieldExtractorTmp = getAnnotationExtractByUrl(clazz, field);
if (fieldExtractor != null && fieldExtractorTmp != null) { if (fieldExtractor != null && fieldExtractorTmp != null) {
throw new IllegalStateException("Only one of 'ExtractBy ExtractByRaw ExtractByUrl' can be added to a field!"); throw new IllegalStateException("Only one of 'ExtractBy ComboExtract ExtractByUrl' can be added to a field!");
} else if (fieldExtractor == null && fieldExtractorTmp != null) { } else if (fieldExtractor == null && fieldExtractorTmp != null) {
fieldExtractor = fieldExtractorTmp; fieldExtractor = fieldExtractorTmp;
} }
@ -94,26 +90,23 @@ class PageModelExtractor {
return fieldExtractor; return fieldExtractor;
} }
private FieldExtractor getAnnotationExtractBy(Class clazz, Field field) { private FieldExtractor getAnnotationExtractCombo(Class clazz, Field field) {
FieldExtractor fieldExtractor = null; FieldExtractor fieldExtractor = null;
ExtractBy extractBy = field.getAnnotation(ExtractBy.class); ComboExtract comboExtract = field.getAnnotation(ComboExtract.class);
if (extractBy != null) { if (comboExtract != null) {
String value = extractBy.value(); ExtractBy[] extractBies = comboExtract.value();
Selector selector; Selector selector;
switch (extractBy.type()) { switch (comboExtract.op()) {
case Css: case And:
selector = new CssSelector(value); selector = new AndSelector(ExtractorUtils.getSelectors(extractBies));
break; break;
case Regex: case Or:
selector = new RegexSelector(value); selector = new OrSelector(ExtractorUtils.getSelectors(extractBies));
break;
case XPath:
selector = new XpathSelector(value);
break; break;
default: default:
selector = new XpathSelector(value); selector = new AndSelector(ExtractorUtils.getSelectors(extractBies));
} }
fieldExtractor = new FieldExtractor(field, selector, FieldExtractor.Source.Html, extractBy.notNull(), extractBy.multi()); fieldExtractor = new FieldExtractor(field, selector, FieldExtractor.Source.Html, comboExtract.notNull(), comboExtract.multi());
Method setterMethod = getSetterMethod(clazz, field); Method setterMethod = getSetterMethod(clazz, field);
if (setterMethod != null) { if (setterMethod != null) {
fieldExtractor.setSetterMethod(setterMethod); fieldExtractor.setSetterMethod(setterMethod);
@ -122,70 +115,12 @@ class PageModelExtractor {
return fieldExtractor; return fieldExtractor;
} }
private void addAnnotationExtractBy2(FieldExtractor fieldExtractor) { private FieldExtractor getAnnotationExtractBy(Class clazz, Field field) {
ExtractBy2 extractBy = fieldExtractor.getField().getAnnotation(ExtractBy2.class);
if (extractBy != null) {
String value = extractBy.value();
Selector selector;
switch (extractBy.type()) {
case Css:
selector = new CssSelector(value);
break;
case Regex:
selector = new RegexSelector(value);
break;
case XPath:
selector = new XpathSelector(value);
break;
default:
selector = new XpathSelector(value);
}
fieldExtractor.setSelector(new AndSelector(fieldExtractor.getSelector(), selector));
}
}
private void addAnnotationExtractBy3(FieldExtractor fieldExtractor) {
ExtractBy3 extractBy = fieldExtractor.getField().getAnnotation(ExtractBy3.class);
if (extractBy != null) {
String value = extractBy.value();
Selector selector;
switch (extractBy.type()) {
case Css:
selector = new CssSelector(value);
break;
case Regex:
selector = new RegexSelector(value);
break;
case XPath:
selector = new XpathSelector(value);
break;
default:
selector = new XpathSelector(value);
}
fieldExtractor.setSelector(new AndSelector(fieldExtractor.getSelector(), selector));
}
}
private FieldExtractor getAnnotationExtractByRaw(Class clazz, Field field) {
FieldExtractor fieldExtractor = null; FieldExtractor fieldExtractor = null;
ExtractByRaw extractByRaw = field.getAnnotation(ExtractByRaw.class); ExtractBy extractBy = field.getAnnotation(ExtractBy.class);
if (extractByRaw != null) { if (extractBy != null) {
String value = extractByRaw.value(); Selector selector = ExtractorUtils.getSelector(extractBy);
Selector selector; fieldExtractor = new FieldExtractor(field, selector, FieldExtractor.Source.Html, extractBy.notNull(), extractBy.multi());
switch (extractByRaw.type()) {
case Css:
selector = new CssSelector(value);
break;
case Regex:
selector = new RegexSelector(value);
break;
case XPath:
selector = new XpathSelector(value);
break;
default:
selector = new XpathSelector(value);
}
fieldExtractor = new FieldExtractor(field, selector, FieldExtractor.Source.RawHtml, extractByRaw.notNull(), extractByRaw.multi());
Method setterMethod = getSetterMethod(clazz, field); Method setterMethod = getSetterMethod(clazz, field);
if (setterMethod != null) { if (setterMethod != null) {
fieldExtractor.setSetterMethod(setterMethod); fieldExtractor.setSetterMethod(setterMethod);

View File

@ -5,14 +5,75 @@ import java.lang.annotation.Retention;
import java.lang.annotation.Target; import java.lang.annotation.Target;
/** /**
* Combo 'ExtractBy' extractor with and/or operator.
*
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* Date: 13-8-16 <br> * @since 0.2.1
* Time: 11:09 <br>
*/ */
@Retention(java.lang.annotation.RetentionPolicy.RUNTIME) @Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
@Target({ElementType.FIELD, ElementType.TYPE}) @Target({ElementType.FIELD, ElementType.TYPE})
public @interface ComboExtract { public @interface ComboExtract {
/**
* The extractors to be combined.
*
* @return the extractors to be combined
*/
ExtractBy[] value();
enum Op {
/**
* All extractors will be arranged as a pipeline. <br>
* The next extractor uses the result of the previous as source.
*/
And,
/**
* All extractors will do extracting separately, <br>
* and the results of extractors will combined as the final result.
*/
Or;
}
/**
* Combining operation of extractors.<br>
*
* @return combining operation of extractors
*/
Op op() default Op.And;
/**
* Define whether the field can be null.<br>
* If set to 'true' and the extractor get no result, the entire class will be discarded. <br>
*
* @return whether the field can be null
*/
boolean notNull() default false;
public enum Source {
/**
* extract from the content extracted by class extractor
*/
SelectedHtml,
/**
* extract from the raw html
*/
RawHtml
}
/**
* The source for extracting. <br>
* It works only if you already added 'ExtractBy' to Class. <br>
*
* @return the source for extracting
*/
Source source() default Source.SelectedHtml;
/**
* Define whether the extractor return more than one result.
* When set to 'true', the extractor return a list of string (so you should define the field as List). <br>
*
* @return whether the extractor return more than one result
*/
boolean multi() default false;
} }

View File

@ -5,45 +5,63 @@ import java.lang.annotation.Retention;
import java.lang.annotation.Target; import java.lang.annotation.Target;
/** /**
* <br> * Define the extractor for field or class<br>
* *
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* Date: 13-8-1 <br> * @since 0.2.0
* Time: 8:40 <br>
*/ */
@Retention(java.lang.annotation.RetentionPolicy.RUNTIME) @Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
@Target({ElementType.FIELD, ElementType.TYPE}) @Target({ElementType.FIELD, ElementType.TYPE})
public @interface ExtractBy { public @interface ExtractBy {
/** /**
* * Extractor expression, support XPath, CSS Selector and regex.
* *
* @return * @return extractor expression
*/ */
String value(); String value();
public enum Type {XPath, Regex, Css} public enum Type {XPath, Regex, Css}
/** /**
* XPathCss selectorXPath * Extractor type, support XPath, CSS Selector and regex.
* *
* @return * @return extractor type
*/ */
Type type() default Type.XPath; Type type() default Type.XPath;
/** /**
* notNulltruefalse * Define whether the field can be null.<br>
* If set to 'true' and the extractor get no result, the entire class will be discarded. <br>
* *
* @return * @return whether the field can be null
*/ */
boolean notNull() default false; boolean notNull() default false;
public enum Source {
/**
* extract from the content extracted by class extractor
*/
SelectedHtml,
/**
* extract from the raw html
*/
RawHtml
}
/** /**
* <br> * The source for extracting. <br>
* List<String><br> * It works only if you already added 'ExtractBy' to Class. <br>
* <br>
* *
* @return * @return the source for extracting
*/
Source source() default Source.SelectedHtml;
/**
* Define whether the extractor return more than one result.
* When set to 'true', the extractor return a list of string (so you should define the field as List). <br>
*
* @return whether the extractor return more than one result
*/ */
boolean multi() default false; boolean multi() default false;

View File

@ -1,24 +0,0 @@
package us.codecraft.webmagic.model.annotation;
import java.lang.annotation.ElementType;
import java.lang.annotation.Retention;
import java.lang.annotation.Target;
/**
* ExtractExtractByRaw使<br>
*
* @author code4crafter@gmail.com <br>
* Date: 13-8-1 <br>
* Time: 8:40 <br>
*/
@Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
@Target({ElementType.FIELD})
public @interface ExtractBy2 {
String value();
public enum Type {XPath, Regex, Css}
Type type() default Type.XPath;
}

View File

@ -1,23 +0,0 @@
package us.codecraft.webmagic.model.annotation;
import java.lang.annotation.ElementType;
import java.lang.annotation.Retention;
import java.lang.annotation.Target;
/**
* ExtractExtractByRaw使<br>
* @author code4crafter@gmail.com <br>
* Date: 13-8-1 <br>
* Time: 8:40 <br>
*/
@Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
@Target({ElementType.FIELD})
public @interface ExtractBy3 {
String value();
public enum Type { XPath, Regex, Css}
Type type() default Type.XPath;
}

View File

@ -1,49 +0,0 @@
package us.codecraft.webmagic.model.annotation;
import java.lang.annotation.ElementType;
import java.lang.annotation.Retention;
import java.lang.annotation.Target;
/**
* Class使ExtractBy使<br>
*
* @author code4crafter@gmail.com <br>
* Date: 13-8-1 <br>
* Time: 8:40 <br>
*/
@Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
@Target({ElementType.FIELD, ElementType.TYPE})
public @interface ExtractByRaw {
/**
*
*
* @return
*/
String value();
public enum Type {XPath, Regex, Css}
/**
* XPathCss selectorXPath
*
* @return
*/
Type type() default Type.XPath;
/**
* notNulltruefalse
*
* @return
*/
boolean notNull() default false;
/**
* <br>
* List<String><br>
*
* @return
*/
boolean multi() default false;
}

View File

@ -5,35 +5,35 @@ import java.lang.annotation.Retention;
import java.lang.annotation.Target; import java.lang.annotation.Target;
/** /**
* (url)<br> * Define a extractor for url. Only regex can be used. <br>
*
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* Date: 13-8-1 <br> * @since 0.2.0
* Time: 8:40 <br>
*/ */
@Retention(java.lang.annotation.RetentionPolicy.RUNTIME) @Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
@Target({ElementType.FIELD}) @Target({ElementType.FIELD})
public @interface ExtractByUrl{ public @interface ExtractByUrl {
/** /**
* * Extractor expression, only regex can be used
* *
* @return * @return extractor expression
*/ */
String value() default ""; String value() default "";
/** /**
* notNulltruefalse * Define whether the field can be null.<br>
* If set to 'true' and the extractor get no result, the entire class will be discarded. <br>
* *
* @return * @return whether the field can be null
*/ */
boolean notNull() default false; boolean notNull() default false;
/** /**
* <br> * Define whether the extractor return more than one result.
* List<String><br> * When set to 'true', the extractor return a list of string (so you should define the field as List). <br>
* <br>
* *
* @return * @return whether the extractor return more than one result
*/ */
boolean multi() default false; boolean multi() default false;

View File

@ -5,26 +5,32 @@ import java.lang.annotation.Retention;
import java.lang.annotation.Target; import java.lang.annotation.Target;
/** /**
* url<br> * Define the 'help' url patterns for class. <br>
* All urls matching the pattern will be crawled and but not extracted for new objects. <br>
*
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* Date: 13-8-1 <br> * @since 0.2.0
* Time: 8:40 <br>
*/ */
@Retention(java.lang.annotation.RetentionPolicy.RUNTIME) @Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
@Target({ElementType.TYPE}) @Target({ElementType.TYPE})
public @interface HelpUrl { public @interface HelpUrl {
/** /**
* URL<br> * The url patterns to crawl. <br>
* webmagic"."".""\*"".\*""http://\*.oschina.net/\*"oschinaURL<br> * Use regex expression with some changes: <br>
* "." stand for literal character "." instead of "any character". <br>
* "*" stand for any legal character for url in 0-n length ([^"'#]*) instead of "any length". <br>
* *
* @return * @return the url patterns for class
*/ */
String[] value(); String[] value();
/** /**
* URL(XPath) * Define the region for url extracting. <br>
* @return URL * Only support XPath.<br>
* When sourceRegion is set, the urls will be extracted only from the region instead of entire content. <br>
*
* @return the region for url extracting
*/ */
String sourceRegion() default ""; String sourceRegion() default "";
} }

View File

@ -5,27 +5,32 @@ import java.lang.annotation.Retention;
import java.lang.annotation.Target; import java.lang.annotation.Target;
/** /**
* sourceRegionxpath<br> * Define the url patterns for class. <br>
* All urls matching the pattern will be crawled and extracted for new objects. <br>
* *
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* Date: 13-8-1 <br> * @since 0.2.0
* Time: 8:40 <br>
*/ */
@Retention(java.lang.annotation.RetentionPolicy.RUNTIME) @Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
@Target({ElementType.TYPE}) @Target({ElementType.TYPE})
public @interface TargetUrl { public @interface TargetUrl {
/** /**
* URL<br> * The url patterns for class.<br>
* webmagic"."".""\*"".\*""http://\*.oschina.net/\*"oschinaURL<br> * Use regex expression with some changes: <br>
* "." stand for literal character "." instead of "any character". <br>
* "*" stand for any legal character for url in 0-n length ([^"'#]*) instead of "any length". <br>
* *
* @return * @return the url patterns for class
*/ */
String[] value(); String[] value();
/** /**
* URL(XPath) * Define the region for url extracting. <br>
* @return URL * Only support XPath.<br>
* When sourceRegion is set, the urls will be extracted only from the region instead of entire content. <br>
*
* @return the region for url extracting
*/ */
String sourceRegion() default ""; String sourceRegion() default "";

View File

@ -1,5 +1,5 @@
<html> <html>
<body> <body>
webmagic注解抓取方式所定义的注解。 Annotations for define a class.
</body> </body>
</html> </html>

View File

@ -0,0 +1,48 @@
package us.codecraft.webmagic.utils;
import us.codecraft.webmagic.model.annotation.ExtractBy;
import us.codecraft.webmagic.selector.CssSelector;
import us.codecraft.webmagic.selector.RegexSelector;
import us.codecraft.webmagic.selector.Selector;
import us.codecraft.webmagic.selector.XpathSelector;
import java.util.ArrayList;
import java.util.List;
/**
* Tools for annotation converting. <br>
* @author code4crafter@gmail.com <br>
* @since 0.2.1
*/
public class ExtractorUtils {
public static Selector getSelector(ExtractBy extractBy) {
String value = extractBy.value();
Selector selector;
switch (extractBy.type()) {
case Css:
selector = new CssSelector(value);
break;
case Regex:
selector = new RegexSelector(value);
break;
case XPath:
selector = new XpathSelector(value);
break;
default:
selector = new XpathSelector(value);
}
return selector;
}
public static List<Selector> getSelectors(ExtractBy[] extractBies) {
List<Selector> selectors = new ArrayList<Selector>();
if (extractBies==null){
return selectors;
}
for (ExtractBy extractBy : extractBies) {
selectors.add(getSelector(extractBy));
}
return selectors;
}
}