add combo extract to replace Extract2 Extract3...
parent
f946fcdfea
commit
3ba7a76f44
|
@ -5,8 +5,7 @@ import java.util.List;
|
|||
|
||||
/**
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-8-3 <br>
|
||||
* Time: 下午5:29 <br>
|
||||
* @since 0.2.0
|
||||
*/
|
||||
public class AndSelector implements Selector {
|
||||
|
||||
|
@ -18,6 +17,10 @@ public class AndSelector implements Selector {
|
|||
}
|
||||
}
|
||||
|
||||
public AndSelector(List<Selector> selectors) {
|
||||
this.selectors = selectors;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String select(String text) {
|
||||
for (Selector selector : selectors) {
|
||||
|
|
|
@ -5,8 +5,7 @@ import java.util.List;
|
|||
|
||||
/**
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-8-3 <br>
|
||||
* Time: 下午5:29 <br>
|
||||
* @since 0.2.0
|
||||
*/
|
||||
public class OrSelector implements Selector {
|
||||
|
||||
|
@ -18,11 +17,15 @@ public class OrSelector implements Selector {
|
|||
}
|
||||
}
|
||||
|
||||
public OrSelector(List<Selector> selectors) {
|
||||
this.selectors = selectors;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String select(String text) {
|
||||
for (Selector selector : selectors) {
|
||||
text = selector.select(text);
|
||||
if (text!=null){
|
||||
if (text != null) {
|
||||
return text;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -4,6 +4,7 @@ import org.apache.commons.lang3.StringUtils;
|
|||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.model.annotation.*;
|
||||
import us.codecraft.webmagic.selector.*;
|
||||
import us.codecraft.webmagic.utils.ExtractorUtils;
|
||||
|
||||
import java.lang.annotation.Annotation;
|
||||
import java.lang.reflect.Field;
|
||||
|
@ -49,20 +50,15 @@ class PageModelExtractor {
|
|||
for (Field field : clazz.getDeclaredFields()) {
|
||||
field.setAccessible(true);
|
||||
FieldExtractor fieldExtractor = getAnnotationExtractBy(clazz, field);
|
||||
FieldExtractor fieldExtractorTmp = getAnnotationExtractByRaw(clazz, field);
|
||||
FieldExtractor fieldExtractorTmp = getAnnotationExtractCombo(clazz, field);
|
||||
if (fieldExtractor != null && fieldExtractorTmp != null) {
|
||||
throw new IllegalStateException("Only one of 'ExtractBy ExtractByRaw ExtractByUrl' can be added to a field!");
|
||||
throw new IllegalStateException("Only one of 'ExtractBy ComboExtract ExtractByUrl' can be added to a field!");
|
||||
} else if (fieldExtractor == null && fieldExtractorTmp != null) {
|
||||
fieldExtractor = fieldExtractorTmp;
|
||||
}
|
||||
// ExtractBy2 & ExtractBy3
|
||||
if (fieldExtractor!=null){
|
||||
addAnnotationExtractBy2(fieldExtractor);
|
||||
addAnnotationExtractBy3(fieldExtractor);
|
||||
}
|
||||
fieldExtractorTmp = getAnnotationExtractByUrl(clazz, field);
|
||||
if (fieldExtractor != null && fieldExtractorTmp != null) {
|
||||
throw new IllegalStateException("Only one of 'ExtractBy ExtractByRaw ExtractByUrl' can be added to a field!");
|
||||
throw new IllegalStateException("Only one of 'ExtractBy ComboExtract ExtractByUrl' can be added to a field!");
|
||||
} else if (fieldExtractor == null && fieldExtractorTmp != null) {
|
||||
fieldExtractor = fieldExtractorTmp;
|
||||
}
|
||||
|
@ -94,26 +90,23 @@ class PageModelExtractor {
|
|||
return fieldExtractor;
|
||||
}
|
||||
|
||||
private FieldExtractor getAnnotationExtractBy(Class clazz, Field field) {
|
||||
private FieldExtractor getAnnotationExtractCombo(Class clazz, Field field) {
|
||||
FieldExtractor fieldExtractor = null;
|
||||
ExtractBy extractBy = field.getAnnotation(ExtractBy.class);
|
||||
if (extractBy != null) {
|
||||
String value = extractBy.value();
|
||||
ComboExtract comboExtract = field.getAnnotation(ComboExtract.class);
|
||||
if (comboExtract != null) {
|
||||
ExtractBy[] extractBies = comboExtract.value();
|
||||
Selector selector;
|
||||
switch (extractBy.type()) {
|
||||
case Css:
|
||||
selector = new CssSelector(value);
|
||||
switch (comboExtract.op()) {
|
||||
case And:
|
||||
selector = new AndSelector(ExtractorUtils.getSelectors(extractBies));
|
||||
break;
|
||||
case Regex:
|
||||
selector = new RegexSelector(value);
|
||||
break;
|
||||
case XPath:
|
||||
selector = new XpathSelector(value);
|
||||
case Or:
|
||||
selector = new OrSelector(ExtractorUtils.getSelectors(extractBies));
|
||||
break;
|
||||
default:
|
||||
selector = new XpathSelector(value);
|
||||
selector = new AndSelector(ExtractorUtils.getSelectors(extractBies));
|
||||
}
|
||||
fieldExtractor = new FieldExtractor(field, selector, FieldExtractor.Source.Html, extractBy.notNull(), extractBy.multi());
|
||||
fieldExtractor = new FieldExtractor(field, selector, FieldExtractor.Source.Html, comboExtract.notNull(), comboExtract.multi());
|
||||
Method setterMethod = getSetterMethod(clazz, field);
|
||||
if (setterMethod != null) {
|
||||
fieldExtractor.setSetterMethod(setterMethod);
|
||||
|
@ -122,70 +115,12 @@ class PageModelExtractor {
|
|||
return fieldExtractor;
|
||||
}
|
||||
|
||||
private void addAnnotationExtractBy2(FieldExtractor fieldExtractor) {
|
||||
ExtractBy2 extractBy = fieldExtractor.getField().getAnnotation(ExtractBy2.class);
|
||||
if (extractBy != null) {
|
||||
String value = extractBy.value();
|
||||
Selector selector;
|
||||
switch (extractBy.type()) {
|
||||
case Css:
|
||||
selector = new CssSelector(value);
|
||||
break;
|
||||
case Regex:
|
||||
selector = new RegexSelector(value);
|
||||
break;
|
||||
case XPath:
|
||||
selector = new XpathSelector(value);
|
||||
break;
|
||||
default:
|
||||
selector = new XpathSelector(value);
|
||||
}
|
||||
fieldExtractor.setSelector(new AndSelector(fieldExtractor.getSelector(), selector));
|
||||
}
|
||||
}
|
||||
|
||||
private void addAnnotationExtractBy3(FieldExtractor fieldExtractor) {
|
||||
ExtractBy3 extractBy = fieldExtractor.getField().getAnnotation(ExtractBy3.class);
|
||||
if (extractBy != null) {
|
||||
String value = extractBy.value();
|
||||
Selector selector;
|
||||
switch (extractBy.type()) {
|
||||
case Css:
|
||||
selector = new CssSelector(value);
|
||||
break;
|
||||
case Regex:
|
||||
selector = new RegexSelector(value);
|
||||
break;
|
||||
case XPath:
|
||||
selector = new XpathSelector(value);
|
||||
break;
|
||||
default:
|
||||
selector = new XpathSelector(value);
|
||||
}
|
||||
fieldExtractor.setSelector(new AndSelector(fieldExtractor.getSelector(), selector));
|
||||
}
|
||||
}
|
||||
|
||||
private FieldExtractor getAnnotationExtractByRaw(Class clazz, Field field) {
|
||||
private FieldExtractor getAnnotationExtractBy(Class clazz, Field field) {
|
||||
FieldExtractor fieldExtractor = null;
|
||||
ExtractByRaw extractByRaw = field.getAnnotation(ExtractByRaw.class);
|
||||
if (extractByRaw != null) {
|
||||
String value = extractByRaw.value();
|
||||
Selector selector;
|
||||
switch (extractByRaw.type()) {
|
||||
case Css:
|
||||
selector = new CssSelector(value);
|
||||
break;
|
||||
case Regex:
|
||||
selector = new RegexSelector(value);
|
||||
break;
|
||||
case XPath:
|
||||
selector = new XpathSelector(value);
|
||||
break;
|
||||
default:
|
||||
selector = new XpathSelector(value);
|
||||
}
|
||||
fieldExtractor = new FieldExtractor(field, selector, FieldExtractor.Source.RawHtml, extractByRaw.notNull(), extractByRaw.multi());
|
||||
ExtractBy extractBy = field.getAnnotation(ExtractBy.class);
|
||||
if (extractBy != null) {
|
||||
Selector selector = ExtractorUtils.getSelector(extractBy);
|
||||
fieldExtractor = new FieldExtractor(field, selector, FieldExtractor.Source.Html, extractBy.notNull(), extractBy.multi());
|
||||
Method setterMethod = getSetterMethod(clazz, field);
|
||||
if (setterMethod != null) {
|
||||
fieldExtractor.setSetterMethod(setterMethod);
|
||||
|
|
|
@ -5,14 +5,75 @@ import java.lang.annotation.Retention;
|
|||
import java.lang.annotation.Target;
|
||||
|
||||
/**
|
||||
* Combo 'ExtractBy' extractor with and/or operator.
|
||||
*
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-8-16 <br>
|
||||
* Time: 下午11:09 <br>
|
||||
* @since 0.2.1
|
||||
*/
|
||||
@Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
|
||||
@Target({ElementType.FIELD, ElementType.TYPE})
|
||||
public @interface ComboExtract {
|
||||
|
||||
/**
|
||||
* The extractors to be combined.
|
||||
*
|
||||
* @return the extractors to be combined
|
||||
*/
|
||||
ExtractBy[] value();
|
||||
|
||||
enum Op {
|
||||
/**
|
||||
* All extractors will be arranged as a pipeline. <br>
|
||||
* The next extractor uses the result of the previous as source.
|
||||
*/
|
||||
And,
|
||||
/**
|
||||
* All extractors will do extracting separately, <br>
|
||||
* and the results of extractors will combined as the final result.
|
||||
*/
|
||||
Or;
|
||||
}
|
||||
|
||||
/**
|
||||
* Combining operation of extractors.<br>
|
||||
*
|
||||
* @return combining operation of extractors
|
||||
*/
|
||||
Op op() default Op.And;
|
||||
|
||||
/**
|
||||
* Define whether the field can be null.<br>
|
||||
* If set to 'true' and the extractor get no result, the entire class will be discarded. <br>
|
||||
*
|
||||
* @return whether the field can be null
|
||||
*/
|
||||
boolean notNull() default false;
|
||||
|
||||
public enum Source {
|
||||
/**
|
||||
* extract from the content extracted by class extractor
|
||||
*/
|
||||
SelectedHtml,
|
||||
/**
|
||||
* extract from the raw html
|
||||
*/
|
||||
RawHtml
|
||||
}
|
||||
|
||||
/**
|
||||
* The source for extracting. <br>
|
||||
* It works only if you already added 'ExtractBy' to Class. <br>
|
||||
*
|
||||
* @return the source for extracting
|
||||
*/
|
||||
Source source() default Source.SelectedHtml;
|
||||
|
||||
/**
|
||||
* Define whether the extractor return more than one result.
|
||||
* When set to 'true', the extractor return a list of string (so you should define the field as List). <br>
|
||||
*
|
||||
* @return whether the extractor return more than one result
|
||||
*/
|
||||
boolean multi() default false;
|
||||
|
||||
}
|
||||
|
|
|
@ -5,45 +5,63 @@ import java.lang.annotation.Retention;
|
|||
import java.lang.annotation.Target;
|
||||
|
||||
/**
|
||||
* 定义类或者字段的抽取规则。<br>
|
||||
* Define the extractor for field or class。<br>
|
||||
*
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-8-1 <br>
|
||||
* Time: 下午8:40 <br>
|
||||
* @since 0.2.0
|
||||
*/
|
||||
@Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
|
||||
@Target({ElementType.FIELD, ElementType.TYPE})
|
||||
public @interface ExtractBy {
|
||||
|
||||
/**
|
||||
* 抽取规则
|
||||
* Extractor expression, support XPath, CSS Selector and regex.
|
||||
*
|
||||
* @return 抽取规则
|
||||
* @return extractor expression
|
||||
*/
|
||||
String value();
|
||||
|
||||
public enum Type {XPath, Regex, Css}
|
||||
|
||||
/**
|
||||
* 抽取规则类型,支持XPath、Css selector、正则表达式,默认是XPath
|
||||
* Extractor type, support XPath, CSS Selector and regex.
|
||||
*
|
||||
* @return 抽取规则类型
|
||||
* @return extractor type
|
||||
*/
|
||||
Type type() default Type.XPath;
|
||||
|
||||
/**
|
||||
* 是否是不能为空的关键字段,若notNull为true,则对应字段抽取不到时,丢弃整个类,默认为false
|
||||
* Define whether the field can be null.<br>
|
||||
* If set to 'true' and the extractor get no result, the entire class will be discarded. <br>
|
||||
*
|
||||
* @return 是否是不能为空的关键字段
|
||||
* @return whether the field can be null
|
||||
*/
|
||||
boolean notNull() default false;
|
||||
|
||||
public enum Source {
|
||||
/**
|
||||
* extract from the content extracted by class extractor
|
||||
*/
|
||||
SelectedHtml,
|
||||
/**
|
||||
* extract from the raw html
|
||||
*/
|
||||
RawHtml
|
||||
}
|
||||
|
||||
/**
|
||||
* 是否抽取多个结果<br>
|
||||
* 用于字段时,需要List<String>来盛放结果<br>
|
||||
* 用于类时,表示单页抽取多个对象<br>
|
||||
* The source for extracting. <br>
|
||||
* It works only if you already added 'ExtractBy' to Class. <br>
|
||||
*
|
||||
* @return 是否抽取多个结果
|
||||
* @return the source for extracting
|
||||
*/
|
||||
Source source() default Source.SelectedHtml;
|
||||
|
||||
/**
|
||||
* Define whether the extractor return more than one result.
|
||||
* When set to 'true', the extractor return a list of string (so you should define the field as List). <br>
|
||||
*
|
||||
* @return whether the extractor return more than one result
|
||||
*/
|
||||
boolean multi() default false;
|
||||
|
||||
|
|
|
@ -1,24 +0,0 @@
|
|||
package us.codecraft.webmagic.model.annotation;
|
||||
|
||||
import java.lang.annotation.ElementType;
|
||||
import java.lang.annotation.Retention;
|
||||
import java.lang.annotation.Target;
|
||||
|
||||
/**
|
||||
* 定义类或者字段的抽取规则,只能在Extract、ExtractByRaw之后使用。<br>
|
||||
*
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-8-1 <br>
|
||||
* Time: 下午8:40 <br>
|
||||
*/
|
||||
@Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
|
||||
@Target({ElementType.FIELD})
|
||||
public @interface ExtractBy2 {
|
||||
|
||||
String value();
|
||||
|
||||
public enum Type {XPath, Regex, Css}
|
||||
|
||||
Type type() default Type.XPath;
|
||||
|
||||
}
|
|
@ -1,23 +0,0 @@
|
|||
package us.codecraft.webmagic.model.annotation;
|
||||
|
||||
import java.lang.annotation.ElementType;
|
||||
import java.lang.annotation.Retention;
|
||||
import java.lang.annotation.Target;
|
||||
|
||||
/**
|
||||
* 定义类或者字段的抽取规则,只能在Extract、ExtractByRaw之后使用。<br>
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-8-1 <br>
|
||||
* Time: 下午8:40 <br>
|
||||
*/
|
||||
@Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
|
||||
@Target({ElementType.FIELD})
|
||||
public @interface ExtractBy3 {
|
||||
|
||||
String value();
|
||||
|
||||
public enum Type { XPath, Regex, Css}
|
||||
|
||||
Type type() default Type.XPath;
|
||||
|
||||
}
|
|
@ -1,49 +0,0 @@
|
|||
package us.codecraft.webmagic.model.annotation;
|
||||
|
||||
import java.lang.annotation.ElementType;
|
||||
import java.lang.annotation.Retention;
|
||||
import java.lang.annotation.Target;
|
||||
|
||||
/**
|
||||
* 对于在Class级别就使用过ExtractBy的类,在字段中想抽取全部内容可使用此方法。<br>
|
||||
*
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-8-1 <br>
|
||||
* Time: 下午8:40 <br>
|
||||
*/
|
||||
@Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
|
||||
@Target({ElementType.FIELD, ElementType.TYPE})
|
||||
public @interface ExtractByRaw {
|
||||
|
||||
/**
|
||||
* 抽取规则
|
||||
*
|
||||
* @return 抽取规则
|
||||
*/
|
||||
String value();
|
||||
|
||||
public enum Type {XPath, Regex, Css}
|
||||
|
||||
/**
|
||||
* 抽取规则类型,支持XPath、Css selector、正则表达式,默认是XPath
|
||||
*
|
||||
* @return 抽取规则类型
|
||||
*/
|
||||
Type type() default Type.XPath;
|
||||
|
||||
/**
|
||||
* 是否是不能为空的关键字段,若notNull为true,则对应字段抽取不到时,丢弃整个类,默认为false
|
||||
*
|
||||
* @return 是否是不能为空的关键字段
|
||||
*/
|
||||
boolean notNull() default false;
|
||||
|
||||
/**
|
||||
* 是否抽取多个结果<br>
|
||||
* 需要List<String>来盛放结果<br>
|
||||
*
|
||||
* @return 是否抽取多个结果
|
||||
*/
|
||||
boolean multi() default false;
|
||||
|
||||
}
|
|
@ -5,35 +5,35 @@ import java.lang.annotation.Retention;
|
|||
import java.lang.annotation.Target;
|
||||
|
||||
/**
|
||||
* 定义类或者字段的抽取规则(从url中抽取,只支持正则表达式)。<br>
|
||||
* Define a extractor for url. Only regex can be used. <br>
|
||||
*
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-8-1 <br>
|
||||
* Time: 下午8:40 <br>
|
||||
* @since 0.2.0
|
||||
*/
|
||||
@Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
|
||||
@Target({ElementType.FIELD})
|
||||
public @interface ExtractByUrl{
|
||||
public @interface ExtractByUrl {
|
||||
|
||||
/**
|
||||
* 抽取规则,支持正则表达式
|
||||
* Extractor expression, only regex can be used
|
||||
*
|
||||
* @return 抽取规则
|
||||
* @return extractor expression
|
||||
*/
|
||||
String value() default "";
|
||||
|
||||
/**
|
||||
* 是否是不能为空的关键字段,若notNull为true,则对应字段抽取不到时,丢弃整个类,默认为false
|
||||
* Define whether the field can be null.<br>
|
||||
* If set to 'true' and the extractor get no result, the entire class will be discarded. <br>
|
||||
*
|
||||
* @return 是否是不能为空的关键字段
|
||||
* @return whether the field can be null
|
||||
*/
|
||||
boolean notNull() default false;
|
||||
|
||||
/**
|
||||
* 是否抽取多个结果<br>
|
||||
* 用于字段时,需要List<String>来盛放结果<br>
|
||||
* 用于类时,表示单页抽取多个对象<br>
|
||||
* Define whether the extractor return more than one result.
|
||||
* When set to 'true', the extractor return a list of string (so you should define the field as List). <br>
|
||||
*
|
||||
* @return 是否抽取多个结果
|
||||
* @return whether the extractor return more than one result
|
||||
*/
|
||||
boolean multi() default false;
|
||||
|
||||
|
|
|
@ -5,26 +5,32 @@ import java.lang.annotation.Retention;
|
|||
import java.lang.annotation.Target;
|
||||
|
||||
/**
|
||||
* 定义辅助爬取的url。<br>
|
||||
* Define the 'help' url patterns for class. <br>
|
||||
* All urls matching the pattern will be crawled and but not extracted for new objects. <br>
|
||||
*
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-8-1 <br>
|
||||
* Time: 下午8:40 <br>
|
||||
* @since 0.2.0
|
||||
*/
|
||||
@Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
|
||||
@Target({ElementType.TYPE})
|
||||
public @interface HelpUrl {
|
||||
|
||||
/**
|
||||
* 某个类对应的URL规则列表<br>
|
||||
* webmagic对正则表达式进行了修改,"."仅表示字符"."而不代表任意字符,而"\*"则代表了".\*",例如"http://\*.oschina.net/\*"代表了oschina所有的二级域名下的URL。<br>
|
||||
* The url patterns to crawl. <br>
|
||||
* Use regex expression with some changes: <br>
|
||||
* "." stand for literal character "." instead of "any character". <br>
|
||||
* "*" stand for any legal character for url in 0-n length ([^"'#]*) instead of "any length". <br>
|
||||
*
|
||||
* @return 抽取规则
|
||||
* @return the url patterns for class
|
||||
*/
|
||||
String[] value();
|
||||
|
||||
/**
|
||||
* 指定提取URL的区域(仅支持XPath)
|
||||
* @return 指定提取URL的区域
|
||||
* Define the region for url extracting. <br>
|
||||
* Only support XPath.<br>
|
||||
* When sourceRegion is set, the urls will be extracted only from the region instead of entire content. <br>
|
||||
*
|
||||
* @return the region for url extracting
|
||||
*/
|
||||
String sourceRegion() default "";
|
||||
}
|
||||
|
|
|
@ -5,27 +5,32 @@ import java.lang.annotation.Retention;
|
|||
import java.lang.annotation.Target;
|
||||
|
||||
/**
|
||||
* 定义某个类抽取的范围和来源,sourceRegion可以用xpath语法限定抽取区域。<br>
|
||||
* Define the url patterns for class. <br>
|
||||
* All urls matching the pattern will be crawled and extracted for new objects. <br>
|
||||
*
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-8-1 <br>
|
||||
* Time: 下午8:40 <br>
|
||||
* @since 0.2.0
|
||||
*/
|
||||
@Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
|
||||
@Target({ElementType.TYPE})
|
||||
public @interface TargetUrl {
|
||||
|
||||
/**
|
||||
* 某个类对应的URL规则列表<br>
|
||||
* webmagic对正则表达式进行了修改,"."仅表示字符"."而不代表任意字符,而"\*"则代表了".\*",例如"http://\*.oschina.net/\*"代表了oschina所有的二级域名下的URL。<br>
|
||||
* The url patterns for class.<br>
|
||||
* Use regex expression with some changes: <br>
|
||||
* "." stand for literal character "." instead of "any character". <br>
|
||||
* "*" stand for any legal character for url in 0-n length ([^"'#]*) instead of "any length". <br>
|
||||
*
|
||||
* @return 抽取规则
|
||||
* @return the url patterns for class
|
||||
*/
|
||||
String[] value();
|
||||
|
||||
/**
|
||||
* 指定提取URL的区域(仅支持XPath)
|
||||
* @return 指定提取URL的区域
|
||||
* Define the region for url extracting. <br>
|
||||
* Only support XPath.<br>
|
||||
* When sourceRegion is set, the urls will be extracted only from the region instead of entire content. <br>
|
||||
*
|
||||
* @return the region for url extracting
|
||||
*/
|
||||
String sourceRegion() default "";
|
||||
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
<html>
|
||||
<body>
|
||||
webmagic注解抓取方式所定义的注解。
|
||||
Annotations for define a class.
|
||||
</body>
|
||||
</html>
|
||||
|
|
|
@ -0,0 +1,48 @@
|
|||
package us.codecraft.webmagic.utils;
|
||||
|
||||
import us.codecraft.webmagic.model.annotation.ExtractBy;
|
||||
import us.codecraft.webmagic.selector.CssSelector;
|
||||
import us.codecraft.webmagic.selector.RegexSelector;
|
||||
import us.codecraft.webmagic.selector.Selector;
|
||||
import us.codecraft.webmagic.selector.XpathSelector;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Tools for annotation converting. <br>
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* @since 0.2.1
|
||||
*/
|
||||
public class ExtractorUtils {
|
||||
|
||||
public static Selector getSelector(ExtractBy extractBy) {
|
||||
String value = extractBy.value();
|
||||
Selector selector;
|
||||
switch (extractBy.type()) {
|
||||
case Css:
|
||||
selector = new CssSelector(value);
|
||||
break;
|
||||
case Regex:
|
||||
selector = new RegexSelector(value);
|
||||
break;
|
||||
case XPath:
|
||||
selector = new XpathSelector(value);
|
||||
break;
|
||||
default:
|
||||
selector = new XpathSelector(value);
|
||||
}
|
||||
return selector;
|
||||
}
|
||||
|
||||
public static List<Selector> getSelectors(ExtractBy[] extractBies) {
|
||||
List<Selector> selectors = new ArrayList<Selector>();
|
||||
if (extractBies==null){
|
||||
return selectors;
|
||||
}
|
||||
for (ExtractBy extractBy : extractBies) {
|
||||
selectors.add(getSelector(extractBy));
|
||||
}
|
||||
return selectors;
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue