add combo extract to replace Extract2 Extract3...

2013-08-17 17:23:11 +08:00 · 2013-08-17 17:23:11 +08:00 · 3ba7a76f44
parent f946fcdfea
commit 3ba7a76f44
13 changed files with 213 additions and 230 deletions
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/AndSelector.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/AndSelector.java
@ -5,8 +5,7 @@ import java.util.List;
 /**
 * @author code4crafter@gmail.com <br>
- * Date: 13-8-3 <br>
+ * @since 0.2.0
 * Time: 下午5:29 <br>
 */
 public class AndSelector implements Selector {
@ -18,6 +17,10 @@ public class AndSelector implements Selector {
        }
    }
    public AndSelector(List<Selector> selectors) {
        this.selectors = selectors;
    }
    @Override
    public String select(String text) {
        for (Selector selector : selectors) {
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/OrSelector.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/OrSelector.java
@ -5,8 +5,7 @@ import java.util.List;
 /**
 * @author code4crafter@gmail.com <br>
- * Date: 13-8-3 <br>
+ * @since 0.2.0
 * Time: 下午5:29 <br>
 */
 public class OrSelector implements Selector {
@ -18,11 +17,15 @@ public class OrSelector implements Selector {
        }
    }
    public OrSelector(List<Selector> selectors) {
        this.selectors = selectors;
    }
    @Override
    public String select(String text) {
        for (Selector selector : selectors) {
            text = selector.select(text);
-            if (text!=null){
+            if (text != null) {
                return text;
            }
        }
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java
@ -4,6 +4,7 @@ import org.apache.commons.lang3.StringUtils;
 import us.codecraft.webmagic.Page;
 import us.codecraft.webmagic.model.annotation.*;
 import us.codecraft.webmagic.selector.*;
 import us.codecraft.webmagic.utils.ExtractorUtils;
 import java.lang.annotation.Annotation;
 import java.lang.reflect.Field;
@ -49,20 +50,15 @@ class PageModelExtractor {
        for (Field field : clazz.getDeclaredFields()) {
            field.setAccessible(true);
            FieldExtractor fieldExtractor = getAnnotationExtractBy(clazz, field);
-            FieldExtractor fieldExtractorTmp = getAnnotationExtractByRaw(clazz, field);
+            FieldExtractor fieldExtractorTmp = getAnnotationExtractCombo(clazz, field);
            if (fieldExtractor != null && fieldExtractorTmp != null) {
-                throw new IllegalStateException("Only one of 'ExtractBy ExtractByRaw ExtractByUrl' can be added to a field!");
+                throw new IllegalStateException("Only one of 'ExtractBy ComboExtract ExtractByUrl' can be added to a field!");
            } else if (fieldExtractor == null && fieldExtractorTmp != null) {
                fieldExtractor = fieldExtractorTmp;
            }
            // ExtractBy2 & ExtractBy3
            if (fieldExtractor!=null){
                addAnnotationExtractBy2(fieldExtractor);
                addAnnotationExtractBy3(fieldExtractor);
            }
            fieldExtractorTmp = getAnnotationExtractByUrl(clazz, field);
            if (fieldExtractor != null && fieldExtractorTmp != null) {
-                throw new IllegalStateException("Only one of 'ExtractBy ExtractByRaw ExtractByUrl' can be added to a field!");
+                throw new IllegalStateException("Only one of 'ExtractBy ComboExtract ExtractByUrl' can be added to a field!");
            } else if (fieldExtractor == null && fieldExtractorTmp != null) {
                fieldExtractor = fieldExtractorTmp;
            }
@ -94,26 +90,23 @@ class PageModelExtractor {
        return fieldExtractor;
    }
-    private FieldExtractor getAnnotationExtractBy(Class clazz, Field field) {
+    private FieldExtractor getAnnotationExtractCombo(Class clazz, Field field) {
        FieldExtractor fieldExtractor = null;
-        ExtractBy extractBy = field.getAnnotation(ExtractBy.class);
+        ComboExtract comboExtract = field.getAnnotation(ComboExtract.class);
-        if (extractBy != null) {
+        if (comboExtract != null) {
-            String value = extractBy.value();
+            ExtractBy[] extractBies = comboExtract.value();
            Selector selector;
-            switch (extractBy.type()) {
+            switch (comboExtract.op()) {
-                case Css:
+                case And:
-                    selector = new CssSelector(value);
+                    selector = new AndSelector(ExtractorUtils.getSelectors(extractBies));
                    break;
-                case Regex:
+                case Or:
-                    selector = new RegexSelector(value);
+                    selector = new OrSelector(ExtractorUtils.getSelectors(extractBies));
                    break;
                case XPath:
                    selector = new XpathSelector(value);
                    break;
                default:
-                    selector = new XpathSelector(value);
+                    selector = new AndSelector(ExtractorUtils.getSelectors(extractBies));
            }
-            fieldExtractor = new FieldExtractor(field, selector, FieldExtractor.Source.Html, extractBy.notNull(), extractBy.multi());
+            fieldExtractor = new FieldExtractor(field, selector, FieldExtractor.Source.Html, comboExtract.notNull(), comboExtract.multi());
            Method setterMethod = getSetterMethod(clazz, field);
            if (setterMethod != null) {
                fieldExtractor.setSetterMethod(setterMethod);
@ -122,70 +115,12 @@ class PageModelExtractor {
        return fieldExtractor;
    }
-    private void addAnnotationExtractBy2(FieldExtractor fieldExtractor) {
+    private FieldExtractor getAnnotationExtractBy(Class clazz, Field field) {
        ExtractBy2 extractBy = fieldExtractor.getField().getAnnotation(ExtractBy2.class);
        if (extractBy != null) {
            String value = extractBy.value();
            Selector selector;
            switch (extractBy.type()) {
                case Css:
                    selector = new CssSelector(value);
                    break;
                case Regex:
                    selector = new RegexSelector(value);
                    break;
                case XPath:
                    selector = new XpathSelector(value);
                    break;
                default:
                    selector = new XpathSelector(value);
            }
            fieldExtractor.setSelector(new AndSelector(fieldExtractor.getSelector(), selector));
        }
    }
    private void addAnnotationExtractBy3(FieldExtractor fieldExtractor) {
        ExtractBy3 extractBy = fieldExtractor.getField().getAnnotation(ExtractBy3.class);
        if (extractBy != null) {
            String value = extractBy.value();
            Selector selector;
            switch (extractBy.type()) {
                case Css:
                    selector = new CssSelector(value);
                    break;
                case Regex:
                    selector = new RegexSelector(value);
                    break;
                case XPath:
                    selector = new XpathSelector(value);
                    break;
                default:
                    selector = new XpathSelector(value);
            }
            fieldExtractor.setSelector(new AndSelector(fieldExtractor.getSelector(), selector));
        }
    }
    private FieldExtractor getAnnotationExtractByRaw(Class clazz, Field field) {
        FieldExtractor fieldExtractor = null;
-        ExtractByRaw extractByRaw = field.getAnnotation(ExtractByRaw.class);
+        ExtractBy extractBy = field.getAnnotation(ExtractBy.class);
-        if (extractByRaw != null) {
+        if (extractBy != null) {
-            String value = extractByRaw.value();
+            Selector selector = ExtractorUtils.getSelector(extractBy);
-            Selector selector;
+            fieldExtractor = new FieldExtractor(field, selector, FieldExtractor.Source.Html, extractBy.notNull(), extractBy.multi());
            switch (extractByRaw.type()) {
                case Css:
                    selector = new CssSelector(value);
                    break;
                case Regex:
                    selector = new RegexSelector(value);
                    break;
                case XPath:
                    selector = new XpathSelector(value);
                    break;
                default:
                    selector = new XpathSelector(value);
            }
            fieldExtractor = new FieldExtractor(field, selector, FieldExtractor.Source.RawHtml, extractByRaw.notNull(), extractByRaw.multi());
            Method setterMethod = getSetterMethod(clazz, field);
            if (setterMethod != null) {
                fieldExtractor.setSetterMethod(setterMethod);
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ComboExtract.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ComboExtract.java
@ -5,14 +5,75 @@ import java.lang.annotation.Retention;
 import java.lang.annotation.Target;
 /**
 * Combo 'ExtractBy' extractor with and/or operator.
 *
 * @author code4crafter@gmail.com <br>
- *         Date: 13-8-16 <br>
+ * @since 0.2.1
 *         Time: 下午11:09 <br>
 */
@Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
@Target({ElementType.FIELD, ElementType.TYPE})
 public @interface ComboExtract {
    /**
     * The extractors to be combined.
     *
     * @return the extractors to be combined
     */
    ExtractBy[] value();
    enum Op {
        /**
         * All extractors will be arranged as a pipeline. <br>
         * The next extractor uses the result of the previous as source.
         */
        And,
        /**
         * All extractors will do extracting separately, <br>
         * and the results of extractors will combined as the final result.
         */
        Or;
    }
    /**
     * Combining operation of extractors.<br>
     *
     * @return combining operation of extractors
     */
    Op op() default Op.And;
    /**
     * Define whether the field can be null.<br>
     * If set to 'true' and the extractor get no result, the entire class will be discarded. <br>
     *
     * @return whether the field can be null
     */
    boolean notNull() default false;
    public enum Source {
        /**
         * extract from the content extracted by class extractor
         */
        SelectedHtml,
        /**
         * extract from the raw html
         */
        RawHtml
    }
    /**
     * The source for extracting. <br>
     * It works only if you already added 'ExtractBy' to Class. <br>
     *
     * @return the source for extracting
     */
    Source source() default Source.SelectedHtml;
    /**
     * Define whether the extractor return more than one result.
     * When set to 'true', the extractor return a list of string (so you should define the field as List). <br>
     *
     * @return whether the extractor return more than one result
     */
    boolean multi() default false;
 }
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy.java
@ -5,45 +5,63 @@ import java.lang.annotation.Retention;
 import java.lang.annotation.Target;
 /**
- * 定义类或者字段的抽取规则。<br>
+ * Define the extractor for field or class。<br>
 *
 * @author code4crafter@gmail.com <br>
- * Date: 13-8-1 <br>
+ * @since 0.2.0
 * Time: 下午8:40 <br>
 */
@Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
@Target({ElementType.FIELD, ElementType.TYPE})
 public @interface ExtractBy {
    /**
-     * 抽取规则
+     * Extractor expression, support XPath, CSS Selector and regex.
     *
-     * @return 抽取规则
+     * @return extractor expression
     */
    String value();
    public enum Type {XPath, Regex, Css}
    /**
-     * 抽取规则类型，支持XPath、Css selector、正则表达式，默认是XPath
+     * Extractor type, support XPath, CSS Selector and regex.
     *
-     * @return 抽取规则类型
+     * @return extractor type
     */
    Type type() default Type.XPath;
    /**
-     * 是否是不能为空的关键字段，若notNull为true，则对应字段抽取不到时，丢弃整个类，默认为false
+     * Define whether the field can be null.<br>
     * If set to 'true' and the extractor get no result, the entire class will be discarded. <br>
     *
-     * @return 是否是不能为空的关键字段
+     * @return whether the field can be null
     */
    boolean notNull() default false;
    public enum Source {
        /**
         * extract from the content extracted by class extractor
         */
        SelectedHtml,
        /**
         * extract from the raw html
         */
        RawHtml
    }
    /**
-     * 是否抽取多个结果<br>
+     * The source for extracting. <br>
-     * 用于字段时，需要List<String>来盛放结果<br>
+     * It works only if you already added 'ExtractBy' to Class. <br>
     * 用于类时，表示单页抽取多个对象<br>
     *
-     * @return 是否抽取多个结果
+     * @return the source for extracting
     */
    Source source() default Source.SelectedHtml;
    /**
     * Define whether the extractor return more than one result.
     * When set to 'true', the extractor return a list of string (so you should define the field as List). <br>
     *
     * @return whether the extractor return more than one result
     */
    boolean multi() default false;
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy2.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy2.java
@ -1,24 +0,0 @@
 package us.codecraft.webmagic.model.annotation;
 import java.lang.annotation.ElementType;
 import java.lang.annotation.Retention;
 import java.lang.annotation.Target;
 /**
 * 定义类或者字段的抽取规则，只能在Extract、ExtractByRaw之后使用。<br>
 *
 * @author code4crafter@gmail.com <br>
 * Date: 13-8-1 <br>
 * Time: 下午8:40 <br>
 */
@Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
@Target({ElementType.FIELD})
 public @interface ExtractBy2 {
    String value();
    public enum Type {XPath, Regex, Css}
    Type type() default Type.XPath;
 }
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy3.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy3.java
@ -1,23 +0,0 @@
 package us.codecraft.webmagic.model.annotation;
 import java.lang.annotation.ElementType;
 import java.lang.annotation.Retention;
 import java.lang.annotation.Target;
 /**
 * 定义类或者字段的抽取规则，只能在Extract、ExtractByRaw之后使用。<br>
 * @author code4crafter@gmail.com <br>
 * Date: 13-8-1 <br>
 * Time: 下午8:40 <br>
 */
@Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
@Target({ElementType.FIELD})
 public @interface ExtractBy3 {
    String value();
    public enum Type { XPath, Regex, Css}
    Type type() default Type.XPath;
 }
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractByRaw.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractByRaw.java
@ -1,49 +0,0 @@
 package us.codecraft.webmagic.model.annotation;
 import java.lang.annotation.ElementType;
 import java.lang.annotation.Retention;
 import java.lang.annotation.Target;
 /**
 * 对于在Class级别就使用过ExtractBy的类，在字段中想抽取全部内容可使用此方法。<br>
 *
 * @author code4crafter@gmail.com <br>
 * Date: 13-8-1 <br>
 * Time: 下午8:40 <br>
 */
@Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
@Target({ElementType.FIELD, ElementType.TYPE})
 public @interface ExtractByRaw {
    /**
     * 抽取规则
     *
     * @return 抽取规则
     */
    String value();
    public enum Type {XPath, Regex, Css}
    /**
     * 抽取规则类型，支持XPath、Css selector、正则表达式，默认是XPath
     *
     * @return 抽取规则类型
     */
    Type type() default Type.XPath;
    /**
     * 是否是不能为空的关键字段，若notNull为true，则对应字段抽取不到时，丢弃整个类，默认为false
     *
     * @return 是否是不能为空的关键字段
     */
    boolean notNull() default false;
    /**
     * 是否抽取多个结果<br>
     * 需要List<String>来盛放结果<br>
     *
     * @return 是否抽取多个结果
     */
    boolean multi() default false;
 }
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractByUrl.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractByUrl.java
@ -5,35 +5,35 @@ import java.lang.annotation.Retention;
 import java.lang.annotation.Target;
 /**
- * 定义类或者字段的抽取规则(从url中抽取，只支持正则表达式)。<br>
+ * Define a extractor for url. Only regex can be used. <br>
 *
 * @author code4crafter@gmail.com <br>
- * Date: 13-8-1 <br>
+ * @since 0.2.0
 * Time: 下午8:40 <br>
 */
@Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
@Target({ElementType.FIELD})
-public @interface ExtractByUrl{
+public @interface ExtractByUrl {
    /**
-     * 抽取规则，支持正则表达式
+     * Extractor expression, only regex can be used
     *
-     * @return 抽取规则
+     * @return extractor expression
     */
    String value() default "";
    /**
-     * 是否是不能为空的关键字段，若notNull为true，则对应字段抽取不到时，丢弃整个类，默认为false
+     * Define whether the field can be null.<br>
     * If set to 'true' and the extractor get no result, the entire class will be discarded. <br>
     *
-     * @return 是否是不能为空的关键字段
+     * @return whether the field can be null
     */
    boolean notNull() default false;
    /**
-     * 是否抽取多个结果<br>
+     * Define whether the extractor return more than one result.
-     * 用于字段时，需要List<String>来盛放结果<br>
+     * When set to 'true', the extractor return a list of string (so you should define the field as List). <br>
     * 用于类时，表示单页抽取多个对象<br>
     *
-     * @return 是否抽取多个结果
+     * @return whether the extractor return more than one result
     */
    boolean multi() default false;
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/HelpUrl.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/HelpUrl.java
@ -5,26 +5,32 @@ import java.lang.annotation.Retention;
 import java.lang.annotation.Target;
 /**
- * 定义辅助爬取的url。<br>
+ * Define the 'help' url patterns for class. <br>
 * All urls matching the pattern will be crawled and but not extracted for new objects. <br>
 *
 * @author code4crafter@gmail.com <br>
- * Date: 13-8-1 <br>
+ * @since 0.2.0
 * Time: 下午8:40 <br>
 */
@Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
@Target({ElementType.TYPE})
 public @interface HelpUrl {
    /**
-     * 某个类对应的URL规则列表<br>
+     * The url patterns to crawl. <br>
-     * webmagic对正则表达式进行了修改，"."仅表示字符"."而不代表任意字符，而"\*"则代表了".\*"，例如"http://\*.oschina.net/\*"代表了oschina所有的二级域名下的URL。<br>
+     * Use regex expression with some changes: <br>
     *      "." stand for literal character "." instead of "any character". <br>
     *      "*" stand for any legal character for url in 0-n length ([^"'#]*) instead of "any length". <br>
     *
-     * @return 抽取规则
+     * @return the url patterns for class
     */
    String[] value();
    /**
-     * 指定提取URL的区域(仅支持XPath)
+     * Define the region for url extracting. <br>
-     * @return 指定提取URL的区域
+     * Only support XPath.<br>
     * When sourceRegion is set, the urls will be extracted only from the region instead of entire content. <br>
     *
     * @return the region for url extracting
     */
    String sourceRegion() default "";
 }
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/TargetUrl.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/TargetUrl.java
@ -5,27 +5,32 @@ import java.lang.annotation.Retention;
 import java.lang.annotation.Target;
 /**
- * 定义某个类抽取的范围和来源，sourceRegion可以用xpath语法限定抽取区域。<br>
+ * Define the url patterns for class. <br>
 * All urls matching the pattern will be crawled and extracted for new objects. <br>
 *
 * @author code4crafter@gmail.com <br>
- * Date: 13-8-1 <br>
+ * @since 0.2.0
 * Time: 下午8:40 <br>
 */
@Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
@Target({ElementType.TYPE})
 public @interface TargetUrl {
    /**
-     * 某个类对应的URL规则列表<br>
+     * The url patterns for class.<br>
-     * webmagic对正则表达式进行了修改，"."仅表示字符"."而不代表任意字符，而"\*"则代表了".\*"，例如"http://\*.oschina.net/\*"代表了oschina所有的二级域名下的URL。<br>
+     * Use regex expression with some changes: <br>
     *      "." stand for literal character "." instead of "any character". <br>
     *      "*" stand for any legal character for url in 0-n length ([^"'#]*) instead of "any length". <br>
     *
-     * @return 抽取规则
+     * @return the url patterns for class
     */
    String[] value();
    /**
-     * 指定提取URL的区域(仅支持XPath)
+     * Define the region for url extracting. <br>
-     * @return 指定提取URL的区域
+     * Only support XPath.<br>
     * When sourceRegion is set, the urls will be extracted only from the region instead of entire content. <br>
     *
     * @return the region for url extracting
     */
    String sourceRegion() default "";
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/package.html
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/package.html
@ -1,5 +1,5 @@
 <html>
 	<body>
-webmagic注解抓取方式所定义的注解。
+Annotations for define a class.
 	</body>
 </html>
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/ExtractorUtils.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/ExtractorUtils.java
@ -0,0 +1,48 @@
 package us.codecraft.webmagic.utils;
 import us.codecraft.webmagic.model.annotation.ExtractBy;
 import us.codecraft.webmagic.selector.CssSelector;
 import us.codecraft.webmagic.selector.RegexSelector;
 import us.codecraft.webmagic.selector.Selector;
 import us.codecraft.webmagic.selector.XpathSelector;
 import java.util.ArrayList;
 import java.util.List;
 /**
 * Tools for annotation converting. <br>
 * @author code4crafter@gmail.com <br>
 * @since 0.2.1
 */
 public class ExtractorUtils {
    public static Selector getSelector(ExtractBy extractBy) {
        String value = extractBy.value();
        Selector selector;
        switch (extractBy.type()) {
            case Css:
                selector = new CssSelector(value);
                break;
            case Regex:
                selector = new RegexSelector(value);
                break;
            case XPath:
                selector = new XpathSelector(value);
                break;
            default:
                selector = new XpathSelector(value);
        }
        return selector;
    }
    public static List<Selector> getSelectors(ExtractBy[] extractBies) {
        List<Selector> selectors = new ArrayList<Selector>();
        if (extractBies==null){
            return selectors;
        }
        for (ExtractBy extractBy : extractBies) {
            selectors.add(getSelector(extractBy));
        }
        return selectors;
    }
 }