From 629f8ac2d11925016142bbd25af6eef573f30c82 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Mon, 5 Aug 2013 20:45:34 +0800 Subject: [PATCH] add extractors chain --- .../codecraft/webmagic/model/ExtractBy2.java | 23 ++++ .../codecraft/webmagic/model/ExtractBy3.java | 23 ++++ .../codecraft/webmagic/model/Extractor.java | 10 +- .../webmagic/model/PageModelExtractor.java | 126 +++++++++++++----- 4 files changed, 148 insertions(+), 34 deletions(-) create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/model/ExtractBy2.java create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/model/ExtractBy3.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/model/ExtractBy2.java b/webmagic-core/src/main/java/us/codecraft/webmagic/model/ExtractBy2.java new file mode 100644 index 0000000..55d5dfa --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/model/ExtractBy2.java @@ -0,0 +1,23 @@ +package us.codecraft.webmagic.model; + +import java.lang.annotation.ElementType; +import java.lang.annotation.Retention; +import java.lang.annotation.Target; + +/** + * 定义类或者字段的抽取规则。
+ * @author code4crafter@gmail.com
+ * @date: 13-8-1
+ * Time: 下午8:40
+ */ +@Retention(java.lang.annotation.RetentionPolicy.RUNTIME) +@Target({ElementType.FIELD}) +public @interface ExtractBy2 { + + String value(); + + public enum Type {XPath2, XPath, Regex, Css} + + Type type() default Type.XPath2; + +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/model/ExtractBy3.java b/webmagic-core/src/main/java/us/codecraft/webmagic/model/ExtractBy3.java new file mode 100644 index 0000000..10f6a9f --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/model/ExtractBy3.java @@ -0,0 +1,23 @@ +package us.codecraft.webmagic.model; + +import java.lang.annotation.ElementType; +import java.lang.annotation.Retention; +import java.lang.annotation.Target; + +/** + * 定义类或者字段的抽取规则。
+ * @author code4crafter@gmail.com
+ * @date: 13-8-1
+ * Time: 下午8:40
+ */ +@Retention(java.lang.annotation.RetentionPolicy.RUNTIME) +@Target({ElementType.FIELD}) +public @interface ExtractBy3 { + + String value(); + + public enum Type {XPath2, XPath, Regex, Css} + + Type type() default Type.XPath2; + +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/model/Extractor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/model/Extractor.java index 498aba9..82c7dbb 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/model/Extractor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/model/Extractor.java @@ -9,7 +9,7 @@ import us.codecraft.webmagic.selector.Selector; */ class Extractor { - protected final Selector selector; + protected Selector selector; protected final Source source; @@ -37,4 +37,12 @@ class Extractor { boolean isNotNull() { return notNull; } + + boolean isMulti() { + return multi; + } + + void setSelector(Selector selector) { + this.selector = selector; + } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java index 9694c4e..b2c2bb0 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java @@ -14,6 +14,7 @@ import java.util.regex.Pattern; /** * Model主要逻辑类。将一个带注解的POJO转换为一个PageModelExtractor。
+ * * @author code4crafter@gmail.com
* @date: 13-8-1
* Time: 下午9:33
@@ -46,41 +47,54 @@ class PageModelExtractor { fieldExtractors = new ArrayList(); for (Field field : clazz.getDeclaredFields()) { field.setAccessible(true); - getAnnotationExtractBy(clazz, field); - getAnnotationExtractByRaw(clazz,field); - getAnnotationExtractByUrl(clazz, field); + FieldExtractor fieldExtractor = getAnnotationExtractBy(clazz, field); + FieldExtractor fieldExtractorTmp = getAnnotationExtractByRaw(clazz, field); + if (fieldExtractor != null && fieldExtractorTmp != null) { + throw new IllegalStateException("Only one of 'ExtractBy ExtractByRaw ExtractByUrl' can be added to a field!"); + } else if (fieldExtractor == null && fieldExtractorTmp != null) { + fieldExtractor = fieldExtractorTmp; + } + // ExtractBy2 & ExtractBy3 + addAnnotationExtractBy2(clazz, fieldExtractor); + addAnnotationExtractBy3(clazz, fieldExtractor); + fieldExtractorTmp = getAnnotationExtractByUrl(clazz, field); + if (fieldExtractor != null && fieldExtractorTmp != null) { + throw new IllegalStateException("Only one of 'ExtractBy ExtractByRaw ExtractByUrl' can be added to a field!"); + } else if (fieldExtractor == null && fieldExtractorTmp != null) { + fieldExtractor = fieldExtractorTmp; + } + if (fieldExtractor != null) { + if (!fieldExtractor.isMulti() && !String.class.isAssignableFrom(field.getType())) { + throw new IllegalStateException("Field " + field.getName() + " must be string"); + } else if (fieldExtractor.isMulti() && !List.class.isAssignableFrom(field.getType())) { + throw new IllegalStateException("Field " + field.getName() + " must be list"); + } + } + } } - private void getAnnotationExtractByUrl(Class clazz, Field field) { + private FieldExtractor getAnnotationExtractByUrl(Class clazz, Field field) { + FieldExtractor fieldExtractor = null; ExtractByUrl extractByUrl = field.getAnnotation(ExtractByUrl.class); if (extractByUrl != null) { - if (!extractByUrl.multi() && !String.class.isAssignableFrom(field.getType())) { - throw new IllegalStateException("Field " + field.getName() + " must be string"); - } else if (extractByUrl.multi() && !List.class.isAssignableFrom(field.getType())) { - throw new IllegalStateException("Field " + field.getName() + " must be list"); - } String regexPattern = extractByUrl.value(); if (regexPattern.trim().equals("")) { regexPattern = ".*"; } - FieldExtractor fieldExtractor = new FieldExtractor(field, new RegexSelector(regexPattern), FieldExtractor.Source.Url, extractByUrl.notNull(), extractByUrl.multi()); + fieldExtractor = new FieldExtractor(field, new RegexSelector(regexPattern), FieldExtractor.Source.Url, extractByUrl.notNull(), extractByUrl.multi()); Method setterMethod = getSetterMethod(clazz, field); if (setterMethod != null) { fieldExtractor.setSetterMethod(setterMethod); } - fieldExtractors.add(fieldExtractor); } + return fieldExtractor; } - private void getAnnotationExtractBy(Class clazz, Field field) { + private FieldExtractor getAnnotationExtractBy(Class clazz, Field field) { + FieldExtractor fieldExtractor = null; ExtractBy extractBy = field.getAnnotation(ExtractBy.class); if (extractBy != null) { - if (!extractBy.multi() && !String.class.isAssignableFrom(field.getType())) { - throw new IllegalStateException("Field " + field.getName() + " must be string"); - } else if (extractBy.multi() && !List.class.isAssignableFrom(field.getType())) { - throw new IllegalStateException("Field " + field.getName() + " must be list"); - } String value = extractBy.value(); Selector selector; switch (extractBy.type()) { @@ -99,23 +113,69 @@ class PageModelExtractor { default: selector = new Xpath2Selector(value); } - FieldExtractor fieldExtractor = new FieldExtractor(field, selector, FieldExtractor.Source.Html, extractBy.notNull(), extractBy.multi()); + fieldExtractor = new FieldExtractor(field, selector, FieldExtractor.Source.Html, extractBy.notNull(), extractBy.multi()); Method setterMethod = getSetterMethod(clazz, field); if (setterMethod != null) { fieldExtractor.setSetterMethod(setterMethod); } - fieldExtractors.add(fieldExtractor); + } + return fieldExtractor; + } + + private void addAnnotationExtractBy2(Class clazz, FieldExtractor fieldExtractor) { + ExtractBy2 extractBy = fieldExtractor.getField().getAnnotation(ExtractBy2.class); + if (extractBy != null) { + String value = extractBy.value(); + Selector selector; + switch (extractBy.type()) { + case Css: + selector = new CssSelector(value); + break; + case Regex: + selector = new RegexSelector(value); + break; + case XPath: + selector = new XpathSelector(value); + break; + case XPath2: + selector = new Xpath2Selector(value); + break; + default: + selector = new Xpath2Selector(value); + } + fieldExtractor.setSelector(new AndSelector(fieldExtractor.getSelector(), selector)); } } - private void getAnnotationExtractByRaw(Class clazz, Field field) { + private void addAnnotationExtractBy3(Class clazz, FieldExtractor fieldExtractor) { + ExtractBy3 extractBy = fieldExtractor.getField().getAnnotation(ExtractBy3.class); + if (extractBy != null) { + String value = extractBy.value(); + Selector selector; + switch (extractBy.type()) { + case Css: + selector = new CssSelector(value); + break; + case Regex: + selector = new RegexSelector(value); + break; + case XPath: + selector = new XpathSelector(value); + break; + case XPath2: + selector = new Xpath2Selector(value); + break; + default: + selector = new Xpath2Selector(value); + } + fieldExtractor.setSelector(new AndSelector(fieldExtractor.getSelector(), selector)); + } + } + + private FieldExtractor getAnnotationExtractByRaw(Class clazz, Field field) { + FieldExtractor fieldExtractor = null; ExtractByRaw extractByRaw = field.getAnnotation(ExtractByRaw.class); if (extractByRaw != null) { - if (!extractByRaw.multi() && !String.class.isAssignableFrom(field.getType())) { - throw new IllegalStateException("Field " + field.getName() + " must be string"); - } else if (extractByRaw.multi() && !List.class.isAssignableFrom(field.getType())) { - throw new IllegalStateException("Field " + field.getName() + " must be list"); - } String value = extractByRaw.value(); Selector selector; switch (extractByRaw.type()) { @@ -134,13 +194,13 @@ class PageModelExtractor { default: selector = new Xpath2Selector(value); } - FieldExtractor fieldExtractor = new FieldExtractor(field, selector, FieldExtractor.Source.RawHtml, extractByRaw.notNull(), extractByRaw.multi()); + fieldExtractor = new FieldExtractor(field, selector, FieldExtractor.Source.RawHtml, extractByRaw.notNull(), extractByRaw.multi()); Method setterMethod = getSetterMethod(clazz, field); if (setterMethod != null) { fieldExtractor.setSetterMethod(setterMethod); } - fieldExtractors.add(fieldExtractor); } + return fieldExtractor; } public static Method getSetterMethod(Class clazz, Field field) { @@ -197,19 +257,19 @@ class PageModelExtractor { return null; } if (extractor == null) { - return processSingle(page,page.getHtml().toString()); + return processSingle(page, page.getHtml().toString()); } else { - if (extractor.multi){ + if (extractor.multi) { List os = new ArrayList(); List list = extractor.getSelector().selectList(page.getHtml().toString()); for (String s : list) { Object o = processSingle(page, s); - if (o!=null){ + if (o != null) { os.add(o); } } return os; - }else { + } else { String select = extractor.getSelector().select(page.getHtml().toString()); Object o = processSingle(page, select); return o; @@ -217,12 +277,12 @@ class PageModelExtractor { } } - private Object processSingle(Page page,String html) { + private Object processSingle(Page page, String html) { Object o = null; try { o = clazz.newInstance(); for (FieldExtractor fieldExtractor : fieldExtractors) { - if (fieldExtractor.multi) { + if (fieldExtractor.isMulti()) { List value; switch (fieldExtractor.getSource()) { case RawHtml: