diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/model/ExtractBy2.java b/webmagic-core/src/main/java/us/codecraft/webmagic/model/ExtractBy2.java
new file mode 100644
index 0000000..55d5dfa
--- /dev/null
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/model/ExtractBy2.java
@@ -0,0 +1,23 @@
+package us.codecraft.webmagic.model;
+
+import java.lang.annotation.ElementType;
+import java.lang.annotation.Retention;
+import java.lang.annotation.Target;
+
+/**
+ * 定义类或者字段的抽取规则。
+ * @author code4crafter@gmail.com
+ * @date: 13-8-1
+ * Time: 下午8:40
+ */
+@Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
+@Target({ElementType.FIELD})
+public @interface ExtractBy2 {
+
+ String value();
+
+ public enum Type {XPath2, XPath, Regex, Css}
+
+ Type type() default Type.XPath2;
+
+}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/model/ExtractBy3.java b/webmagic-core/src/main/java/us/codecraft/webmagic/model/ExtractBy3.java
new file mode 100644
index 0000000..10f6a9f
--- /dev/null
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/model/ExtractBy3.java
@@ -0,0 +1,23 @@
+package us.codecraft.webmagic.model;
+
+import java.lang.annotation.ElementType;
+import java.lang.annotation.Retention;
+import java.lang.annotation.Target;
+
+/**
+ * 定义类或者字段的抽取规则。
+ * @author code4crafter@gmail.com
+ * @date: 13-8-1
+ * Time: 下午8:40
+ */
+@Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
+@Target({ElementType.FIELD})
+public @interface ExtractBy3 {
+
+ String value();
+
+ public enum Type {XPath2, XPath, Regex, Css}
+
+ Type type() default Type.XPath2;
+
+}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/model/Extractor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/model/Extractor.java
index 498aba9..82c7dbb 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/model/Extractor.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/model/Extractor.java
@@ -9,7 +9,7 @@ import us.codecraft.webmagic.selector.Selector;
*/
class Extractor {
- protected final Selector selector;
+ protected Selector selector;
protected final Source source;
@@ -37,4 +37,12 @@ class Extractor {
boolean isNotNull() {
return notNull;
}
+
+ boolean isMulti() {
+ return multi;
+ }
+
+ void setSelector(Selector selector) {
+ this.selector = selector;
+ }
}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java
index 9694c4e..b2c2bb0 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java
@@ -14,6 +14,7 @@ import java.util.regex.Pattern;
/**
* Model主要逻辑类。将一个带注解的POJO转换为一个PageModelExtractor。
+ *
* @author code4crafter@gmail.com
* @date: 13-8-1
* Time: 下午9:33
@@ -46,41 +47,54 @@ class PageModelExtractor {
fieldExtractors = new ArrayList();
for (Field field : clazz.getDeclaredFields()) {
field.setAccessible(true);
- getAnnotationExtractBy(clazz, field);
- getAnnotationExtractByRaw(clazz,field);
- getAnnotationExtractByUrl(clazz, field);
+ FieldExtractor fieldExtractor = getAnnotationExtractBy(clazz, field);
+ FieldExtractor fieldExtractorTmp = getAnnotationExtractByRaw(clazz, field);
+ if (fieldExtractor != null && fieldExtractorTmp != null) {
+ throw new IllegalStateException("Only one of 'ExtractBy ExtractByRaw ExtractByUrl' can be added to a field!");
+ } else if (fieldExtractor == null && fieldExtractorTmp != null) {
+ fieldExtractor = fieldExtractorTmp;
+ }
+ // ExtractBy2 & ExtractBy3
+ addAnnotationExtractBy2(clazz, fieldExtractor);
+ addAnnotationExtractBy3(clazz, fieldExtractor);
+ fieldExtractorTmp = getAnnotationExtractByUrl(clazz, field);
+ if (fieldExtractor != null && fieldExtractorTmp != null) {
+ throw new IllegalStateException("Only one of 'ExtractBy ExtractByRaw ExtractByUrl' can be added to a field!");
+ } else if (fieldExtractor == null && fieldExtractorTmp != null) {
+ fieldExtractor = fieldExtractorTmp;
+ }
+ if (fieldExtractor != null) {
+ if (!fieldExtractor.isMulti() && !String.class.isAssignableFrom(field.getType())) {
+ throw new IllegalStateException("Field " + field.getName() + " must be string");
+ } else if (fieldExtractor.isMulti() && !List.class.isAssignableFrom(field.getType())) {
+ throw new IllegalStateException("Field " + field.getName() + " must be list");
+ }
+ }
+
}
}
- private void getAnnotationExtractByUrl(Class clazz, Field field) {
+ private FieldExtractor getAnnotationExtractByUrl(Class clazz, Field field) {
+ FieldExtractor fieldExtractor = null;
ExtractByUrl extractByUrl = field.getAnnotation(ExtractByUrl.class);
if (extractByUrl != null) {
- if (!extractByUrl.multi() && !String.class.isAssignableFrom(field.getType())) {
- throw new IllegalStateException("Field " + field.getName() + " must be string");
- } else if (extractByUrl.multi() && !List.class.isAssignableFrom(field.getType())) {
- throw new IllegalStateException("Field " + field.getName() + " must be list");
- }
String regexPattern = extractByUrl.value();
if (regexPattern.trim().equals("")) {
regexPattern = ".*";
}
- FieldExtractor fieldExtractor = new FieldExtractor(field, new RegexSelector(regexPattern), FieldExtractor.Source.Url, extractByUrl.notNull(), extractByUrl.multi());
+ fieldExtractor = new FieldExtractor(field, new RegexSelector(regexPattern), FieldExtractor.Source.Url, extractByUrl.notNull(), extractByUrl.multi());
Method setterMethod = getSetterMethod(clazz, field);
if (setterMethod != null) {
fieldExtractor.setSetterMethod(setterMethod);
}
- fieldExtractors.add(fieldExtractor);
}
+ return fieldExtractor;
}
- private void getAnnotationExtractBy(Class clazz, Field field) {
+ private FieldExtractor getAnnotationExtractBy(Class clazz, Field field) {
+ FieldExtractor fieldExtractor = null;
ExtractBy extractBy = field.getAnnotation(ExtractBy.class);
if (extractBy != null) {
- if (!extractBy.multi() && !String.class.isAssignableFrom(field.getType())) {
- throw new IllegalStateException("Field " + field.getName() + " must be string");
- } else if (extractBy.multi() && !List.class.isAssignableFrom(field.getType())) {
- throw new IllegalStateException("Field " + field.getName() + " must be list");
- }
String value = extractBy.value();
Selector selector;
switch (extractBy.type()) {
@@ -99,23 +113,69 @@ class PageModelExtractor {
default:
selector = new Xpath2Selector(value);
}
- FieldExtractor fieldExtractor = new FieldExtractor(field, selector, FieldExtractor.Source.Html, extractBy.notNull(), extractBy.multi());
+ fieldExtractor = new FieldExtractor(field, selector, FieldExtractor.Source.Html, extractBy.notNull(), extractBy.multi());
Method setterMethod = getSetterMethod(clazz, field);
if (setterMethod != null) {
fieldExtractor.setSetterMethod(setterMethod);
}
- fieldExtractors.add(fieldExtractor);
+ }
+ return fieldExtractor;
+ }
+
+ private void addAnnotationExtractBy2(Class clazz, FieldExtractor fieldExtractor) {
+ ExtractBy2 extractBy = fieldExtractor.getField().getAnnotation(ExtractBy2.class);
+ if (extractBy != null) {
+ String value = extractBy.value();
+ Selector selector;
+ switch (extractBy.type()) {
+ case Css:
+ selector = new CssSelector(value);
+ break;
+ case Regex:
+ selector = new RegexSelector(value);
+ break;
+ case XPath:
+ selector = new XpathSelector(value);
+ break;
+ case XPath2:
+ selector = new Xpath2Selector(value);
+ break;
+ default:
+ selector = new Xpath2Selector(value);
+ }
+ fieldExtractor.setSelector(new AndSelector(fieldExtractor.getSelector(), selector));
}
}
- private void getAnnotationExtractByRaw(Class clazz, Field field) {
+ private void addAnnotationExtractBy3(Class clazz, FieldExtractor fieldExtractor) {
+ ExtractBy3 extractBy = fieldExtractor.getField().getAnnotation(ExtractBy3.class);
+ if (extractBy != null) {
+ String value = extractBy.value();
+ Selector selector;
+ switch (extractBy.type()) {
+ case Css:
+ selector = new CssSelector(value);
+ break;
+ case Regex:
+ selector = new RegexSelector(value);
+ break;
+ case XPath:
+ selector = new XpathSelector(value);
+ break;
+ case XPath2:
+ selector = new Xpath2Selector(value);
+ break;
+ default:
+ selector = new Xpath2Selector(value);
+ }
+ fieldExtractor.setSelector(new AndSelector(fieldExtractor.getSelector(), selector));
+ }
+ }
+
+ private FieldExtractor getAnnotationExtractByRaw(Class clazz, Field field) {
+ FieldExtractor fieldExtractor = null;
ExtractByRaw extractByRaw = field.getAnnotation(ExtractByRaw.class);
if (extractByRaw != null) {
- if (!extractByRaw.multi() && !String.class.isAssignableFrom(field.getType())) {
- throw new IllegalStateException("Field " + field.getName() + " must be string");
- } else if (extractByRaw.multi() && !List.class.isAssignableFrom(field.getType())) {
- throw new IllegalStateException("Field " + field.getName() + " must be list");
- }
String value = extractByRaw.value();
Selector selector;
switch (extractByRaw.type()) {
@@ -134,13 +194,13 @@ class PageModelExtractor {
default:
selector = new Xpath2Selector(value);
}
- FieldExtractor fieldExtractor = new FieldExtractor(field, selector, FieldExtractor.Source.RawHtml, extractByRaw.notNull(), extractByRaw.multi());
+ fieldExtractor = new FieldExtractor(field, selector, FieldExtractor.Source.RawHtml, extractByRaw.notNull(), extractByRaw.multi());
Method setterMethod = getSetterMethod(clazz, field);
if (setterMethod != null) {
fieldExtractor.setSetterMethod(setterMethod);
}
- fieldExtractors.add(fieldExtractor);
}
+ return fieldExtractor;
}
public static Method getSetterMethod(Class clazz, Field field) {
@@ -197,19 +257,19 @@ class PageModelExtractor {
return null;
}
if (extractor == null) {
- return processSingle(page,page.getHtml().toString());
+ return processSingle(page, page.getHtml().toString());
} else {
- if (extractor.multi){
+ if (extractor.multi) {
List