diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/model/ExtractByRaw.java b/webmagic-core/src/main/java/us/codecraft/webmagic/model/ExtractByRaw.java
new file mode 100644
index 0000000..5dca8e1
--- /dev/null
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/model/ExtractByRaw.java
@@ -0,0 +1,27 @@
+package us.codecraft.webmagic.model;
+
+import java.lang.annotation.ElementType;
+import java.lang.annotation.Retention;
+import java.lang.annotation.Target;
+
+/**
+ * 对于在Class级别就使用过ExtractBy的类,在字段中想抽取全部内容可使用此方法。
+ * @author code4crafter@gmail.com
+ * @date: 13-8-1
+ * Time: 下午8:40
+ */
+@Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
+@Target({ElementType.FIELD,ElementType.TYPE})
+public @interface ExtractByRaw {
+
+ String value();
+
+ public enum Type {XPath2, XPath, Regex, Css}
+
+ Type type() default Type.XPath2;
+
+ boolean notNull() default true;
+
+ boolean multi() default false;
+
+}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/model/Extractor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/model/Extractor.java
index c8feef4..498aba9 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/model/Extractor.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/model/Extractor.java
@@ -17,7 +17,7 @@ class Extractor {
protected final boolean multi;
- static enum Source {Html, Url}
+ static enum Source {Html, Url, RawHtml}
public Extractor(Selector selector, Source source, boolean notNull, boolean multi) {
this.selector = selector;
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java
index cf0eeac..9694c4e 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java
@@ -46,56 +46,100 @@ class PageModelExtractor {
fieldExtractors = new ArrayList();
for (Field field : clazz.getDeclaredFields()) {
field.setAccessible(true);
- ExtractBy extractBy = field.getAnnotation(ExtractBy.class);
- if (extractBy != null) {
- if (!extractBy.multi() && !String.class.isAssignableFrom(field.getType())) {
- throw new IllegalStateException("Field " + field.getName() + " must be string");
- } else if (extractBy.multi() && !List.class.isAssignableFrom(field.getType())) {
- throw new IllegalStateException("Field " + field.getName() + " must be list");
- }
- String value = extractBy.value();
- Selector selector;
- switch (extractBy.type()) {
- case Css:
- selector = new CssSelector(value);
- break;
- case Regex:
- selector = new RegexSelector(value);
- break;
- case XPath:
- selector = new XpathSelector(value);
- break;
- case XPath2:
- selector = new Xpath2Selector(value);
- break;
- default:
- selector = new Xpath2Selector(value);
- }
- FieldExtractor fieldExtractor = new FieldExtractor(field, selector, FieldExtractor.Source.Html, extractBy.notNull(), extractBy.multi());
- Method setterMethod = getSetterMethod(clazz, field);
- if (setterMethod != null) {
- fieldExtractor.setSetterMethod(setterMethod);
- }
- fieldExtractors.add(fieldExtractor);
+ getAnnotationExtractBy(clazz, field);
+ getAnnotationExtractByRaw(clazz,field);
+ getAnnotationExtractByUrl(clazz, field);
+ }
+ }
+
+ private void getAnnotationExtractByUrl(Class clazz, Field field) {
+ ExtractByUrl extractByUrl = field.getAnnotation(ExtractByUrl.class);
+ if (extractByUrl != null) {
+ if (!extractByUrl.multi() && !String.class.isAssignableFrom(field.getType())) {
+ throw new IllegalStateException("Field " + field.getName() + " must be string");
+ } else if (extractByUrl.multi() && !List.class.isAssignableFrom(field.getType())) {
+ throw new IllegalStateException("Field " + field.getName() + " must be list");
}
- ExtractByUrl extractByUrl = field.getAnnotation(ExtractByUrl.class);
- if (extractByUrl != null) {
- if (!extractByUrl.multi() && !String.class.isAssignableFrom(field.getType())) {
- throw new IllegalStateException("Field " + field.getName() + " must be string");
- } else if (extractByUrl.multi() && !List.class.isAssignableFrom(field.getType())) {
- throw new IllegalStateException("Field " + field.getName() + " must be list");
- }
- String regexPattern = extractByUrl.value();
- if (regexPattern.trim().equals("")) {
- regexPattern = ".*";
- }
- FieldExtractor fieldExtractor = new FieldExtractor(field, new RegexSelector(regexPattern), FieldExtractor.Source.Url, extractByUrl.notNull(), extractByUrl.multi());
- Method setterMethod = getSetterMethod(clazz, field);
- if (setterMethod != null) {
- fieldExtractor.setSetterMethod(setterMethod);
- }
- fieldExtractors.add(fieldExtractor);
+ String regexPattern = extractByUrl.value();
+ if (regexPattern.trim().equals("")) {
+ regexPattern = ".*";
}
+ FieldExtractor fieldExtractor = new FieldExtractor(field, new RegexSelector(regexPattern), FieldExtractor.Source.Url, extractByUrl.notNull(), extractByUrl.multi());
+ Method setterMethod = getSetterMethod(clazz, field);
+ if (setterMethod != null) {
+ fieldExtractor.setSetterMethod(setterMethod);
+ }
+ fieldExtractors.add(fieldExtractor);
+ }
+ }
+
+ private void getAnnotationExtractBy(Class clazz, Field field) {
+ ExtractBy extractBy = field.getAnnotation(ExtractBy.class);
+ if (extractBy != null) {
+ if (!extractBy.multi() && !String.class.isAssignableFrom(field.getType())) {
+ throw new IllegalStateException("Field " + field.getName() + " must be string");
+ } else if (extractBy.multi() && !List.class.isAssignableFrom(field.getType())) {
+ throw new IllegalStateException("Field " + field.getName() + " must be list");
+ }
+ String value = extractBy.value();
+ Selector selector;
+ switch (extractBy.type()) {
+ case Css:
+ selector = new CssSelector(value);
+ break;
+ case Regex:
+ selector = new RegexSelector(value);
+ break;
+ case XPath:
+ selector = new XpathSelector(value);
+ break;
+ case XPath2:
+ selector = new Xpath2Selector(value);
+ break;
+ default:
+ selector = new Xpath2Selector(value);
+ }
+ FieldExtractor fieldExtractor = new FieldExtractor(field, selector, FieldExtractor.Source.Html, extractBy.notNull(), extractBy.multi());
+ Method setterMethod = getSetterMethod(clazz, field);
+ if (setterMethod != null) {
+ fieldExtractor.setSetterMethod(setterMethod);
+ }
+ fieldExtractors.add(fieldExtractor);
+ }
+ }
+
+ private void getAnnotationExtractByRaw(Class clazz, Field field) {
+ ExtractByRaw extractByRaw = field.getAnnotation(ExtractByRaw.class);
+ if (extractByRaw != null) {
+ if (!extractByRaw.multi() && !String.class.isAssignableFrom(field.getType())) {
+ throw new IllegalStateException("Field " + field.getName() + " must be string");
+ } else if (extractByRaw.multi() && !List.class.isAssignableFrom(field.getType())) {
+ throw new IllegalStateException("Field " + field.getName() + " must be list");
+ }
+ String value = extractByRaw.value();
+ Selector selector;
+ switch (extractByRaw.type()) {
+ case Css:
+ selector = new CssSelector(value);
+ break;
+ case Regex:
+ selector = new RegexSelector(value);
+ break;
+ case XPath:
+ selector = new XpathSelector(value);
+ break;
+ case XPath2:
+ selector = new Xpath2Selector(value);
+ break;
+ default:
+ selector = new Xpath2Selector(value);
+ }
+ FieldExtractor fieldExtractor = new FieldExtractor(field, selector, FieldExtractor.Source.RawHtml, extractByRaw.notNull(), extractByRaw.multi());
+ Method setterMethod = getSetterMethod(clazz, field);
+ if (setterMethod != null) {
+ fieldExtractor.setSetterMethod(setterMethod);
+ }
+ fieldExtractors.add(fieldExtractor);
}
}
@@ -181,6 +225,9 @@ class PageModelExtractor {
if (fieldExtractor.multi) {
List value;
switch (fieldExtractor.getSource()) {
+ case RawHtml:
+ value = fieldExtractor.getSelector().selectList(page.getHtml().toString());
+ break;
case Html:
value = fieldExtractor.getSelector().selectList(html);
break;
@@ -197,6 +244,9 @@ class PageModelExtractor {
} else {
String value;
switch (fieldExtractor.getSource()) {
+ case RawHtml:
+ value = fieldExtractor.getSelector().select(page.getHtml().toString());
+ break;
case Html:
value = fieldExtractor.getSelector().select(html);
break;
diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/model/OschinaBlog.java b/webmagic-core/src/test/java/us/codecraft/webmagic/model/OschinaBlog.java
index 1bb219f..2552104 100644
--- a/webmagic-core/src/test/java/us/codecraft/webmagic/model/OschinaBlog.java
+++ b/webmagic-core/src/test/java/us/codecraft/webmagic/model/OschinaBlog.java
@@ -22,6 +22,9 @@ public class OschinaBlog implements AfterExtractor {
@ExtractBy(value = "//div[@class='BlogTags']/a/text()", multi = true)
private List tags;
+ @ExtractBy(value = "//div[@class='BlogTags']/a/text()", multi = true)
+ private List comments;
+
@Override
public void afterProcess(Page page) {
System.out.println("title:\t"+title);
diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/model/OschinaBlogComment.java b/webmagic-core/src/test/java/us/codecraft/webmagic/model/OschinaBlogComment.java
new file mode 100644
index 0000000..a1e5843
--- /dev/null
+++ b/webmagic-core/src/test/java/us/codecraft/webmagic/model/OschinaBlogComment.java
@@ -0,0 +1,13 @@
+package us.codecraft.webmagic.model;
+
+/**
+ * @author code4crafter@gmail.com
+ * @date: 13-8-1
+ * Time: 下午10:18
+ */
+@TargetUrl("http://my.oschina.net/flashsword/blog/*")
+public class OschinaBlogComment {
+
+
+
+}
\ No newline at end of file