diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/model/ExtractByRaw.java b/webmagic-core/src/main/java/us/codecraft/webmagic/model/ExtractByRaw.java new file mode 100644 index 0000000..5dca8e1 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/model/ExtractByRaw.java @@ -0,0 +1,27 @@ +package us.codecraft.webmagic.model; + +import java.lang.annotation.ElementType; +import java.lang.annotation.Retention; +import java.lang.annotation.Target; + +/** + * 对于在Class级别就使用过ExtractBy的类,在字段中想抽取全部内容可使用此方法。
+ * @author code4crafter@gmail.com
+ * @date: 13-8-1
+ * Time: 下午8:40
+ */ +@Retention(java.lang.annotation.RetentionPolicy.RUNTIME) +@Target({ElementType.FIELD,ElementType.TYPE}) +public @interface ExtractByRaw { + + String value(); + + public enum Type {XPath2, XPath, Regex, Css} + + Type type() default Type.XPath2; + + boolean notNull() default true; + + boolean multi() default false; + +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/model/Extractor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/model/Extractor.java index c8feef4..498aba9 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/model/Extractor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/model/Extractor.java @@ -17,7 +17,7 @@ class Extractor { protected final boolean multi; - static enum Source {Html, Url} + static enum Source {Html, Url, RawHtml} public Extractor(Selector selector, Source source, boolean notNull, boolean multi) { this.selector = selector; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java index cf0eeac..9694c4e 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java @@ -46,56 +46,100 @@ class PageModelExtractor { fieldExtractors = new ArrayList(); for (Field field : clazz.getDeclaredFields()) { field.setAccessible(true); - ExtractBy extractBy = field.getAnnotation(ExtractBy.class); - if (extractBy != null) { - if (!extractBy.multi() && !String.class.isAssignableFrom(field.getType())) { - throw new IllegalStateException("Field " + field.getName() + " must be string"); - } else if (extractBy.multi() && !List.class.isAssignableFrom(field.getType())) { - throw new IllegalStateException("Field " + field.getName() + " must be list"); - } - String value = extractBy.value(); - Selector selector; - switch (extractBy.type()) { - case Css: - selector = new CssSelector(value); - break; - case Regex: - selector = new RegexSelector(value); - break; - case XPath: - selector = new XpathSelector(value); - break; - case XPath2: - selector = new Xpath2Selector(value); - break; - default: - selector = new Xpath2Selector(value); - } - FieldExtractor fieldExtractor = new FieldExtractor(field, selector, FieldExtractor.Source.Html, extractBy.notNull(), extractBy.multi()); - Method setterMethod = getSetterMethod(clazz, field); - if (setterMethod != null) { - fieldExtractor.setSetterMethod(setterMethod); - } - fieldExtractors.add(fieldExtractor); + getAnnotationExtractBy(clazz, field); + getAnnotationExtractByRaw(clazz,field); + getAnnotationExtractByUrl(clazz, field); + } + } + + private void getAnnotationExtractByUrl(Class clazz, Field field) { + ExtractByUrl extractByUrl = field.getAnnotation(ExtractByUrl.class); + if (extractByUrl != null) { + if (!extractByUrl.multi() && !String.class.isAssignableFrom(field.getType())) { + throw new IllegalStateException("Field " + field.getName() + " must be string"); + } else if (extractByUrl.multi() && !List.class.isAssignableFrom(field.getType())) { + throw new IllegalStateException("Field " + field.getName() + " must be list"); } - ExtractByUrl extractByUrl = field.getAnnotation(ExtractByUrl.class); - if (extractByUrl != null) { - if (!extractByUrl.multi() && !String.class.isAssignableFrom(field.getType())) { - throw new IllegalStateException("Field " + field.getName() + " must be string"); - } else if (extractByUrl.multi() && !List.class.isAssignableFrom(field.getType())) { - throw new IllegalStateException("Field " + field.getName() + " must be list"); - } - String regexPattern = extractByUrl.value(); - if (regexPattern.trim().equals("")) { - regexPattern = ".*"; - } - FieldExtractor fieldExtractor = new FieldExtractor(field, new RegexSelector(regexPattern), FieldExtractor.Source.Url, extractByUrl.notNull(), extractByUrl.multi()); - Method setterMethod = getSetterMethod(clazz, field); - if (setterMethod != null) { - fieldExtractor.setSetterMethod(setterMethod); - } - fieldExtractors.add(fieldExtractor); + String regexPattern = extractByUrl.value(); + if (regexPattern.trim().equals("")) { + regexPattern = ".*"; } + FieldExtractor fieldExtractor = new FieldExtractor(field, new RegexSelector(regexPattern), FieldExtractor.Source.Url, extractByUrl.notNull(), extractByUrl.multi()); + Method setterMethod = getSetterMethod(clazz, field); + if (setterMethod != null) { + fieldExtractor.setSetterMethod(setterMethod); + } + fieldExtractors.add(fieldExtractor); + } + } + + private void getAnnotationExtractBy(Class clazz, Field field) { + ExtractBy extractBy = field.getAnnotation(ExtractBy.class); + if (extractBy != null) { + if (!extractBy.multi() && !String.class.isAssignableFrom(field.getType())) { + throw new IllegalStateException("Field " + field.getName() + " must be string"); + } else if (extractBy.multi() && !List.class.isAssignableFrom(field.getType())) { + throw new IllegalStateException("Field " + field.getName() + " must be list"); + } + String value = extractBy.value(); + Selector selector; + switch (extractBy.type()) { + case Css: + selector = new CssSelector(value); + break; + case Regex: + selector = new RegexSelector(value); + break; + case XPath: + selector = new XpathSelector(value); + break; + case XPath2: + selector = new Xpath2Selector(value); + break; + default: + selector = new Xpath2Selector(value); + } + FieldExtractor fieldExtractor = new FieldExtractor(field, selector, FieldExtractor.Source.Html, extractBy.notNull(), extractBy.multi()); + Method setterMethod = getSetterMethod(clazz, field); + if (setterMethod != null) { + fieldExtractor.setSetterMethod(setterMethod); + } + fieldExtractors.add(fieldExtractor); + } + } + + private void getAnnotationExtractByRaw(Class clazz, Field field) { + ExtractByRaw extractByRaw = field.getAnnotation(ExtractByRaw.class); + if (extractByRaw != null) { + if (!extractByRaw.multi() && !String.class.isAssignableFrom(field.getType())) { + throw new IllegalStateException("Field " + field.getName() + " must be string"); + } else if (extractByRaw.multi() && !List.class.isAssignableFrom(field.getType())) { + throw new IllegalStateException("Field " + field.getName() + " must be list"); + } + String value = extractByRaw.value(); + Selector selector; + switch (extractByRaw.type()) { + case Css: + selector = new CssSelector(value); + break; + case Regex: + selector = new RegexSelector(value); + break; + case XPath: + selector = new XpathSelector(value); + break; + case XPath2: + selector = new Xpath2Selector(value); + break; + default: + selector = new Xpath2Selector(value); + } + FieldExtractor fieldExtractor = new FieldExtractor(field, selector, FieldExtractor.Source.RawHtml, extractByRaw.notNull(), extractByRaw.multi()); + Method setterMethod = getSetterMethod(clazz, field); + if (setterMethod != null) { + fieldExtractor.setSetterMethod(setterMethod); + } + fieldExtractors.add(fieldExtractor); } } @@ -181,6 +225,9 @@ class PageModelExtractor { if (fieldExtractor.multi) { List value; switch (fieldExtractor.getSource()) { + case RawHtml: + value = fieldExtractor.getSelector().selectList(page.getHtml().toString()); + break; case Html: value = fieldExtractor.getSelector().selectList(html); break; @@ -197,6 +244,9 @@ class PageModelExtractor { } else { String value; switch (fieldExtractor.getSource()) { + case RawHtml: + value = fieldExtractor.getSelector().select(page.getHtml().toString()); + break; case Html: value = fieldExtractor.getSelector().select(html); break; diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/model/OschinaBlog.java b/webmagic-core/src/test/java/us/codecraft/webmagic/model/OschinaBlog.java index 1bb219f..2552104 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/model/OschinaBlog.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/model/OschinaBlog.java @@ -22,6 +22,9 @@ public class OschinaBlog implements AfterExtractor { @ExtractBy(value = "//div[@class='BlogTags']/a/text()", multi = true) private List tags; + @ExtractBy(value = "//div[@class='BlogTags']/a/text()", multi = true) + private List comments; + @Override public void afterProcess(Page page) { System.out.println("title:\t"+title); diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/model/OschinaBlogComment.java b/webmagic-core/src/test/java/us/codecraft/webmagic/model/OschinaBlogComment.java new file mode 100644 index 0000000..a1e5843 --- /dev/null +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/model/OschinaBlogComment.java @@ -0,0 +1,13 @@ +package us.codecraft.webmagic.model; + +/** + * @author code4crafter@gmail.com
+ * @date: 13-8-1
+ * Time: 下午10:18
+ */ +@TargetUrl("http://my.oschina.net/flashsword/blog/*") +public class OschinaBlogComment { + + + +} \ No newline at end of file