diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/GithubRepoApi.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/GithubRepoApi.java index 34608fd..4181bb9 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/GithubRepoApi.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/GithubRepoApi.java @@ -15,19 +15,19 @@ import java.util.List; */ public class GithubRepoApi implements HasKey { - @ExtractBy(type = ExtractBy.Type.JsonPath, value = "$.name") + @ExtractBy(type = ExtractBy.Type.JsonPath, value = "$.name", source = ExtractBy.Source.RawText) private String name; - @ExtractBy(type = ExtractBy.Type.JsonPath, value = "$..owner.login") + @ExtractBy(type = ExtractBy.Type.JsonPath, value = "$..owner.login", source = ExtractBy.Source.RawText) private String author; - @ExtractBy(type = ExtractBy.Type.JsonPath, value = "$.language",multi = true) + @ExtractBy(type = ExtractBy.Type.JsonPath, value = "$.language",multi = true, source = ExtractBy.Source.RawText) private List language; - @ExtractBy(type = ExtractBy.Type.JsonPath, value = "$.stargazers_count") + @ExtractBy(type = ExtractBy.Type.JsonPath, value = "$.stargazers_count", source = ExtractBy.Source.RawText) private int star; - @ExtractBy(type = ExtractBy.Type.JsonPath, value = "$.homepage") + @ExtractBy(type = ExtractBy.Type.JsonPath, value = "$.forks_count", source = ExtractBy.Source.RawText) private int fork; @ExtractByUrl diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/Extractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/Extractor.java index 32f561e..f1d2f84 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/Extractor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/Extractor.java @@ -17,7 +17,7 @@ class Extractor { protected final boolean multi; - static enum Source {Html, Url, RawHtml} + static enum Source {Html, Url, RawHtml, RawText} public Extractor(Selector selector, Source source, boolean notNull, boolean multi) { this.selector = selector; diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java index 9816c71..a1da94b 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java @@ -179,7 +179,24 @@ class PageModelExtractor { ExtractBy extractBy = field.getAnnotation(ExtractBy.class); if (extractBy != null) { Selector selector = ExtractorUtils.getSelector(extractBy); - fieldExtractor = new FieldExtractor(field, selector, extractBy.source() == ExtractBy.Source.RawHtml ? FieldExtractor.Source.RawHtml : FieldExtractor.Source.Html, + + FieldExtractor.Source source = null; + switch (extractBy.source()){ + case RawText: + source = FieldExtractor.Source.RawText; + break; + case RawHtml: + source = FieldExtractor.Source.RawHtml; + break; + case SelectedHtml: + source =FieldExtractor.Source.Html; + break; + default: + source =FieldExtractor.Source.Html; + + } + + fieldExtractor = new FieldExtractor(field, selector, source, extractBy.notNull(), extractBy.multi() || List.class.isAssignableFrom(field.getType())); Method setterMethod = getSetterMethod(clazz, field); if (setterMethod != null) { @@ -284,6 +301,9 @@ class PageModelExtractor { case Url: value = fieldExtractor.getSelector().selectList(page.getUrl().toString()); break; + case RawText: + value = fieldExtractor.getSelector().selectList(page.getRawText()); + break; default: value = fieldExtractor.getSelector().selectList(html); } @@ -312,6 +332,9 @@ class PageModelExtractor { case Url: value = fieldExtractor.getSelector().select(page.getUrl().toString()); break; + case RawText: + value = fieldExtractor.getSelector().select(page.getRawText()); + break; default: value = fieldExtractor.getSelector().select(html); } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy.java index 2e23aa0..8e02895 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy.java @@ -52,7 +52,8 @@ public @interface ExtractBy { /** * extract from the raw html */ - RawHtml + RawHtml, + RawText } /**