From 59ad4cad27676e8e551377df0bfc98fb5acffbb6 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Thu, 28 Nov 2013 08:25:16 +0800 Subject: [PATCH] #42 Add jsonpath in annotation mode for json result --- .../us/codecraft/webmagic/selector/Html.java | 25 +++++++++++++++---- .../codecraft/webmagic/example/AppStore.java | 24 ++++++++++++++++++ .../webmagic/model/PageModelExtractor.java | 4 +-- .../webmagic/model/annotation/ExtractBy.java | 2 +- .../model/annotation/ExtractByUrl.java | 2 +- .../webmagic/utils/ExtractorUtils.java | 3 +++ 6 files changed, 51 insertions(+), 9 deletions(-) create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/example/AppStore.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java index b9b7f02..1dce782 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java @@ -9,7 +9,7 @@ import java.util.ArrayList; import java.util.List; /** - * Selectable plain text.
+ * Selectable html.
* * @author code4crafter@gmail.com
* @since 0.1.0 @@ -23,16 +23,28 @@ public class Html extends PlainText { */ private Document document; + private boolean init = false; + public Html(List strings) { super(strings); } public Html(String text) { super(text); - try { - this.document = Jsoup.parse(text); - } catch (Exception e) { - logger.warn("parse document error ", e); + } + + /** + * lazy init + */ + private void initDocument() { + if (this.document == null && !init) { + init = true; + //just init once whether the parsing succeeds or not + try { + this.document = Jsoup.parse(getText()); + } catch (Exception e) { + logger.warn("parse document error ", e); + } } } @@ -47,6 +59,7 @@ public class Html extends PlainText { @Override protected Selectable select(Selector selector, List strings) { + initDocument(); List results = new ArrayList(); for (String string : strings) { String result = selector.select(string); @@ -59,6 +72,7 @@ public class Html extends PlainText { @Override protected Selectable selectList(Selector selector, List strings) { + initDocument(); List results = new ArrayList(); for (String string : strings) { List result = selector.selectList(string); @@ -69,6 +83,7 @@ public class Html extends PlainText { @Override public Selectable smartContent() { + initDocument(); SmartContentSelector smartContentSelector = Selectors.smartContent(); return select(smartContentSelector, strings); } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/AppStore.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/AppStore.java new file mode 100644 index 0000000..504e6d2 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/AppStore.java @@ -0,0 +1,24 @@ +package us.codecraft.webmagic.example; + +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.model.OOSpider; +import us.codecraft.webmagic.model.annotation.ExtractBy; + +/** + * @author code4crafter@gmail.com + * @since 0.4.1 + */ +public class AppStore { + + @ExtractBy(type = ExtractBy.Type.JsonPath, value = "$..trackName") + private String trackName; + + @ExtractBy(type = ExtractBy.Type.JsonPath, value = "$..description") + private String description; + + public static void main(String[] args) { + AppStore appStore = OOSpider.create(Site.me(), AppStore.class).get("http://itunes.apple.com/lookup?id=653350791&country=cn&entity=software"); + System.out.println(appStore.trackName); + System.out.println(appStore.description); + } +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java index a079988..d7da0c9 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java @@ -239,7 +239,7 @@ class PageModelExtractor { } else { if (objectExtractor.multi) { List os = new ArrayList(); - List list = objectExtractor.getSelector().selectList(page.getHtml().toString()); + List list = objectExtractor.getSelector().selectList(page.getRawText()); for (String s : list) { Object o = processSingle(page, s, false); if (o != null) { @@ -248,7 +248,7 @@ class PageModelExtractor { } return os; } else { - String select = objectExtractor.getSelector().select(page.getHtml().toString()); + String select = objectExtractor.getSelector().select(page.getRawText()); Object o = processSingle(page, select, false); return o; } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy.java index 4f66deb..8fddccf 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy.java @@ -24,7 +24,7 @@ public @interface ExtractBy { /** * types of extractor expressions */ - public static enum Type {XPath, Regex, Css} + public static enum Type {XPath, Regex, Css, JsonPath} /** * Extractor type, support XPath, CSS Selector and regex. diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractByUrl.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractByUrl.java index 416bd89..328c079 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractByUrl.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractByUrl.java @@ -5,7 +5,7 @@ import java.lang.annotation.Retention; import java.lang.annotation.Target; /** - * Define a extractor for url. Only regex can be used.
+ * Define a extractor to extract data in url of current page. Only regex can be used.
* * @author code4crafter@gmail.com
* @since 0.2.0 diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/ExtractorUtils.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/ExtractorUtils.java index 2d9fd51..0818fde 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/ExtractorUtils.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/ExtractorUtils.java @@ -27,6 +27,9 @@ public class ExtractorUtils { case XPath: selector = getXpathSelector(value); break; + case JsonPath: + selector = new JsonPathSelector(value); + break; default: selector = getXpathSelector(value); }