diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java
index b9b7f02..1dce782 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java
@@ -9,7 +9,7 @@ import java.util.ArrayList;
import java.util.List;
/**
- * Selectable plain text.
+ * Selectable html.
*
* @author code4crafter@gmail.com
* @since 0.1.0
@@ -23,16 +23,28 @@ public class Html extends PlainText {
*/
private Document document;
+ private boolean init = false;
+
public Html(List strings) {
super(strings);
}
public Html(String text) {
super(text);
- try {
- this.document = Jsoup.parse(text);
- } catch (Exception e) {
- logger.warn("parse document error ", e);
+ }
+
+ /**
+ * lazy init
+ */
+ private void initDocument() {
+ if (this.document == null && !init) {
+ init = true;
+ //just init once whether the parsing succeeds or not
+ try {
+ this.document = Jsoup.parse(getText());
+ } catch (Exception e) {
+ logger.warn("parse document error ", e);
+ }
}
}
@@ -47,6 +59,7 @@ public class Html extends PlainText {
@Override
protected Selectable select(Selector selector, List strings) {
+ initDocument();
List results = new ArrayList();
for (String string : strings) {
String result = selector.select(string);
@@ -59,6 +72,7 @@ public class Html extends PlainText {
@Override
protected Selectable selectList(Selector selector, List strings) {
+ initDocument();
List results = new ArrayList();
for (String string : strings) {
List result = selector.selectList(string);
@@ -69,6 +83,7 @@ public class Html extends PlainText {
@Override
public Selectable smartContent() {
+ initDocument();
SmartContentSelector smartContentSelector = Selectors.smartContent();
return select(smartContentSelector, strings);
}
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/AppStore.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/AppStore.java
new file mode 100644
index 0000000..504e6d2
--- /dev/null
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/AppStore.java
@@ -0,0 +1,24 @@
+package us.codecraft.webmagic.example;
+
+import us.codecraft.webmagic.Site;
+import us.codecraft.webmagic.model.OOSpider;
+import us.codecraft.webmagic.model.annotation.ExtractBy;
+
+/**
+ * @author code4crafter@gmail.com
+ * @since 0.4.1
+ */
+public class AppStore {
+
+ @ExtractBy(type = ExtractBy.Type.JsonPath, value = "$..trackName")
+ private String trackName;
+
+ @ExtractBy(type = ExtractBy.Type.JsonPath, value = "$..description")
+ private String description;
+
+ public static void main(String[] args) {
+ AppStore appStore = OOSpider.create(Site.me(), AppStore.class).get("http://itunes.apple.com/lookup?id=653350791&country=cn&entity=software");
+ System.out.println(appStore.trackName);
+ System.out.println(appStore.description);
+ }
+}
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java
index a079988..d7da0c9 100644
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java
@@ -239,7 +239,7 @@ class PageModelExtractor {
} else {
if (objectExtractor.multi) {
List