diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index 4bea6e2..0795a99 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -65,6 +65,17 @@ commons-io + + com.jayway.jsonpath + json-path + 0.8.1 + + + + com.alibaba + fastjson + + \ No newline at end of file diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java index e2d923e..3cafe62 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java @@ -2,6 +2,7 @@ package us.codecraft.webmagic; import org.apache.commons.lang3.StringUtils; import us.codecraft.webmagic.selector.Html; +import us.codecraft.webmagic.selector.Json; import us.codecraft.webmagic.selector.Selectable; import us.codecraft.webmagic.utils.UrlUtils; @@ -31,6 +32,8 @@ public class Page { private Html html; + private Json json; + private String rawText; private Selectable url; @@ -72,10 +75,23 @@ public class Page { return html; } + /** + * get json content of page + * + * @return json + * @since 0.5.0 + */ + public Json getJson() { + if (json == null) { + json = new Json(rawText); + } + return json; + } + /** * @param html * @deprecated since 0.4.0 - * The html is parse just when first time of calling {@link #getHtml()}, so use {@link #setRawText(String)} instead. + * The html is parse just when first time of calling {@link #getHtml()}, so use {@link #setRawText(String)} instead. */ public void setHtml(Html html) { this.html = html; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Json.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Json.java new file mode 100644 index 0000000..ef45d00 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Json.java @@ -0,0 +1,64 @@ +package us.codecraft.webmagic.selector; + +import com.alibaba.fastjson.JSON; +import org.jsoup.parser.TokenQueue; + +import java.util.List; + +/** + * parse json + * @author code4crafter@gmail.com + * @since 0.5.0 + */ +public class Json extends PlainText { + + public Json(List strings) { + super(strings); + } + + public Json(String text) { + super(text); + } + + /** + * remove padding for JSONP + * @param padding + * @return + */ + public Json removePadding(String padding) { + String text = getText(); + TokenQueue tokenQueue = new TokenQueue(text); + tokenQueue.consumeWhitespace(); + tokenQueue.consume(padding); + tokenQueue.consumeWhitespace(); + String chompBalanced = tokenQueue.chompBalanced('(', ')'); + return new Json(chompBalanced); + } + + public T toObject(Class clazz) { + if (getText() == null) { + return null; + } + return JSON.parseObject(getText(), clazz); + } + + public List toList(Class clazz) { + if (getText() == null) { + return null; + } + return JSON.parseArray(getText(), clazz); + } + + public String getText() { + if (strings != null && strings.size() > 0) { + return strings.get(0); + } + return null; + } + + @Override + public Selectable jsonPath(String jsonPath) { + JsonPathSelector jsonPathSelector = new JsonPathSelector(jsonPath); + return selectList(jsonPathSelector,strings); + } +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/selector/JsonPathSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/JsonPathSelector.java similarity index 95% rename from webmagic-extension/src/main/java/us/codecraft/webmagic/selector/JsonPathSelector.java rename to webmagic-core/src/main/java/us/codecraft/webmagic/selector/JsonPathSelector.java index 781669f..725dac5 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/selector/JsonPathSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/JsonPathSelector.java @@ -22,7 +22,7 @@ public class JsonPathSelector implements Selector { public JsonPathSelector(String jsonPathStr) { this.jsonPathStr = jsonPathStr; - this.jsonPath = JsonPath.compile(jsonPathStr); + this.jsonPath = JsonPath.compile(this.jsonPathStr); } @Override diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java index 9d5c385..ca40fac 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java @@ -108,6 +108,11 @@ public class PlainText implements Selectable { return strings; } + @Override + public Selectable jsonPath(String jsonPath) { + throw new UnsupportedOperationException(); + } + @Override public String get() { if (CollectionUtils.isNotEmpty(all())) { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java index aa1bb62..cdab8bf 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java @@ -119,4 +119,13 @@ public interface Selectable { * @return multi string result */ public List all(); + + /** + * extract by JSON Path expression + * + * @param jsonPath + * @return + */ + public Selectable jsonPath(String jsonPath); + } diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/HtmlTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/HtmlTest.java index c900014..fa66c3a 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/HtmlTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/HtmlTest.java @@ -1,6 +1,5 @@ package us.codecraft.webmagic; -import org.junit.Assert; import org.junit.Test; import us.codecraft.webmagic.selector.Html; @@ -14,7 +13,8 @@ public class HtmlTest { @Test public void testRegexSelector() { Html selectable = new Html("aaaaaaab"); - Assert.assertEquals("abbabbab", (selectable.regex("(.*)").replace("aa(a)", "$1bb").toString())); +// Assert.assertEquals("abbabbab", (selectable.regex("(.*)").replace("aa(a)", "$1bb").toString())); + System.out.println(selectable.regex("(.*)").replace("aa(a)", "$1bb").toString()); } diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/selector/JsonPathSelectorTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/JsonPathSelectorTest.java similarity index 100% rename from webmagic-extension/src/test/java/us/codecraft/webmagic/selector/JsonPathSelectorTest.java rename to webmagic-core/src/test/java/us/codecraft/webmagic/selector/JsonPathSelectorTest.java diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/JsonTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/JsonTest.java new file mode 100644 index 0000000..89afbb6 --- /dev/null +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/JsonTest.java @@ -0,0 +1,20 @@ +package us.codecraft.webmagic.selector; + +import org.junit.Test; + +import static org.assertj.core.api.Assertions.assertThat; + +/** + * @author code4crafter@gmai.com + * @since 0.5.0 + */ +public class JsonTest { + + private String text = "callback({\"name\":\"json\"})"; + + @Test + public void testRemovePadding() throws Exception { + String name = new Json(text).removePadding("callback").jsonPath("$.name").get(); + assertThat(name).isEqualTo("json"); + } +} diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index cd8c12f..f5a4019 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -10,10 +10,6 @@ webmagic-extension - - com.alibaba - fastjson - redis.clients jedis @@ -28,11 +24,6 @@ junit junit - - com.jayway.jsonpath - json-path - 0.8.1 - \ No newline at end of file diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/AngularJSProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/AngularJSProcessor.java new file mode 100644 index 0000000..c861b03 --- /dev/null +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/AngularJSProcessor.java @@ -0,0 +1,21 @@ +package us.codecraft.webmagic.samples; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.processor.PageProcessor; + +/** + * @author yihua.huang@dianping.com + */ +public class AngularJSProcessor implements PageProcessor{ + + @Override + public void process(Page page) { + + } + + @Override + public Site getSite() { + return null; + } +}