#42 Add jsonpath in annotation mode for json result
parent
c2d6d495b3
commit
59ad4cad27
|
@ -9,7 +9,7 @@ import java.util.ArrayList;
|
|||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Selectable plain text.<br>
|
||||
* Selectable html.<br>
|
||||
*
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* @since 0.1.0
|
||||
|
@ -23,16 +23,28 @@ public class Html extends PlainText {
|
|||
*/
|
||||
private Document document;
|
||||
|
||||
private boolean init = false;
|
||||
|
||||
public Html(List<String> strings) {
|
||||
super(strings);
|
||||
}
|
||||
|
||||
public Html(String text) {
|
||||
super(text);
|
||||
try {
|
||||
this.document = Jsoup.parse(text);
|
||||
} catch (Exception e) {
|
||||
logger.warn("parse document error ", e);
|
||||
}
|
||||
|
||||
/**
|
||||
* lazy init
|
||||
*/
|
||||
private void initDocument() {
|
||||
if (this.document == null && !init) {
|
||||
init = true;
|
||||
//just init once whether the parsing succeeds or not
|
||||
try {
|
||||
this.document = Jsoup.parse(getText());
|
||||
} catch (Exception e) {
|
||||
logger.warn("parse document error ", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -47,6 +59,7 @@ public class Html extends PlainText {
|
|||
|
||||
@Override
|
||||
protected Selectable select(Selector selector, List<String> strings) {
|
||||
initDocument();
|
||||
List<String> results = new ArrayList<String>();
|
||||
for (String string : strings) {
|
||||
String result = selector.select(string);
|
||||
|
@ -59,6 +72,7 @@ public class Html extends PlainText {
|
|||
|
||||
@Override
|
||||
protected Selectable selectList(Selector selector, List<String> strings) {
|
||||
initDocument();
|
||||
List<String> results = new ArrayList<String>();
|
||||
for (String string : strings) {
|
||||
List<String> result = selector.selectList(string);
|
||||
|
@ -69,6 +83,7 @@ public class Html extends PlainText {
|
|||
|
||||
@Override
|
||||
public Selectable smartContent() {
|
||||
initDocument();
|
||||
SmartContentSelector smartContentSelector = Selectors.smartContent();
|
||||
return select(smartContentSelector, strings);
|
||||
}
|
||||
|
|
|
@ -0,0 +1,24 @@
|
|||
package us.codecraft.webmagic.example;
|
||||
|
||||
import us.codecraft.webmagic.Site;
|
||||
import us.codecraft.webmagic.model.OOSpider;
|
||||
import us.codecraft.webmagic.model.annotation.ExtractBy;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com
|
||||
* @since 0.4.1
|
||||
*/
|
||||
public class AppStore {
|
||||
|
||||
@ExtractBy(type = ExtractBy.Type.JsonPath, value = "$..trackName")
|
||||
private String trackName;
|
||||
|
||||
@ExtractBy(type = ExtractBy.Type.JsonPath, value = "$..description")
|
||||
private String description;
|
||||
|
||||
public static void main(String[] args) {
|
||||
AppStore appStore = OOSpider.create(Site.me(), AppStore.class).<AppStore>get("http://itunes.apple.com/lookup?id=653350791&country=cn&entity=software");
|
||||
System.out.println(appStore.trackName);
|
||||
System.out.println(appStore.description);
|
||||
}
|
||||
}
|
|
@ -239,7 +239,7 @@ class PageModelExtractor {
|
|||
} else {
|
||||
if (objectExtractor.multi) {
|
||||
List<Object> os = new ArrayList<Object>();
|
||||
List<String> list = objectExtractor.getSelector().selectList(page.getHtml().toString());
|
||||
List<String> list = objectExtractor.getSelector().selectList(page.getRawText());
|
||||
for (String s : list) {
|
||||
Object o = processSingle(page, s, false);
|
||||
if (o != null) {
|
||||
|
@ -248,7 +248,7 @@ class PageModelExtractor {
|
|||
}
|
||||
return os;
|
||||
} else {
|
||||
String select = objectExtractor.getSelector().select(page.getHtml().toString());
|
||||
String select = objectExtractor.getSelector().select(page.getRawText());
|
||||
Object o = processSingle(page, select, false);
|
||||
return o;
|
||||
}
|
||||
|
|
|
@ -24,7 +24,7 @@ public @interface ExtractBy {
|
|||
/**
|
||||
* types of extractor expressions
|
||||
*/
|
||||
public static enum Type {XPath, Regex, Css}
|
||||
public static enum Type {XPath, Regex, Css, JsonPath}
|
||||
|
||||
/**
|
||||
* Extractor type, support XPath, CSS Selector and regex.
|
||||
|
|
|
@ -5,7 +5,7 @@ import java.lang.annotation.Retention;
|
|||
import java.lang.annotation.Target;
|
||||
|
||||
/**
|
||||
* Define a extractor for url. Only regex can be used. <br>
|
||||
* Define a extractor to extract data in url of current page. Only regex can be used. <br>
|
||||
*
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* @since 0.2.0
|
||||
|
|
|
@ -27,6 +27,9 @@ public class ExtractorUtils {
|
|||
case XPath:
|
||||
selector = getXpathSelector(value);
|
||||
break;
|
||||
case JsonPath:
|
||||
selector = new JsonPathSelector(value);
|
||||
break;
|
||||
default:
|
||||
selector = getXpathSelector(value);
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue