#42 Add jsonpath in annotation mode for json result
parent
c2d6d495b3
commit
59ad4cad27
|
@ -9,7 +9,7 @@ import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Selectable plain text.<br>
|
* Selectable html.<br>
|
||||||
*
|
*
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
* @since 0.1.0
|
* @since 0.1.0
|
||||||
|
@ -23,18 +23,30 @@ public class Html extends PlainText {
|
||||||
*/
|
*/
|
||||||
private Document document;
|
private Document document;
|
||||||
|
|
||||||
|
private boolean init = false;
|
||||||
|
|
||||||
public Html(List<String> strings) {
|
public Html(List<String> strings) {
|
||||||
super(strings);
|
super(strings);
|
||||||
}
|
}
|
||||||
|
|
||||||
public Html(String text) {
|
public Html(String text) {
|
||||||
super(text);
|
super(text);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* lazy init
|
||||||
|
*/
|
||||||
|
private void initDocument() {
|
||||||
|
if (this.document == null && !init) {
|
||||||
|
init = true;
|
||||||
|
//just init once whether the parsing succeeds or not
|
||||||
try {
|
try {
|
||||||
this.document = Jsoup.parse(text);
|
this.document = Jsoup.parse(getText());
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
logger.warn("parse document error ", e);
|
logger.warn("parse document error ", e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public Html(Document document) {
|
public Html(Document document) {
|
||||||
super(document.html());
|
super(document.html());
|
||||||
|
@ -47,6 +59,7 @@ public class Html extends PlainText {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected Selectable select(Selector selector, List<String> strings) {
|
protected Selectable select(Selector selector, List<String> strings) {
|
||||||
|
initDocument();
|
||||||
List<String> results = new ArrayList<String>();
|
List<String> results = new ArrayList<String>();
|
||||||
for (String string : strings) {
|
for (String string : strings) {
|
||||||
String result = selector.select(string);
|
String result = selector.select(string);
|
||||||
|
@ -59,6 +72,7 @@ public class Html extends PlainText {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected Selectable selectList(Selector selector, List<String> strings) {
|
protected Selectable selectList(Selector selector, List<String> strings) {
|
||||||
|
initDocument();
|
||||||
List<String> results = new ArrayList<String>();
|
List<String> results = new ArrayList<String>();
|
||||||
for (String string : strings) {
|
for (String string : strings) {
|
||||||
List<String> result = selector.selectList(string);
|
List<String> result = selector.selectList(string);
|
||||||
|
@ -69,6 +83,7 @@ public class Html extends PlainText {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Selectable smartContent() {
|
public Selectable smartContent() {
|
||||||
|
initDocument();
|
||||||
SmartContentSelector smartContentSelector = Selectors.smartContent();
|
SmartContentSelector smartContentSelector = Selectors.smartContent();
|
||||||
return select(smartContentSelector, strings);
|
return select(smartContentSelector, strings);
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,24 @@
|
||||||
|
package us.codecraft.webmagic.example;
|
||||||
|
|
||||||
|
import us.codecraft.webmagic.Site;
|
||||||
|
import us.codecraft.webmagic.model.OOSpider;
|
||||||
|
import us.codecraft.webmagic.model.annotation.ExtractBy;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author code4crafter@gmail.com
|
||||||
|
* @since 0.4.1
|
||||||
|
*/
|
||||||
|
public class AppStore {
|
||||||
|
|
||||||
|
@ExtractBy(type = ExtractBy.Type.JsonPath, value = "$..trackName")
|
||||||
|
private String trackName;
|
||||||
|
|
||||||
|
@ExtractBy(type = ExtractBy.Type.JsonPath, value = "$..description")
|
||||||
|
private String description;
|
||||||
|
|
||||||
|
public static void main(String[] args) {
|
||||||
|
AppStore appStore = OOSpider.create(Site.me(), AppStore.class).<AppStore>get("http://itunes.apple.com/lookup?id=653350791&country=cn&entity=software");
|
||||||
|
System.out.println(appStore.trackName);
|
||||||
|
System.out.println(appStore.description);
|
||||||
|
}
|
||||||
|
}
|
|
@ -239,7 +239,7 @@ class PageModelExtractor {
|
||||||
} else {
|
} else {
|
||||||
if (objectExtractor.multi) {
|
if (objectExtractor.multi) {
|
||||||
List<Object> os = new ArrayList<Object>();
|
List<Object> os = new ArrayList<Object>();
|
||||||
List<String> list = objectExtractor.getSelector().selectList(page.getHtml().toString());
|
List<String> list = objectExtractor.getSelector().selectList(page.getRawText());
|
||||||
for (String s : list) {
|
for (String s : list) {
|
||||||
Object o = processSingle(page, s, false);
|
Object o = processSingle(page, s, false);
|
||||||
if (o != null) {
|
if (o != null) {
|
||||||
|
@ -248,7 +248,7 @@ class PageModelExtractor {
|
||||||
}
|
}
|
||||||
return os;
|
return os;
|
||||||
} else {
|
} else {
|
||||||
String select = objectExtractor.getSelector().select(page.getHtml().toString());
|
String select = objectExtractor.getSelector().select(page.getRawText());
|
||||||
Object o = processSingle(page, select, false);
|
Object o = processSingle(page, select, false);
|
||||||
return o;
|
return o;
|
||||||
}
|
}
|
||||||
|
|
|
@ -24,7 +24,7 @@ public @interface ExtractBy {
|
||||||
/**
|
/**
|
||||||
* types of extractor expressions
|
* types of extractor expressions
|
||||||
*/
|
*/
|
||||||
public static enum Type {XPath, Regex, Css}
|
public static enum Type {XPath, Regex, Css, JsonPath}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Extractor type, support XPath, CSS Selector and regex.
|
* Extractor type, support XPath, CSS Selector and regex.
|
||||||
|
|
|
@ -5,7 +5,7 @@ import java.lang.annotation.Retention;
|
||||||
import java.lang.annotation.Target;
|
import java.lang.annotation.Target;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Define a extractor for url. Only regex can be used. <br>
|
* Define a extractor to extract data in url of current page. Only regex can be used. <br>
|
||||||
*
|
*
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
* @since 0.2.0
|
* @since 0.2.0
|
||||||
|
|
|
@ -27,6 +27,9 @@ public class ExtractorUtils {
|
||||||
case XPath:
|
case XPath:
|
||||||
selector = getXpathSelector(value);
|
selector = getXpathSelector(value);
|
||||||
break;
|
break;
|
||||||
|
case JsonPath:
|
||||||
|
selector = new JsonPathSelector(value);
|
||||||
|
break;
|
||||||
default:
|
default:
|
||||||
selector = getXpathSelector(value);
|
selector = getXpathSelector(value);
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue