update
parent
2c3574537a
commit
326b97c65a
|
@ -1,6 +1,7 @@
|
|||
package us.codecraft.webmagic;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import us.codecraft.webmagic.selector.Html;
|
||||
import us.codecraft.webmagic.selector.Selectable;
|
||||
import us.codecraft.webmagic.utils.UrlUtils;
|
||||
|
||||
|
@ -28,7 +29,7 @@ public class Page {
|
|||
|
||||
private ResultItems resultItems = new ResultItems();
|
||||
|
||||
private Selectable html;
|
||||
private Html html;
|
||||
|
||||
private Selectable url;
|
||||
|
||||
|
@ -58,11 +59,11 @@ public class Page {
|
|||
*
|
||||
* @return html
|
||||
*/
|
||||
public Selectable getHtml() {
|
||||
public Html getHtml() {
|
||||
return html;
|
||||
}
|
||||
|
||||
public void setHtml(Selectable html) {
|
||||
public void setHtml(Html html) {
|
||||
this.html = html;
|
||||
}
|
||||
|
||||
|
|
|
@ -0,0 +1,36 @@
|
|||
package us.codecraft.webmagic.selector;
|
||||
|
||||
import org.jsoup.nodes.Element;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Cache parsed element for extract.
|
||||
*
|
||||
* @author code4crafter@gmail.com
|
||||
* @since 0.2.2
|
||||
*/
|
||||
public class CacheElement {
|
||||
|
||||
public String text;
|
||||
|
||||
public Element element;
|
||||
|
||||
public String select(Selector selector) {
|
||||
if (selector instanceof ElementSelector) {
|
||||
ElementSelector elementSelector = (ElementSelector) selector;
|
||||
return elementSelector.select(getElement());
|
||||
} else {
|
||||
return selector.select(getText());
|
||||
}
|
||||
}
|
||||
|
||||
public List<String> selectList(Selector selector) {
|
||||
if (selector instanceof ElementSelector) {
|
||||
ElementSelector elementSelector = (ElementSelector) selector;
|
||||
return elementSelector.selectList(getElement());
|
||||
} else {
|
||||
return selector.selectList(getText());
|
||||
}
|
||||
}
|
||||
}
|
|
@ -97,4 +97,34 @@ public class Html extends PlainText {
|
|||
return selectList(cssSelector, strings);
|
||||
}
|
||||
|
||||
public Document getDocument() {
|
||||
return document;
|
||||
}
|
||||
|
||||
public String getText() {
|
||||
return document.html();
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param selector
|
||||
* @return
|
||||
*/
|
||||
public String select(Selector selector) {
|
||||
if (selector instanceof ElementSelector) {
|
||||
ElementSelector elementSelector = (ElementSelector) selector;
|
||||
return elementSelector.select(getDocument());
|
||||
} else {
|
||||
return selector.select(getText());
|
||||
}
|
||||
}
|
||||
|
||||
public List<String> selectList(Selector selector) {
|
||||
if (selector instanceof ElementSelector) {
|
||||
ElementSelector elementSelector = (ElementSelector) selector;
|
||||
return elementSelector.selectList(getDocument());
|
||||
} else {
|
||||
return selector.selectList(getText());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
package us.codecraft.webmagic.model;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.jsoup.nodes.Element;
|
||||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.model.annotation.*;
|
||||
import us.codecraft.webmagic.selector.*;
|
||||
|
@ -34,7 +35,7 @@ class PageModelExtractor {
|
|||
|
||||
private List<FieldExtractor> fieldExtractors;
|
||||
|
||||
private Extractor extractor;
|
||||
private Extractor objectExtractor;
|
||||
|
||||
public static PageModelExtractor create(Class clazz) {
|
||||
PageModelExtractor pageModelExtractor = new PageModelExtractor();
|
||||
|
@ -169,7 +170,7 @@ class PageModelExtractor {
|
|||
annotation = clazz.getAnnotation(ExtractBy.class);
|
||||
if (annotation != null) {
|
||||
ExtractBy extractBy = (ExtractBy) annotation;
|
||||
extractor = new Extractor(new XpathSelector(extractBy.value()), Extractor.Source.Html, extractBy.notNull(), extractBy.multi());
|
||||
objectExtractor = new Extractor(new XpathSelector(extractBy.value()), Extractor.Source.Html, extractBy.notNull(), extractBy.multi());
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -183,12 +184,12 @@ class PageModelExtractor {
|
|||
if (!matched) {
|
||||
return null;
|
||||
}
|
||||
if (extractor == null) {
|
||||
if (objectExtractor == null) {
|
||||
return processSingle(page, page.getHtml().toString());
|
||||
} else {
|
||||
if (extractor.multi) {
|
||||
if (objectExtractor.multi) {
|
||||
List<Object> os = new ArrayList<Object>();
|
||||
List<String> list = extractor.getSelector().selectList(page.getHtml().toString());
|
||||
List<String> list = objectExtractor.getSelector().selectList(page.getHtml().toString());
|
||||
for (String s : list) {
|
||||
Object o = processSingle(page, s);
|
||||
if (o != null) {
|
||||
|
@ -197,13 +198,19 @@ class PageModelExtractor {
|
|||
}
|
||||
return os;
|
||||
} else {
|
||||
String select = extractor.getSelector().select(page.getHtml().toString());
|
||||
String select = objectExtractor.getSelector().select(page.getHtml().toString());
|
||||
Object o = processSingle(page, select);
|
||||
return o;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private List<String> select(Selector selector,Element element,String html){
|
||||
if (selector instanceof ElementSelector){
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
private Object processSingle(Page page, String html) {
|
||||
Object o = null;
|
||||
try {
|
||||
|
|
Loading…
Reference in New Issue