update
parent
2c3574537a
commit
326b97c65a
|
@ -1,6 +1,7 @@
|
||||||
package us.codecraft.webmagic;
|
package us.codecraft.webmagic;
|
||||||
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
import us.codecraft.webmagic.selector.Html;
|
||||||
import us.codecraft.webmagic.selector.Selectable;
|
import us.codecraft.webmagic.selector.Selectable;
|
||||||
import us.codecraft.webmagic.utils.UrlUtils;
|
import us.codecraft.webmagic.utils.UrlUtils;
|
||||||
|
|
||||||
|
@ -28,7 +29,7 @@ public class Page {
|
||||||
|
|
||||||
private ResultItems resultItems = new ResultItems();
|
private ResultItems resultItems = new ResultItems();
|
||||||
|
|
||||||
private Selectable html;
|
private Html html;
|
||||||
|
|
||||||
private Selectable url;
|
private Selectable url;
|
||||||
|
|
||||||
|
@ -58,11 +59,11 @@ public class Page {
|
||||||
*
|
*
|
||||||
* @return html
|
* @return html
|
||||||
*/
|
*/
|
||||||
public Selectable getHtml() {
|
public Html getHtml() {
|
||||||
return html;
|
return html;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setHtml(Selectable html) {
|
public void setHtml(Html html) {
|
||||||
this.html = html;
|
this.html = html;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,36 @@
|
||||||
|
package us.codecraft.webmagic.selector;
|
||||||
|
|
||||||
|
import org.jsoup.nodes.Element;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Cache parsed element for extract.
|
||||||
|
*
|
||||||
|
* @author code4crafter@gmail.com
|
||||||
|
* @since 0.2.2
|
||||||
|
*/
|
||||||
|
public class CacheElement {
|
||||||
|
|
||||||
|
public String text;
|
||||||
|
|
||||||
|
public Element element;
|
||||||
|
|
||||||
|
public String select(Selector selector) {
|
||||||
|
if (selector instanceof ElementSelector) {
|
||||||
|
ElementSelector elementSelector = (ElementSelector) selector;
|
||||||
|
return elementSelector.select(getElement());
|
||||||
|
} else {
|
||||||
|
return selector.select(getText());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<String> selectList(Selector selector) {
|
||||||
|
if (selector instanceof ElementSelector) {
|
||||||
|
ElementSelector elementSelector = (ElementSelector) selector;
|
||||||
|
return elementSelector.selectList(getElement());
|
||||||
|
} else {
|
||||||
|
return selector.selectList(getText());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -97,4 +97,34 @@ public class Html extends PlainText {
|
||||||
return selectList(cssSelector, strings);
|
return selectList(cssSelector, strings);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public Document getDocument() {
|
||||||
|
return document;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getText() {
|
||||||
|
return document.html();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @param selector
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public String select(Selector selector) {
|
||||||
|
if (selector instanceof ElementSelector) {
|
||||||
|
ElementSelector elementSelector = (ElementSelector) selector;
|
||||||
|
return elementSelector.select(getDocument());
|
||||||
|
} else {
|
||||||
|
return selector.select(getText());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<String> selectList(Selector selector) {
|
||||||
|
if (selector instanceof ElementSelector) {
|
||||||
|
ElementSelector elementSelector = (ElementSelector) selector;
|
||||||
|
return elementSelector.selectList(getDocument());
|
||||||
|
} else {
|
||||||
|
return selector.selectList(getText());
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
package us.codecraft.webmagic.model;
|
package us.codecraft.webmagic.model;
|
||||||
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
import org.jsoup.nodes.Element;
|
||||||
import us.codecraft.webmagic.Page;
|
import us.codecraft.webmagic.Page;
|
||||||
import us.codecraft.webmagic.model.annotation.*;
|
import us.codecraft.webmagic.model.annotation.*;
|
||||||
import us.codecraft.webmagic.selector.*;
|
import us.codecraft.webmagic.selector.*;
|
||||||
|
@ -34,7 +35,7 @@ class PageModelExtractor {
|
||||||
|
|
||||||
private List<FieldExtractor> fieldExtractors;
|
private List<FieldExtractor> fieldExtractors;
|
||||||
|
|
||||||
private Extractor extractor;
|
private Extractor objectExtractor;
|
||||||
|
|
||||||
public static PageModelExtractor create(Class clazz) {
|
public static PageModelExtractor create(Class clazz) {
|
||||||
PageModelExtractor pageModelExtractor = new PageModelExtractor();
|
PageModelExtractor pageModelExtractor = new PageModelExtractor();
|
||||||
|
@ -169,7 +170,7 @@ class PageModelExtractor {
|
||||||
annotation = clazz.getAnnotation(ExtractBy.class);
|
annotation = clazz.getAnnotation(ExtractBy.class);
|
||||||
if (annotation != null) {
|
if (annotation != null) {
|
||||||
ExtractBy extractBy = (ExtractBy) annotation;
|
ExtractBy extractBy = (ExtractBy) annotation;
|
||||||
extractor = new Extractor(new XpathSelector(extractBy.value()), Extractor.Source.Html, extractBy.notNull(), extractBy.multi());
|
objectExtractor = new Extractor(new XpathSelector(extractBy.value()), Extractor.Source.Html, extractBy.notNull(), extractBy.multi());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -183,12 +184,12 @@ class PageModelExtractor {
|
||||||
if (!matched) {
|
if (!matched) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
if (extractor == null) {
|
if (objectExtractor == null) {
|
||||||
return processSingle(page, page.getHtml().toString());
|
return processSingle(page, page.getHtml().toString());
|
||||||
} else {
|
} else {
|
||||||
if (extractor.multi) {
|
if (objectExtractor.multi) {
|
||||||
List<Object> os = new ArrayList<Object>();
|
List<Object> os = new ArrayList<Object>();
|
||||||
List<String> list = extractor.getSelector().selectList(page.getHtml().toString());
|
List<String> list = objectExtractor.getSelector().selectList(page.getHtml().toString());
|
||||||
for (String s : list) {
|
for (String s : list) {
|
||||||
Object o = processSingle(page, s);
|
Object o = processSingle(page, s);
|
||||||
if (o != null) {
|
if (o != null) {
|
||||||
|
@ -197,13 +198,19 @@ class PageModelExtractor {
|
||||||
}
|
}
|
||||||
return os;
|
return os;
|
||||||
} else {
|
} else {
|
||||||
String select = extractor.getSelector().select(page.getHtml().toString());
|
String select = objectExtractor.getSelector().select(page.getHtml().toString());
|
||||||
Object o = processSingle(page, select);
|
Object o = processSingle(page, select);
|
||||||
return o;
|
return o;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private List<String> select(Selector selector,Element element,String html){
|
||||||
|
if (selector instanceof ElementSelector){
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private Object processSingle(Page page, String html) {
|
private Object processSingle(Page page, String html) {
|
||||||
Object o = null;
|
Object o = null;
|
||||||
try {
|
try {
|
||||||
|
|
Loading…
Reference in New Issue