master
yihua.huang 2013-09-04 00:15:54 +08:00
parent 2c3574537a
commit 326b97c65a
4 changed files with 83 additions and 9 deletions

View File

@ -1,6 +1,7 @@
package us.codecraft.webmagic;
import org.apache.commons.lang3.StringUtils;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.Selectable;
import us.codecraft.webmagic.utils.UrlUtils;
@ -28,7 +29,7 @@ public class Page {
private ResultItems resultItems = new ResultItems();
private Selectable html;
private Html html;
private Selectable url;
@ -58,11 +59,11 @@ public class Page {
*
* @return html
*/
public Selectable getHtml() {
public Html getHtml() {
return html;
}
public void setHtml(Selectable html) {
public void setHtml(Html html) {
this.html = html;
}

View File

@ -0,0 +1,36 @@
package us.codecraft.webmagic.selector;
import org.jsoup.nodes.Element;
import java.util.List;
/**
* Cache parsed element for extract.
*
* @author code4crafter@gmail.com
* @since 0.2.2
*/
public class CacheElement {
public String text;
public Element element;
public String select(Selector selector) {
if (selector instanceof ElementSelector) {
ElementSelector elementSelector = (ElementSelector) selector;
return elementSelector.select(getElement());
} else {
return selector.select(getText());
}
}
public List<String> selectList(Selector selector) {
if (selector instanceof ElementSelector) {
ElementSelector elementSelector = (ElementSelector) selector;
return elementSelector.selectList(getElement());
} else {
return selector.selectList(getText());
}
}
}

View File

@ -97,4 +97,34 @@ public class Html extends PlainText {
return selectList(cssSelector, strings);
}
public Document getDocument() {
return document;
}
public String getText() {
return document.html();
}
/**
*
* @param selector
* @return
*/
public String select(Selector selector) {
if (selector instanceof ElementSelector) {
ElementSelector elementSelector = (ElementSelector) selector;
return elementSelector.select(getDocument());
} else {
return selector.select(getText());
}
}
public List<String> selectList(Selector selector) {
if (selector instanceof ElementSelector) {
ElementSelector elementSelector = (ElementSelector) selector;
return elementSelector.selectList(getDocument());
} else {
return selector.selectList(getText());
}
}
}

View File

@ -1,6 +1,7 @@
package us.codecraft.webmagic.model;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.nodes.Element;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.model.annotation.*;
import us.codecraft.webmagic.selector.*;
@ -34,7 +35,7 @@ class PageModelExtractor {
private List<FieldExtractor> fieldExtractors;
private Extractor extractor;
private Extractor objectExtractor;
public static PageModelExtractor create(Class clazz) {
PageModelExtractor pageModelExtractor = new PageModelExtractor();
@ -169,7 +170,7 @@ class PageModelExtractor {
annotation = clazz.getAnnotation(ExtractBy.class);
if (annotation != null) {
ExtractBy extractBy = (ExtractBy) annotation;
extractor = new Extractor(new XpathSelector(extractBy.value()), Extractor.Source.Html, extractBy.notNull(), extractBy.multi());
objectExtractor = new Extractor(new XpathSelector(extractBy.value()), Extractor.Source.Html, extractBy.notNull(), extractBy.multi());
}
}
@ -183,12 +184,12 @@ class PageModelExtractor {
if (!matched) {
return null;
}
if (extractor == null) {
if (objectExtractor == null) {
return processSingle(page, page.getHtml().toString());
} else {
if (extractor.multi) {
if (objectExtractor.multi) {
List<Object> os = new ArrayList<Object>();
List<String> list = extractor.getSelector().selectList(page.getHtml().toString());
List<String> list = objectExtractor.getSelector().selectList(page.getHtml().toString());
for (String s : list) {
Object o = processSingle(page, s);
if (o != null) {
@ -197,13 +198,19 @@ class PageModelExtractor {
}
return os;
} else {
String select = extractor.getSelector().select(page.getHtml().toString());
String select = objectExtractor.getSelector().select(page.getHtml().toString());
Object o = processSingle(page, select);
return o;
}
}
}
private List<String> select(Selector selector,Element element,String html){
if (selector instanceof ElementSelector){
}
}
private Object processSingle(Page page, String html) {
Object o = null;
try {