master
yihua.huang 2013-09-04 00:15:54 +08:00
parent 2c3574537a
commit 326b97c65a
4 changed files with 83 additions and 9 deletions

View File

@ -1,6 +1,7 @@
package us.codecraft.webmagic; package us.codecraft.webmagic;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.Selectable; import us.codecraft.webmagic.selector.Selectable;
import us.codecraft.webmagic.utils.UrlUtils; import us.codecraft.webmagic.utils.UrlUtils;
@ -28,7 +29,7 @@ public class Page {
private ResultItems resultItems = new ResultItems(); private ResultItems resultItems = new ResultItems();
private Selectable html; private Html html;
private Selectable url; private Selectable url;
@ -58,11 +59,11 @@ public class Page {
* *
* @return html * @return html
*/ */
public Selectable getHtml() { public Html getHtml() {
return html; return html;
} }
public void setHtml(Selectable html) { public void setHtml(Html html) {
this.html = html; this.html = html;
} }

View File

@ -0,0 +1,36 @@
package us.codecraft.webmagic.selector;
import org.jsoup.nodes.Element;
import java.util.List;
/**
* Cache parsed element for extract.
*
* @author code4crafter@gmail.com
* @since 0.2.2
*/
public class CacheElement {
public String text;
public Element element;
public String select(Selector selector) {
if (selector instanceof ElementSelector) {
ElementSelector elementSelector = (ElementSelector) selector;
return elementSelector.select(getElement());
} else {
return selector.select(getText());
}
}
public List<String> selectList(Selector selector) {
if (selector instanceof ElementSelector) {
ElementSelector elementSelector = (ElementSelector) selector;
return elementSelector.selectList(getElement());
} else {
return selector.selectList(getText());
}
}
}

View File

@ -97,4 +97,34 @@ public class Html extends PlainText {
return selectList(cssSelector, strings); return selectList(cssSelector, strings);
} }
public Document getDocument() {
return document;
}
public String getText() {
return document.html();
}
/**
*
* @param selector
* @return
*/
public String select(Selector selector) {
if (selector instanceof ElementSelector) {
ElementSelector elementSelector = (ElementSelector) selector;
return elementSelector.select(getDocument());
} else {
return selector.select(getText());
}
}
public List<String> selectList(Selector selector) {
if (selector instanceof ElementSelector) {
ElementSelector elementSelector = (ElementSelector) selector;
return elementSelector.selectList(getDocument());
} else {
return selector.selectList(getText());
}
}
} }

View File

@ -1,6 +1,7 @@
package us.codecraft.webmagic.model; package us.codecraft.webmagic.model;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.jsoup.nodes.Element;
import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.model.annotation.*; import us.codecraft.webmagic.model.annotation.*;
import us.codecraft.webmagic.selector.*; import us.codecraft.webmagic.selector.*;
@ -34,7 +35,7 @@ class PageModelExtractor {
private List<FieldExtractor> fieldExtractors; private List<FieldExtractor> fieldExtractors;
private Extractor extractor; private Extractor objectExtractor;
public static PageModelExtractor create(Class clazz) { public static PageModelExtractor create(Class clazz) {
PageModelExtractor pageModelExtractor = new PageModelExtractor(); PageModelExtractor pageModelExtractor = new PageModelExtractor();
@ -169,7 +170,7 @@ class PageModelExtractor {
annotation = clazz.getAnnotation(ExtractBy.class); annotation = clazz.getAnnotation(ExtractBy.class);
if (annotation != null) { if (annotation != null) {
ExtractBy extractBy = (ExtractBy) annotation; ExtractBy extractBy = (ExtractBy) annotation;
extractor = new Extractor(new XpathSelector(extractBy.value()), Extractor.Source.Html, extractBy.notNull(), extractBy.multi()); objectExtractor = new Extractor(new XpathSelector(extractBy.value()), Extractor.Source.Html, extractBy.notNull(), extractBy.multi());
} }
} }
@ -183,12 +184,12 @@ class PageModelExtractor {
if (!matched) { if (!matched) {
return null; return null;
} }
if (extractor == null) { if (objectExtractor == null) {
return processSingle(page, page.getHtml().toString()); return processSingle(page, page.getHtml().toString());
} else { } else {
if (extractor.multi) { if (objectExtractor.multi) {
List<Object> os = new ArrayList<Object>(); List<Object> os = new ArrayList<Object>();
List<String> list = extractor.getSelector().selectList(page.getHtml().toString()); List<String> list = objectExtractor.getSelector().selectList(page.getHtml().toString());
for (String s : list) { for (String s : list) {
Object o = processSingle(page, s); Object o = processSingle(page, s);
if (o != null) { if (o != null) {
@ -197,13 +198,19 @@ class PageModelExtractor {
} }
return os; return os;
} else { } else {
String select = extractor.getSelector().select(page.getHtml().toString()); String select = objectExtractor.getSelector().select(page.getHtml().toString());
Object o = processSingle(page, select); Object o = processSingle(page, select);
return o; return o;
} }
} }
} }
private List<String> select(Selector selector,Element element,String html){
if (selector instanceof ElementSelector){
}
}
private Object processSingle(Page page, String html) { private Object processSingle(Page page, String html) {
Object o = null; Object o = null;
try { try {