diff --git a/pom.xml b/pom.xml index 96b27fa..0e49cd3 100644 --- a/pom.xml +++ b/pom.xml @@ -89,7 +89,7 @@ us.codecraft xsoup - 0.1.0 + 0.2.0 net.sf.saxon diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/BaiduBaikePageProcessor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/BaiduBaikePageProcessor.java index 1827e06..1dc3352 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/BaiduBaikePageProcessor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/BaiduBaikePageProcessor.java @@ -20,7 +20,7 @@ public class BaiduBaikePageProcessor implements PageProcessor { @Override public void process(Page page) { - page.putField("name", page.getHtml().$("h1.title div.lemmaTitleH1","text").toString()); + page.putField("name", page.getHtml().css("h1.title div.lemmaTitleH1","text").toString()); page.putField("description", page.getHtml().xpath("//div[@id='lemmaContent-0']//div[@class='para']/allText()")); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java index 9c7032c..185db74 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java @@ -2,6 +2,8 @@ package us.codecraft.webmagic.selector; import org.apache.commons.collections.CollectionUtils; import org.jsoup.nodes.Element; +import org.jsoup.nodes.Node; +import org.jsoup.nodes.TextNode; import org.jsoup.select.Elements; import java.util.ArrayList; @@ -33,11 +35,26 @@ public class CssSelector extends BaseElementSelector { return element.outerHtml(); } else if ("innerHtml".equalsIgnoreCase(attrName)) { return element.html(); + } else if ("text".equalsIgnoreCase(attrName)) { + return getText(element); + } else if ("allText".equalsIgnoreCase(attrName)) { + return element.text(); } else { return element.attr(attrName); } } + protected String getText(Element element) { + StringBuilder accum = new StringBuilder(); + for (Node node : element.childNodes()) { + if (node instanceof TextNode) { + TextNode textNode = (TextNode) node; + accum.append(textNode.text()); + } + } + return accum.toString(); + } + @Override public String select(Element element) { Elements elements = element.select(selectorText);