add attribute 'text' support for CssSelector #66
parent
88b50d4182
commit
55368919df
2
pom.xml
2
pom.xml
|
@ -89,7 +89,7 @@
|
|||
<dependency>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<artifactId>xsoup</artifactId>
|
||||
<version>0.1.0</version>
|
||||
<version>0.2.0</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>net.sf.saxon</groupId>
|
||||
|
|
|
@ -20,7 +20,7 @@ public class BaiduBaikePageProcessor implements PageProcessor {
|
|||
|
||||
@Override
|
||||
public void process(Page page) {
|
||||
page.putField("name", page.getHtml().$("h1.title div.lemmaTitleH1","text").toString());
|
||||
page.putField("name", page.getHtml().css("h1.title div.lemmaTitleH1","text").toString());
|
||||
page.putField("description", page.getHtml().xpath("//div[@id='lemmaContent-0']//div[@class='para']/allText()"));
|
||||
}
|
||||
|
||||
|
|
|
@ -2,6 +2,8 @@ package us.codecraft.webmagic.selector;
|
|||
|
||||
import org.apache.commons.collections.CollectionUtils;
|
||||
import org.jsoup.nodes.Element;
|
||||
import org.jsoup.nodes.Node;
|
||||
import org.jsoup.nodes.TextNode;
|
||||
import org.jsoup.select.Elements;
|
||||
|
||||
import java.util.ArrayList;
|
||||
|
@ -33,11 +35,26 @@ public class CssSelector extends BaseElementSelector {
|
|||
return element.outerHtml();
|
||||
} else if ("innerHtml".equalsIgnoreCase(attrName)) {
|
||||
return element.html();
|
||||
} else if ("text".equalsIgnoreCase(attrName)) {
|
||||
return getText(element);
|
||||
} else if ("allText".equalsIgnoreCase(attrName)) {
|
||||
return element.text();
|
||||
} else {
|
||||
return element.attr(attrName);
|
||||
}
|
||||
}
|
||||
|
||||
protected String getText(Element element) {
|
||||
StringBuilder accum = new StringBuilder();
|
||||
for (Node node : element.childNodes()) {
|
||||
if (node instanceof TextNode) {
|
||||
TextNode textNode = (TextNode) node;
|
||||
accum.append(textNode.text());
|
||||
}
|
||||
}
|
||||
return accum.toString();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String select(Element element) {
|
||||
Elements elements = element.select(selectorText);
|
||||
|
|
Loading…
Reference in New Issue