add attribute 'text' support for CssSelector #66
parent
88b50d4182
commit
55368919df
2
pom.xml
2
pom.xml
|
@ -89,7 +89,7 @@
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<artifactId>xsoup</artifactId>
|
<artifactId>xsoup</artifactId>
|
||||||
<version>0.1.0</version>
|
<version>0.2.0</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>net.sf.saxon</groupId>
|
<groupId>net.sf.saxon</groupId>
|
||||||
|
|
|
@ -20,7 +20,7 @@ public class BaiduBaikePageProcessor implements PageProcessor {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void process(Page page) {
|
public void process(Page page) {
|
||||||
page.putField("name", page.getHtml().$("h1.title div.lemmaTitleH1","text").toString());
|
page.putField("name", page.getHtml().css("h1.title div.lemmaTitleH1","text").toString());
|
||||||
page.putField("description", page.getHtml().xpath("//div[@id='lemmaContent-0']//div[@class='para']/allText()"));
|
page.putField("description", page.getHtml().xpath("//div[@id='lemmaContent-0']//div[@class='para']/allText()"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -2,6 +2,8 @@ package us.codecraft.webmagic.selector;
|
||||||
|
|
||||||
import org.apache.commons.collections.CollectionUtils;
|
import org.apache.commons.collections.CollectionUtils;
|
||||||
import org.jsoup.nodes.Element;
|
import org.jsoup.nodes.Element;
|
||||||
|
import org.jsoup.nodes.Node;
|
||||||
|
import org.jsoup.nodes.TextNode;
|
||||||
import org.jsoup.select.Elements;
|
import org.jsoup.select.Elements;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
@ -33,11 +35,26 @@ public class CssSelector extends BaseElementSelector {
|
||||||
return element.outerHtml();
|
return element.outerHtml();
|
||||||
} else if ("innerHtml".equalsIgnoreCase(attrName)) {
|
} else if ("innerHtml".equalsIgnoreCase(attrName)) {
|
||||||
return element.html();
|
return element.html();
|
||||||
|
} else if ("text".equalsIgnoreCase(attrName)) {
|
||||||
|
return getText(element);
|
||||||
|
} else if ("allText".equalsIgnoreCase(attrName)) {
|
||||||
|
return element.text();
|
||||||
} else {
|
} else {
|
||||||
return element.attr(attrName);
|
return element.attr(attrName);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
protected String getText(Element element) {
|
||||||
|
StringBuilder accum = new StringBuilder();
|
||||||
|
for (Node node : element.childNodes()) {
|
||||||
|
if (node instanceof TextNode) {
|
||||||
|
TextNode textNode = (TextNode) node;
|
||||||
|
accum.append(textNode.text());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return accum.toString();
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String select(Element element) {
|
public String select(Element element) {
|
||||||
Elements elements = element.select(selectorText);
|
Elements elements = element.select(selectorText);
|
||||||
|
|
Loading…
Reference in New Issue