Downgrade htmlcleaner from 2.24 back to 2.5, to make Xpath2Selector pass the test cases.
parent
683db09133
commit
124c52b988
2
pom.xml
2
pom.xml
|
@ -171,7 +171,7 @@
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>net.sourceforge.htmlcleaner</groupId>
|
<groupId>net.sourceforge.htmlcleaner</groupId>
|
||||||
<artifactId>htmlcleaner</artifactId>
|
<artifactId>htmlcleaner</artifactId>
|
||||||
<version>2.24</version>
|
<version>2.5</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>com.github.detro</groupId>
|
<groupId>com.github.detro</groupId>
|
||||||
|
|
|
@ -1,16 +1,11 @@
|
||||||
package us.codecraft.webmagic.selector;
|
package us.codecraft.webmagic.selector;
|
||||||
|
|
||||||
import net.sf.saxon.lib.NamespaceConstant;
|
import java.io.StringWriter;
|
||||||
import net.sf.saxon.xpath.XPathEvaluator;
|
import java.util.ArrayList;
|
||||||
import org.htmlcleaner.CleanerProperties;
|
import java.util.Iterator;
|
||||||
import org.htmlcleaner.DomSerializer;
|
import java.util.List;
|
||||||
import org.htmlcleaner.HtmlCleaner;
|
import java.util.Map;
|
||||||
import org.htmlcleaner.TagNode;
|
import java.util.concurrent.ConcurrentHashMap;
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
import org.w3c.dom.Document;
|
|
||||||
import org.w3c.dom.Node;
|
|
||||||
import org.w3c.dom.NodeList;
|
|
||||||
|
|
||||||
import javax.xml.namespace.NamespaceContext;
|
import javax.xml.namespace.NamespaceContext;
|
||||||
import javax.xml.transform.OutputKeys;
|
import javax.xml.transform.OutputKeys;
|
||||||
|
@ -21,12 +16,19 @@ import javax.xml.transform.stream.StreamResult;
|
||||||
import javax.xml.xpath.XPathConstants;
|
import javax.xml.xpath.XPathConstants;
|
||||||
import javax.xml.xpath.XPathExpression;
|
import javax.xml.xpath.XPathExpression;
|
||||||
import javax.xml.xpath.XPathExpressionException;
|
import javax.xml.xpath.XPathExpressionException;
|
||||||
import java.io.StringWriter;
|
|
||||||
import java.util.ArrayList;
|
import org.htmlcleaner.CleanerProperties;
|
||||||
import java.util.Iterator;
|
import org.htmlcleaner.DomSerializer;
|
||||||
import java.util.List;
|
import org.htmlcleaner.HtmlCleaner;
|
||||||
import java.util.Map;
|
import org.htmlcleaner.TagNode;
|
||||||
import java.util.concurrent.ConcurrentHashMap;
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
import org.w3c.dom.Document;
|
||||||
|
import org.w3c.dom.Node;
|
||||||
|
import org.w3c.dom.NodeList;
|
||||||
|
|
||||||
|
import net.sf.saxon.lib.NamespaceConstant;
|
||||||
|
import net.sf.saxon.xpath.XPathEvaluator;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 支持xpath2.0的选择器。包装了HtmlCleaner和Saxon HE。<br>
|
* 支持xpath2.0的选择器。包装了HtmlCleaner和Saxon HE。<br>
|
||||||
|
|
|
@ -1,5 +1,7 @@
|
||||||
package us.codecraft.webmagic.selector;
|
package us.codecraft.webmagic.selector;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
import org.htmlcleaner.HtmlCleaner;
|
import org.htmlcleaner.HtmlCleaner;
|
||||||
import org.htmlcleaner.TagNode;
|
import org.htmlcleaner.TagNode;
|
||||||
import org.htmlcleaner.XPatherException;
|
import org.htmlcleaner.XPatherException;
|
||||||
|
@ -1368,15 +1370,19 @@ public class XpathSelectorTest {
|
||||||
public void testXPath2() {
|
public void testXPath2() {
|
||||||
String text = "<h1>眉山:扎实推进农业农村工作 促农持续增收<br>\n" +
|
String text = "<h1>眉山:扎实推进农业农村工作 促农持续增收<br>\n" +
|
||||||
"<span>2013-07-31 23:29:45 来源:<a href=\"http://www.mshw.net\" target=\"_blank\" style=\"color:#AAA\">眉山网</a> 责任编辑:张斯炜</span></h1>";
|
"<span>2013-07-31 23:29:45 来源:<a href=\"http://www.mshw.net\" target=\"_blank\" style=\"color:#AAA\">眉山网</a> 责任编辑:张斯炜</span></h1>";
|
||||||
XpathSelector xpathSelector = new XpathSelector("//h1/text()");
|
Xpath2Selector xpathSelector = new Xpath2Selector("//h1/text()");
|
||||||
Assert.assertEquals("眉山:扎实推进农业农村工作 促农持续增收 ", xpathSelector.select(text));
|
Assert.assertEquals("眉山:扎实推进农业农村工作 促农持续增收", xpathSelector.select(text));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testXpath2Selector() {
|
public void testXpath2Selector() {
|
||||||
Xpath2Selector xpath2Selector = new Xpath2Selector("//a/@href");
|
Xpath2Selector xpath2Selector = new Xpath2Selector("//a/@href");
|
||||||
String select = xpath2Selector.select(html);
|
String select = xpath2Selector.select(html);
|
||||||
Assert.assertNotNull(select);
|
Assert.assertEquals("http://www.oschina.net/", select);
|
||||||
|
|
||||||
|
List<String> selectList = xpath2Selector.selectList(html);
|
||||||
|
Assert.assertEquals(113, selectList.size());
|
||||||
|
Assert.assertEquals("http://www.oschina.net/", selectList.get(0));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Ignore("take long time")
|
@Ignore("take long time")
|
||||||
|
|
Loading…
Reference in New Issue