Downgrade htmlcleaner from 2.24 back to 2.5, to make Xpath2Selector pass the test cases.

master
Sutra Zhou 2021-01-11 01:25:41 +08:00
parent 683db09133
commit 124c52b988
3 changed files with 29 additions and 21 deletions

View File

@ -171,7 +171,7 @@
<dependency> <dependency>
<groupId>net.sourceforge.htmlcleaner</groupId> <groupId>net.sourceforge.htmlcleaner</groupId>
<artifactId>htmlcleaner</artifactId> <artifactId>htmlcleaner</artifactId>
<version>2.24</version> <version>2.5</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>com.github.detro</groupId> <groupId>com.github.detro</groupId>

View File

@ -1,16 +1,11 @@
package us.codecraft.webmagic.selector; package us.codecraft.webmagic.selector;
import net.sf.saxon.lib.NamespaceConstant; import java.io.StringWriter;
import net.sf.saxon.xpath.XPathEvaluator; import java.util.ArrayList;
import org.htmlcleaner.CleanerProperties; import java.util.Iterator;
import org.htmlcleaner.DomSerializer; import java.util.List;
import org.htmlcleaner.HtmlCleaner; import java.util.Map;
import org.htmlcleaner.TagNode; import java.util.concurrent.ConcurrentHashMap;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import javax.xml.namespace.NamespaceContext; import javax.xml.namespace.NamespaceContext;
import javax.xml.transform.OutputKeys; import javax.xml.transform.OutputKeys;
@ -21,12 +16,19 @@ import javax.xml.transform.stream.StreamResult;
import javax.xml.xpath.XPathConstants; import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpression; import javax.xml.xpath.XPathExpression;
import javax.xml.xpath.XPathExpressionException; import javax.xml.xpath.XPathExpressionException;
import java.io.StringWriter;
import java.util.ArrayList; import org.htmlcleaner.CleanerProperties;
import java.util.Iterator; import org.htmlcleaner.DomSerializer;
import java.util.List; import org.htmlcleaner.HtmlCleaner;
import java.util.Map; import org.htmlcleaner.TagNode;
import java.util.concurrent.ConcurrentHashMap; import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import net.sf.saxon.lib.NamespaceConstant;
import net.sf.saxon.xpath.XPathEvaluator;
/** /**
* xpath2.0HtmlCleanerSaxon HE<br> * xpath2.0HtmlCleanerSaxon HE<br>

View File

@ -1,5 +1,7 @@
package us.codecraft.webmagic.selector; package us.codecraft.webmagic.selector;
import java.util.List;
import org.htmlcleaner.HtmlCleaner; import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode; import org.htmlcleaner.TagNode;
import org.htmlcleaner.XPatherException; import org.htmlcleaner.XPatherException;
@ -1368,15 +1370,19 @@ public class XpathSelectorTest {
public void testXPath2() { public void testXPath2() {
String text = "<h1>眉山:扎实推进农业农村工作 促农持续增收<br>\n" + String text = "<h1>眉山:扎实推进农业农村工作 促农持续增收<br>\n" +
"<span>2013-07-31 23:29:45&nbsp;&nbsp;&nbsp;来源:<a href=\"http://www.mshw.net\" target=\"_blank\" style=\"color:#AAA\">眉山网</a>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;责任编辑:张斯炜</span></h1>"; "<span>2013-07-31 23:29:45&nbsp;&nbsp;&nbsp;来源:<a href=\"http://www.mshw.net\" target=\"_blank\" style=\"color:#AAA\">眉山网</a>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;责任编辑:张斯炜</span></h1>";
XpathSelector xpathSelector = new XpathSelector("//h1/text()"); Xpath2Selector xpathSelector = new Xpath2Selector("//h1/text()");
Assert.assertEquals("眉山:扎实推进农业农村工作 促农持续增收 ", xpathSelector.select(text)); Assert.assertEquals("眉山:扎实推进农业农村工作 促农持续增收", xpathSelector.select(text));
} }
@Test @Test
public void testXpath2Selector() { public void testXpath2Selector() {
Xpath2Selector xpath2Selector = new Xpath2Selector("//a/@href"); Xpath2Selector xpath2Selector = new Xpath2Selector("//a/@href");
String select = xpath2Selector.select(html); String select = xpath2Selector.select(html);
Assert.assertNotNull(select); Assert.assertEquals("http://www.oschina.net/", select);
List<String> selectList = xpath2Selector.selectList(html);
Assert.assertEquals(113, selectList.size());
Assert.assertEquals("http://www.oschina.net/", selectList.get(0));
} }
@Ignore("take long time") @Ignore("take long time")