From 124c52b9884b1c855e47cfcdddbc1e7d9c613dbe Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Mon, 11 Jan 2021 01:25:41 +0800 Subject: [PATCH] Downgrade htmlcleaner from 2.24 back to 2.5, to make Xpath2Selector pass the test cases. --- pom.xml | 2 +- .../webmagic/selector/Xpath2Selector.java | 36 ++++++++++--------- .../webmagic/selector/XpathSelectorTest.java | 12 +++++-- 3 files changed, 29 insertions(+), 21 deletions(-) diff --git a/pom.xml b/pom.xml index 97897db..16e14cf 100644 --- a/pom.xml +++ b/pom.xml @@ -171,7 +171,7 @@ net.sourceforge.htmlcleaner htmlcleaner - 2.24 + 2.5 com.github.detro diff --git a/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java b/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java index d8aab6c..1f1f0a5 100644 --- a/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java +++ b/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java @@ -1,16 +1,11 @@ package us.codecraft.webmagic.selector; -import net.sf.saxon.lib.NamespaceConstant; -import net.sf.saxon.xpath.XPathEvaluator; -import org.htmlcleaner.CleanerProperties; -import org.htmlcleaner.DomSerializer; -import org.htmlcleaner.HtmlCleaner; -import org.htmlcleaner.TagNode; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.w3c.dom.Document; -import org.w3c.dom.Node; -import org.w3c.dom.NodeList; +import java.io.StringWriter; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; import javax.xml.namespace.NamespaceContext; import javax.xml.transform.OutputKeys; @@ -21,12 +16,19 @@ import javax.xml.transform.stream.StreamResult; import javax.xml.xpath.XPathConstants; import javax.xml.xpath.XPathExpression; import javax.xml.xpath.XPathExpressionException; -import java.io.StringWriter; -import java.util.ArrayList; -import java.util.Iterator; -import java.util.List; -import java.util.Map; -import java.util.concurrent.ConcurrentHashMap; + +import org.htmlcleaner.CleanerProperties; +import org.htmlcleaner.DomSerializer; +import org.htmlcleaner.HtmlCleaner; +import org.htmlcleaner.TagNode; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.w3c.dom.Document; +import org.w3c.dom.Node; +import org.w3c.dom.NodeList; + +import net.sf.saxon.lib.NamespaceConstant; +import net.sf.saxon.xpath.XPathEvaluator; /** * 支持xpath2.0的选择器。包装了HtmlCleaner和Saxon HE。
diff --git a/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java b/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java index 38aac15..32906b5 100644 --- a/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java +++ b/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java @@ -1,5 +1,7 @@ package us.codecraft.webmagic.selector; +import java.util.List; + import org.htmlcleaner.HtmlCleaner; import org.htmlcleaner.TagNode; import org.htmlcleaner.XPatherException; @@ -1368,15 +1370,19 @@ public class XpathSelectorTest { public void testXPath2() { String text = "

眉山:扎实推进农业农村工作 促农持续增收
\n" + "2013-07-31 23:29:45   来源:眉山网      责任编辑:张斯炜

"; - XpathSelector xpathSelector = new XpathSelector("//h1/text()"); - Assert.assertEquals("眉山:扎实推进农业农村工作 促农持续增收 ", xpathSelector.select(text)); + Xpath2Selector xpathSelector = new Xpath2Selector("//h1/text()"); + Assert.assertEquals("眉山:扎实推进农业农村工作 促农持续增收", xpathSelector.select(text)); } @Test public void testXpath2Selector() { Xpath2Selector xpath2Selector = new Xpath2Selector("//a/@href"); String select = xpath2Selector.select(html); - Assert.assertNotNull(select); + Assert.assertEquals("http://www.oschina.net/", select); + + List selectList = xpath2Selector.selectList(html); + Assert.assertEquals(113, selectList.size()); + Assert.assertEquals("http://www.oschina.net/", selectList.get(0)); } @Ignore("take long time")