test saxon and invite XPath2.0 support
parent
3fe3d8f044
commit
d7899e94ae
7
pom.xml
7
pom.xml
|
@ -27,6 +27,11 @@
|
|||
<artifactId>httpclient</artifactId>
|
||||
<version>4.2.4</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>net.sf.saxon</groupId>
|
||||
<artifactId>Saxon-HE</artifactId>
|
||||
<version>9.5.1-1</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>log4j</groupId>
|
||||
<artifactId>log4j</artifactId>
|
||||
|
@ -45,7 +50,7 @@
|
|||
<dependency>
|
||||
<groupId>net.sourceforge.htmlcleaner</groupId>
|
||||
<artifactId>htmlcleaner</artifactId>
|
||||
<version>2.4</version>
|
||||
<version>2.5</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.commons</groupId>
|
||||
|
|
|
@ -27,6 +27,11 @@
|
|||
<artifactId>commons-lang3</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>net.sf.saxon</groupId>
|
||||
<artifactId>Saxon-HE</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>log4j</groupId>
|
||||
<artifactId>log4j</artifactId>
|
||||
|
|
|
@ -0,0 +1,45 @@
|
|||
package us.codecraft.webmagic.selector;
|
||||
|
||||
import net.sf.saxon.xpath.XPathFactoryImpl;
|
||||
import org.htmlcleaner.CleanerProperties;
|
||||
import org.htmlcleaner.DomSerializer;
|
||||
import org.htmlcleaner.HtmlCleaner;
|
||||
import org.htmlcleaner.TagNode;
|
||||
import org.junit.Test;
|
||||
import org.w3c.dom.Document;
|
||||
import org.w3c.dom.NodeList;
|
||||
|
||||
import javax.xml.xpath.XPath;
|
||||
import javax.xml.xpath.XPathConstants;
|
||||
import javax.xml.xpath.XPathExpression;
|
||||
import javax.xml.xpath.XPathFactoryConfigurationException;
|
||||
|
||||
/**
|
||||
* @author yihua.huang@dianping.com <br>
|
||||
* @date: 13-8-2 <br>
|
||||
* Time: 下午5:48 <br>
|
||||
*/
|
||||
public class SaxonTest {
|
||||
|
||||
@Test
|
||||
public void test() throws XPathFactoryConfigurationException {
|
||||
// System.setProperty("javax.xml.xpath.XPathFactory:" + NamespaceConstant.OBJECT_MODEL_SAXON, "net.sf.saxon.xpath.XPathFactoryImpl");
|
||||
// XPathFactory xpf = XPathFactory.newInstance(NamespaceConstant.OBJECT_MODEL_SAXON);
|
||||
String xml = "<root><a>#BBB#</a><a>#CCC#</a><b><a>#DDD#</a></b></root>";
|
||||
try {
|
||||
HtmlCleaner htmlCleaner = new HtmlCleaner();
|
||||
TagNode tagNode = htmlCleaner.clean("");
|
||||
Document document = new DomSerializer(new CleanerProperties()).createDOM(tagNode);
|
||||
|
||||
javax.xml.xpath.XPathFactory factory = XPathFactoryImpl.newInstance();
|
||||
XPath xpath = factory.newXPath();
|
||||
XPathExpression expr = xpath.compile("//a[matches(.,'#...#')]");
|
||||
|
||||
Object result = expr.evaluate(document, XPathConstants.NODESET);
|
||||
NodeList nodes = (NodeList) result;
|
||||
System.out.println(nodes);
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,7 +1,24 @@
|
|||
package us.codecraft.webmagic.selector;
|
||||
|
||||
import net.sf.saxon.Configuration;
|
||||
import net.sf.saxon.lib.NamespaceConstant;
|
||||
import net.sf.saxon.om.NamespaceResolver;
|
||||
import net.sf.saxon.pull.NamespaceContextImpl;
|
||||
import net.sf.saxon.xpath.JAXPXPathStaticContext;
|
||||
import net.sf.saxon.xpath.XPathEvaluator;
|
||||
import net.sf.saxon.xpath.XPathFactoryImpl;
|
||||
import org.htmlcleaner.CleanerProperties;
|
||||
import org.htmlcleaner.DomSerializer;
|
||||
import org.htmlcleaner.HtmlCleaner;
|
||||
import org.htmlcleaner.TagNode;
|
||||
import org.junit.Assert;
|
||||
import org.junit.Test;
|
||||
import org.w3c.dom.Document;
|
||||
import org.w3c.dom.NodeList;
|
||||
|
||||
import javax.xml.xpath.*;
|
||||
import java.util.Collections;
|
||||
import java.util.Iterator;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com <br> Date: 13-4-21 Time: 上午10:06
|
||||
|
@ -1354,4 +1371,50 @@ public class XpathSelectorTest {
|
|||
Assert.assertNotNull(html1.$("a[href]").xpath("//@href").all());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testXPath2() {
|
||||
String text = "<h1>眉山:扎实推进农业农村工作 促农持续增收<br>\n" +
|
||||
"<span>2013-07-31 23:29:45 来源:<a href=\"http://www.mshw.net\" target=\"_blank\" style=\"color:#AAA\">眉山网</a> 责任编辑:张斯炜</span></h1>";
|
||||
XpathSelector xpathSelector = new XpathSelector("//h1/text()");
|
||||
System.out.println(xpathSelector.select(text));
|
||||
}
|
||||
|
||||
//http://sourceforge.net/mailarchive/forum.php?thread_name=4EA92A8A.6080202%40saxonica.com&forum_name=saxon-help
|
||||
@Test
|
||||
public void testSaxon() throws XPathFactoryConfigurationException {
|
||||
System.setProperty("javax.xml.xpath.XPathFactory:" + NamespaceConstant.OBJECT_MODEL_SAXON, "net.sf.saxon.xpath.XPathFactoryImpl");
|
||||
System.setProperty("javax.xml.xpath.XPathFactory:" + NamespaceConstant.FN, "net.sf.saxon.xpath.XPathFactoryImpl");
|
||||
XPathFactory xpf = XPathFactory.newInstance(NamespaceConstant.OBJECT_MODEL_SAXON);
|
||||
String text = "<h1>眉山:扎实推进农业农村工作 促农持续增收<br>\n" +
|
||||
"<span>2013-07-31 23:29:45 来源:<a href=\"http://www.mshw.net\" target=\"_blank\" style=\"color:#AAA\">眉山网</a> 责任编辑:张斯炜</span></h1>";
|
||||
try {
|
||||
HtmlCleaner htmlCleaner = new HtmlCleaner();
|
||||
TagNode tagNode = htmlCleaner.clean(text);
|
||||
Document document = new DomSerializer(new CleanerProperties()).createDOM(tagNode);
|
||||
javax.xml.xpath.XPathFactory factory = XPathFactoryImpl.newInstance(NamespaceConstant.OBJECT_MODEL_SAXON);
|
||||
Configuration config = Configuration.newConfiguration();
|
||||
XPathEvaluator xPathEvaluator = new XPathEvaluator(config);
|
||||
JAXPXPathStaticContext context = new JAXPXPathStaticContext(config);
|
||||
context.setNamespaceContext(new NamespaceContextImpl(new NamespaceResolver() {
|
||||
|
||||
|
||||
@Override
|
||||
public String getURIForPrefix(String s, boolean b) {
|
||||
return NamespaceConstant.FN;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Iterator<String> iteratePrefixes() {
|
||||
return Collections.singletonList("fn").iterator();
|
||||
}
|
||||
}));
|
||||
xPathEvaluator.setStaticContext(context);
|
||||
XPathExpression expr = xPathEvaluator.compile("fn:substring-before(//h1,'\n')");
|
||||
Object result = expr.evaluate(document, XPathConstants.STRING);
|
||||
System.out.println(result);
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue