diff --git a/pom.xml b/pom.xml
index 5974eae..fa369f4 100644
--- a/pom.xml
+++ b/pom.xml
@@ -27,6 +27,11 @@
httpclient
4.2.4
+
+ net.sf.saxon
+ Saxon-HE
+ 9.5.1-1
+
log4j
log4j
@@ -45,7 +50,7 @@
net.sourceforge.htmlcleaner
htmlcleaner
- 2.4
+ 2.5
org.apache.commons
diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml
index 60c37c0..a5fbd75 100644
--- a/webmagic-core/pom.xml
+++ b/webmagic-core/pom.xml
@@ -27,6 +27,11 @@
commons-lang3
+
+ net.sf.saxon
+ Saxon-HE
+
+
log4j
log4j
diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/SaxonTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/SaxonTest.java
new file mode 100644
index 0000000..509be44
--- /dev/null
+++ b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/SaxonTest.java
@@ -0,0 +1,45 @@
+package us.codecraft.webmagic.selector;
+
+import net.sf.saxon.xpath.XPathFactoryImpl;
+import org.htmlcleaner.CleanerProperties;
+import org.htmlcleaner.DomSerializer;
+import org.htmlcleaner.HtmlCleaner;
+import org.htmlcleaner.TagNode;
+import org.junit.Test;
+import org.w3c.dom.Document;
+import org.w3c.dom.NodeList;
+
+import javax.xml.xpath.XPath;
+import javax.xml.xpath.XPathConstants;
+import javax.xml.xpath.XPathExpression;
+import javax.xml.xpath.XPathFactoryConfigurationException;
+
+/**
+ * @author yihua.huang@dianping.com
+ * @date: 13-8-2
+ * Time: 下午5:48
+ */
+public class SaxonTest {
+
+ @Test
+ public void test() throws XPathFactoryConfigurationException {
+// System.setProperty("javax.xml.xpath.XPathFactory:" + NamespaceConstant.OBJECT_MODEL_SAXON, "net.sf.saxon.xpath.XPathFactoryImpl");
+// XPathFactory xpf = XPathFactory.newInstance(NamespaceConstant.OBJECT_MODEL_SAXON);
+ String xml = "#BBB##CCC##DDD#";
+ try {
+ HtmlCleaner htmlCleaner = new HtmlCleaner();
+ TagNode tagNode = htmlCleaner.clean("");
+ Document document = new DomSerializer(new CleanerProperties()).createDOM(tagNode);
+
+ javax.xml.xpath.XPathFactory factory = XPathFactoryImpl.newInstance();
+ XPath xpath = factory.newXPath();
+ XPathExpression expr = xpath.compile("//a[matches(.,'#...#')]");
+
+ Object result = expr.evaluate(document, XPathConstants.NODESET);
+ NodeList nodes = (NodeList) result;
+ System.out.println(nodes);
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+ }
+}
diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java
index 6f1c21e..c2cc7ec 100644
--- a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java
+++ b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java
@@ -1,7 +1,24 @@
package us.codecraft.webmagic.selector;
+import net.sf.saxon.Configuration;
+import net.sf.saxon.lib.NamespaceConstant;
+import net.sf.saxon.om.NamespaceResolver;
+import net.sf.saxon.pull.NamespaceContextImpl;
+import net.sf.saxon.xpath.JAXPXPathStaticContext;
+import net.sf.saxon.xpath.XPathEvaluator;
+import net.sf.saxon.xpath.XPathFactoryImpl;
+import org.htmlcleaner.CleanerProperties;
+import org.htmlcleaner.DomSerializer;
+import org.htmlcleaner.HtmlCleaner;
+import org.htmlcleaner.TagNode;
import org.junit.Assert;
import org.junit.Test;
+import org.w3c.dom.Document;
+import org.w3c.dom.NodeList;
+
+import javax.xml.xpath.*;
+import java.util.Collections;
+import java.util.Iterator;
/**
* @author code4crafter@gmail.com
Date: 13-4-21 Time: 上午10:06
@@ -1354,4 +1371,50 @@ public class XpathSelectorTest {
Assert.assertNotNull(html1.$("a[href]").xpath("//@href").all());
}
+ @Test
+ public void testXPath2() {
+ String text = "眉山:扎实推进农业农村工作 促农持续增收
\n" +
+ "2013-07-31 23:29:45 来源:眉山网 责任编辑:张斯炜
";
+ XpathSelector xpathSelector = new XpathSelector("//h1/text()");
+ System.out.println(xpathSelector.select(text));
+ }
+
+ //http://sourceforge.net/mailarchive/forum.php?thread_name=4EA92A8A.6080202%40saxonica.com&forum_name=saxon-help
+ @Test
+ public void testSaxon() throws XPathFactoryConfigurationException {
+ System.setProperty("javax.xml.xpath.XPathFactory:" + NamespaceConstant.OBJECT_MODEL_SAXON, "net.sf.saxon.xpath.XPathFactoryImpl");
+ System.setProperty("javax.xml.xpath.XPathFactory:" + NamespaceConstant.FN, "net.sf.saxon.xpath.XPathFactoryImpl");
+ XPathFactory xpf = XPathFactory.newInstance(NamespaceConstant.OBJECT_MODEL_SAXON);
+ String text = "眉山:扎实推进农业农村工作 促农持续增收
\n" +
+ "2013-07-31 23:29:45 来源:眉山网 责任编辑:张斯炜
";
+ try {
+ HtmlCleaner htmlCleaner = new HtmlCleaner();
+ TagNode tagNode = htmlCleaner.clean(text);
+ Document document = new DomSerializer(new CleanerProperties()).createDOM(tagNode);
+ javax.xml.xpath.XPathFactory factory = XPathFactoryImpl.newInstance(NamespaceConstant.OBJECT_MODEL_SAXON);
+ Configuration config = Configuration.newConfiguration();
+ XPathEvaluator xPathEvaluator = new XPathEvaluator(config);
+ JAXPXPathStaticContext context = new JAXPXPathStaticContext(config);
+ context.setNamespaceContext(new NamespaceContextImpl(new NamespaceResolver() {
+
+
+ @Override
+ public String getURIForPrefix(String s, boolean b) {
+ return NamespaceConstant.FN;
+ }
+
+ @Override
+ public Iterator iteratePrefixes() {
+ return Collections.singletonList("fn").iterator();
+ }
+ }));
+ xPathEvaluator.setStaticContext(context);
+ XPathExpression expr = xPathEvaluator.compile("fn:substring-before(//h1,'\n')");
+ Object result = expr.evaluate(document, XPathConstants.STRING);
+ System.out.println(result);
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+ }
+
}