diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java
new file mode 100644
index 0000000..99112ca
--- /dev/null
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java
@@ -0,0 +1,167 @@
+package us.codecraft.webmagic.selector;
+
+import net.sf.saxon.lib.NamespaceConstant;
+import net.sf.saxon.xpath.XPathEvaluator;
+import org.apache.log4j.Logger;
+import org.htmlcleaner.CleanerProperties;
+import org.htmlcleaner.DomSerializer;
+import org.htmlcleaner.HtmlCleaner;
+import org.htmlcleaner.TagNode;
+import org.w3c.dom.Document;
+import org.w3c.dom.NodeList;
+
+import javax.xml.namespace.NamespaceContext;
+import javax.xml.transform.OutputKeys;
+import javax.xml.transform.Transformer;
+import javax.xml.transform.TransformerFactory;
+import javax.xml.transform.dom.DOMSource;
+import javax.xml.transform.stream.StreamResult;
+import javax.xml.xpath.XPathConstants;
+import javax.xml.xpath.XPathExpression;
+import javax.xml.xpath.XPathExpressionException;
+import java.io.StringWriter;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.ConcurrentHashMap;
+
+/**
+ * 支持xpath2.0的选择器。包装了HtmlCleaner和Saxon HE。
+ *
+ * @author code4crafter@gmail.com
+ * Date: 13-4-21
+ * Time: 上午9:39
+ */
+public class Xpath2Selector implements Selector {
+
+ private String xpathStr;
+
+ private XPathExpression xPathExpression;
+
+ private Logger logger = Logger.getLogger(getClass());
+
+ public Xpath2Selector(String xpathStr) {
+ this.xpathStr = xpathStr;
+ try {
+ init();
+ } catch (XPathExpressionException e) {
+ throw new IllegalArgumentException("XPath error!", e);
+ }
+ }
+
+ enum XPath2NamespaceContext implements NamespaceContext {
+
+ INSTANCE;
+
+ private final Map prefix2NamespaceMap = new ConcurrentHashMap();
+
+ private final Map> namespace2PrefixMap = new ConcurrentHashMap>();
+
+ private void put(String prefix, String namespaceURI) {
+ prefix2NamespaceMap.put(prefix, namespaceURI);
+ List prefixes = namespace2PrefixMap.get(namespaceURI);
+ if (prefixes == null) {
+ prefixes = new ArrayList();
+ namespace2PrefixMap.put(namespaceURI, prefixes);
+ }
+ prefixes.add(prefix);
+ }
+
+ private XPath2NamespaceContext() {
+ put("fn", NamespaceConstant.FN);
+ put("xslt",NamespaceConstant.XSLT);
+ }
+
+ @Override
+ public String getNamespaceURI(String prefix) {
+ return prefix2NamespaceMap.get(prefix);
+ }
+
+ @Override
+ public String getPrefix(String namespaceURI) {
+ List prefixes = namespace2PrefixMap.get(namespaceURI);
+ if (prefixes == null || prefixes.size() < 1) {
+ return null;
+ }
+ return prefixes.get(0);
+ }
+
+ @Override
+ public Iterator getPrefixes(String namespaceURI) {
+ List prefixes = namespace2PrefixMap.get(namespaceURI);
+ if (prefixes == null || prefixes.size() < 1) {
+ return null;
+ }
+ return prefixes.iterator();
+ }
+ }
+
+ private void init() throws XPathExpressionException {
+ XPathEvaluator xPathEvaluator = new XPathEvaluator();
+ xPathEvaluator.setNamespaceContext(XPath2NamespaceContext.INSTANCE);
+ xPathExpression = xPathEvaluator.compile(xpathStr);
+ }
+
+ @Override
+ public String select(String text) {
+ try {
+ HtmlCleaner htmlCleaner = new HtmlCleaner();
+ TagNode tagNode = htmlCleaner.clean(text);
+ Document document = new DomSerializer(new CleanerProperties()).createDOM(tagNode);
+ Object result;
+ try {
+ result = xPathExpression.evaluate(document, XPathConstants.NODESET);
+ } catch (XPathExpressionException e) {
+ result = xPathExpression.evaluate(document, XPathConstants.STRING);
+ }
+ if (result instanceof NodeList) {
+ StreamResult xmlOutput = new StreamResult(new StringWriter());
+ Transformer transformer = TransformerFactory.newInstance().newTransformer();
+ transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
+ NodeList nodeList = (NodeList) result;
+ if (nodeList.getLength() == 0) {
+ return null;
+ }
+ transformer.transform(new DOMSource(nodeList.item(0)), xmlOutput);
+ return xmlOutput.getWriter().toString();
+ }
+ return result.toString();
+ } catch (Exception e) {
+ logger.error("select text error! " + xpathStr, e);
+ }
+ return null;
+ }
+
+ @Override
+ public List selectList(String text) {
+ List results = new ArrayList();
+ try {
+ HtmlCleaner htmlCleaner = new HtmlCleaner();
+ TagNode tagNode = htmlCleaner.clean(text);
+ Document document = new DomSerializer(new CleanerProperties()).createDOM(tagNode);
+ Object result;
+ try {
+ result = xPathExpression.evaluate(document, XPathConstants.NODESET);
+ } catch (XPathExpressionException e) {
+ result = xPathExpression.evaluate(document, XPathConstants.STRING);
+ }
+ if (result instanceof NodeList) {
+ NodeList nodeList = (NodeList) result;
+ Transformer transformer = TransformerFactory.newInstance().newTransformer();
+ StreamResult xmlOutput = new StreamResult();
+ transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
+ for (int i = 0; i < nodeList.getLength(); i++) {
+ xmlOutput.setWriter(new StringWriter());
+ transformer.transform(new DOMSource(nodeList.item(i)), xmlOutput);
+ results.add(xmlOutput.getWriter().toString());
+ }
+ } else {
+ results.add(result.toString());
+ }
+ } catch (Exception e) {
+ logger.error("select text error! " + xpathStr, e);
+ }
+ return results;
+ }
+}
diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java
index 6544e9e..2b8e15d 100644
--- a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java
+++ b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java
@@ -12,6 +12,7 @@ import org.htmlcleaner.DomSerializer;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;
import org.junit.Assert;
+import org.junit.Ignore;
import org.junit.Test;
import org.w3c.dom.Document;
import org.w3c.dom.NodeList;
@@ -1381,9 +1382,7 @@ public class XpathSelectorTest {
//http://sourceforge.net/mailarchive/forum.php?thread_name=4EA92A8A.6080202%40saxonica.com&forum_name=saxon-help
@Test
- public void testSaxon() throws XPathFactoryConfigurationException {
- System.setProperty("javax.xml.xpath.XPathFactory:" + NamespaceConstant.OBJECT_MODEL_SAXON, "net.sf.saxon.xpath.XPathFactoryImpl");
- System.setProperty("javax.xml.xpath.XPathFactory:" + NamespaceConstant.FN, "net.sf.saxon.xpath.XPathFactoryImpl");
+ public void testSaxon() {
String text = "眉山:扎实推进农业农村工作 促农持续增收
\n" +
"2013-07-31 23:29:45 来源:眉山网 责任编辑:张斯炜
";
try {
@@ -1406,10 +1405,44 @@ public class XpathSelectorTest {
}));
XPathExpression expr = xPathEvaluator.compile("fn:substring-before(//h1,'\n')");
Object result = expr.evaluate(document, XPathConstants.STRING);
- System.out.println(result);
+ Assert.assertNotNull(result);
} catch (Exception e) {
e.printStackTrace();
}
+ Xpath2Selector xpath2Selector = new Xpath2Selector("fn:substring-before(//h1,'\n')");
+ String select = xpath2Selector.select(text);
+ Assert.assertNotNull(select);
+ Assert.assertNotNull(xpath2Selector.selectList(text));
+
+ }
+
+ @Test
+ public void testXpath2Selector() {
+ Xpath2Selector xpath2Selector = new Xpath2Selector("//a");
+ String select = xpath2Selector.select(html);
+ Assert.assertNotNull(select);
+ }
+
+ @Ignore("take long time")
+ @Test
+ public void performanceTest() {
+ Xpath2Selector xpath2Selector = new Xpath2Selector("//a");
+ long time =System.currentTimeMillis();
+ for (int i = 0; i < 1000; i++) {
+ xpath2Selector.selectList(html);
+ }
+ System.out.println(System.currentTimeMillis()-time);
+ XpathSelector xpathSelector = new XpathSelector("//a");
+ time =System.currentTimeMillis();
+ for (int i = 0; i < 1000; i++) {
+ xpathSelector.selectList(html);
+ }
+ System.out.println(System.currentTimeMillis()-time);
+ time =System.currentTimeMillis();
+ for (int i = 0; i < 1000; i++) {
+ xpath2Selector.selectList(html);
+ }
+ System.out.println(System.currentTimeMillis()-time);
}
}