From 7c9e9ce8694693082d179c47427b053020f899aa Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 3 Aug 2013 07:28:46 +0800 Subject: [PATCH] xpath2.0 --- .../webmagic/selector/Xpath2Selector.java | 167 ++++++++++++++++++ .../webmagic/selector/XpathSelectorTest.java | 41 ++++- 2 files changed, 204 insertions(+), 4 deletions(-) create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java new file mode 100644 index 0000000..99112ca --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java @@ -0,0 +1,167 @@ +package us.codecraft.webmagic.selector; + +import net.sf.saxon.lib.NamespaceConstant; +import net.sf.saxon.xpath.XPathEvaluator; +import org.apache.log4j.Logger; +import org.htmlcleaner.CleanerProperties; +import org.htmlcleaner.DomSerializer; +import org.htmlcleaner.HtmlCleaner; +import org.htmlcleaner.TagNode; +import org.w3c.dom.Document; +import org.w3c.dom.NodeList; + +import javax.xml.namespace.NamespaceContext; +import javax.xml.transform.OutputKeys; +import javax.xml.transform.Transformer; +import javax.xml.transform.TransformerFactory; +import javax.xml.transform.dom.DOMSource; +import javax.xml.transform.stream.StreamResult; +import javax.xml.xpath.XPathConstants; +import javax.xml.xpath.XPathExpression; +import javax.xml.xpath.XPathExpressionException; +import java.io.StringWriter; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; + +/** + * 支持xpath2.0的选择器。包装了HtmlCleaner和Saxon HE。
+ * + * @author code4crafter@gmail.com
+ * Date: 13-4-21 + * Time: 上午9:39 + */ +public class Xpath2Selector implements Selector { + + private String xpathStr; + + private XPathExpression xPathExpression; + + private Logger logger = Logger.getLogger(getClass()); + + public Xpath2Selector(String xpathStr) { + this.xpathStr = xpathStr; + try { + init(); + } catch (XPathExpressionException e) { + throw new IllegalArgumentException("XPath error!", e); + } + } + + enum XPath2NamespaceContext implements NamespaceContext { + + INSTANCE; + + private final Map prefix2NamespaceMap = new ConcurrentHashMap(); + + private final Map> namespace2PrefixMap = new ConcurrentHashMap>(); + + private void put(String prefix, String namespaceURI) { + prefix2NamespaceMap.put(prefix, namespaceURI); + List prefixes = namespace2PrefixMap.get(namespaceURI); + if (prefixes == null) { + prefixes = new ArrayList(); + namespace2PrefixMap.put(namespaceURI, prefixes); + } + prefixes.add(prefix); + } + + private XPath2NamespaceContext() { + put("fn", NamespaceConstant.FN); + put("xslt",NamespaceConstant.XSLT); + } + + @Override + public String getNamespaceURI(String prefix) { + return prefix2NamespaceMap.get(prefix); + } + + @Override + public String getPrefix(String namespaceURI) { + List prefixes = namespace2PrefixMap.get(namespaceURI); + if (prefixes == null || prefixes.size() < 1) { + return null; + } + return prefixes.get(0); + } + + @Override + public Iterator getPrefixes(String namespaceURI) { + List prefixes = namespace2PrefixMap.get(namespaceURI); + if (prefixes == null || prefixes.size() < 1) { + return null; + } + return prefixes.iterator(); + } + } + + private void init() throws XPathExpressionException { + XPathEvaluator xPathEvaluator = new XPathEvaluator(); + xPathEvaluator.setNamespaceContext(XPath2NamespaceContext.INSTANCE); + xPathExpression = xPathEvaluator.compile(xpathStr); + } + + @Override + public String select(String text) { + try { + HtmlCleaner htmlCleaner = new HtmlCleaner(); + TagNode tagNode = htmlCleaner.clean(text); + Document document = new DomSerializer(new CleanerProperties()).createDOM(tagNode); + Object result; + try { + result = xPathExpression.evaluate(document, XPathConstants.NODESET); + } catch (XPathExpressionException e) { + result = xPathExpression.evaluate(document, XPathConstants.STRING); + } + if (result instanceof NodeList) { + StreamResult xmlOutput = new StreamResult(new StringWriter()); + Transformer transformer = TransformerFactory.newInstance().newTransformer(); + transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes"); + NodeList nodeList = (NodeList) result; + if (nodeList.getLength() == 0) { + return null; + } + transformer.transform(new DOMSource(nodeList.item(0)), xmlOutput); + return xmlOutput.getWriter().toString(); + } + return result.toString(); + } catch (Exception e) { + logger.error("select text error! " + xpathStr, e); + } + return null; + } + + @Override + public List selectList(String text) { + List results = new ArrayList(); + try { + HtmlCleaner htmlCleaner = new HtmlCleaner(); + TagNode tagNode = htmlCleaner.clean(text); + Document document = new DomSerializer(new CleanerProperties()).createDOM(tagNode); + Object result; + try { + result = xPathExpression.evaluate(document, XPathConstants.NODESET); + } catch (XPathExpressionException e) { + result = xPathExpression.evaluate(document, XPathConstants.STRING); + } + if (result instanceof NodeList) { + NodeList nodeList = (NodeList) result; + Transformer transformer = TransformerFactory.newInstance().newTransformer(); + StreamResult xmlOutput = new StreamResult(); + transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes"); + for (int i = 0; i < nodeList.getLength(); i++) { + xmlOutput.setWriter(new StringWriter()); + transformer.transform(new DOMSource(nodeList.item(i)), xmlOutput); + results.add(xmlOutput.getWriter().toString()); + } + } else { + results.add(result.toString()); + } + } catch (Exception e) { + logger.error("select text error! " + xpathStr, e); + } + return results; + } +} diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java index 6544e9e..2b8e15d 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java @@ -12,6 +12,7 @@ import org.htmlcleaner.DomSerializer; import org.htmlcleaner.HtmlCleaner; import org.htmlcleaner.TagNode; import org.junit.Assert; +import org.junit.Ignore; import org.junit.Test; import org.w3c.dom.Document; import org.w3c.dom.NodeList; @@ -1381,9 +1382,7 @@ public class XpathSelectorTest { //http://sourceforge.net/mailarchive/forum.php?thread_name=4EA92A8A.6080202%40saxonica.com&forum_name=saxon-help @Test - public void testSaxon() throws XPathFactoryConfigurationException { - System.setProperty("javax.xml.xpath.XPathFactory:" + NamespaceConstant.OBJECT_MODEL_SAXON, "net.sf.saxon.xpath.XPathFactoryImpl"); - System.setProperty("javax.xml.xpath.XPathFactory:" + NamespaceConstant.FN, "net.sf.saxon.xpath.XPathFactoryImpl"); + public void testSaxon() { String text = "

眉山:扎实推进农业农村工作 促农持续增收
\n" + "2013-07-31 23:29:45   来源:眉山网      责任编辑:张斯炜

"; try { @@ -1406,10 +1405,44 @@ public class XpathSelectorTest { })); XPathExpression expr = xPathEvaluator.compile("fn:substring-before(//h1,'\n')"); Object result = expr.evaluate(document, XPathConstants.STRING); - System.out.println(result); + Assert.assertNotNull(result); } catch (Exception e) { e.printStackTrace(); } + Xpath2Selector xpath2Selector = new Xpath2Selector("fn:substring-before(//h1,'\n')"); + String select = xpath2Selector.select(text); + Assert.assertNotNull(select); + Assert.assertNotNull(xpath2Selector.selectList(text)); + + } + + @Test + public void testXpath2Selector() { + Xpath2Selector xpath2Selector = new Xpath2Selector("//a"); + String select = xpath2Selector.select(html); + Assert.assertNotNull(select); + } + + @Ignore("take long time") + @Test + public void performanceTest() { + Xpath2Selector xpath2Selector = new Xpath2Selector("//a"); + long time =System.currentTimeMillis(); + for (int i = 0; i < 1000; i++) { + xpath2Selector.selectList(html); + } + System.out.println(System.currentTimeMillis()-time); + XpathSelector xpathSelector = new XpathSelector("//a"); + time =System.currentTimeMillis(); + for (int i = 0; i < 1000; i++) { + xpathSelector.selectList(html); + } + System.out.println(System.currentTimeMillis()-time); + time =System.currentTimeMillis(); + for (int i = 0; i < 1000; i++) { + xpath2Selector.selectList(html); + } + System.out.println(System.currentTimeMillis()-time); } }