diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml index efa8291..1c4e745 100644 --- a/webmagic-saxon/pom.xml +++ b/webmagic-saxon/pom.xml @@ -17,6 +17,11 @@ webmagic-core ${project.version} + + us.codecraft + xsoup + 0.0.1-SNAPSHOT + net.sf.saxon Saxon-HE diff --git a/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java b/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java index b623040..6c19c8a 100644 --- a/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java +++ b/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java @@ -1,8 +1,15 @@ package us.codecraft.webmagic.selector; +import org.htmlcleaner.HtmlCleaner; +import org.htmlcleaner.TagNode; +import org.htmlcleaner.XPatherException; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; import org.junit.Assert; import org.junit.Ignore; import org.junit.Test; +import us.codecraft.xsoup.XPathEvaluator; +import us.codecraft.xsoup.Xsoup; /** * @author code4crafter@gmail.com
Date: 13-4-21 Time: 上午10:06 @@ -1353,6 +1360,7 @@ public class XpathSelectorTest { Html html1 = new Html(html); Assert.assertEquals("再次吐槽easyui", html1.xpath(".//*[@class='QTitle']/h1/a").toString()); Assert.assertNotNull(html1.$("a[href]").xpath("//@href").all()); + Selectors.xpath("/abc/").select(""); } @Test @@ -1379,17 +1387,86 @@ public class XpathSelectorTest { xpath2Selector.selectList(html); } System.out.println(System.currentTimeMillis()-time); + XpathSelector xpathSelector = new XpathSelector("//a"); time =System.currentTimeMillis(); for (int i = 0; i < 1000; i++) { xpathSelector.selectList(html); } System.out.println(System.currentTimeMillis()-time); + time =System.currentTimeMillis(); for (int i = 0; i < 1000; i++) { xpath2Selector.selectList(html); } + System.out.println(System.currentTimeMillis() - time); + + CssSelector cssSelector = new CssSelector("a"); + time =System.currentTimeMillis(); + for (int i = 0; i < 1000; i++) { + cssSelector.selectList(html); + } + System.out.println("css "+(System.currentTimeMillis()-time)); + } + + @Ignore("take long time") + @Test + public void parserPerformanceTest() throws XPatherException { + System.out.println(html.length()); + + HtmlCleaner htmlCleaner = new HtmlCleaner(); + TagNode tagNode = htmlCleaner.clean(html); + Document document = Jsoup.parse(html); + + long time =System.currentTimeMillis(); + for (int i = 0; i < 2000; i++) { + htmlCleaner.clean(html); + } System.out.println(System.currentTimeMillis()-time); + + time =System.currentTimeMillis(); + for (int i = 0; i < 2000; i++) { + tagNode.evaluateXPath("//a"); + } + System.out.println(System.currentTimeMillis()-time); + + System.out.println("============="); + + time =System.currentTimeMillis(); + for (int i = 0; i < 2000; i++) { + Jsoup.parse(html); + } + System.out.println(System.currentTimeMillis()-time); + + time =System.currentTimeMillis(); + for (int i = 0; i < 2000; i++) { + document.select("a"); + } + System.out.println(System.currentTimeMillis()-time); + + System.out.println("============="); + + time =System.currentTimeMillis(); + for (int i = 0; i < 2000; i++) { + htmlCleaner.clean(html); + } + System.out.println(System.currentTimeMillis()-time); + + time =System.currentTimeMillis(); + for (int i = 0; i < 2000; i++) { + tagNode.evaluateXPath("//a"); + } + System.out.println(System.currentTimeMillis()-time); + + System.out.println("============="); + + XPathEvaluator compile = Xsoup.compile("//a"); + time =System.currentTimeMillis(); + for (int i = 0; i < 2000; i++) { + compile.evaluate(document); + } + System.out.println(System.currentTimeMillis()-time); + } }