From 12ce86425f4f5b09be06e49f0d19e84dfa10c54b Mon Sep 17 00:00:00 2001 From: hooyantsing Date: Fri, 3 Feb 2023 22:48:58 +0800 Subject: [PATCH 1/2] =?UTF-8?q?BugFix:=20Jsoup=20=E5=92=8C=20HtmlCleaner?= =?UTF-8?q?=20=E6=9E=84=E5=BB=BA=20Dom=20=E6=97=B6=EF=BC=8C=E8=8B=A5?= =?UTF-8?q?=E7=BC=BA=E5=A4=B1=20table=20=E6=A0=87=E7=AD=BE=EF=BC=8C?= =?UTF-8?q?=E5=88=99=E6=97=A0=E6=B3=95=E6=AD=A3=E5=B8=B8=E8=A7=A3=E6=9E=90?= =?UTF-8?q?=20tr=20=E5=92=8C=20td=20=E6=A0=87=E7=AD=BE=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../selector/BaseElementSelector.java | 10 ++----- .../webmagic/utils/BaseSelectorUtils.java | 23 +++++++++++++++ .../webmagic/selector/Xpath2Selector.java | 28 +++++++++++-------- .../webmagic/selector/XpathSelectorTest.java | 19 +++++++++++++ 4 files changed, 60 insertions(+), 20 deletions(-) create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/utils/BaseSelectorUtils.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java index b267d5b..6001767 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java @@ -3,6 +3,7 @@ package us.codecraft.webmagic.selector; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; +import us.codecraft.webmagic.utils.BaseSelectorUtils; import java.util.ArrayList; import java.util.List; @@ -13,16 +14,9 @@ import java.util.List; */ public abstract class BaseElementSelector implements Selector, ElementSelector { private Document parse(String text) { - if (text == null) { - return null; - } - // Jsoup could not parse or tag directly // https://stackoverflow.com/questions/63607740/jsoup-couldnt-parse-tr-tag - if ((text.startsWith("") && text.endsWith("")) - || (text.startsWith("") && text.endsWith(""))) { - text = "" + text + "
"; - } + text = BaseSelectorUtils.preParse(text); return Jsoup.parse(text); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/BaseSelectorUtils.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/BaseSelectorUtils.java new file mode 100644 index 0000000..04c0651 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/BaseSelectorUtils.java @@ -0,0 +1,23 @@ +package us.codecraft.webmagic.utils; + +/** + * @author hooy + */ +public class BaseSelectorUtils { + + /** + * Jsoup/HtmlCleaner could not parse "tr" or "td" tag directly + * https://stackoverflow.com/questions/63607740/jsoup-couldnt-parse-tr-tag + * + * @param text - the html string + * @return text + */ + public static String preParse(String text) { + if (((text.startsWith("") || text.startsWith("")) + || ((text.startsWith("") || text.startsWith(""))) { + text = "" + text + "
"; + } + return text; + } + +} diff --git a/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java b/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java index 9d5eef9..b63213b 100644 --- a/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java +++ b/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java @@ -8,6 +8,7 @@ import java.util.Map; import java.util.concurrent.ConcurrentHashMap; import javax.xml.namespace.NamespaceContext; +import javax.xml.parsers.ParserConfigurationException; import javax.xml.transform.OutputKeys; import javax.xml.transform.Transformer; import javax.xml.transform.TransformerFactory; @@ -29,13 +30,14 @@ import org.w3c.dom.NodeList; import net.sf.saxon.lib.NamespaceConstant; import net.sf.saxon.xpath.XPathEvaluator; +import us.codecraft.webmagic.utils.BaseSelectorUtils; /** * 支持xpath2.0的选择器。包装了HtmlCleaner和Saxon HE。
* * @author code4crafter@gmail.com
- * Date: 13-4-21 - * Time: 上午9:39 + * Date: 13-4-21 + * Time: 上午9:39 */ public class Xpath2Selector implements Selector { @@ -111,14 +113,11 @@ public class Xpath2Selector implements Selector { @Override public String select(String text) { try { - HtmlCleaner htmlCleaner = new HtmlCleaner(); - TagNode tagNode = htmlCleaner.clean(text); - Document document = new DomSerializer(new CleanerProperties()).createDOM(tagNode); Object result; try { - result = xPathExpression.evaluate(document, XPathConstants.NODESET); + result = xPathExpression.evaluate(parse(text), XPathConstants.NODESET); } catch (XPathExpressionException e) { - result = xPathExpression.evaluate(document, XPathConstants.STRING); + result = xPathExpression.evaluate(parse(text), XPathConstants.STRING); } if (result instanceof NodeList) { NodeList nodeList = (NodeList) result; @@ -147,14 +146,11 @@ public class Xpath2Selector implements Selector { public List selectList(String text) { List results = new ArrayList(); try { - HtmlCleaner htmlCleaner = new HtmlCleaner(); - TagNode tagNode = htmlCleaner.clean(text); - Document document = new DomSerializer(new CleanerProperties()).createDOM(tagNode); Object result; try { - result = xPathExpression.evaluate(document, XPathConstants.NODESET); + result = xPathExpression.evaluate(parse(text), XPathConstants.NODESET); } catch (XPathExpressionException e) { - result = xPathExpression.evaluate(document, XPathConstants.STRING); + result = xPathExpression.evaluate(parse(text), XPathConstants.STRING); } if (result instanceof NodeList) { NodeList nodeList = (NodeList) result; @@ -179,4 +175,12 @@ public class Xpath2Selector implements Selector { } return results; } + + private Document parse(String text) throws ParserConfigurationException { + // HtmlCleaner could not parse or tag directly + text = BaseSelectorUtils.preParse(text); + HtmlCleaner htmlCleaner = new HtmlCleaner(); + TagNode tagNode = htmlCleaner.clean(text); + return new DomSerializer(new CleanerProperties()).createDOM(tagNode); + } } diff --git a/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java b/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java index 1661883..c2025e7 100644 --- a/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java +++ b/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java @@ -11,6 +11,9 @@ import org.junit.Assert; import org.junit.Ignore; import org.junit.Test; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.xsoup.XPathEvaluator; import us.codecraft.xsoup.Xsoup; @@ -1385,6 +1388,22 @@ public class XpathSelectorTest { Assert.assertEquals("http://www.oschina.net/", selectList.get(0)); } + @Ignore("test parse
tag") + @Test + public void htmlCleanerParseTest() { + Spider.create(new RuoxiaPageProcessor()).addUrl("http://www.ruoxia.com/top/dianji/month").thread(1).run(); + } + class RuoxiaPageProcessor implements PageProcessor { + @Override + public void process(Page page) { + List nodes = page.getHtml().xpath("//div[@class=\"bd\"]//tbody/tr").nodes(); + for (Selectable node:nodes) { + String name = node.xpath("//td[3]/div/a[1]/text()").get(); + System.out.println(name); + } + } + } + @Ignore("take long time") @Test public void performanceTest() { From 08f4a4046b4cb13a81684533534a7d51640c3e04 Mon Sep 17 00:00:00 2001 From: hooyantsing Date: Fri, 3 Feb 2023 22:59:56 +0800 Subject: [PATCH 2/2] =?UTF-8?q?Update:=20=E6=8F=90=E4=BE=9B=E6=B5=8B?= =?UTF-8?q?=E8=AF=95=E7=94=A8=E4=BE=8B=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../webmagic/selector/XpathSelectorTest.java | 49 ++++++++++--------- 1 file changed, 25 insertions(+), 24 deletions(-) diff --git a/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java b/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java index c2025e7..8ac7219 100644 --- a/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java +++ b/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java @@ -1393,12 +1393,13 @@ public class XpathSelectorTest { public void htmlCleanerParseTest() { Spider.create(new RuoxiaPageProcessor()).addUrl("http://www.ruoxia.com/top/dianji/month").thread(1).run(); } + class RuoxiaPageProcessor implements PageProcessor { @Override public void process(Page page) { - List nodes = page.getHtml().xpath("//div[@class=\"bd\"]//tbody/tr").nodes(); - for (Selectable node:nodes) { - String name = node.xpath("//td[3]/div/a[1]/text()").get(); + List items = new Xpath2Selector("//div[@class=\"bd\"]//tbody/tr").selectList(page.getRawText()); + for (String item : items) { + String name = new Xpath2Selector("//td[3]/div/a[1]/text()").select(item); System.out.println(name); } } @@ -1408,31 +1409,31 @@ public class XpathSelectorTest { @Test public void performanceTest() { Xpath2Selector xpath2Selector = new Xpath2Selector("//a"); - long time =System.currentTimeMillis(); + long time = System.currentTimeMillis(); for (int i = 0; i < 1000; i++) { xpath2Selector.selectList(html); } - System.out.println(System.currentTimeMillis()-time); + System.out.println(System.currentTimeMillis() - time); XpathSelector xpathSelector = new XpathSelector("//a"); - time =System.currentTimeMillis(); + time = System.currentTimeMillis(); for (int i = 0; i < 1000; i++) { xpathSelector.selectList(html); } - System.out.println(System.currentTimeMillis()-time); + System.out.println(System.currentTimeMillis() - time); - time =System.currentTimeMillis(); + time = System.currentTimeMillis(); for (int i = 0; i < 1000; i++) { xpath2Selector.selectList(html); } System.out.println(System.currentTimeMillis() - time); CssSelector cssSelector = new CssSelector("a"); - time =System.currentTimeMillis(); + time = System.currentTimeMillis(); for (int i = 0; i < 1000; i++) { cssSelector.selectList(html); } - System.out.println("css "+(System.currentTimeMillis()-time)); + System.out.println("css " + (System.currentTimeMillis() - time)); } @Ignore("take long time") @@ -1444,54 +1445,54 @@ public class XpathSelectorTest { TagNode tagNode = htmlCleaner.clean(html); Document document = Jsoup.parse(html); - long time =System.currentTimeMillis(); + long time = System.currentTimeMillis(); for (int i = 0; i < 2000; i++) { htmlCleaner.clean(html); } - System.out.println(System.currentTimeMillis()-time); + System.out.println(System.currentTimeMillis() - time); - time =System.currentTimeMillis(); + time = System.currentTimeMillis(); for (int i = 0; i < 2000; i++) { tagNode.evaluateXPath("//a"); } - System.out.println(System.currentTimeMillis()-time); + System.out.println(System.currentTimeMillis() - time); System.out.println("============="); - time =System.currentTimeMillis(); + time = System.currentTimeMillis(); for (int i = 0; i < 2000; i++) { Jsoup.parse(html); } - System.out.println(System.currentTimeMillis()-time); + System.out.println(System.currentTimeMillis() - time); - time =System.currentTimeMillis(); + time = System.currentTimeMillis(); for (int i = 0; i < 2000; i++) { document.select("a"); } - System.out.println(System.currentTimeMillis()-time); + System.out.println(System.currentTimeMillis() - time); System.out.println("============="); - time =System.currentTimeMillis(); + time = System.currentTimeMillis(); for (int i = 0; i < 2000; i++) { htmlCleaner.clean(html); } - System.out.println(System.currentTimeMillis()-time); + System.out.println(System.currentTimeMillis() - time); - time =System.currentTimeMillis(); + time = System.currentTimeMillis(); for (int i = 0; i < 2000; i++) { tagNode.evaluateXPath("//a"); } - System.out.println(System.currentTimeMillis()-time); + System.out.println(System.currentTimeMillis() - time); System.out.println("============="); XPathEvaluator compile = Xsoup.compile("//a"); - time =System.currentTimeMillis(); + time = System.currentTimeMillis(); for (int i = 0; i < 2000; i++) { compile.evaluate(document); } - System.out.println(System.currentTimeMillis()-time); + System.out.println(System.currentTimeMillis() - time); }