diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java index b267d5b..6001767 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java @@ -3,6 +3,7 @@ package us.codecraft.webmagic.selector; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; +import us.codecraft.webmagic.utils.BaseSelectorUtils; import java.util.ArrayList; import java.util.List; @@ -13,16 +14,9 @@ import java.util.List; */ public abstract class BaseElementSelector implements Selector, ElementSelector { private Document parse(String text) { - if (text == null) { - return null; - } - // Jsoup could not parse or tag directly // https://stackoverflow.com/questions/63607740/jsoup-couldnt-parse-tr-tag - if ((text.startsWith("") && text.endsWith("")) - || (text.startsWith("") && text.endsWith(""))) { - text = "" + text + "
"; - } + text = BaseSelectorUtils.preParse(text); return Jsoup.parse(text); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/BaseSelectorUtils.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/BaseSelectorUtils.java new file mode 100644 index 0000000..04c0651 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/BaseSelectorUtils.java @@ -0,0 +1,23 @@ +package us.codecraft.webmagic.utils; + +/** + * @author hooy + */ +public class BaseSelectorUtils { + + /** + * Jsoup/HtmlCleaner could not parse "tr" or "td" tag directly + * https://stackoverflow.com/questions/63607740/jsoup-couldnt-parse-tr-tag + * + * @param text - the html string + * @return text + */ + public static String preParse(String text) { + if (((text.startsWith("") || text.startsWith("")) + || ((text.startsWith("") || text.startsWith(""))) { + text = "" + text + "
"; + } + return text; + } + +} diff --git a/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java b/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java index 9d5eef9..b63213b 100644 --- a/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java +++ b/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java @@ -8,6 +8,7 @@ import java.util.Map; import java.util.concurrent.ConcurrentHashMap; import javax.xml.namespace.NamespaceContext; +import javax.xml.parsers.ParserConfigurationException; import javax.xml.transform.OutputKeys; import javax.xml.transform.Transformer; import javax.xml.transform.TransformerFactory; @@ -29,13 +30,14 @@ import org.w3c.dom.NodeList; import net.sf.saxon.lib.NamespaceConstant; import net.sf.saxon.xpath.XPathEvaluator; +import us.codecraft.webmagic.utils.BaseSelectorUtils; /** * 支持xpath2.0的选择器。包装了HtmlCleaner和Saxon HE。
* * @author code4crafter@gmail.com
- * Date: 13-4-21 - * Time: 上午9:39 + * Date: 13-4-21 + * Time: 上午9:39 */ public class Xpath2Selector implements Selector { @@ -111,14 +113,11 @@ public class Xpath2Selector implements Selector { @Override public String select(String text) { try { - HtmlCleaner htmlCleaner = new HtmlCleaner(); - TagNode tagNode = htmlCleaner.clean(text); - Document document = new DomSerializer(new CleanerProperties()).createDOM(tagNode); Object result; try { - result = xPathExpression.evaluate(document, XPathConstants.NODESET); + result = xPathExpression.evaluate(parse(text), XPathConstants.NODESET); } catch (XPathExpressionException e) { - result = xPathExpression.evaluate(document, XPathConstants.STRING); + result = xPathExpression.evaluate(parse(text), XPathConstants.STRING); } if (result instanceof NodeList) { NodeList nodeList = (NodeList) result; @@ -147,14 +146,11 @@ public class Xpath2Selector implements Selector { public List selectList(String text) { List results = new ArrayList(); try { - HtmlCleaner htmlCleaner = new HtmlCleaner(); - TagNode tagNode = htmlCleaner.clean(text); - Document document = new DomSerializer(new CleanerProperties()).createDOM(tagNode); Object result; try { - result = xPathExpression.evaluate(document, XPathConstants.NODESET); + result = xPathExpression.evaluate(parse(text), XPathConstants.NODESET); } catch (XPathExpressionException e) { - result = xPathExpression.evaluate(document, XPathConstants.STRING); + result = xPathExpression.evaluate(parse(text), XPathConstants.STRING); } if (result instanceof NodeList) { NodeList nodeList = (NodeList) result; @@ -179,4 +175,12 @@ public class Xpath2Selector implements Selector { } return results; } + + private Document parse(String text) throws ParserConfigurationException { + // HtmlCleaner could not parse or tag directly + text = BaseSelectorUtils.preParse(text); + HtmlCleaner htmlCleaner = new HtmlCleaner(); + TagNode tagNode = htmlCleaner.clean(text); + return new DomSerializer(new CleanerProperties()).createDOM(tagNode); + } } diff --git a/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java b/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java index 1661883..c2025e7 100644 --- a/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java +++ b/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java @@ -11,6 +11,9 @@ import org.junit.Assert; import org.junit.Ignore; import org.junit.Test; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.xsoup.XPathEvaluator; import us.codecraft.xsoup.Xsoup; @@ -1385,6 +1388,22 @@ public class XpathSelectorTest { Assert.assertEquals("http://www.oschina.net/", selectList.get(0)); } + @Ignore("test parse
tag") + @Test + public void htmlCleanerParseTest() { + Spider.create(new RuoxiaPageProcessor()).addUrl("http://www.ruoxia.com/top/dianji/month").thread(1).run(); + } + class RuoxiaPageProcessor implements PageProcessor { + @Override + public void process(Page page) { + List nodes = page.getHtml().xpath("//div[@class=\"bd\"]//tbody/tr").nodes(); + for (Selectable node:nodes) { + String name = node.xpath("//td[3]/div/a[1]/text()").get(); + System.out.println(name); + } + } + } + @Ignore("take long time") @Test public void performanceTest() {