From 7a64847a3c15a5825a686d5e4bc6c7f849684540 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Tue, 3 Jun 2014 20:03:33 +0800 Subject: [PATCH] Bugfix: selector does not works well in element #113 --- .../codecraft/webmagic/selector/HtmlNode.java | 28 +++++++++++++++++-- .../webmagic/selector/SelectorTest.java | 3 +- 2 files changed, 27 insertions(+), 4 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java index 3749686..e41267b 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java @@ -1,9 +1,11 @@ package us.codecraft.webmagic.selector; +import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import java.util.ArrayList; import java.util.List; +import java.util.ListIterator; /** * @author code4crafer@gmail.com @@ -48,9 +50,11 @@ public class HtmlNode extends AbstractSelectable { * @return */ protected Selectable selectElements(BaseElementSelector elementSelector) { + ListIterator elementIterator = getElements().listIterator(); if (!elementSelector.hasAttribute()) { List resultElements = new ArrayList(); - for (Element element : getElements()) { + while (elementIterator.hasNext()) { + Element element = checkElementAndConvert(elementIterator); List selectElements = elementSelector.selectElements(element); resultElements.addAll(selectElements); } @@ -58,7 +62,8 @@ public class HtmlNode extends AbstractSelectable { } else { // has attribute, consider as plaintext List resultStrings = new ArrayList(); - for (Element element : getElements()) { + while (elementIterator.hasNext()) { + Element element = checkElementAndConvert(elementIterator); List selectList = elementSelector.selectList(element); resultStrings.addAll(selectList); } @@ -67,6 +72,25 @@ public class HtmlNode extends AbstractSelectable { } } + /** + * Only document can be select + * See: https://github.com/code4craft/webmagic/issues/113 + * + * @param elementIterator + * @param element + */ + private Element checkElementAndConvert(ListIterator elementIterator) { + Element element = elementIterator.next(); + if (!(element instanceof Document)) { + Document root = new Document(element.ownerDocument().baseUri()); + Element clone = element.clone(); + root.appendChild(clone); + elementIterator.set(root); + return root; + } + return element; + } + @Override public Selectable $(String selector) { CssSelector cssSelector = Selectors.$(selector); diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/SelectorTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/SelectorTest.java index 04158aa..4ec692d 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/SelectorTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/SelectorTest.java @@ -28,7 +28,6 @@ public class SelectorTest { public void testNodes() throws Exception { Html selectable = new Html(html); List links = selectable.xpath("//a").nodes(); - assertThat(links.get(0).xpath("/@href").get()).isEqualTo("http://whatever.com/aaa"); - assertThat(links.get(1).xpath("/@href").get()).isEqualTo("http://whatever.com/bbb"); + assertThat(links.get(0).links().get()).isEqualTo("http://whatever.com/aaa"); } }