Bugfix: selector does not works well in element #113
parent
8d67fd0357
commit
7a64847a3c
|
@ -1,9 +1,11 @@
|
||||||
package us.codecraft.webmagic.selector;
|
package us.codecraft.webmagic.selector;
|
||||||
|
|
||||||
|
import org.jsoup.nodes.Document;
|
||||||
import org.jsoup.nodes.Element;
|
import org.jsoup.nodes.Element;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.ListIterator;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @author code4crafer@gmail.com
|
* @author code4crafer@gmail.com
|
||||||
|
@ -48,9 +50,11 @@ public class HtmlNode extends AbstractSelectable {
|
||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
protected Selectable selectElements(BaseElementSelector elementSelector) {
|
protected Selectable selectElements(BaseElementSelector elementSelector) {
|
||||||
|
ListIterator<Element> elementIterator = getElements().listIterator();
|
||||||
if (!elementSelector.hasAttribute()) {
|
if (!elementSelector.hasAttribute()) {
|
||||||
List<Element> resultElements = new ArrayList<Element>();
|
List<Element> resultElements = new ArrayList<Element>();
|
||||||
for (Element element : getElements()) {
|
while (elementIterator.hasNext()) {
|
||||||
|
Element element = checkElementAndConvert(elementIterator);
|
||||||
List<Element> selectElements = elementSelector.selectElements(element);
|
List<Element> selectElements = elementSelector.selectElements(element);
|
||||||
resultElements.addAll(selectElements);
|
resultElements.addAll(selectElements);
|
||||||
}
|
}
|
||||||
|
@ -58,7 +62,8 @@ public class HtmlNode extends AbstractSelectable {
|
||||||
} else {
|
} else {
|
||||||
// has attribute, consider as plaintext
|
// has attribute, consider as plaintext
|
||||||
List<String> resultStrings = new ArrayList<String>();
|
List<String> resultStrings = new ArrayList<String>();
|
||||||
for (Element element : getElements()) {
|
while (elementIterator.hasNext()) {
|
||||||
|
Element element = checkElementAndConvert(elementIterator);
|
||||||
List<String> selectList = elementSelector.selectList(element);
|
List<String> selectList = elementSelector.selectList(element);
|
||||||
resultStrings.addAll(selectList);
|
resultStrings.addAll(selectList);
|
||||||
}
|
}
|
||||||
|
@ -67,6 +72,25 @@ public class HtmlNode extends AbstractSelectable {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Only document can be select
|
||||||
|
* See: https://github.com/code4craft/webmagic/issues/113
|
||||||
|
*
|
||||||
|
* @param elementIterator
|
||||||
|
* @param element
|
||||||
|
*/
|
||||||
|
private Element checkElementAndConvert(ListIterator<Element> elementIterator) {
|
||||||
|
Element element = elementIterator.next();
|
||||||
|
if (!(element instanceof Document)) {
|
||||||
|
Document root = new Document(element.ownerDocument().baseUri());
|
||||||
|
Element clone = element.clone();
|
||||||
|
root.appendChild(clone);
|
||||||
|
elementIterator.set(root);
|
||||||
|
return root;
|
||||||
|
}
|
||||||
|
return element;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Selectable $(String selector) {
|
public Selectable $(String selector) {
|
||||||
CssSelector cssSelector = Selectors.$(selector);
|
CssSelector cssSelector = Selectors.$(selector);
|
||||||
|
|
|
@ -28,7 +28,6 @@ public class SelectorTest {
|
||||||
public void testNodes() throws Exception {
|
public void testNodes() throws Exception {
|
||||||
Html selectable = new Html(html);
|
Html selectable = new Html(html);
|
||||||
List<Selectable> links = selectable.xpath("//a").nodes();
|
List<Selectable> links = selectable.xpath("//a").nodes();
|
||||||
assertThat(links.get(0).xpath("/@href").get()).isEqualTo("http://whatever.com/aaa");
|
assertThat(links.get(0).links().get()).isEqualTo("http://whatever.com/aaa");
|
||||||
assertThat(links.get(1).xpath("/@href").get()).isEqualTo("http://whatever.com/bbb");
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue