#523 remove fixAllRelativeHrefs by default, get absolute urls for links()
parent
abd020b45b
commit
db67db8103
|
@ -1,16 +1,13 @@
|
|||
package us.codecraft.webmagic;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.http.Header;
|
||||
|
||||
import us.codecraft.webmagic.selector.Html;
|
||||
import us.codecraft.webmagic.selector.Json;
|
||||
import us.codecraft.webmagic.selector.Selectable;
|
||||
import us.codecraft.webmagic.utils.UrlUtils;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
|
@ -76,7 +73,7 @@ public class Page {
|
|||
*/
|
||||
public Html getHtml() {
|
||||
if (html == null) {
|
||||
html = new Html(UrlUtils.fixAllRelativeHrefs(rawText, request.getUrl()));
|
||||
html = new Html(rawText, request.getUrl());
|
||||
}
|
||||
return html;
|
||||
}
|
||||
|
|
|
@ -44,6 +44,16 @@ public class Html extends HtmlNode {
|
|||
*/
|
||||
private Document document;
|
||||
|
||||
public Html(String text, String url) {
|
||||
try {
|
||||
disableJsoupHtmlEntityEscape();
|
||||
this.document = Jsoup.parse(text, url);
|
||||
} catch (Exception e) {
|
||||
this.document = null;
|
||||
logger.warn("parse document error ", e);
|
||||
}
|
||||
}
|
||||
|
||||
public Html(String text) {
|
||||
try {
|
||||
disableJsoupHtmlEntityEscape();
|
||||
|
|
|
@ -34,7 +34,7 @@ public class HtmlNode extends AbstractSelectable {
|
|||
|
||||
@Override
|
||||
public Selectable links() {
|
||||
return xpath("//a/@href");
|
||||
return selectElements(new LinksSelector());
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -0,0 +1,51 @@
|
|||
package us.codecraft.webmagic.selector;
|
||||
|
||||
import org.jsoup.helper.StringUtil;
|
||||
import org.jsoup.nodes.Element;
|
||||
import org.jsoup.select.Elements;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Links selector based on jsoup. Use absolute url. <br>
|
||||
*
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* @since 0.7.0
|
||||
*/
|
||||
public class LinksSelector extends BaseElementSelector {
|
||||
|
||||
@Override
|
||||
public String select(Element element) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<String> selectList(Element element) {
|
||||
Elements elements = element.select("a");
|
||||
List<String> links = new ArrayList<String>(elements.size());
|
||||
for (Element element0 : elements) {
|
||||
if (!StringUtil.isBlank(element0.baseUri())) {
|
||||
links.add(element0.attr("abs:href"));
|
||||
} else {
|
||||
links.add(element0.attr("href"));
|
||||
}
|
||||
}
|
||||
return links;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Element selectElement(Element element) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<Element> selectElements(Element element) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasAttribute() {
|
||||
return true;
|
||||
}
|
||||
}
|
|
@ -48,4 +48,14 @@ public class HtmlTest {
|
|||
Selectable selectable = html.xpath("//a[1]").nodes().get(0);
|
||||
assertThat(selectable.xpath("/a/@href").get()).isEqualTo("/xx/xx");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testGetHrefsByJsoup(){
|
||||
Html html = new Html("<html><a href='issues'>issues</a><img src='webmagic.jpg'/></html>","https://github.com/code4craft/webmagic/");
|
||||
assertThat(html.xpath("//a[1]/@abs:href").get()).isEqualTo("https://github.com/code4craft/webmagic/issues");
|
||||
assertThat(html.xpath("//img/@abs:src").get()).isEqualTo("https://github.com/code4craft/webmagic/webmagic.jpg");
|
||||
html = new Html("<html><base href='https://github.com/code4craft/webmagic/'><a href='issues'>issues</a><img src='webmagic.jpg'/></base></html>");
|
||||
assertThat(html.xpath("//a[1]/@abs:href").get()).isEqualTo("https://github.com/code4craft/webmagic/issues");
|
||||
assertThat(html.xpath("//img/@abs:src").get()).isEqualTo("https://github.com/code4craft/webmagic/webmagic.jpg");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,21 @@
|
|||
package us.codecraft.webmagic.selector;
|
||||
|
||||
import org.junit.Test;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com
|
||||
* Date: 17/4/8
|
||||
* Time: 下午9:41
|
||||
*/
|
||||
public class LinksSelectorTest {
|
||||
|
||||
private String html = "<div><a href='http://whatever.com/aaa'></a></div><div><a href='http://whatever.com/bbb'></a></div>";
|
||||
|
||||
@Test
|
||||
public void testLinks() throws Exception {
|
||||
List<String> links = new LinksSelector().selectList(html);
|
||||
System.out.println(links);
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue