#523 remove fixAllRelativeHrefs by default, get absolute urls for links()
parent
abd020b45b
commit
db67db8103
|
@ -1,16 +1,13 @@
|
||||||
package us.codecraft.webmagic;
|
package us.codecraft.webmagic;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
import org.apache.http.Header;
|
|
||||||
|
|
||||||
import us.codecraft.webmagic.selector.Html;
|
import us.codecraft.webmagic.selector.Html;
|
||||||
import us.codecraft.webmagic.selector.Json;
|
import us.codecraft.webmagic.selector.Json;
|
||||||
import us.codecraft.webmagic.selector.Selectable;
|
import us.codecraft.webmagic.selector.Selectable;
|
||||||
import us.codecraft.webmagic.utils.UrlUtils;
|
import us.codecraft.webmagic.utils.UrlUtils;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -76,7 +73,7 @@ public class Page {
|
||||||
*/
|
*/
|
||||||
public Html getHtml() {
|
public Html getHtml() {
|
||||||
if (html == null) {
|
if (html == null) {
|
||||||
html = new Html(UrlUtils.fixAllRelativeHrefs(rawText, request.getUrl()));
|
html = new Html(rawText, request.getUrl());
|
||||||
}
|
}
|
||||||
return html;
|
return html;
|
||||||
}
|
}
|
||||||
|
|
|
@ -44,6 +44,16 @@ public class Html extends HtmlNode {
|
||||||
*/
|
*/
|
||||||
private Document document;
|
private Document document;
|
||||||
|
|
||||||
|
public Html(String text, String url) {
|
||||||
|
try {
|
||||||
|
disableJsoupHtmlEntityEscape();
|
||||||
|
this.document = Jsoup.parse(text, url);
|
||||||
|
} catch (Exception e) {
|
||||||
|
this.document = null;
|
||||||
|
logger.warn("parse document error ", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public Html(String text) {
|
public Html(String text) {
|
||||||
try {
|
try {
|
||||||
disableJsoupHtmlEntityEscape();
|
disableJsoupHtmlEntityEscape();
|
||||||
|
|
|
@ -34,7 +34,7 @@ public class HtmlNode extends AbstractSelectable {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Selectable links() {
|
public Selectable links() {
|
||||||
return xpath("//a/@href");
|
return selectElements(new LinksSelector());
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -0,0 +1,51 @@
|
||||||
|
package us.codecraft.webmagic.selector;
|
||||||
|
|
||||||
|
import org.jsoup.helper.StringUtil;
|
||||||
|
import org.jsoup.nodes.Element;
|
||||||
|
import org.jsoup.select.Elements;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Links selector based on jsoup. Use absolute url. <br>
|
||||||
|
*
|
||||||
|
* @author code4crafter@gmail.com <br>
|
||||||
|
* @since 0.7.0
|
||||||
|
*/
|
||||||
|
public class LinksSelector extends BaseElementSelector {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String select(Element element) {
|
||||||
|
throw new UnsupportedOperationException();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public List<String> selectList(Element element) {
|
||||||
|
Elements elements = element.select("a");
|
||||||
|
List<String> links = new ArrayList<String>(elements.size());
|
||||||
|
for (Element element0 : elements) {
|
||||||
|
if (!StringUtil.isBlank(element0.baseUri())) {
|
||||||
|
links.add(element0.attr("abs:href"));
|
||||||
|
} else {
|
||||||
|
links.add(element0.attr("href"));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return links;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Element selectElement(Element element) {
|
||||||
|
throw new UnsupportedOperationException();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public List<Element> selectElements(Element element) {
|
||||||
|
throw new UnsupportedOperationException();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean hasAttribute() {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
|
@ -48,4 +48,14 @@ public class HtmlTest {
|
||||||
Selectable selectable = html.xpath("//a[1]").nodes().get(0);
|
Selectable selectable = html.xpath("//a[1]").nodes().get(0);
|
||||||
assertThat(selectable.xpath("/a/@href").get()).isEqualTo("/xx/xx");
|
assertThat(selectable.xpath("/a/@href").get()).isEqualTo("/xx/xx");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testGetHrefsByJsoup(){
|
||||||
|
Html html = new Html("<html><a href='issues'>issues</a><img src='webmagic.jpg'/></html>","https://github.com/code4craft/webmagic/");
|
||||||
|
assertThat(html.xpath("//a[1]/@abs:href").get()).isEqualTo("https://github.com/code4craft/webmagic/issues");
|
||||||
|
assertThat(html.xpath("//img/@abs:src").get()).isEqualTo("https://github.com/code4craft/webmagic/webmagic.jpg");
|
||||||
|
html = new Html("<html><base href='https://github.com/code4craft/webmagic/'><a href='issues'>issues</a><img src='webmagic.jpg'/></base></html>");
|
||||||
|
assertThat(html.xpath("//a[1]/@abs:href").get()).isEqualTo("https://github.com/code4craft/webmagic/issues");
|
||||||
|
assertThat(html.xpath("//img/@abs:src").get()).isEqualTo("https://github.com/code4craft/webmagic/webmagic.jpg");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,21 @@
|
||||||
|
package us.codecraft.webmagic.selector;
|
||||||
|
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author code4crafter@gmail.com
|
||||||
|
* Date: 17/4/8
|
||||||
|
* Time: 下午9:41
|
||||||
|
*/
|
||||||
|
public class LinksSelectorTest {
|
||||||
|
|
||||||
|
private String html = "<div><a href='http://whatever.com/aaa'></a></div><div><a href='http://whatever.com/bbb'></a></div>";
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testLinks() throws Exception {
|
||||||
|
List<String> links = new LinksSelector().selectList(html);
|
||||||
|
System.out.println(links);
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue