From db67db8103b05506d131c74ce0036a5c875fb92a Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 8 Apr 2017 22:06:18 +0800 Subject: [PATCH] #523 remove fixAllRelativeHrefs by default, get absolute urls for links() --- .../main/java/us/codecraft/webmagic/Page.java | 9 ++-- .../us/codecraft/webmagic/selector/Html.java | 10 ++++ .../codecraft/webmagic/selector/HtmlNode.java | 2 +- .../webmagic/selector/LinksSelector.java | 51 +++++++++++++++++++ .../java/us/codecraft/webmagic/HtmlTest.java | 10 ++++ .../webmagic/selector/LinksSelectorTest.java | 21 ++++++++ 6 files changed, 96 insertions(+), 7 deletions(-) create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/selector/LinksSelector.java create mode 100644 webmagic-core/src/test/java/us/codecraft/webmagic/selector/LinksSelectorTest.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java index d24ceba..7dd48f8 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java @@ -1,16 +1,13 @@ package us.codecraft.webmagic; -import java.util.ArrayList; -import java.util.List; - import org.apache.commons.lang3.StringUtils; -import org.apache.http.Header; - import us.codecraft.webmagic.selector.Html; import us.codecraft.webmagic.selector.Json; import us.codecraft.webmagic.selector.Selectable; import us.codecraft.webmagic.utils.UrlUtils; +import java.util.ArrayList; +import java.util.List; import java.util.Map; /** @@ -76,7 +73,7 @@ public class Page { */ public Html getHtml() { if (html == null) { - html = new Html(UrlUtils.fixAllRelativeHrefs(rawText, request.getUrl())); + html = new Html(rawText, request.getUrl()); } return html; } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java index d80e8b4..7b22639 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java @@ -44,6 +44,16 @@ public class Html extends HtmlNode { */ private Document document; + public Html(String text, String url) { + try { + disableJsoupHtmlEntityEscape(); + this.document = Jsoup.parse(text, url); + } catch (Exception e) { + this.document = null; + logger.warn("parse document error ", e); + } + } + public Html(String text) { try { disableJsoupHtmlEntityEscape(); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java index 030522f..89de5a6 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java @@ -34,7 +34,7 @@ public class HtmlNode extends AbstractSelectable { @Override public Selectable links() { - return xpath("//a/@href"); + return selectElements(new LinksSelector()); } @Override diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/LinksSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/LinksSelector.java new file mode 100644 index 0000000..5296a74 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/LinksSelector.java @@ -0,0 +1,51 @@ +package us.codecraft.webmagic.selector; + +import org.jsoup.helper.StringUtil; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + +import java.util.ArrayList; +import java.util.List; + +/** + * Links selector based on jsoup. Use absolute url.
+ * + * @author code4crafter@gmail.com
+ * @since 0.7.0 + */ +public class LinksSelector extends BaseElementSelector { + + @Override + public String select(Element element) { + throw new UnsupportedOperationException(); + } + + @Override + public List selectList(Element element) { + Elements elements = element.select("a"); + List links = new ArrayList(elements.size()); + for (Element element0 : elements) { + if (!StringUtil.isBlank(element0.baseUri())) { + links.add(element0.attr("abs:href")); + } else { + links.add(element0.attr("href")); + } + } + return links; + } + + @Override + public Element selectElement(Element element) { + throw new UnsupportedOperationException(); + } + + @Override + public List selectElements(Element element) { + throw new UnsupportedOperationException(); + } + + @Override + public boolean hasAttribute() { + return true; + } +} diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/HtmlTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/HtmlTest.java index 6cf5382..faf249f 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/HtmlTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/HtmlTest.java @@ -48,4 +48,14 @@ public class HtmlTest { Selectable selectable = html.xpath("//a[1]").nodes().get(0); assertThat(selectable.xpath("/a/@href").get()).isEqualTo("/xx/xx"); } + + @Test + public void testGetHrefsByJsoup(){ + Html html = new Html("issues","https://github.com/code4craft/webmagic/"); + assertThat(html.xpath("//a[1]/@abs:href").get()).isEqualTo("https://github.com/code4craft/webmagic/issues"); + assertThat(html.xpath("//img/@abs:src").get()).isEqualTo("https://github.com/code4craft/webmagic/webmagic.jpg"); + html = new Html("issues"); + assertThat(html.xpath("//a[1]/@abs:href").get()).isEqualTo("https://github.com/code4craft/webmagic/issues"); + assertThat(html.xpath("//img/@abs:src").get()).isEqualTo("https://github.com/code4craft/webmagic/webmagic.jpg"); + } } diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/LinksSelectorTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/LinksSelectorTest.java new file mode 100644 index 0000000..3fcb71b --- /dev/null +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/LinksSelectorTest.java @@ -0,0 +1,21 @@ +package us.codecraft.webmagic.selector; + +import org.junit.Test; + +import java.util.List; + +/** + * @author code4crafter@gmail.com + * Date: 17/4/8 + * Time: 下午9:41 + */ +public class LinksSelectorTest { + + private String html = "
"; + + @Test + public void testLinks() throws Exception { + List links = new LinksSelector().selectList(html); + System.out.println(links); + } +}