diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java
index d24ceba..7dd48f8 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java
@@ -1,16 +1,13 @@
package us.codecraft.webmagic;
-import java.util.ArrayList;
-import java.util.List;
-
import org.apache.commons.lang3.StringUtils;
-import org.apache.http.Header;
-
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.Json;
import us.codecraft.webmagic.selector.Selectable;
import us.codecraft.webmagic.utils.UrlUtils;
+import java.util.ArrayList;
+import java.util.List;
import java.util.Map;
/**
@@ -76,7 +73,7 @@ public class Page {
*/
public Html getHtml() {
if (html == null) {
- html = new Html(UrlUtils.fixAllRelativeHrefs(rawText, request.getUrl()));
+ html = new Html(rawText, request.getUrl());
}
return html;
}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java
index d80e8b4..7b22639 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java
@@ -44,6 +44,16 @@ public class Html extends HtmlNode {
*/
private Document document;
+ public Html(String text, String url) {
+ try {
+ disableJsoupHtmlEntityEscape();
+ this.document = Jsoup.parse(text, url);
+ } catch (Exception e) {
+ this.document = null;
+ logger.warn("parse document error ", e);
+ }
+ }
+
public Html(String text) {
try {
disableJsoupHtmlEntityEscape();
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java
index 030522f..89de5a6 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java
@@ -34,7 +34,7 @@ public class HtmlNode extends AbstractSelectable {
@Override
public Selectable links() {
- return xpath("//a/@href");
+ return selectElements(new LinksSelector());
}
@Override
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/LinksSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/LinksSelector.java
new file mode 100644
index 0000000..5296a74
--- /dev/null
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/LinksSelector.java
@@ -0,0 +1,51 @@
+package us.codecraft.webmagic.selector;
+
+import org.jsoup.helper.StringUtil;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * Links selector based on jsoup. Use absolute url.
+ *
+ * @author code4crafter@gmail.com
+ * @since 0.7.0
+ */
+public class LinksSelector extends BaseElementSelector {
+
+ @Override
+ public String select(Element element) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public List selectList(Element element) {
+ Elements elements = element.select("a");
+ List links = new ArrayList(elements.size());
+ for (Element element0 : elements) {
+ if (!StringUtil.isBlank(element0.baseUri())) {
+ links.add(element0.attr("abs:href"));
+ } else {
+ links.add(element0.attr("href"));
+ }
+ }
+ return links;
+ }
+
+ @Override
+ public Element selectElement(Element element) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public List selectElements(Element element) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public boolean hasAttribute() {
+ return true;
+ }
+}
diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/HtmlTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/HtmlTest.java
index 6cf5382..faf249f 100644
--- a/webmagic-core/src/test/java/us/codecraft/webmagic/HtmlTest.java
+++ b/webmagic-core/src/test/java/us/codecraft/webmagic/HtmlTest.java
@@ -48,4 +48,14 @@ public class HtmlTest {
Selectable selectable = html.xpath("//a[1]").nodes().get(0);
assertThat(selectable.xpath("/a/@href").get()).isEqualTo("/xx/xx");
}
+
+ @Test
+ public void testGetHrefsByJsoup(){
+ Html html = new Html("issues
","https://github.com/code4craft/webmagic/");
+ assertThat(html.xpath("//a[1]/@abs:href").get()).isEqualTo("https://github.com/code4craft/webmagic/issues");
+ assertThat(html.xpath("//img/@abs:src").get()).isEqualTo("https://github.com/code4craft/webmagic/webmagic.jpg");
+ html = new Html("issues
");
+ assertThat(html.xpath("//a[1]/@abs:href").get()).isEqualTo("https://github.com/code4craft/webmagic/issues");
+ assertThat(html.xpath("//img/@abs:src").get()).isEqualTo("https://github.com/code4craft/webmagic/webmagic.jpg");
+ }
}
diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/LinksSelectorTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/LinksSelectorTest.java
new file mode 100644
index 0000000..3fcb71b
--- /dev/null
+++ b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/LinksSelectorTest.java
@@ -0,0 +1,21 @@
+package us.codecraft.webmagic.selector;
+
+import org.junit.Test;
+
+import java.util.List;
+
+/**
+ * @author code4crafter@gmail.com
+ * Date: 17/4/8
+ * Time: 下午9:41
+ */
+public class LinksSelectorTest {
+
+ private String html = "";
+
+ @Test
+ public void testLinks() throws Exception {
+ List links = new LinksSelector().selectList(html);
+ System.out.println(links);
+ }
+}