diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java
index 456b3cc..99b71e0 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java
@@ -73,18 +73,37 @@ public class UrlUtils {
return domain;
}
- private static Pattern patternForHref = Pattern.compile("(]*href=)[\"']{0,1}([^\"'<>\\s]*)[\"']{0,1}", Pattern.CASE_INSENSITIVE);
+ /**
+ * allow blank space in quote
+ */
+ private static Pattern patternForHrefWithQuote = Pattern.compile("(]*href=)[\"']([^\"'<>]*)[\"']", Pattern.CASE_INSENSITIVE);
+
+ /**
+ * disallow blank space without quote
+ */
+ private static Pattern patternForHrefWithoutQuote = Pattern.compile("(]*href=)([^\"'<>\\s]+)", Pattern.CASE_INSENSITIVE);
public static String fixAllRelativeHrefs(String html, String url) {
+ html = replaceByPattern(html, url, patternForHrefWithQuote);
+ html = replaceByPattern(html, url, patternForHrefWithoutQuote);
+ return html;
+ }
+
+ public static String replaceByPattern(String html, String url, Pattern pattern) {
StringBuilder stringBuilder = new StringBuilder();
- Matcher matcher = patternForHref.matcher(html);
+ Matcher matcher = pattern.matcher(html);
int lastEnd = 0;
+ boolean modified = false;
while (matcher.find()) {
+ modified = true;
stringBuilder.append(StringUtils.substring(html, lastEnd, matcher.start()));
stringBuilder.append(matcher.group(1));
stringBuilder.append("\"").append(canonicalizeUrl(matcher.group(2), url)).append("\"");
lastEnd = matcher.end();
}
+ if (!modified) {
+ return html;
+ }
stringBuilder.append(StringUtils.substring(html, lastEnd));
return stringBuilder.toString();
}
diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java
index abe6adc..1e403c4 100644
--- a/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java
+++ b/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java
@@ -3,6 +3,8 @@ package us.codecraft.webmagic.utils;
import org.junit.Assert;
import org.junit.Test;
+import static org.assertj.core.api.Assertions.assertThat;
+
/**
* @author code4crafter@gmail.com
* Date: 13-4-21
@@ -12,19 +14,39 @@ public class UrlUtilsTest {
@Test
public void testFixRelativeUrl() {
- String fixrelativeurl = UrlUtils.canonicalizeUrl("aa", "http://www.dianping.com/sh/ss/com");
- System.out.println("fix: " + fixrelativeurl);
- Assert.assertEquals("http://www.dianping.com/sh/ss/aa", fixrelativeurl);
+ String absoluteUrl = UrlUtils.canonicalizeUrl("aa", "http://www.dianping.com/sh/ss/com");
+ assertThat(absoluteUrl).isEqualTo("http://www.dianping.com/sh/ss/aa");
- fixrelativeurl = UrlUtils.canonicalizeUrl("../aa", "http://www.dianping.com/sh/ss/com");
- Assert.assertEquals("http://www.dianping.com/sh/aa", fixrelativeurl);
+ absoluteUrl = UrlUtils.canonicalizeUrl("../aa", "http://www.dianping.com/sh/ss/com");
+ assertThat(absoluteUrl).isEqualTo("http://www.dianping.com/sh/aa");
- fixrelativeurl = UrlUtils.canonicalizeUrl("..aa", "http://www.dianping.com/sh/ss/com");
- Assert.assertEquals("http://www.dianping.com/sh/ss/..aa", fixrelativeurl);
- fixrelativeurl = UrlUtils.canonicalizeUrl("../../aa", "http://www.dianping.com/sh/ss/com/");
- Assert.assertEquals("http://www.dianping.com/sh/aa", fixrelativeurl);
- fixrelativeurl = UrlUtils.canonicalizeUrl("../../aa", "http://www.dianping.com/sh/ss/com");
- Assert.assertEquals("http://www.dianping.com/aa", fixrelativeurl);
+ absoluteUrl = UrlUtils.canonicalizeUrl("..aa", "http://www.dianping.com/sh/ss/com");
+ assertThat(absoluteUrl).isEqualTo("http://www.dianping.com/sh/ss/..aa");
+
+ absoluteUrl = UrlUtils.canonicalizeUrl("../../aa", "http://www.dianping.com/sh/ss/com/");
+ assertThat(absoluteUrl).isEqualTo("http://www.dianping.com/sh/aa");
+
+ absoluteUrl = UrlUtils.canonicalizeUrl("../../aa", "http://www.dianping.com/sh/ss/com");
+ assertThat(absoluteUrl).isEqualTo("http://www.dianping.com/aa");
+ }
+
+ @Test
+ public void testFixAllRelativeHrefs() {
+ String originHtml = "";
+ String replacedHtml = UrlUtils.fixAllRelativeHrefs(originHtml, "http://www.dianping.com/");
+ assertThat(replacedHtml).isEqualTo("");
+
+ originHtml = "";
+ replacedHtml = UrlUtils.fixAllRelativeHrefs(originHtml, "http://www.dianping.com/");
+ assertThat(replacedHtml).isEqualTo("");
+
+ originHtml = "";
+ replacedHtml = UrlUtils.fixAllRelativeHrefs(originHtml, "http://www.dianping.com/");
+ assertThat(replacedHtml).isEqualTo("");
+
+ originHtml = "";
+ replacedHtml = UrlUtils.fixAllRelativeHrefs(originHtml, "http://www.dianping.com/");
+ assertThat(replacedHtml).isEqualTo("");
}
@Test