diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java index 456b3cc..99b71e0 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java @@ -73,18 +73,37 @@ public class UrlUtils { return domain; } - private static Pattern patternForHref = Pattern.compile("(]*href=)[\"']{0,1}([^\"'<>\\s]*)[\"']{0,1}", Pattern.CASE_INSENSITIVE); + /** + * allow blank space in quote + */ + private static Pattern patternForHrefWithQuote = Pattern.compile("(]*href=)[\"']([^\"'<>]*)[\"']", Pattern.CASE_INSENSITIVE); + + /** + * disallow blank space without quote + */ + private static Pattern patternForHrefWithoutQuote = Pattern.compile("(]*href=)([^\"'<>\\s]+)", Pattern.CASE_INSENSITIVE); public static String fixAllRelativeHrefs(String html, String url) { + html = replaceByPattern(html, url, patternForHrefWithQuote); + html = replaceByPattern(html, url, patternForHrefWithoutQuote); + return html; + } + + public static String replaceByPattern(String html, String url, Pattern pattern) { StringBuilder stringBuilder = new StringBuilder(); - Matcher matcher = patternForHref.matcher(html); + Matcher matcher = pattern.matcher(html); int lastEnd = 0; + boolean modified = false; while (matcher.find()) { + modified = true; stringBuilder.append(StringUtils.substring(html, lastEnd, matcher.start())); stringBuilder.append(matcher.group(1)); stringBuilder.append("\"").append(canonicalizeUrl(matcher.group(2), url)).append("\""); lastEnd = matcher.end(); } + if (!modified) { + return html; + } stringBuilder.append(StringUtils.substring(html, lastEnd)); return stringBuilder.toString(); } diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java index abe6adc..1e403c4 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java @@ -3,6 +3,8 @@ package us.codecraft.webmagic.utils; import org.junit.Assert; import org.junit.Test; +import static org.assertj.core.api.Assertions.assertThat; + /** * @author code4crafter@gmail.com
* Date: 13-4-21 @@ -12,19 +14,39 @@ public class UrlUtilsTest { @Test public void testFixRelativeUrl() { - String fixrelativeurl = UrlUtils.canonicalizeUrl("aa", "http://www.dianping.com/sh/ss/com"); - System.out.println("fix: " + fixrelativeurl); - Assert.assertEquals("http://www.dianping.com/sh/ss/aa", fixrelativeurl); + String absoluteUrl = UrlUtils.canonicalizeUrl("aa", "http://www.dianping.com/sh/ss/com"); + assertThat(absoluteUrl).isEqualTo("http://www.dianping.com/sh/ss/aa"); - fixrelativeurl = UrlUtils.canonicalizeUrl("../aa", "http://www.dianping.com/sh/ss/com"); - Assert.assertEquals("http://www.dianping.com/sh/aa", fixrelativeurl); + absoluteUrl = UrlUtils.canonicalizeUrl("../aa", "http://www.dianping.com/sh/ss/com"); + assertThat(absoluteUrl).isEqualTo("http://www.dianping.com/sh/aa"); - fixrelativeurl = UrlUtils.canonicalizeUrl("..aa", "http://www.dianping.com/sh/ss/com"); - Assert.assertEquals("http://www.dianping.com/sh/ss/..aa", fixrelativeurl); - fixrelativeurl = UrlUtils.canonicalizeUrl("../../aa", "http://www.dianping.com/sh/ss/com/"); - Assert.assertEquals("http://www.dianping.com/sh/aa", fixrelativeurl); - fixrelativeurl = UrlUtils.canonicalizeUrl("../../aa", "http://www.dianping.com/sh/ss/com"); - Assert.assertEquals("http://www.dianping.com/aa", fixrelativeurl); + absoluteUrl = UrlUtils.canonicalizeUrl("..aa", "http://www.dianping.com/sh/ss/com"); + assertThat(absoluteUrl).isEqualTo("http://www.dianping.com/sh/ss/..aa"); + + absoluteUrl = UrlUtils.canonicalizeUrl("../../aa", "http://www.dianping.com/sh/ss/com/"); + assertThat(absoluteUrl).isEqualTo("http://www.dianping.com/sh/aa"); + + absoluteUrl = UrlUtils.canonicalizeUrl("../../aa", "http://www.dianping.com/sh/ss/com"); + assertThat(absoluteUrl).isEqualTo("http://www.dianping.com/aa"); + } + + @Test + public void testFixAllRelativeHrefs() { + String originHtml = ""; + String replacedHtml = UrlUtils.fixAllRelativeHrefs(originHtml, "http://www.dianping.com/"); + assertThat(replacedHtml).isEqualTo(""); + + originHtml = ""; + replacedHtml = UrlUtils.fixAllRelativeHrefs(originHtml, "http://www.dianping.com/"); + assertThat(replacedHtml).isEqualTo(""); + + originHtml = ""; + replacedHtml = UrlUtils.fixAllRelativeHrefs(originHtml, "http://www.dianping.com/"); + assertThat(replacedHtml).isEqualTo(""); + + originHtml = ""; + replacedHtml = UrlUtils.fixAllRelativeHrefs(originHtml, "http://www.dianping.com/"); + assertThat(replacedHtml).isEqualTo(""); } @Test