diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java index 99b71e0..60eacee 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java @@ -43,12 +43,22 @@ public class UrlUtils { if (url.startsWith("?")) url = base.getPath() + url; URL abs = new URL(base, url); - return abs.toExternalForm(); + return encodeIllegalCharacterInUrl(abs.toExternalForm()); } catch (MalformedURLException e) { return ""; } } + /** + * + * @param url + * @return + */ + public static String encodeIllegalCharacterInUrl(String url) { + //TODO more charator support + return url.replace(" ", "%20"); + } + public static String getHost(String url) { String host = url; int i = StringUtils.ordinalIndexOf(url, "/", 3); @@ -101,9 +111,9 @@ public class UrlUtils { stringBuilder.append("\"").append(canonicalizeUrl(matcher.group(2), url)).append("\""); lastEnd = matcher.end(); } - if (!modified) { - return html; - } + if (!modified) { + return html; + } stringBuilder.append(StringUtils.substring(html, lastEnd)); return stringBuilder.toString(); } diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java index ac01926..e6fe5ae 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java @@ -8,6 +8,8 @@ import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.selector.Html; +import java.io.UnsupportedEncodingException; + import static org.assertj.core.api.Assertions.assertThat; import static org.junit.Assert.assertTrue; @@ -32,6 +34,12 @@ public class HttpClientDownloaderTest { assertTrue(!html.getText().isEmpty()); } + @Test(expected = IllegalArgumentException.class) + public void testDownloaderInIllegalUrl() throws UnsupportedEncodingException { + HttpClientDownloader httpClientDownloader = new HttpClientDownloader(); + httpClientDownloader.download("http://www.oschina.net/>"); + } + @Test public void testCycleTriedTimes() { HttpClientDownloader httpClientDownloader = new HttpClientDownloader(); diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java index 1e403c4..565fde4 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java @@ -38,17 +38,22 @@ public class UrlUtilsTest { originHtml = ""; replacedHtml = UrlUtils.fixAllRelativeHrefs(originHtml, "http://www.dianping.com/"); - assertThat(replacedHtml).isEqualTo(""); + assertThat(replacedHtml).isEqualTo(""); originHtml = ""; replacedHtml = UrlUtils.fixAllRelativeHrefs(originHtml, "http://www.dianping.com/"); - assertThat(replacedHtml).isEqualTo(""); + assertThat(replacedHtml).isEqualTo(""); originHtml = ""; replacedHtml = UrlUtils.fixAllRelativeHrefs(originHtml, "http://www.dianping.com/"); assertThat(replacedHtml).isEqualTo(""); } + @Test + public void test(){ + UrlUtils.canonicalizeUrl("start tag", "http://www.dianping.com/"); + } + @Test public void testGetDomain(){ String url = "http://www.dianping.com/aa/";