diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java
index 99b71e0..60eacee 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java
@@ -43,12 +43,22 @@ public class UrlUtils {
if (url.startsWith("?"))
url = base.getPath() + url;
URL abs = new URL(base, url);
- return abs.toExternalForm();
+ return encodeIllegalCharacterInUrl(abs.toExternalForm());
} catch (MalformedURLException e) {
return "";
}
}
+ /**
+ *
+ * @param url
+ * @return
+ */
+ public static String encodeIllegalCharacterInUrl(String url) {
+ //TODO more charator support
+ return url.replace(" ", "%20");
+ }
+
public static String getHost(String url) {
String host = url;
int i = StringUtils.ordinalIndexOf(url, "/", 3);
@@ -101,9 +111,9 @@ public class UrlUtils {
stringBuilder.append("\"").append(canonicalizeUrl(matcher.group(2), url)).append("\"");
lastEnd = matcher.end();
}
- if (!modified) {
- return html;
- }
+ if (!modified) {
+ return html;
+ }
stringBuilder.append(StringUtils.substring(html, lastEnd));
return stringBuilder.toString();
}
diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java
index ac01926..e6fe5ae 100644
--- a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java
+++ b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java
@@ -8,6 +8,8 @@ import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.selector.Html;
+import java.io.UnsupportedEncodingException;
+
import static org.assertj.core.api.Assertions.assertThat;
import static org.junit.Assert.assertTrue;
@@ -32,6 +34,12 @@ public class HttpClientDownloaderTest {
assertTrue(!html.getText().isEmpty());
}
+ @Test(expected = IllegalArgumentException.class)
+ public void testDownloaderInIllegalUrl() throws UnsupportedEncodingException {
+ HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
+ httpClientDownloader.download("http://www.oschina.net/>");
+ }
+
@Test
public void testCycleTriedTimes() {
HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java
index 1e403c4..565fde4 100644
--- a/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java
+++ b/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java
@@ -38,17 +38,22 @@ public class UrlUtilsTest {
originHtml = "";
replacedHtml = UrlUtils.fixAllRelativeHrefs(originHtml, "http://www.dianping.com/");
- assertThat(replacedHtml).isEqualTo("");
+ assertThat(replacedHtml).isEqualTo("");
originHtml = "";
replacedHtml = UrlUtils.fixAllRelativeHrefs(originHtml, "http://www.dianping.com/");
- assertThat(replacedHtml).isEqualTo("");
+ assertThat(replacedHtml).isEqualTo("");
originHtml = "";
replacedHtml = UrlUtils.fixAllRelativeHrefs(originHtml, "http://www.dianping.com/");
assertThat(replacedHtml).isEqualTo("");
}
+ @Test
+ public void test(){
+ UrlUtils.canonicalizeUrl("start tag", "http://www.dianping.com/");
+ }
+
@Test
public void testGetDomain(){
String url = "http://www.dianping.com/aa/";