encode illegal charactors in url #80

master
yihua.huang 2014-04-01 22:14:30 +08:00
parent 2780423e60
commit 01848301d4
3 changed files with 29 additions and 6 deletions

View File

@ -43,12 +43,22 @@ public class UrlUtils {
if (url.startsWith("?")) if (url.startsWith("?"))
url = base.getPath() + url; url = base.getPath() + url;
URL abs = new URL(base, url); URL abs = new URL(base, url);
return abs.toExternalForm(); return encodeIllegalCharacterInUrl(abs.toExternalForm());
} catch (MalformedURLException e) { } catch (MalformedURLException e) {
return ""; return "";
} }
} }
/**
*
* @param url
* @return
*/
public static String encodeIllegalCharacterInUrl(String url) {
//TODO more charator support
return url.replace(" ", "%20");
}
public static String getHost(String url) { public static String getHost(String url) {
String host = url; String host = url;
int i = StringUtils.ordinalIndexOf(url, "/", 3); int i = StringUtils.ordinalIndexOf(url, "/", 3);
@ -101,9 +111,9 @@ public class UrlUtils {
stringBuilder.append("\"").append(canonicalizeUrl(matcher.group(2), url)).append("\""); stringBuilder.append("\"").append(canonicalizeUrl(matcher.group(2), url)).append("\"");
lastEnd = matcher.end(); lastEnd = matcher.end();
} }
if (!modified) { if (!modified) {
return html; return html;
} }
stringBuilder.append(StringUtils.substring(html, lastEnd)); stringBuilder.append(StringUtils.substring(html, lastEnd));
return stringBuilder.toString(); return stringBuilder.toString();
} }

View File

@ -8,6 +8,8 @@ import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Task; import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.selector.Html; import us.codecraft.webmagic.selector.Html;
import java.io.UnsupportedEncodingException;
import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assertions.assertThat;
import static org.junit.Assert.assertTrue; import static org.junit.Assert.assertTrue;
@ -32,6 +34,12 @@ public class HttpClientDownloaderTest {
assertTrue(!html.getText().isEmpty()); assertTrue(!html.getText().isEmpty());
} }
@Test(expected = IllegalArgumentException.class)
public void testDownloaderInIllegalUrl() throws UnsupportedEncodingException {
HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
httpClientDownloader.download("http://www.oschina.net/>");
}
@Test @Test
public void testCycleTriedTimes() { public void testCycleTriedTimes() {
HttpClientDownloader httpClientDownloader = new HttpClientDownloader(); HttpClientDownloader httpClientDownloader = new HttpClientDownloader();

View File

@ -38,17 +38,22 @@ public class UrlUtilsTest {
originHtml = "<a href=\"/start a\">"; originHtml = "<a href=\"/start a\">";
replacedHtml = UrlUtils.fixAllRelativeHrefs(originHtml, "http://www.dianping.com/"); replacedHtml = UrlUtils.fixAllRelativeHrefs(originHtml, "http://www.dianping.com/");
assertThat(replacedHtml).isEqualTo("<a href=\"http://www.dianping.com/start a\">"); assertThat(replacedHtml).isEqualTo("<a href=\"http://www.dianping.com/start%20a\">");
originHtml = "<a href='/start a'>"; originHtml = "<a href='/start a'>";
replacedHtml = UrlUtils.fixAllRelativeHrefs(originHtml, "http://www.dianping.com/"); replacedHtml = UrlUtils.fixAllRelativeHrefs(originHtml, "http://www.dianping.com/");
assertThat(replacedHtml).isEqualTo("<a href=\"http://www.dianping.com/start a\">"); assertThat(replacedHtml).isEqualTo("<a href=\"http://www.dianping.com/start%20a\">");
originHtml = "<a href=/start tag>"; originHtml = "<a href=/start tag>";
replacedHtml = UrlUtils.fixAllRelativeHrefs(originHtml, "http://www.dianping.com/"); replacedHtml = UrlUtils.fixAllRelativeHrefs(originHtml, "http://www.dianping.com/");
assertThat(replacedHtml).isEqualTo("<a href=\"http://www.dianping.com/start\" tag>"); assertThat(replacedHtml).isEqualTo("<a href=\"http://www.dianping.com/start\" tag>");
} }
@Test
public void test(){
UrlUtils.canonicalizeUrl("start tag", "http://www.dianping.com/");
}
@Test @Test
public void testGetDomain(){ public void testGetDomain(){
String url = "http://www.dianping.com/aa/"; String url = "http://www.dianping.com/aa/";