encode illegal charactors in url #80
parent
2780423e60
commit
01848301d4
|
@ -43,12 +43,22 @@ public class UrlUtils {
|
||||||
if (url.startsWith("?"))
|
if (url.startsWith("?"))
|
||||||
url = base.getPath() + url;
|
url = base.getPath() + url;
|
||||||
URL abs = new URL(base, url);
|
URL abs = new URL(base, url);
|
||||||
return abs.toExternalForm();
|
return encodeIllegalCharacterInUrl(abs.toExternalForm());
|
||||||
} catch (MalformedURLException e) {
|
} catch (MalformedURLException e) {
|
||||||
return "";
|
return "";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @param url
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public static String encodeIllegalCharacterInUrl(String url) {
|
||||||
|
//TODO more charator support
|
||||||
|
return url.replace(" ", "%20");
|
||||||
|
}
|
||||||
|
|
||||||
public static String getHost(String url) {
|
public static String getHost(String url) {
|
||||||
String host = url;
|
String host = url;
|
||||||
int i = StringUtils.ordinalIndexOf(url, "/", 3);
|
int i = StringUtils.ordinalIndexOf(url, "/", 3);
|
||||||
|
@ -101,9 +111,9 @@ public class UrlUtils {
|
||||||
stringBuilder.append("\"").append(canonicalizeUrl(matcher.group(2), url)).append("\"");
|
stringBuilder.append("\"").append(canonicalizeUrl(matcher.group(2), url)).append("\"");
|
||||||
lastEnd = matcher.end();
|
lastEnd = matcher.end();
|
||||||
}
|
}
|
||||||
if (!modified) {
|
if (!modified) {
|
||||||
return html;
|
return html;
|
||||||
}
|
}
|
||||||
stringBuilder.append(StringUtils.substring(html, lastEnd));
|
stringBuilder.append(StringUtils.substring(html, lastEnd));
|
||||||
return stringBuilder.toString();
|
return stringBuilder.toString();
|
||||||
}
|
}
|
||||||
|
|
|
@ -8,6 +8,8 @@ import us.codecraft.webmagic.Site;
|
||||||
import us.codecraft.webmagic.Task;
|
import us.codecraft.webmagic.Task;
|
||||||
import us.codecraft.webmagic.selector.Html;
|
import us.codecraft.webmagic.selector.Html;
|
||||||
|
|
||||||
|
import java.io.UnsupportedEncodingException;
|
||||||
|
|
||||||
import static org.assertj.core.api.Assertions.assertThat;
|
import static org.assertj.core.api.Assertions.assertThat;
|
||||||
import static org.junit.Assert.assertTrue;
|
import static org.junit.Assert.assertTrue;
|
||||||
|
|
||||||
|
@ -32,6 +34,12 @@ public class HttpClientDownloaderTest {
|
||||||
assertTrue(!html.getText().isEmpty());
|
assertTrue(!html.getText().isEmpty());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test(expected = IllegalArgumentException.class)
|
||||||
|
public void testDownloaderInIllegalUrl() throws UnsupportedEncodingException {
|
||||||
|
HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
|
||||||
|
httpClientDownloader.download("http://www.oschina.net/>");
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testCycleTriedTimes() {
|
public void testCycleTriedTimes() {
|
||||||
HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
|
HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
|
||||||
|
|
|
@ -38,17 +38,22 @@ public class UrlUtilsTest {
|
||||||
|
|
||||||
originHtml = "<a href=\"/start a\">";
|
originHtml = "<a href=\"/start a\">";
|
||||||
replacedHtml = UrlUtils.fixAllRelativeHrefs(originHtml, "http://www.dianping.com/");
|
replacedHtml = UrlUtils.fixAllRelativeHrefs(originHtml, "http://www.dianping.com/");
|
||||||
assertThat(replacedHtml).isEqualTo("<a href=\"http://www.dianping.com/start a\">");
|
assertThat(replacedHtml).isEqualTo("<a href=\"http://www.dianping.com/start%20a\">");
|
||||||
|
|
||||||
originHtml = "<a href='/start a'>";
|
originHtml = "<a href='/start a'>";
|
||||||
replacedHtml = UrlUtils.fixAllRelativeHrefs(originHtml, "http://www.dianping.com/");
|
replacedHtml = UrlUtils.fixAllRelativeHrefs(originHtml, "http://www.dianping.com/");
|
||||||
assertThat(replacedHtml).isEqualTo("<a href=\"http://www.dianping.com/start a\">");
|
assertThat(replacedHtml).isEqualTo("<a href=\"http://www.dianping.com/start%20a\">");
|
||||||
|
|
||||||
originHtml = "<a href=/start tag>";
|
originHtml = "<a href=/start tag>";
|
||||||
replacedHtml = UrlUtils.fixAllRelativeHrefs(originHtml, "http://www.dianping.com/");
|
replacedHtml = UrlUtils.fixAllRelativeHrefs(originHtml, "http://www.dianping.com/");
|
||||||
assertThat(replacedHtml).isEqualTo("<a href=\"http://www.dianping.com/start\" tag>");
|
assertThat(replacedHtml).isEqualTo("<a href=\"http://www.dianping.com/start\" tag>");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void test(){
|
||||||
|
UrlUtils.canonicalizeUrl("start tag", "http://www.dianping.com/");
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testGetDomain(){
|
public void testGetDomain(){
|
||||||
String url = "http://www.dianping.com/aa/";
|
String url = "http://www.dianping.com/aa/";
|
||||||
|
|
Loading…
Reference in New Issue