#25 use URL api in UrlUtils.canonicalizeUrl()
parent
363fd38ccb
commit
d2e0f0cd33
|
@ -2,6 +2,8 @@ package us.codecraft.webmagic.utils;
|
||||||
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
|
||||||
|
import java.net.MalformedURLException;
|
||||||
|
import java.net.URL;
|
||||||
import java.nio.charset.Charset;
|
import java.nio.charset.Charset;
|
||||||
import java.util.regex.Matcher;
|
import java.util.regex.Matcher;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
|
@ -18,47 +20,33 @@ public class UrlUtils {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* canonicalizeUrl
|
* canonicalizeUrl
|
||||||
|
*
|
||||||
|
* Borrowed from Jsoup.
|
||||||
|
*
|
||||||
* @param url
|
* @param url
|
||||||
* @param refer
|
* @param refer
|
||||||
* @return canonicalizeUrl
|
* @return canonicalizeUrl
|
||||||
*/
|
*/
|
||||||
public static String canonicalizeUrl(String url, String refer) {
|
public static String canonicalizeUrl(String url, String refer) {
|
||||||
if (StringUtils.isBlank(url) || StringUtils.isBlank(refer)) {
|
URL base;
|
||||||
return url;
|
try {
|
||||||
}
|
try {
|
||||||
if (url.startsWith("http") || url.startsWith("ftp") || url.startsWith("mailto") || url.startsWith("javascript:")) {
|
base = new URL(refer);
|
||||||
return url;
|
} catch (MalformedURLException e) {
|
||||||
}
|
// the base is unsuitable, but the attribute may be abs on its own, so try that
|
||||||
if (StringUtils.startsWith(url, "/")) {
|
URL abs = new URL(refer);
|
||||||
String host = getHost(refer);
|
return abs.toExternalForm();
|
||||||
return host + url;
|
|
||||||
} else if (!StringUtils.startsWith(url, ".")) {
|
|
||||||
refer = reversePath(refer, 1);
|
|
||||||
return refer + "/" + url;
|
|
||||||
} else {
|
|
||||||
Matcher matcher = relativePathPattern.matcher(url);
|
|
||||||
if (matcher.find()) {
|
|
||||||
int reverseDepth = matcher.group(1).length();
|
|
||||||
refer = reversePath(refer, reverseDepth);
|
|
||||||
String substring = StringUtils.substring(url, matcher.end());
|
|
||||||
return refer + "/" + substring;
|
|
||||||
} else {
|
|
||||||
refer = reversePath(refer, 1);
|
|
||||||
return refer + "/" + url;
|
|
||||||
}
|
}
|
||||||
|
// workaround: java resolves '//path/file + ?foo' to '//path/?foo', not '//path/file?foo' as desired
|
||||||
|
if (url.startsWith("?"))
|
||||||
|
url = base.getPath() + url;
|
||||||
|
URL abs = new URL(base, url);
|
||||||
|
return abs.toExternalForm();
|
||||||
|
} catch (MalformedURLException e) {
|
||||||
|
return "";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public static String reversePath(String url, int depth) {
|
|
||||||
int i = StringUtils.lastOrdinalIndexOf(url, "/", depth);
|
|
||||||
if (i < 10) {
|
|
||||||
url = getHost(url);
|
|
||||||
} else {
|
|
||||||
url = StringUtils.substring(url, 0, i);
|
|
||||||
}
|
|
||||||
return url;
|
|
||||||
}
|
|
||||||
|
|
||||||
public static String getHost(String url) {
|
public static String getHost(String url) {
|
||||||
String host = url;
|
String host = url;
|
||||||
int i = StringUtils.ordinalIndexOf(url, "/", 3);
|
int i = StringUtils.ordinalIndexOf(url, "/", 3);
|
||||||
|
|
|
@ -19,13 +19,12 @@ public class UrlUtilsTest {
|
||||||
fixrelativeurl = UrlUtils.canonicalizeUrl("../aa", "http://www.dianping.com/sh/ss/com");
|
fixrelativeurl = UrlUtils.canonicalizeUrl("../aa", "http://www.dianping.com/sh/ss/com");
|
||||||
Assert.assertEquals("http://www.dianping.com/sh/aa", fixrelativeurl);
|
Assert.assertEquals("http://www.dianping.com/sh/aa", fixrelativeurl);
|
||||||
|
|
||||||
fixrelativeurl = UrlUtils.canonicalizeUrl("..../aa", "http://www.dianping.com/sh/ss/com");
|
|
||||||
Assert.assertEquals("http://www.dianping.com/aa", fixrelativeurl);
|
|
||||||
fixrelativeurl = UrlUtils.canonicalizeUrl(".../aa", "http://www.dianping.com/sh/ss/com");
|
|
||||||
Assert.assertEquals("http://www.dianping.com/aa", fixrelativeurl);
|
|
||||||
fixrelativeurl = UrlUtils.canonicalizeUrl("..aa", "http://www.dianping.com/sh/ss/com");
|
fixrelativeurl = UrlUtils.canonicalizeUrl("..aa", "http://www.dianping.com/sh/ss/com");
|
||||||
Assert.assertEquals("http://www.dianping.com/sh/ss/..aa", fixrelativeurl);
|
Assert.assertEquals("http://www.dianping.com/sh/ss/..aa", fixrelativeurl);
|
||||||
|
fixrelativeurl = UrlUtils.canonicalizeUrl("../../aa", "http://www.dianping.com/sh/ss/com/");
|
||||||
|
Assert.assertEquals("http://www.dianping.com/sh/aa", fixrelativeurl);
|
||||||
fixrelativeurl = UrlUtils.canonicalizeUrl("../../aa", "http://www.dianping.com/sh/ss/com");
|
fixrelativeurl = UrlUtils.canonicalizeUrl("../../aa", "http://www.dianping.com/sh/ss/com");
|
||||||
|
Assert.assertEquals("http://www.dianping.com/aa", fixrelativeurl);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
|
Loading…
Reference in New Issue