enable blank space in quotes in UrlUtils.fixAllRelativeHrefs #80
parent
97b6f46280
commit
2780423e60
|
@ -73,18 +73,37 @@ public class UrlUtils {
|
||||||
return domain;
|
return domain;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static Pattern patternForHref = Pattern.compile("(<a[^<>]*href=)[\"']{0,1}([^\"'<>\\s]*)[\"']{0,1}", Pattern.CASE_INSENSITIVE);
|
/**
|
||||||
|
* allow blank space in quote
|
||||||
|
*/
|
||||||
|
private static Pattern patternForHrefWithQuote = Pattern.compile("(<a[^<>]*href=)[\"']([^\"'<>]*)[\"']", Pattern.CASE_INSENSITIVE);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* disallow blank space without quote
|
||||||
|
*/
|
||||||
|
private static Pattern patternForHrefWithoutQuote = Pattern.compile("(<a[^<>]*href=)([^\"'<>\\s]+)", Pattern.CASE_INSENSITIVE);
|
||||||
|
|
||||||
public static String fixAllRelativeHrefs(String html, String url) {
|
public static String fixAllRelativeHrefs(String html, String url) {
|
||||||
|
html = replaceByPattern(html, url, patternForHrefWithQuote);
|
||||||
|
html = replaceByPattern(html, url, patternForHrefWithoutQuote);
|
||||||
|
return html;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static String replaceByPattern(String html, String url, Pattern pattern) {
|
||||||
StringBuilder stringBuilder = new StringBuilder();
|
StringBuilder stringBuilder = new StringBuilder();
|
||||||
Matcher matcher = patternForHref.matcher(html);
|
Matcher matcher = pattern.matcher(html);
|
||||||
int lastEnd = 0;
|
int lastEnd = 0;
|
||||||
|
boolean modified = false;
|
||||||
while (matcher.find()) {
|
while (matcher.find()) {
|
||||||
|
modified = true;
|
||||||
stringBuilder.append(StringUtils.substring(html, lastEnd, matcher.start()));
|
stringBuilder.append(StringUtils.substring(html, lastEnd, matcher.start()));
|
||||||
stringBuilder.append(matcher.group(1));
|
stringBuilder.append(matcher.group(1));
|
||||||
stringBuilder.append("\"").append(canonicalizeUrl(matcher.group(2), url)).append("\"");
|
stringBuilder.append("\"").append(canonicalizeUrl(matcher.group(2), url)).append("\"");
|
||||||
lastEnd = matcher.end();
|
lastEnd = matcher.end();
|
||||||
}
|
}
|
||||||
|
if (!modified) {
|
||||||
|
return html;
|
||||||
|
}
|
||||||
stringBuilder.append(StringUtils.substring(html, lastEnd));
|
stringBuilder.append(StringUtils.substring(html, lastEnd));
|
||||||
return stringBuilder.toString();
|
return stringBuilder.toString();
|
||||||
}
|
}
|
||||||
|
|
|
@ -3,6 +3,8 @@ package us.codecraft.webmagic.utils;
|
||||||
import org.junit.Assert;
|
import org.junit.Assert;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
|
||||||
|
import static org.assertj.core.api.Assertions.assertThat;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
* Date: 13-4-21
|
* Date: 13-4-21
|
||||||
|
@ -12,19 +14,39 @@ public class UrlUtilsTest {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testFixRelativeUrl() {
|
public void testFixRelativeUrl() {
|
||||||
String fixrelativeurl = UrlUtils.canonicalizeUrl("aa", "http://www.dianping.com/sh/ss/com");
|
String absoluteUrl = UrlUtils.canonicalizeUrl("aa", "http://www.dianping.com/sh/ss/com");
|
||||||
System.out.println("fix: " + fixrelativeurl);
|
assertThat(absoluteUrl).isEqualTo("http://www.dianping.com/sh/ss/aa");
|
||||||
Assert.assertEquals("http://www.dianping.com/sh/ss/aa", fixrelativeurl);
|
|
||||||
|
|
||||||
fixrelativeurl = UrlUtils.canonicalizeUrl("../aa", "http://www.dianping.com/sh/ss/com");
|
absoluteUrl = UrlUtils.canonicalizeUrl("../aa", "http://www.dianping.com/sh/ss/com");
|
||||||
Assert.assertEquals("http://www.dianping.com/sh/aa", fixrelativeurl);
|
assertThat(absoluteUrl).isEqualTo("http://www.dianping.com/sh/aa");
|
||||||
|
|
||||||
fixrelativeurl = UrlUtils.canonicalizeUrl("..aa", "http://www.dianping.com/sh/ss/com");
|
absoluteUrl = UrlUtils.canonicalizeUrl("..aa", "http://www.dianping.com/sh/ss/com");
|
||||||
Assert.assertEquals("http://www.dianping.com/sh/ss/..aa", fixrelativeurl);
|
assertThat(absoluteUrl).isEqualTo("http://www.dianping.com/sh/ss/..aa");
|
||||||
fixrelativeurl = UrlUtils.canonicalizeUrl("../../aa", "http://www.dianping.com/sh/ss/com/");
|
|
||||||
Assert.assertEquals("http://www.dianping.com/sh/aa", fixrelativeurl);
|
absoluteUrl = UrlUtils.canonicalizeUrl("../../aa", "http://www.dianping.com/sh/ss/com/");
|
||||||
fixrelativeurl = UrlUtils.canonicalizeUrl("../../aa", "http://www.dianping.com/sh/ss/com");
|
assertThat(absoluteUrl).isEqualTo("http://www.dianping.com/sh/aa");
|
||||||
Assert.assertEquals("http://www.dianping.com/aa", fixrelativeurl);
|
|
||||||
|
absoluteUrl = UrlUtils.canonicalizeUrl("../../aa", "http://www.dianping.com/sh/ss/com");
|
||||||
|
assertThat(absoluteUrl).isEqualTo("http://www.dianping.com/aa");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testFixAllRelativeHrefs() {
|
||||||
|
String originHtml = "<a href=\"/start\">";
|
||||||
|
String replacedHtml = UrlUtils.fixAllRelativeHrefs(originHtml, "http://www.dianping.com/");
|
||||||
|
assertThat(replacedHtml).isEqualTo("<a href=\"http://www.dianping.com/start\">");
|
||||||
|
|
||||||
|
originHtml = "<a href=\"/start a\">";
|
||||||
|
replacedHtml = UrlUtils.fixAllRelativeHrefs(originHtml, "http://www.dianping.com/");
|
||||||
|
assertThat(replacedHtml).isEqualTo("<a href=\"http://www.dianping.com/start a\">");
|
||||||
|
|
||||||
|
originHtml = "<a href='/start a'>";
|
||||||
|
replacedHtml = UrlUtils.fixAllRelativeHrefs(originHtml, "http://www.dianping.com/");
|
||||||
|
assertThat(replacedHtml).isEqualTo("<a href=\"http://www.dianping.com/start a\">");
|
||||||
|
|
||||||
|
originHtml = "<a href=/start tag>";
|
||||||
|
replacedHtml = UrlUtils.fixAllRelativeHrefs(originHtml, "http://www.dianping.com/");
|
||||||
|
assertThat(replacedHtml).isEqualTo("<a href=\"http://www.dianping.com/start\" tag>");
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
|
Loading…
Reference in New Issue