删除 fixAllRelativeHrefs 并修复 SeleniumDownloader 对 fixAllRelativeHrefs 的依赖
parent
bc6e81e00f
commit
ce3f0ac239
|
@ -92,41 +92,6 @@ public class UrlUtils {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* allow blank space in quote
|
|
||||||
*/
|
|
||||||
private static Pattern patternForHrefWithQuote = Pattern.compile("(<a[^<>]*href=)[\"']([^\"'<>]*)[\"']", Pattern.CASE_INSENSITIVE);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* disallow blank space without quote
|
|
||||||
*/
|
|
||||||
private static Pattern patternForHrefWithoutQuote = Pattern.compile("(<a[^<>]*href=)([^\"'<>\\s]+)", Pattern.CASE_INSENSITIVE);
|
|
||||||
|
|
||||||
public static String fixAllRelativeHrefs(String html, String url) {
|
|
||||||
html = replaceByPattern(html, url, patternForHrefWithQuote);
|
|
||||||
html = replaceByPattern(html, url, patternForHrefWithoutQuote);
|
|
||||||
return html;
|
|
||||||
}
|
|
||||||
|
|
||||||
public static String replaceByPattern(String html, String url, Pattern pattern) {
|
|
||||||
StringBuilder stringBuilder = new StringBuilder();
|
|
||||||
Matcher matcher = pattern.matcher(html);
|
|
||||||
int lastEnd = 0;
|
|
||||||
boolean modified = false;
|
|
||||||
while (matcher.find()) {
|
|
||||||
modified = true;
|
|
||||||
stringBuilder.append(StringUtils.substring(html, lastEnd, matcher.start()));
|
|
||||||
stringBuilder.append(matcher.group(1));
|
|
||||||
stringBuilder.append("\"").append(canonicalizeUrl(matcher.group(2), url)).append("\"");
|
|
||||||
lastEnd = matcher.end();
|
|
||||||
}
|
|
||||||
if (!modified) {
|
|
||||||
return html;
|
|
||||||
}
|
|
||||||
stringBuilder.append(StringUtils.substring(html, lastEnd));
|
|
||||||
return stringBuilder.toString();
|
|
||||||
}
|
|
||||||
|
|
||||||
public static List<Request> convertToRequests(Collection<String> urls) {
|
public static List<Request> convertToRequests(Collection<String> urls) {
|
||||||
List<Request> requestList = new ArrayList<Request>(urls.size());
|
List<Request> requestList = new ArrayList<Request>(urls.size());
|
||||||
for (String url : urls) {
|
for (String url : urls) {
|
||||||
|
|
|
@ -33,25 +33,6 @@ public class UrlUtilsTest {
|
||||||
assertThat(absoluteUrl).isEqualTo("http://www.dianping.com/aa");
|
assertThat(absoluteUrl).isEqualTo("http://www.dianping.com/aa");
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testFixAllRelativeHrefs() {
|
|
||||||
String originHtml = "<a href=\"/start\">";
|
|
||||||
String replacedHtml = UrlUtils.fixAllRelativeHrefs(originHtml, "http://www.dianping.com/");
|
|
||||||
assertThat(replacedHtml).isEqualTo("<a href=\"http://www.dianping.com/start\">");
|
|
||||||
|
|
||||||
originHtml = "<a href=\"/start a\">";
|
|
||||||
replacedHtml = UrlUtils.fixAllRelativeHrefs(originHtml, "http://www.dianping.com/");
|
|
||||||
assertThat(replacedHtml).isEqualTo("<a href=\"http://www.dianping.com/start%20a\">");
|
|
||||||
|
|
||||||
originHtml = "<a href='/start a'>";
|
|
||||||
replacedHtml = UrlUtils.fixAllRelativeHrefs(originHtml, "http://www.dianping.com/");
|
|
||||||
assertThat(replacedHtml).isEqualTo("<a href=\"http://www.dianping.com/start%20a\">");
|
|
||||||
|
|
||||||
originHtml = "<a href=/start tag>";
|
|
||||||
replacedHtml = UrlUtils.fixAllRelativeHrefs(originHtml, "http://www.dianping.com/");
|
|
||||||
assertThat(replacedHtml).isEqualTo("<a href=\"http://www.dianping.com/start\" tag>");
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testGetDomain(){
|
public void testGetDomain(){
|
||||||
String url = "http://www.dianping.com/aa/";
|
String url = "http://www.dianping.com/aa/";
|
||||||
|
|
|
@ -5,7 +5,6 @@ import org.openqa.selenium.By;
|
||||||
import org.openqa.selenium.Cookie;
|
import org.openqa.selenium.Cookie;
|
||||||
import org.openqa.selenium.WebDriver;
|
import org.openqa.selenium.WebDriver;
|
||||||
import org.openqa.selenium.WebElement;
|
import org.openqa.selenium.WebElement;
|
||||||
|
|
||||||
import us.codecraft.webmagic.Page;
|
import us.codecraft.webmagic.Page;
|
||||||
import us.codecraft.webmagic.Request;
|
import us.codecraft.webmagic.Request;
|
||||||
import us.codecraft.webmagic.Site;
|
import us.codecraft.webmagic.Site;
|
||||||
|
@ -13,7 +12,6 @@ import us.codecraft.webmagic.Task;
|
||||||
import us.codecraft.webmagic.downloader.Downloader;
|
import us.codecraft.webmagic.downloader.Downloader;
|
||||||
import us.codecraft.webmagic.selector.Html;
|
import us.codecraft.webmagic.selector.Html;
|
||||||
import us.codecraft.webmagic.selector.PlainText;
|
import us.codecraft.webmagic.selector.PlainText;
|
||||||
import us.codecraft.webmagic.utils.UrlUtils;
|
|
||||||
|
|
||||||
import java.io.Closeable;
|
import java.io.Closeable;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
@ -108,8 +106,7 @@ public class SeleniumDownloader implements Downloader, Closeable {
|
||||||
String content = webElement.getAttribute("outerHTML");
|
String content = webElement.getAttribute("outerHTML");
|
||||||
Page page = new Page();
|
Page page = new Page();
|
||||||
page.setRawText(content);
|
page.setRawText(content);
|
||||||
page.setHtml(new Html(UrlUtils.fixAllRelativeHrefs(content,
|
page.setHtml(new Html(content, request.getUrl()));
|
||||||
request.getUrl())));
|
|
||||||
page.setUrl(new PlainText(request.getUrl()));
|
page.setUrl(new PlainText(request.getUrl()));
|
||||||
page.setRequest(request);
|
page.setRequest(request);
|
||||||
webDriverPool.returnToPool(webDriver);
|
webDriverPool.returnToPool(webDriver);
|
||||||
|
|
Loading…
Reference in New Issue