From bc6e81e00f23bf0c89ae88ac3b5c4cedf43380a7 Mon Sep 17 00:00:00 2001 From: GZhY Date: Sun, 9 Apr 2017 20:40:00 +0800 Subject: [PATCH 1/3] =?UTF-8?q?=E4=BF=AE=E5=A4=8DcheckElementAndConvert?= =?UTF-8?q?=E6=96=B9=E6=B3=95=E6=B3=A8=E9=87=8A=E4=B8=AD=E6=B3=A8=E9=87=8A?= =?UTF-8?q?=E9=94=99=E8=AF=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../src/main/java/us/codecraft/webmagic/selector/HtmlNode.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java index 89de5a6..c063b48 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java @@ -90,7 +90,7 @@ public class HtmlNode extends AbstractSelectable { * See: https://github.com/code4craft/webmagic/issues/113 * * @param elementIterator elementIterator - * @param element element + * @return element element */ private Element checkElementAndConvert(ListIterator elementIterator) { Element element = elementIterator.next(); From ce3f0ac23968acee622017220f67d2d0874a477b Mon Sep 17 00:00:00 2001 From: GZhY Date: Sun, 9 Apr 2017 21:01:32 +0800 Subject: [PATCH 2/3] =?UTF-8?q?=E5=88=A0=E9=99=A4=20fixAllRelativeHrefs=20?= =?UTF-8?q?=E5=B9=B6=E4=BF=AE=E5=A4=8D=20SeleniumDownloader=20=E5=AF=B9=20?= =?UTF-8?q?fixAllRelativeHrefs=20=E7=9A=84=E4=BE=9D=E8=B5=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../us/codecraft/webmagic/utils/UrlUtils.java | 35 ------------------- .../webmagic/utils/UrlUtilsTest.java | 19 ---------- .../selenium/SeleniumDownloader.java | 5 +-- 3 files changed, 1 insertion(+), 58 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java index 72a9d3f..6864606 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java @@ -92,41 +92,6 @@ public class UrlUtils { } } - /** - * allow blank space in quote - */ - private static Pattern patternForHrefWithQuote = Pattern.compile("(]*href=)[\"']([^\"'<>]*)[\"']", Pattern.CASE_INSENSITIVE); - - /** - * disallow blank space without quote - */ - private static Pattern patternForHrefWithoutQuote = Pattern.compile("(]*href=)([^\"'<>\\s]+)", Pattern.CASE_INSENSITIVE); - - public static String fixAllRelativeHrefs(String html, String url) { - html = replaceByPattern(html, url, patternForHrefWithQuote); - html = replaceByPattern(html, url, patternForHrefWithoutQuote); - return html; - } - - public static String replaceByPattern(String html, String url, Pattern pattern) { - StringBuilder stringBuilder = new StringBuilder(); - Matcher matcher = pattern.matcher(html); - int lastEnd = 0; - boolean modified = false; - while (matcher.find()) { - modified = true; - stringBuilder.append(StringUtils.substring(html, lastEnd, matcher.start())); - stringBuilder.append(matcher.group(1)); - stringBuilder.append("\"").append(canonicalizeUrl(matcher.group(2), url)).append("\""); - lastEnd = matcher.end(); - } - if (!modified) { - return html; - } - stringBuilder.append(StringUtils.substring(html, lastEnd)); - return stringBuilder.toString(); - } - public static List convertToRequests(Collection urls) { List requestList = new ArrayList(urls.size()); for (String url : urls) { diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java index a90304d..6afdeef 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java @@ -33,25 +33,6 @@ public class UrlUtilsTest { assertThat(absoluteUrl).isEqualTo("http://www.dianping.com/aa"); } - @Test - public void testFixAllRelativeHrefs() { - String originHtml = ""; - String replacedHtml = UrlUtils.fixAllRelativeHrefs(originHtml, "http://www.dianping.com/"); - assertThat(replacedHtml).isEqualTo(""); - - originHtml = ""; - replacedHtml = UrlUtils.fixAllRelativeHrefs(originHtml, "http://www.dianping.com/"); - assertThat(replacedHtml).isEqualTo(""); - - originHtml = ""; - replacedHtml = UrlUtils.fixAllRelativeHrefs(originHtml, "http://www.dianping.com/"); - assertThat(replacedHtml).isEqualTo(""); - - originHtml = ""; - replacedHtml = UrlUtils.fixAllRelativeHrefs(originHtml, "http://www.dianping.com/"); - assertThat(replacedHtml).isEqualTo(""); - } - @Test public void testGetDomain(){ String url = "http://www.dianping.com/aa/"; diff --git a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java index 6e350aa..f45f7e2 100644 --- a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java +++ b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java @@ -5,7 +5,6 @@ import org.openqa.selenium.By; import org.openqa.selenium.Cookie; import org.openqa.selenium.WebDriver; import org.openqa.selenium.WebElement; - import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Site; @@ -13,7 +12,6 @@ import us.codecraft.webmagic.Task; import us.codecraft.webmagic.downloader.Downloader; import us.codecraft.webmagic.selector.Html; import us.codecraft.webmagic.selector.PlainText; -import us.codecraft.webmagic.utils.UrlUtils; import java.io.Closeable; import java.io.IOException; @@ -108,8 +106,7 @@ public class SeleniumDownloader implements Downloader, Closeable { String content = webElement.getAttribute("outerHTML"); Page page = new Page(); page.setRawText(content); - page.setHtml(new Html(UrlUtils.fixAllRelativeHrefs(content, - request.getUrl()))); + page.setHtml(new Html(content, request.getUrl())); page.setUrl(new PlainText(request.getUrl())); page.setRequest(request); webDriverPool.returnToPool(webDriver); From 5f34adf938972e235716dff87c8d932dc2595af8 Mon Sep 17 00:00:00 2001 From: GZhY Date: Sun, 9 Apr 2017 21:29:01 +0800 Subject: [PATCH 3/3] =?UTF-8?q?=E5=AE=8C=E5=96=84=20LinksSelector.selectLi?= =?UTF-8?q?st=20=E7=9A=84=E6=B5=8B=E8=AF=95=E7=94=A8=E4=BE=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../us/codecraft/webmagic/selector/LinksSelectorTest.java | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/LinksSelectorTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/LinksSelectorTest.java index 3fcb71b..75a2913 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/LinksSelectorTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/LinksSelectorTest.java @@ -1,5 +1,6 @@ package us.codecraft.webmagic.selector; +import org.jsoup.Jsoup; import org.junit.Test; import java.util.List; @@ -15,7 +16,12 @@ public class LinksSelectorTest { @Test public void testLinks() throws Exception { - List links = new LinksSelector().selectList(html); + LinksSelector linksSelector = new LinksSelector(); + List links = linksSelector.selectList(html); + System.out.println(links); + + html = "
"; + links = linksSelector.selectList(Jsoup.parse(html, "http://whatever.com/")); System.out.println(links); } }