From 52fd5cfc1c8f4f5385aec7a2af04101074eeda40 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Tue, 30 Jul 2013 15:24:59 +0800 Subject: [PATCH] fix encoding --- pom.xml | 1 + .../webmagic/utils/UrlUtilsTest.java | 5 +- .../downloader/SeleniumDownloaderTest.java | 64 ++++++++++++------- 3 files changed, 44 insertions(+), 26 deletions(-) diff --git a/pom.xml b/pom.xml index 086437d..5974eae 100644 --- a/pom.xml +++ b/pom.xml @@ -69,6 +69,7 @@ 1.6 1.6 + UTF-8 diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java index cd55b2c..7ac7aa0 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java @@ -25,10 +25,7 @@ public class UrlUtilsTest { Assert.assertEquals("http://www.dianping.com/aa", fixrelativeurl); fixrelativeurl = UrlUtils.canonicalizeUrl("..aa", "http://www.dianping.com/sh/ss/com"); Assert.assertEquals("http://www.dianping.com/sh/ss/..aa", fixrelativeurl); -// fixrelativeurl = fixrelativeurl("/aa", "http://www.dianping.com"); -// System.out.println("fix: " + fixrelativeurl); -// fixrelativeurl = fixrelativeurl("/aa", "http://www.dianping.com/"); -// System.out.println("fix: " + fixrelativeurl); + fixrelativeurl = UrlUtils.canonicalizeUrl("../../aa", "http://www.dianping.com/sh/ss/com"); } @Test diff --git a/webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/downloader/SeleniumDownloaderTest.java b/webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/downloader/SeleniumDownloaderTest.java index 4aa9919..9683083 100644 --- a/webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/downloader/SeleniumDownloaderTest.java +++ b/webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/downloader/SeleniumDownloaderTest.java @@ -10,32 +10,52 @@ import us.codecraft.webmagic.Task; /** * @author yihua.huang@dianping.com
* @date: 13-7-26
- * Time: 下午2:46
+ * Time: 下午2:46
*/ public class SeleniumDownloaderTest { - private String chromeDriverPath = "/Users/yihua/Downloads/chromedriver"; + private String chromeDriverPath = "/Users/yihua/Downloads/chromedriver"; - @Ignore("need chrome driver") - @Test - public void test() { - SeleniumDownloader seleniumDownloader = new SeleniumDownloader(chromeDriverPath); - long time1 = System.currentTimeMillis(); - for (int i = 0; i < 100; i++) { - Page page = seleniumDownloader.download(new Request("http://huaban.com/"), new Task() { - @Override - public String getUUID() { - return "huaban.com"; - } + @Ignore("need chrome driver") + @Test + public void test() { + SeleniumDownloader seleniumDownloader = new SeleniumDownloader(chromeDriverPath); + long time1 = System.currentTimeMillis(); + for (int i = 0; i < 100; i++) { + Page page = seleniumDownloader.download(new Request("http://huaban.com/"), new Task() { + @Override + public String getUUID() { + return "huaban.com"; + } - @Override - public Site getSite() { - return Site.me(); - } - }); - System.out.println(page.getHtml().$("#waterfall").links().regex(".*pins.*").all()); - } - System.out.println(System.currentTimeMillis() - time1); - } + @Override + public Site getSite() { + return Site.me(); + } + }); + System.out.println(page.getHtml().$("#waterfall").links().regex(".*pins.*").all()); + } + System.out.println(System.currentTimeMillis() - time1); + } + + @Ignore + @Test + public void testBaiduWenku() { + SeleniumDownloader seleniumDownloader = new SeleniumDownloader(chromeDriverPath); + seleniumDownloader.setSleepTime(10000); + long time1 = System.currentTimeMillis(); + Page page = seleniumDownloader.download(new Request("http://wenku.baidu.com/view/462933ff04a1b0717fd5ddc2.html"), new Task() { + @Override + public String getUUID() { + return "huaban.com"; + } + + @Override + public Site getSite() { + return Site.me(); + } + }); + System.out.println(page.getHtml().$("div.inner").replace("<[^<>]+>","").replace("&nsbp;","").all()); + } }