fix encoding

master
yihua.huang 2013-07-30 15:24:59 +08:00
parent 78cfb4d554
commit 52fd5cfc1c
3 changed files with 44 additions and 26 deletions

View File

@ -69,6 +69,7 @@
<configuration> <configuration>
<source>1.6</source> <source>1.6</source>
<target>1.6</target> <target>1.6</target>
<encoding>UTF-8</encoding>
</configuration> </configuration>
</plugin> </plugin>
<plugin> <plugin>

View File

@ -25,10 +25,7 @@ public class UrlUtilsTest {
Assert.assertEquals("http://www.dianping.com/aa", fixrelativeurl); Assert.assertEquals("http://www.dianping.com/aa", fixrelativeurl);
fixrelativeurl = UrlUtils.canonicalizeUrl("..aa", "http://www.dianping.com/sh/ss/com"); fixrelativeurl = UrlUtils.canonicalizeUrl("..aa", "http://www.dianping.com/sh/ss/com");
Assert.assertEquals("http://www.dianping.com/sh/ss/..aa", fixrelativeurl); Assert.assertEquals("http://www.dianping.com/sh/ss/..aa", fixrelativeurl);
// fixrelativeurl = fixrelativeurl("/aa", "http://www.dianping.com"); fixrelativeurl = UrlUtils.canonicalizeUrl("../../aa", "http://www.dianping.com/sh/ss/com");
// System.out.println("fix: " + fixrelativeurl);
// fixrelativeurl = fixrelativeurl("/aa", "http://www.dianping.com/");
// System.out.println("fix: " + fixrelativeurl);
} }
@Test @Test

View File

@ -10,32 +10,52 @@ import us.codecraft.webmagic.Task;
/** /**
* @author yihua.huang@dianping.com <br> * @author yihua.huang@dianping.com <br>
* @date: 13-7-26 <br> * @date: 13-7-26 <br>
* Time: 2:46 <br> * Time: 2:46 <br>
*/ */
public class SeleniumDownloaderTest { public class SeleniumDownloaderTest {
private String chromeDriverPath = "/Users/yihua/Downloads/chromedriver"; private String chromeDriverPath = "/Users/yihua/Downloads/chromedriver";
@Ignore("need chrome driver") @Ignore("need chrome driver")
@Test @Test
public void test() { public void test() {
SeleniumDownloader seleniumDownloader = new SeleniumDownloader(chromeDriverPath); SeleniumDownloader seleniumDownloader = new SeleniumDownloader(chromeDriverPath);
long time1 = System.currentTimeMillis(); long time1 = System.currentTimeMillis();
for (int i = 0; i < 100; i++) { for (int i = 0; i < 100; i++) {
Page page = seleniumDownloader.download(new Request("http://huaban.com/"), new Task() { Page page = seleniumDownloader.download(new Request("http://huaban.com/"), new Task() {
@Override @Override
public String getUUID() { public String getUUID() {
return "huaban.com"; return "huaban.com";
} }
@Override @Override
public Site getSite() { public Site getSite() {
return Site.me(); return Site.me();
} }
}); });
System.out.println(page.getHtml().$("#waterfall").links().regex(".*pins.*").all()); System.out.println(page.getHtml().$("#waterfall").links().regex(".*pins.*").all());
} }
System.out.println(System.currentTimeMillis() - time1); System.out.println(System.currentTimeMillis() - time1);
} }
@Ignore
@Test
public void testBaiduWenku() {
SeleniumDownloader seleniumDownloader = new SeleniumDownloader(chromeDriverPath);
seleniumDownloader.setSleepTime(10000);
long time1 = System.currentTimeMillis();
Page page = seleniumDownloader.download(new Request("http://wenku.baidu.com/view/462933ff04a1b0717fd5ddc2.html"), new Task() {
@Override
public String getUUID() {
return "huaban.com";
}
@Override
public Site getSite() {
return Site.me();
}
});
System.out.println(page.getHtml().$("div.inner").replace("<[^<>]+>","").replace("&nsbp;","").all());
}
} }