fix encoding
parent
78cfb4d554
commit
52fd5cfc1c
1
pom.xml
1
pom.xml
|
@ -69,6 +69,7 @@
|
||||||
<configuration>
|
<configuration>
|
||||||
<source>1.6</source>
|
<source>1.6</source>
|
||||||
<target>1.6</target>
|
<target>1.6</target>
|
||||||
|
<encoding>UTF-8</encoding>
|
||||||
</configuration>
|
</configuration>
|
||||||
</plugin>
|
</plugin>
|
||||||
<plugin>
|
<plugin>
|
||||||
|
|
|
@ -25,10 +25,7 @@ public class UrlUtilsTest {
|
||||||
Assert.assertEquals("http://www.dianping.com/aa", fixrelativeurl);
|
Assert.assertEquals("http://www.dianping.com/aa", fixrelativeurl);
|
||||||
fixrelativeurl = UrlUtils.canonicalizeUrl("..aa", "http://www.dianping.com/sh/ss/com");
|
fixrelativeurl = UrlUtils.canonicalizeUrl("..aa", "http://www.dianping.com/sh/ss/com");
|
||||||
Assert.assertEquals("http://www.dianping.com/sh/ss/..aa", fixrelativeurl);
|
Assert.assertEquals("http://www.dianping.com/sh/ss/..aa", fixrelativeurl);
|
||||||
// fixrelativeurl = fixrelativeurl("/aa", "http://www.dianping.com");
|
fixrelativeurl = UrlUtils.canonicalizeUrl("../../aa", "http://www.dianping.com/sh/ss/com");
|
||||||
// System.out.println("fix: " + fixrelativeurl);
|
|
||||||
// fixrelativeurl = fixrelativeurl("/aa", "http://www.dianping.com/");
|
|
||||||
// System.out.println("fix: " + fixrelativeurl);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
|
|
@ -10,32 +10,52 @@ import us.codecraft.webmagic.Task;
|
||||||
/**
|
/**
|
||||||
* @author yihua.huang@dianping.com <br>
|
* @author yihua.huang@dianping.com <br>
|
||||||
* @date: 13-7-26 <br>
|
* @date: 13-7-26 <br>
|
||||||
* Time: 下午2:46 <br>
|
* Time: 下午2:46 <br>
|
||||||
*/
|
*/
|
||||||
public class SeleniumDownloaderTest {
|
public class SeleniumDownloaderTest {
|
||||||
|
|
||||||
private String chromeDriverPath = "/Users/yihua/Downloads/chromedriver";
|
private String chromeDriverPath = "/Users/yihua/Downloads/chromedriver";
|
||||||
|
|
||||||
@Ignore("need chrome driver")
|
@Ignore("need chrome driver")
|
||||||
@Test
|
@Test
|
||||||
public void test() {
|
public void test() {
|
||||||
SeleniumDownloader seleniumDownloader = new SeleniumDownloader(chromeDriverPath);
|
SeleniumDownloader seleniumDownloader = new SeleniumDownloader(chromeDriverPath);
|
||||||
long time1 = System.currentTimeMillis();
|
long time1 = System.currentTimeMillis();
|
||||||
for (int i = 0; i < 100; i++) {
|
for (int i = 0; i < 100; i++) {
|
||||||
Page page = seleniumDownloader.download(new Request("http://huaban.com/"), new Task() {
|
Page page = seleniumDownloader.download(new Request("http://huaban.com/"), new Task() {
|
||||||
@Override
|
@Override
|
||||||
public String getUUID() {
|
public String getUUID() {
|
||||||
return "huaban.com";
|
return "huaban.com";
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Site getSite() {
|
public Site getSite() {
|
||||||
return Site.me();
|
return Site.me();
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
System.out.println(page.getHtml().$("#waterfall").links().regex(".*pins.*").all());
|
System.out.println(page.getHtml().$("#waterfall").links().regex(".*pins.*").all());
|
||||||
}
|
}
|
||||||
System.out.println(System.currentTimeMillis() - time1);
|
System.out.println(System.currentTimeMillis() - time1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Ignore
|
||||||
|
@Test
|
||||||
|
public void testBaiduWenku() {
|
||||||
|
SeleniumDownloader seleniumDownloader = new SeleniumDownloader(chromeDriverPath);
|
||||||
|
seleniumDownloader.setSleepTime(10000);
|
||||||
|
long time1 = System.currentTimeMillis();
|
||||||
|
Page page = seleniumDownloader.download(new Request("http://wenku.baidu.com/view/462933ff04a1b0717fd5ddc2.html"), new Task() {
|
||||||
|
@Override
|
||||||
|
public String getUUID() {
|
||||||
|
return "huaban.com";
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Site getSite() {
|
||||||
|
return Site.me();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
System.out.println(page.getHtml().$("div.inner").replace("<[^<>]+>","").replace("&nsbp;","").all());
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue