complete selenium
parent
86a20eabd9
commit
644a90c2d8
|
@ -2,16 +2,20 @@ package us.codecraft.webmagic.selenium.downloader;
|
||||||
|
|
||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
import org.openqa.selenium.By;
|
import org.openqa.selenium.By;
|
||||||
|
import org.openqa.selenium.Cookie;
|
||||||
import org.openqa.selenium.WebDriver;
|
import org.openqa.selenium.WebDriver;
|
||||||
import org.openqa.selenium.WebElement;
|
import org.openqa.selenium.WebElement;
|
||||||
import us.codecraft.webmagic.Page;
|
import us.codecraft.webmagic.Page;
|
||||||
import us.codecraft.webmagic.Request;
|
import us.codecraft.webmagic.Request;
|
||||||
|
import us.codecraft.webmagic.Site;
|
||||||
import us.codecraft.webmagic.Task;
|
import us.codecraft.webmagic.Task;
|
||||||
import us.codecraft.webmagic.downloader.Downloader;
|
import us.codecraft.webmagic.downloader.Downloader;
|
||||||
import us.codecraft.webmagic.selector.Html;
|
import us.codecraft.webmagic.selector.Html;
|
||||||
import us.codecraft.webmagic.selector.PlainText;
|
import us.codecraft.webmagic.selector.PlainText;
|
||||||
import us.codecraft.webmagic.utils.UrlUtils;
|
import us.codecraft.webmagic.utils.UrlUtils;
|
||||||
|
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @author yihua.huang@dianping.com <br>
|
* @author yihua.huang@dianping.com <br>
|
||||||
* @date: 13-7-26 <br>
|
* @date: 13-7-26 <br>
|
||||||
|
@ -25,24 +29,40 @@ public class SeleniumDownloader implements Downloader {
|
||||||
|
|
||||||
public SeleniumDownloader(String chromeDriverPath) {
|
public SeleniumDownloader(String chromeDriverPath) {
|
||||||
System.getProperties().setProperty("webdriver.chrome.driver", chromeDriverPath);
|
System.getProperties().setProperty("webdriver.chrome.driver", chromeDriverPath);
|
||||||
|
webDriverPool = new WebDriverPool();
|
||||||
|
}
|
||||||
|
|
||||||
|
public SeleniumDownloader(String chromeDriverPath, int poolSize) {
|
||||||
|
System.getProperties().setProperty("webdriver.chrome.driver", chromeDriverPath);
|
||||||
|
webDriverPool = new WebDriverPool(poolSize);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Page download(Request request, Task task) {
|
public Page download(Request request, Task task) {
|
||||||
WebDriver webDriver = null;
|
WebDriver webDriver;
|
||||||
try {
|
try {
|
||||||
webDriver = webDriverPool.get();
|
webDriver = webDriverPool.get();
|
||||||
} catch (InterruptedException e) {
|
} catch (InterruptedException e) {
|
||||||
logger.warn("interrupted",e);
|
logger.warn("interrupted", e);
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
webDriver.get(request.getUrl());
|
webDriver.get(request.getUrl());
|
||||||
|
WebDriver.Options manage = webDriver.manage();
|
||||||
|
Site site = task.getSite();
|
||||||
|
if (site.getCookies() != null) {
|
||||||
|
for (Map.Entry<String, String> cookieEntry : site.getCookies().entrySet()) {
|
||||||
|
Cookie cookie = new Cookie(cookieEntry.getKey(), cookieEntry.getValue());
|
||||||
|
manage.addCookie(cookie);
|
||||||
|
}
|
||||||
|
}
|
||||||
WebElement webElement = webDriver.findElement(By.xpath("/html"));
|
WebElement webElement = webDriver.findElement(By.xpath("/html"));
|
||||||
String content = webElement.getAttribute("outerHTML");
|
String content = webElement.getAttribute("outerHTML");
|
||||||
Page page = new Page();
|
Page page = new Page();
|
||||||
page.setHtml(new Html(UrlUtils.fixAllRelativeHrefs(content, request.getUrl())));
|
page.setHtml(new Html(UrlUtils.fixAllRelativeHrefs(content, request.getUrl())));
|
||||||
page.setUrl(new PlainText(request.getUrl()));
|
page.setUrl(new PlainText(request.getUrl()));
|
||||||
page.setRequest(request);
|
page.setRequest(request);
|
||||||
|
webDriverPool.returnToPool(webDriver);
|
||||||
return page;
|
return page;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,36 @@
|
||||||
|
package us.codecraft.webmagic.selenium.downloader;
|
||||||
|
|
||||||
|
import org.junit.Ignore;
|
||||||
|
import org.junit.Test;
|
||||||
|
import us.codecraft.webmagic.Page;
|
||||||
|
import us.codecraft.webmagic.Request;
|
||||||
|
import us.codecraft.webmagic.Site;
|
||||||
|
import us.codecraft.webmagic.Task;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author yihua.huang@dianping.com <br>
|
||||||
|
* @date: 13-7-26 <br>
|
||||||
|
* Time: 下午2:46 <br>
|
||||||
|
*/
|
||||||
|
public class SeleniumDownloaderTest {
|
||||||
|
|
||||||
|
private String chromeDriverPath = "/Users/yihua/Downloads/chromedriver";
|
||||||
|
|
||||||
|
@Ignore("need chrome driver")
|
||||||
|
@Test
|
||||||
|
public void test(){
|
||||||
|
SeleniumDownloader seleniumDownloader = new SeleniumDownloader(chromeDriverPath);
|
||||||
|
Page page = seleniumDownloader.download(new Request("http://huaban.com/"), new Task() {
|
||||||
|
@Override
|
||||||
|
public String getUUID() {
|
||||||
|
return "huaban.com";
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Site getSite() {
|
||||||
|
return Site.me();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
System.out.println(page.getHtml().$("#waterfall").links().regex(".*pins.*").all());
|
||||||
|
}
|
||||||
|
}
|
|
@ -10,12 +10,13 @@ import org.openqa.selenium.WebDriver;
|
||||||
*/
|
*/
|
||||||
public class WebDriverPoolTest {
|
public class WebDriverPoolTest {
|
||||||
|
|
||||||
|
private String chromeDriverPath = "/Users/yihua/Downloads/chromedriver";
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void test(){
|
public void test() {
|
||||||
String chromeDriverPath = "/Users/yihua/Downloads/chromedriver";
|
|
||||||
System.getProperties().setProperty("webdriver.chrome.driver", chromeDriverPath);
|
System.getProperties().setProperty("webdriver.chrome.driver", chromeDriverPath);
|
||||||
WebDriverPool webDriverPool =new WebDriverPool(5);
|
WebDriverPool webDriverPool = new WebDriverPool(5);
|
||||||
for (int i=0;i<5;i++){
|
for (int i = 0; i < 5; i++) {
|
||||||
try {
|
try {
|
||||||
WebDriver webDriver = webDriverPool.get();
|
WebDriver webDriver = webDriverPool.get();
|
||||||
System.out.println(i);
|
System.out.println(i);
|
||||||
|
|
Loading…
Reference in New Issue