Add PhantomJS support for selenium

The configuration file is config.ini
The dependencies are updated in pom.xml.
Update SeleniumDownloader and WebDriverPool to support PhantomJS. 
NOTE: The versions of GhostDriver, Selenium, and PhantomJS are stable
and validated.

A GooglePlay Example is under samples package: GooglePlayProcessor.java
master
bingoko 2015-07-11 15:34:21 +01:00
parent b30ca6ce1e
commit d3bbece202
5 changed files with 400 additions and 166 deletions

View File

@ -0,0 +1,12 @@
# What WebDriver to use for the tests
driver=phantomjs
#driver=firefox
#driver=chrome
#driver=http://localhost:8910
#driver=http://localhost:4444/wd/hub
# PhantomJS specific config (change according to your installation)
#phantomjs_exec_path=/Users/Bingo/bin/phantomjs-qt5
phantomjs_exec_path=/Users/Bingo/Downloads/phantomjs-1.9.8-macosx/bin/phantomjs
#phantomjs_driver_path=/Users/Bingo/Documents/workspace/webmagic/webmagic-selenium/src/main.js
phantomjs_driver_loglevel=DEBUG

View File

@ -1,40 +1,50 @@
<?xml version="1.0" encoding="UTF-8"?> <?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
<parent> xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<artifactId>webmagic-parent</artifactId> <parent>
<groupId>us.codecraft</groupId> <artifactId>webmagic-parent</artifactId>
<version>0.5.2</version> <groupId>us.codecraft</groupId>
</parent> <version>0.5.2</version>
<modelVersion>4.0.0</modelVersion> </parent>
<modelVersion>4.0.0</modelVersion>
<artifactId>webmagic-selenium</artifactId> <artifactId>webmagic-selenium</artifactId>
<dependencies> <dependencies>
<dependency> <dependency>
<groupId>org.seleniumhq.selenium</groupId> <groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-java</artifactId> <artifactId>selenium-java</artifactId>
<version>2.33.0</version> <version>2.34.0</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>us.codecraft</groupId> <groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId> <artifactId>webmagic-core</artifactId>
<version>${project.version}</version> <version>${project.version}</version>
</dependency> </dependency>
<dependency> <!-- <dependency> <groupId>com.github.detro</groupId> <artifactId>phantomjsdriver</artifactId>
<groupId>junit</groupId> <version>1.2.0</version> </dependency> -->
<artifactId>junit</artifactId> <dependency>
</dependency> <groupId>com.github.detro.ghostdriver</groupId>
</dependencies> <artifactId>phantomjsdriver</artifactId>
<version>1.1.0</version>
</dependency>
<build>
<plugins> <dependency>
<plugin> <groupId>junit</groupId>
<artifactId>maven-deploy-plugin</artifactId> <artifactId>junit</artifactId>
<configuration> </dependency>
<skip>true</skip> </dependencies>
</configuration>
</plugin> <build>
</plugins> <plugins>
</build> <plugin>
<artifactId>maven-deploy-plugin</artifactId>
<configuration>
<skip>true</skip>
</configuration>
</plugin>
</plugins>
</build>
</project> </project>

View File

@ -5,6 +5,7 @@ import org.openqa.selenium.By;
import org.openqa.selenium.Cookie; import org.openqa.selenium.Cookie;
import org.openqa.selenium.WebDriver; import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement; import org.openqa.selenium.WebElement;
import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Site;
@ -23,90 +24,113 @@ import java.util.Map;
* Selenium driver<br> * Selenium driver<br>
* *
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* Date: 13-7-26 <br> * Date: 13-7-26 <br>
* Time: 1:37 <br> * Time: 1:37 <br>
*/ */
public class SeleniumDownloader implements Downloader, Closeable { public class SeleniumDownloader implements Downloader, Closeable {
private volatile WebDriverPool webDriverPool; private volatile WebDriverPool webDriverPool;
private Logger logger = Logger.getLogger(getClass()); private Logger logger = Logger.getLogger(getClass());
private int sleepTime = 0; private int sleepTime = 0;
private int poolSize = 1; private int poolSize = 1;
/** private static final String DRIVER_PHANTOMJS = "phantomjs";
*
*
* @param chromeDriverPath
*/
public SeleniumDownloader(String chromeDriverPath) {
System.getProperties().setProperty("webdriver.chrome.driver", chromeDriverPath);
}
/** /**
* set sleep time to wait until load success *
* *
* @param sleepTime * @param chromeDriverPath
* @return this */
*/ public SeleniumDownloader(String chromeDriverPath) {
public SeleniumDownloader setSleepTime(int sleepTime) { System.getProperties().setProperty("webdriver.chrome.driver",
this.sleepTime = sleepTime; chromeDriverPath);
return this; }
}
@Override /**
public Page download(Request request, Task task) { * Constructor without any filed. Construct PhantomJS browser
checkInit(); *
WebDriver webDriver; * @author bob.li.0718@gmail.com
try { */
webDriver = webDriverPool.get(); public SeleniumDownloader() {
} catch (InterruptedException e) { // System.setProperty("phantomjs.binary.path",
logger.warn("interrupted", e); // "/Users/Bingo/Downloads/phantomjs-1.9.7-macosx/bin/phantomjs");
return null; }
}
logger.info("downloading page " + request.getUrl());
webDriver.get(request.getUrl());
try {
Thread.sleep(sleepTime);
} catch (InterruptedException e) {
e.printStackTrace();
}
WebDriver.Options manage = webDriver.manage();
Site site = task.getSite();
if (site.getCookies() != null) {
for (Map.Entry<String, String> cookieEntry : site.getCookies().entrySet()) {
Cookie cookie = new Cookie(cookieEntry.getKey(), cookieEntry.getValue());
manage.addCookie(cookie);
}
}
WebElement webElement = webDriver.findElement(By.xpath("/html"));
String content = webElement.getAttribute("outerHTML");
Page page = new Page();
page.setRawText(content);
page.setHtml(new Html(UrlUtils.fixAllRelativeHrefs(content, request.getUrl())));
page.setUrl(new PlainText(request.getUrl()));
page.setRequest(request);
webDriverPool.returnToPool(webDriver);
return page;
}
private void checkInit() { /**
if (webDriverPool == null) { * set sleep time to wait until load success
synchronized (this){ *
webDriverPool = new WebDriverPool(poolSize); * @param sleepTime
} * @return this
} */
} public SeleniumDownloader setSleepTime(int sleepTime) {
this.sleepTime = sleepTime;
return this;
}
@Override @Override
public void setThread(int thread) { public Page download(Request request, Task task) {
this.poolSize = thread; checkInit();
} WebDriver webDriver;
try {
webDriver = webDriverPool.get();
} catch (InterruptedException e) {
logger.warn("interrupted", e);
return null;
}
logger.info("downloading page " + request.getUrl());
webDriver.get(request.getUrl());
try {
Thread.sleep(sleepTime);
} catch (InterruptedException e) {
e.printStackTrace();
}
WebDriver.Options manage = webDriver.manage();
Site site = task.getSite();
if (site.getCookies() != null) {
for (Map.Entry<String, String> cookieEntry : site.getCookies()
.entrySet()) {
Cookie cookie = new Cookie(cookieEntry.getKey(),
cookieEntry.getValue());
manage.addCookie(cookie);
}
}
@Override /*
public void close() throws IOException { * TODO You can add mouse event or other processes
webDriverPool.closeAll(); *
} * @author: bob.li.0718@gmail.com
*/
WebElement webElement = webDriver.findElement(By.xpath("/html"));
String content = webElement.getAttribute("outerHTML");
Page page = new Page();
page.setRawText(content);
page.setHtml(new Html(UrlUtils.fixAllRelativeHrefs(content,
request.getUrl())));
page.setUrl(new PlainText(request.getUrl()));
page.setRequest(request);
webDriverPool.returnToPool(webDriver);
return page;
}
private void checkInit() {
if (webDriverPool == null) {
synchronized (this) {
webDriverPool = new WebDriverPool(poolSize);
}
}
}
@Override
public void setThread(int thread) {
this.poolSize = thread;
}
@Override
public void close() throws IOException {
webDriverPool.closeAll();
}
} }

View File

@ -3,89 +3,231 @@ package us.codecraft.webmagic.downloader.selenium;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
import org.openqa.selenium.WebDriver; import org.openqa.selenium.WebDriver;
import org.openqa.selenium.chrome.ChromeDriver; import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.firefox.FirefoxDriver;
import org.openqa.selenium.phantomjs.PhantomJSDriver;
import org.openqa.selenium.phantomjs.PhantomJSDriverService;
import org.openqa.selenium.remote.DesiredCapabilities;
import org.openqa.selenium.remote.RemoteWebDriver;
import java.io.FileReader;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collections; import java.util.Collections;
import java.util.List; import java.util.List;
import java.util.Properties;
import java.util.concurrent.BlockingDeque; import java.util.concurrent.BlockingDeque;
import java.util.concurrent.LinkedBlockingDeque; import java.util.concurrent.LinkedBlockingDeque;
import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicInteger;
/** /**
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* Date: 13-7-26 <br> * Date: 13-7-26 <br>
* Time: 1:41 <br> * Time: 1:41 <br>
*/ */
class WebDriverPool { class WebDriverPool {
private Logger logger = Logger.getLogger(getClass()); private Logger logger = Logger.getLogger(getClass());
private final static int DEFAULT_CAPACITY = 5; private final static int DEFAULT_CAPACITY = 5;
private final int capacity; private final int capacity;
private final static int STAT_RUNNING = 1; private final static int STAT_RUNNING = 1;
private final static int STAT_CLODED = 2; private final static int STAT_CLODED = 2;
private AtomicInteger stat = new AtomicInteger(STAT_RUNNING); private AtomicInteger stat = new AtomicInteger(STAT_RUNNING);
/** /*
* store webDrivers created * new fields for configuring phantomJS
*/ */
private List<WebDriver> webDriverList = Collections.synchronizedList(new ArrayList<WebDriver>()); private WebDriver mDriver = null;
private boolean mAutoQuitDriver = true;
/** private static final String CONFIG_FILE = "/Users/Bingo/Documents/workspace/webmagic/webmagic-selenium/config.ini";
* store webDrivers available private static final String DRIVER_FIREFOX = "firefox";
*/ private static final String DRIVER_CHROME = "chrome";
private BlockingDeque<WebDriver> innerQueue = new LinkedBlockingDeque<WebDriver>(); private static final String DRIVER_PHANTOMJS = "phantomjs";
public WebDriverPool(int capacity) { protected static Properties sConfig;
this.capacity = capacity; protected static DesiredCapabilities sCaps;
}
public WebDriverPool() { /**
this(DEFAULT_CAPACITY); * Configure the GhostDriver, and initialize a WebDriver instance. This part
} * of code comes from GhostDriver.
* https://github.com/detro/ghostdriver/tree/master/test/java/src/test/java/ghostdriver
*
* @author bob.li.0718@gmail.com
* @throws IOException
*/
public void configure() throws IOException {
// Read config file
sConfig = new Properties();
sConfig.load(new FileReader(CONFIG_FILE));
public WebDriver get() throws InterruptedException { // Prepare capabilities
checkRunning(); sCaps = new DesiredCapabilities();
WebDriver poll = innerQueue.poll(); sCaps.setJavascriptEnabled(true);
if (poll != null) { sCaps.setCapability("takesScreenshot", false);
return poll;
}
if (webDriverList.size() < capacity) {
synchronized (webDriverList) {
if (webDriverList.size() < capacity) {
ChromeDriver e = new ChromeDriver();
innerQueue.add(e);
webDriverList.add(e);
}
}
} String driver = sConfig.getProperty("driver", DRIVER_PHANTOMJS);
return innerQueue.take();
}
public void returnToPool(WebDriver webDriver) { // Fetch PhantomJS-specific configuration parameters
checkRunning(); if (driver.equals(DRIVER_PHANTOMJS)) {
innerQueue.add(webDriver); // "phantomjs_exec_path"
} if (sConfig.getProperty("phantomjs_exec_path") != null) {
sCaps.setCapability(
PhantomJSDriverService.PHANTOMJS_EXECUTABLE_PATH_PROPERTY,
sConfig.getProperty("phantomjs_exec_path"));
} else {
throw new IOException(
String.format(
"Property '%s' not set!",
PhantomJSDriverService.PHANTOMJS_EXECUTABLE_PATH_PROPERTY));
}
// "phantomjs_driver_path"
if (sConfig.getProperty("phantomjs_driver_path") != null) {
System.out.println("Test will use an external GhostDriver");
sCaps.setCapability(
PhantomJSDriverService.PHANTOMJS_GHOSTDRIVER_PATH_PROPERTY,
sConfig.getProperty("phantomjs_driver_path"));
} else {
System.out
.println("Test will use PhantomJS internal GhostDriver");
}
}
protected void checkRunning() { // Disable "web-security", enable all possible "ssl-protocols" and
if (!stat.compareAndSet(STAT_RUNNING, STAT_RUNNING)) { // "ignore-ssl-errors" for PhantomJSDriver
throw new IllegalStateException("Already closed!"); // sCaps.setCapability(PhantomJSDriverService.PHANTOMJS_CLI_ARGS, new
} // String[] {
} // "--web-security=false",
// "--ssl-protocol=any",
// "--ignore-ssl-errors=true"
// });
public void closeAll() { ArrayList<String> cliArgsCap = new ArrayList<String>();
boolean b = stat.compareAndSet(STAT_RUNNING, STAT_CLODED); cliArgsCap.add("--web-security=false");
if (!b) { cliArgsCap.add("--ssl-protocol=any");
throw new IllegalStateException("Already closed!"); cliArgsCap.add("--ignore-ssl-errors=true");
} sCaps.setCapability(PhantomJSDriverService.PHANTOMJS_CLI_ARGS,
for (WebDriver webDriver : webDriverList) { cliArgsCap);
logger.info("Quit webDriver" + webDriver);
webDriver.quit(); // Control LogLevel for GhostDriver, via CLI arguments
} sCaps.setCapability(
} PhantomJSDriverService.PHANTOMJS_GHOSTDRIVER_CLI_ARGS,
new String[] { "--logLevel="
+ (sConfig.getProperty("phantomjs_driver_loglevel") != null ? sConfig
.getProperty("phantomjs_driver_loglevel")
: "INFO") });
// String driver = sConfig.getProperty("driver", DRIVER_PHANTOMJS);
// Start appropriate Driver
if (isUrl(driver)) {
sCaps.setBrowserName("phantomjs");
mDriver = new RemoteWebDriver(new URL(driver), sCaps);
} else if (driver.equals(DRIVER_FIREFOX)) {
mDriver = new FirefoxDriver(sCaps);
} else if (driver.equals(DRIVER_CHROME)) {
mDriver = new ChromeDriver(sCaps);
} else if (driver.equals(DRIVER_PHANTOMJS)) {
mDriver = new PhantomJSDriver(sCaps);
}
}
/**
* check whether input is a valid URL
*
* @author bob.li.0718@gmail.com
* @param urlString
* @return true means yes, otherwise no.
*/
private boolean isUrl(String urlString) {
try {
new URL(urlString);
return true;
} catch (MalformedURLException mue) {
return false;
}
}
/**
* store webDrivers created
*/
private List<WebDriver> webDriverList = Collections
.synchronizedList(new ArrayList<WebDriver>());
/**
* store webDrivers available
*/
private BlockingDeque<WebDriver> innerQueue = new LinkedBlockingDeque<WebDriver>();
public WebDriverPool(int capacity) {
this.capacity = capacity;
}
public WebDriverPool() {
this(DEFAULT_CAPACITY);
}
/**
*
* @return
* @throws InterruptedException
*/
public WebDriver get() throws InterruptedException {
checkRunning();
WebDriver poll = innerQueue.poll();
if (poll != null) {
return poll;
}
if (webDriverList.size() < capacity) {
synchronized (webDriverList) {
if (webDriverList.size() < capacity) {
// add new WebDriver instance into pool
try {
configure();
innerQueue.add(mDriver);
webDriverList.add(mDriver);
} catch (IOException e) {
e.printStackTrace();
}
// ChromeDriver e = new ChromeDriver();
// WebDriver e = getWebDriver();
// innerQueue.add(e);
// webDriverList.add(e);
}
}
}
return innerQueue.take();
}
public void returnToPool(WebDriver webDriver) {
checkRunning();
innerQueue.add(webDriver);
}
protected void checkRunning() {
if (!stat.compareAndSet(STAT_RUNNING, STAT_RUNNING)) {
throw new IllegalStateException("Already closed!");
}
}
public void closeAll() {
boolean b = stat.compareAndSet(STAT_RUNNING, STAT_CLODED);
if (!b) {
throw new IllegalStateException("Already closed!");
}
for (WebDriver webDriver : webDriverList) {
logger.info("Quit webDriver" + webDriver);
webDriver.quit();
webDriver = null;
}
}
} }

View File

@ -0,0 +1,46 @@
package us.codecraft.webmagic.samples;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.downloader.selenium.SeleniumDownloader;
import us.codecraft.webmagic.pipeline.FilePipeline;
import us.codecraft.webmagic.processor.PageProcessor;
/**
*
* Using Selenium with PhantomJS to fetch web-page with JS<br>
*
* @author bob.li.0718@gmail.com <br>
* Date: 15-7-11 <br>
*/
public class GooglePlayProcessor implements PageProcessor {
private Site site;
@Override
public void process(Page page) {
page.putField("whole-html", page.getHtml().toString());
}
@Override
public Site getSite() {
if (null == site) {
site = Site.me().setDomain("play.google.com").setSleepTime(300);
}
return site;
}
public static void main(String[] args) {
Spider.create(new GooglePlayProcessor())
.thread(5)
.addPipeline(
new FilePipeline(
"/Users/Bingo/Documents/workspace/webmagic/webmagic-selenium/data/"))
.setDownloader(new SeleniumDownloader())
.addUrl("https://play.google.com/store/apps/details?id=com.tencent.mm")
.runAsync();
}
}