diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java index c27292d..2f9b112 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java @@ -26,7 +26,7 @@ public abstract class AbstractDownloader implements Downloader { /** * A simple method to download a url. * - * @param url url + * @param url url * @param charset charset * @return html */ @@ -38,7 +38,7 @@ public abstract class AbstractDownloader implements Downloader { protected void onSuccess(Request request) { } - protected void onError(Request request) { + protected void onError(Request request, Throwable e) { } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 49217e1..89b6038 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -87,7 +87,7 @@ public class HttpClientDownloader extends AbstractDownloader { return page; } catch (IOException e) { logger.warn("download page {} error", request.getUrl(), e); - onError(request); + onError(request, e); return page; } finally { if (httpResponse != null) { @@ -110,7 +110,7 @@ public class HttpClientDownloader extends AbstractDownloader { String contentType = httpResponse.getEntity().getContentType() == null ? "" : httpResponse.getEntity().getContentType().getValue(); Page page = new Page(); page.setBytes(bytes); - if (!request.isBinaryContent()){ + if (!request.isBinaryContent()) { if (charset == null) { charset = getHtmlCharset(contentType, bytes); } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java index 6055bdb..88b8237 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java @@ -16,73 +16,70 @@ import java.io.*; * @version 0.5.3 */ public class PhantomJSDownloader extends AbstractDownloader { - - private static Logger logger = LoggerFactory.getLogger(PhantomJSDownloader.class); + private static final Logger logger = LoggerFactory.getLogger(PhantomJSDownloader.class); private static String crawlJsPath; private static String phantomJsCommand = "phantomjs"; // default - private int retryNum; - private int threadNum; - public PhantomJSDownloader() { this.initPhantomjsCrawlPath(); } - + /** * 添加新的构造函数,支持phantomjs自定义命令 - * - * example: - * phantomjs.exe 支持windows环境 - * phantomjs --ignore-ssl-errors=yes 忽略抓取地址是https时的一些错误 - * /usr/local/bin/phantomjs 命令的绝对路径,避免因系统环境变量引起的IOException - * + *
+ * example: + * phantomjs.exe 支持windows环境 + * phantomjs --ignore-ssl-errors=yes 忽略抓取地址是https时的一些错误 + * /usr/local/bin/phantomjs 命令的绝对路径,避免因系统环境变量引起的IOException + * * @param phantomJsCommand phantomJsCommand */ public PhantomJSDownloader(String phantomJsCommand) { this.initPhantomjsCrawlPath(); PhantomJSDownloader.phantomJsCommand = phantomJsCommand; } - + /** * 新增构造函数,支持crawl.js路径自定义,因为当其他项目依赖此jar包时,runtime.exec()执行phantomjs命令时无使用法jar包中的crawl.js *
* crawl.js start -- - * + * * var system = require('system'); * var url = system.args[1]; - * + * * var page = require('webpage').create(); * page.settings.loadImages = false; * page.settings.resourceTimeout = 5000; - * + * * page.open(url, function (status) { * if (status != 'success') { * console.log("HTTP request failed!"); * } else { * console.log(page.content); * } - * + * * page.close(); * phantom.exit(); * }); - * + * * -- crawl.js end ** 具体项目时可以将以上js代码复制下来使用 - * + *
* example:
- * new PhantomJSDownloader("/your/path/phantomjs", "/your/path/crawl.js");
- *
+ * new PhantomJSDownloader("/your/path/phantomjs", "/your/path/crawl.js");
+ *
* @param phantomJsCommand phantomJsCommand
- * @param crawlJsPath crawlJsPath
+ * @param crawlJsPath crawlJsPath
*/
public PhantomJSDownloader(String phantomJsCommand, String crawlJsPath) {
- PhantomJSDownloader.phantomJsCommand = phantomJsCommand;
- PhantomJSDownloader.crawlJsPath = crawlJsPath;
+ PhantomJSDownloader.phantomJsCommand = phantomJsCommand;
+ PhantomJSDownloader.crawlJsPath = crawlJsPath;
}
-
+
private void initPhantomjsCrawlPath() {
- PhantomJSDownloader.crawlJsPath = new File(this.getClass().getResource("/").getPath()).getPath() + System.getProperty("file.separator") + "crawl.js ";
+ PhantomJSDownloader.crawlJsPath = new File(this.getClass().getResource("/").getPath()).getPath()
+ + System.getProperty("file.separator") + "crawl.js ";
}
@Override
@@ -90,61 +87,41 @@ public class PhantomJSDownloader extends AbstractDownloader {
if (logger.isInfoEnabled()) {
logger.info("downloading page: " + request.getUrl());
}
- String content = getPage(request);
- if (content.contains("HTTP request failed")) {
- for (int i = 1; i <= getRetryNum(); i++) {
- content = getPage(request);
- if (!content.contains("HTTP request failed")) {
- break;
- }
- }
- if (content.contains("HTTP request failed")) {
- //when failed
- Page page = new Page();
- page.setRequest(request);
- return page;
- }
- }
- Page page = new Page();
- page.setRawText(content);
- page.setUrl(new PlainText(request.getUrl()));
- page.setRequest(request);
- page.setStatusCode(200);
+ Page page = Page.fail();
+ try {
+ String content = getPage(request);
+ if (!content.contains("HTTP request failed")) {
+ page.setDownloadSuccess(true);
+ page.setRawText(content);
+ page.setUrl(new PlainText(request.getUrl()));
+ page.setRequest(request);
+ page.setStatusCode(200);
+ }
+ onSuccess(request);
+ } catch (Exception e) {
+ onError(request, e);
+ logger.warn("download page {} error", request.getUrl(), e);
+ }
return page;
}
@Override
public void setThread(int threadNum) {
- this.threadNum = threadNum;
+ // ignore
}
- protected String getPage(Request request) {
- try {
- String url = request.getUrl();
- Runtime runtime = Runtime.getRuntime();
- Process process = runtime.exec(phantomJsCommand + " " + crawlJsPath + " " + url);
- InputStream is = process.getInputStream();
- BufferedReader br = new BufferedReader(new InputStreamReader(is));
- StringBuffer stringBuffer = new StringBuffer();
- String line;
- while ((line = br.readLine()) != null) {
- stringBuffer.append(line).append("\n");
- }
- return stringBuffer.toString();
- } catch (IOException e) {
- e.printStackTrace();
+ protected String getPage(Request request) throws Exception {
+ String url = request.getUrl();
+ Runtime runtime = Runtime.getRuntime();
+ Process process = runtime.exec(phantomJsCommand + " " + crawlJsPath + " " + url);
+ InputStream is = process.getInputStream();
+ BufferedReader br = new BufferedReader(new InputStreamReader(is));
+ StringBuilder builder = new StringBuilder();
+ String line;
+ while ((line = br.readLine()) != null) {
+ builder.append(line).append("\n");
}
-
- return null;
- }
-
- public int getRetryNum() {
- return retryNum;
- }
-
- public PhantomJSDownloader setRetryNum(int retryNum) {
- this.retryNum = retryNum;
- return this;
+ return builder.toString();
}
}
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/PhantomJSPageProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/PhantomJSPageProcessor.java
index 99d5fa8..ab53140 100644
--- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/PhantomJSPageProcessor.java
+++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/PhantomJSPageProcessor.java
@@ -36,7 +36,7 @@ public class PhantomJSPageProcessor implements PageProcessor {
}
public static void main(String[] args) throws Exception {
- PhantomJSDownloader phantomDownloader = new PhantomJSDownloader().setRetryNum(3);
+ PhantomJSDownloader phantomDownloader = new PhantomJSDownloader();
CollectorPipeline
*
* @author code4crafter@gmail.com
- * Date: 13-7-26
- * Time: 下午1:37
+ * Date: 13-7-26
+ * Time: 下午1:37
*/
-public class SeleniumDownloader implements Downloader, Closeable {
+public class SeleniumDownloader extends AbstractDownloader implements Closeable {
- private volatile WebDriverPool webDriverPool;
+ private volatile WebDriverPool webDriverPool;
- private Logger logger = LoggerFactory.getLogger(getClass());
+ private Logger logger = LoggerFactory.getLogger(getClass());
- private int sleepTime = 0;
+ private int sleepTime = 0;
- private int poolSize = 1;
+ private int poolSize = 1;
- private static final String DRIVER_PHANTOMJS = "phantomjs";
+ private static final String DRIVER_PHANTOMJS = "phantomjs";
- /**
- * 新建
- *
- * @param chromeDriverPath chromeDriverPath
- */
- public SeleniumDownloader(String chromeDriverPath) {
- System.getProperties().setProperty("webdriver.chrome.driver",
- chromeDriverPath);
- }
+ /**
+ * 新建
+ *
+ * @param chromeDriverPath chromeDriverPath
+ */
+ public SeleniumDownloader(String chromeDriverPath) {
+ System.getProperties().setProperty("webdriver.chrome.driver",
+ chromeDriverPath);
+ }
- /**
- * Constructor without any filed. Construct PhantomJS browser
- *
- * @author bob.li.0718@gmail.com
- */
- public SeleniumDownloader() {
- // System.setProperty("phantomjs.binary.path",
- // "/Users/Bingo/Downloads/phantomjs-1.9.7-macosx/bin/phantomjs");
- }
+ /**
+ * Constructor without any filed. Construct PhantomJS browser
+ *
+ * @author bob.li.0718@gmail.com
+ */
+ public SeleniumDownloader() {
+ // System.setProperty("phantomjs.binary.path",
+ // "/Users/Bingo/Downloads/phantomjs-1.9.7-macosx/bin/phantomjs");
+ }
- /**
- * set sleep time to wait until load success
- *
- * @param sleepTime sleepTime
- * @return this
- */
- public SeleniumDownloader setSleepTime(int sleepTime) {
- this.sleepTime = sleepTime;
- return this;
- }
+ /**
+ * set sleep time to wait until load success
+ *
+ * @param sleepTime sleepTime
+ * @return this
+ */
+ public SeleniumDownloader setSleepTime(int sleepTime) {
+ this.sleepTime = sleepTime;
+ return this;
+ }
- @Override
- public Page download(Request request, Task task) {
- checkInit();
- WebDriver webDriver;
- try {
- webDriver = webDriverPool.get();
- } catch (InterruptedException e) {
- logger.warn("interrupted", e);
- return null;
- }
- logger.info("downloading page " + request.getUrl());
- webDriver.get(request.getUrl());
- try {
- Thread.sleep(sleepTime);
- } catch (InterruptedException e) {
- e.printStackTrace();
- }
- WebDriver.Options manage = webDriver.manage();
- Site site = task.getSite();
- if (site.getCookies() != null) {
- for (Map.Entry