From d1f2e65e5d798936e535ceb809671133185c25d9 Mon Sep 17 00:00:00 2001 From: Jsbd Date: Thu, 8 Dec 2016 14:28:48 +0800 Subject: [PATCH 1/2] =?UTF-8?q?=E6=96=B0=E5=A2=9EPhantomJSDownloader?= =?UTF-8?q?=E6=9E=84=E9=80=A0=E5=87=BD=E6=95=B0=EF=BC=8C=E6=94=AF=E6=8C=81?= =?UTF-8?q?crawl.js=E8=B7=AF=E5=BE=84=E8=87=AA=E5=AE=9A=E4=B9=89=EF=BC=8C?= =?UTF-8?q?=E5=9B=A0=E4=B8=BA=E5=BD=93=E5=85=B6=E4=BB=96=E9=A1=B9=E7=9B=AE?= =?UTF-8?q?=E4=BE=9D=E8=B5=96=E6=AD=A4jar=E5=8C=85=E6=97=B6=EF=BC=8Cruntim?= =?UTF-8?q?e.exec()=E6=89=A7=E8=A1=8Cphantomjs=E5=91=BD=E4=BB=A4=E6=97=B6?= =?UTF-8?q?=E6=97=A0=E4=BD=BF=E7=94=A8=E6=B3=95jar=E5=8C=85=E4=B8=AD?= =?UTF-8?q?=E7=9A=84crawl.js?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../codecraft/webmagic/downloader/PhantomJSDownloader.java | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java index 2292788..bea44fd 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java @@ -44,6 +44,11 @@ public class PhantomJSDownloader extends AbstractDownloader { PhantomJSDownloader.phantomJsCommand = phantomJsCommand; } + public PhantomJSDownloader(String phantomJsCommand, String crawlJsPath) { + PhantomJSDownloader.phantomJsCommand = phantomJsCommand; + PhantomJSDownloader.crawlJsPath = crawlJsPath; + } + private void initPhantomjsCrawlPath() { PhantomJSDownloader.crawlJsPath = new File(this.getClass().getResource("/").getPath()).getPath() + System.getProperty("file.separator") + "crawl.js "; } @@ -86,7 +91,7 @@ public class PhantomJSDownloader extends AbstractDownloader { try { String url = request.getUrl(); Runtime runtime = Runtime.getRuntime(); - Process process = runtime.exec(phantomJsCommand + " " + crawlJsPath + url); + Process process = runtime.exec(phantomJsCommand + " " + crawlJsPath + " " + url); InputStream is = process.getInputStream(); BufferedReader br = new BufferedReader(new InputStreamReader(is)); StringBuffer stringBuffer = new StringBuffer(); From 1b886d48a2de1f621305a802faca5d94f72f7f86 Mon Sep 17 00:00:00 2001 From: Jsbd Date: Thu, 8 Dec 2016 14:29:42 +0800 Subject: [PATCH 2/2] =?UTF-8?q?=E6=96=B0=E5=A2=9EPhantomJSDownloader?= =?UTF-8?q?=E6=9E=84=E9=80=A0=E5=87=BD=E6=95=B0=EF=BC=8C=E6=94=AF=E6=8C=81?= =?UTF-8?q?crawl.js=E8=B7=AF=E5=BE=84=E8=87=AA=E5=AE=9A=E4=B9=89=EF=BC=8C?= =?UTF-8?q?=E5=9B=A0=E4=B8=BA=E5=BD=93=E5=85=B6=E4=BB=96=E9=A1=B9=E7=9B=AE?= =?UTF-8?q?=E4=BE=9D=E8=B5=96=E6=AD=A4jar=E5=8C=85=E6=97=B6=EF=BC=8Cruntim?= =?UTF-8?q?e.exec()=E6=89=A7=E8=A1=8Cphantomjs=E5=91=BD=E4=BB=A4=E6=97=B6?= =?UTF-8?q?=E6=97=A0=E4=BD=BF=E7=94=A8=E6=B3=95jar=E5=8C=85=E4=B8=AD?= =?UTF-8?q?=E7=9A=84crawl.js?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../downloader/PhantomJSDownloader.java | 51 +++++++++++++++---- 1 file changed, 42 insertions(+), 9 deletions(-) diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java index bea44fd..a955e73 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java @@ -29,21 +29,54 @@ public class PhantomJSDownloader extends AbstractDownloader { public PhantomJSDownloader() { this.initPhantomjsCrawlPath(); } + /** - * 添加新的构造函数,支持phantomjs自定义命令 - * - * example: - * phantomjs.exe 支持windows环境 - * phantomjs --ignore-ssl-errors=yes 忽略抓取地址是https时的一些错误 - * /usr/local/bin/phantomjs 命令的绝对路径,避免因系统环境变量引起的IOException - * - * @param phantomJsCommand - */ + * 添加新的构造函数,支持phantomjs自定义命令 + * + * example: + * phantomjs.exe 支持windows环境 + * phantomjs --ignore-ssl-errors=yes 忽略抓取地址是https时的一些错误 + * /usr/local/bin/phantomjs 命令的绝对路径,避免因系统环境变量引起的IOException + * + * @param phantomJsCommand + */ public PhantomJSDownloader(String phantomJsCommand) { this.initPhantomjsCrawlPath(); PhantomJSDownloader.phantomJsCommand = phantomJsCommand; } + /** + * 新增构造函数,支持crawl.js路径自定义,因为当其他项目依赖此jar包时,runtime.exec()执行phantomjs命令时无使用法jar包中的crawl.js + * + * crawl.js start -->> + * + * var system = require('system'); + * var url = system.args[1]; + * + * var page = require('webpage').create(); + * page.settings.loadImages = false; + * page.settings.resourceTimeout = 5000; + * + * page.open(url, function (status) { + * if (status != 'success') { + * console.log("HTTP request failed!"); + * } else { + * console.log(page.content); + * } + * + * page.close(); + * phantom.exit(); + * }); + * + * <<-- crawl.js end + * 具体项目时可以将以上js代码复制下来使用 + * + * example: + * new PhantomJSDownloader("/your/path/phantomjs", "/your/path/crawl.js"); + * + * @param phantomJsCommand + * @param crawlJsPath + */ public PhantomJSDownloader(String phantomJsCommand, String crawlJsPath) { PhantomJSDownloader.phantomJsCommand = phantomJsCommand; PhantomJSDownloader.crawlJsPath = crawlJsPath;