From 0d9b148fc29fb83a573e2cb1ba8efb0fc735fd1f Mon Sep 17 00:00:00 2001 From: dolphineor Date: Sun, 23 Nov 2014 23:07:22 +0800 Subject: [PATCH] add PhantomJSDownloader --- .../downloader/PhantomJSDownloader.java | 94 +++++++++++++++++++ webmagic-core/src/main/resources/crawl.js | 17 ++++ .../samples/PhantomJSPageProcessor.java | 54 +++++++++++ webmagic-samples/src/main/resources/crawl.js | 17 ++++ 4 files changed, 182 insertions(+) create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java create mode 100644 webmagic-core/src/main/resources/crawl.js create mode 100644 webmagic-samples/src/main/java/us/codecraft/webmagic/samples/PhantomJSPageProcessor.java create mode 100644 webmagic-samples/src/main/resources/crawl.js diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java new file mode 100644 index 0000000..532a151 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java @@ -0,0 +1,94 @@ +package us.codecraft.webmagic.downloader; + +import org.apache.http.annotation.ThreadSafe; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.Task; +import us.codecraft.webmagic.selector.PlainText; + +import java.io.*; + +/** + * this downloader is used to download pages which need to render the javascript + * + * @author dolphineor@gmail.com + * @version 0.5.3 + */ +@ThreadSafe +public class PhantomJSDownloader extends AbstractDownloader { + + private static Logger logger = LoggerFactory.getLogger(PhantomJSDownloader.class); + private static String phantomJSPath; + + private int retryNum; + private int threadNum; + + public PhantomJSDownloader() { + PhantomJSDownloader.phantomJSPath = new File(this.getClass().getResource("/").getPath()).getPath() + System.getProperty("file.separator") + "crawl.js "; + } + + @Override + public Page download(Request request, Task task) { + if (logger.isInfoEnabled()) { + logger.info("downloading page: " + request.getUrl()); + } + String content = getPage(request); + if (content.contains("HTTP request failed")) { + for (int i = 1; i <= getRetryNum(); i++) { + content = getPage(request); + if (!content.contains("HTTP request failed")) { + break; + } + } + if (content.contains("HTTP request failed")) { + //when failed + Page page = new Page(); + page.setRequest(request); + return page; + } + } + + Page page = new Page(); + page.setRawText(content); + page.setUrl(new PlainText(request.getUrl())); + page.setRequest(request); + page.setStatusCode(200); + return page; + } + + @Override + public void setThread(int threadNum) { + this.threadNum = threadNum; + } + + protected String getPage(Request request) { + try { + String url = request.getUrl(); + Runtime runtime = Runtime.getRuntime(); + Process process = runtime.exec("phantomjs " + phantomJSPath + url); + InputStream is = process.getInputStream(); + BufferedReader br = new BufferedReader(new InputStreamReader(is)); + StringBuffer stringBuffer = new StringBuffer(); + String line; + while ((line = br.readLine()) != null) { + stringBuffer.append(line).append("\n"); + } + return stringBuffer.toString(); + } catch (IOException e) { + e.printStackTrace(); + } + + return null; + } + + public int getRetryNum() { + return retryNum; + } + + public PhantomJSDownloader setRetryNum(int retryNum) { + this.retryNum = retryNum; + return this; + } +} diff --git a/webmagic-core/src/main/resources/crawl.js b/webmagic-core/src/main/resources/crawl.js new file mode 100644 index 0000000..c9cf01c --- /dev/null +++ b/webmagic-core/src/main/resources/crawl.js @@ -0,0 +1,17 @@ +var system = require('system'); +var url = system.args[1]; + +var page = require('webpage').create(); +page.settings.loadImages = false; +page.settings.resourceTimeout = 5000; + +page.open(url, function (status) { + if (status != 'success') { + console.log("HTTP request failed!"); + } else { + console.log(page.content); + } + + page.close(); + phantom.exit(); +}); \ No newline at end of file diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/PhantomJSPageProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/PhantomJSPageProcessor.java new file mode 100644 index 0000000..b4f1936 --- /dev/null +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/PhantomJSPageProcessor.java @@ -0,0 +1,54 @@ +package us.codecraft.webmagic.samples; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.ResultItems; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.downloader.PhantomJSDownloader; +import us.codecraft.webmagic.pipeline.CollectorPipeline; +import us.codecraft.webmagic.pipeline.ResultItemsCollectorPipeline; +import us.codecraft.webmagic.processor.PageProcessor; + +import java.util.List; + +/** + * Created by dolphineor on 2014-11-21. + *

+ * 以淘宝为例, 搜索冬装的相关结果 + */ +public class PhantomJSPageProcessor implements PageProcessor { + + private Site site = Site.me() + .setDomain("s.taobao.com") + .setCharset("GBK") + .addHeader("Referer", "http://www.taobao.com/") + .setRetryTimes(3).setSleepTime(1000); + + @Override + public void process(Page page) { + if (page.getRawText() != null) + page.putField("html", page.getRawText()); + } + + @Override + public Site getSite() { + return site; + } + + public static void main(String[] args) throws Exception { + PhantomJSDownloader phantomDownloader = new PhantomJSDownloader().setRetryNum(3); + + CollectorPipeline collectorPipeline = new ResultItemsCollectorPipeline(); + + Spider.create(new PhantomJSPageProcessor()) + .addUrl("http://s.taobao.com/search?q=%B6%AC%D7%B0&sort=sale-desc") //%B6%AC%D7%B0为冬装的GBK编码 + .setDownloader(phantomDownloader) + .addPipeline(collectorPipeline) + .thread((Runtime.getRuntime().availableProcessors() - 1) << 1) + .run(); + + List resultItemsList = collectorPipeline.getCollected(); + System.out.println(resultItemsList.get(0).get("html").toString()); + } + +} diff --git a/webmagic-samples/src/main/resources/crawl.js b/webmagic-samples/src/main/resources/crawl.js new file mode 100644 index 0000000..c9cf01c --- /dev/null +++ b/webmagic-samples/src/main/resources/crawl.js @@ -0,0 +1,17 @@ +var system = require('system'); +var url = system.args[1]; + +var page = require('webpage').create(); +page.settings.loadImages = false; +page.settings.resourceTimeout = 5000; + +page.open(url, function (status) { + if (status != 'success') { + console.log("HTTP request failed!"); + } else { + console.log(page.content); + } + + page.close(); + phantom.exit(); +}); \ No newline at end of file