Merge pull request #1085 from vioao/common-downloader-error-process
Common downloader error processmaster
commit
db9c92edf5
|
@ -26,7 +26,7 @@ public abstract class AbstractDownloader implements Downloader {
|
||||||
/**
|
/**
|
||||||
* A simple method to download a url.
|
* A simple method to download a url.
|
||||||
*
|
*
|
||||||
* @param url url
|
* @param url url
|
||||||
* @param charset charset
|
* @param charset charset
|
||||||
* @return html
|
* @return html
|
||||||
*/
|
*/
|
||||||
|
@ -38,7 +38,7 @@ public abstract class AbstractDownloader implements Downloader {
|
||||||
protected void onSuccess(Request request) {
|
protected void onSuccess(Request request) {
|
||||||
}
|
}
|
||||||
|
|
||||||
protected void onError(Request request) {
|
protected void onError(Request request, Throwable e) {
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -87,7 +87,7 @@ public class HttpClientDownloader extends AbstractDownloader {
|
||||||
return page;
|
return page;
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
logger.warn("download page {} error", request.getUrl(), e);
|
logger.warn("download page {} error", request.getUrl(), e);
|
||||||
onError(request);
|
onError(request, e);
|
||||||
return page;
|
return page;
|
||||||
} finally {
|
} finally {
|
||||||
if (httpResponse != null) {
|
if (httpResponse != null) {
|
||||||
|
@ -110,7 +110,7 @@ public class HttpClientDownloader extends AbstractDownloader {
|
||||||
String contentType = httpResponse.getEntity().getContentType() == null ? "" : httpResponse.getEntity().getContentType().getValue();
|
String contentType = httpResponse.getEntity().getContentType() == null ? "" : httpResponse.getEntity().getContentType().getValue();
|
||||||
Page page = new Page();
|
Page page = new Page();
|
||||||
page.setBytes(bytes);
|
page.setBytes(bytes);
|
||||||
if (!request.isBinaryContent()){
|
if (!request.isBinaryContent()) {
|
||||||
if (charset == null) {
|
if (charset == null) {
|
||||||
charset = getHtmlCharset(contentType, bytes);
|
charset = getHtmlCharset(contentType, bytes);
|
||||||
}
|
}
|
||||||
|
|
|
@ -16,73 +16,70 @@ import java.io.*;
|
||||||
* @version 0.5.3
|
* @version 0.5.3
|
||||||
*/
|
*/
|
||||||
public class PhantomJSDownloader extends AbstractDownloader {
|
public class PhantomJSDownloader extends AbstractDownloader {
|
||||||
|
private static final Logger logger = LoggerFactory.getLogger(PhantomJSDownloader.class);
|
||||||
private static Logger logger = LoggerFactory.getLogger(PhantomJSDownloader.class);
|
|
||||||
private static String crawlJsPath;
|
private static String crawlJsPath;
|
||||||
private static String phantomJsCommand = "phantomjs"; // default
|
private static String phantomJsCommand = "phantomjs"; // default
|
||||||
|
|
||||||
private int retryNum;
|
|
||||||
private int threadNum;
|
|
||||||
|
|
||||||
public PhantomJSDownloader() {
|
public PhantomJSDownloader() {
|
||||||
this.initPhantomjsCrawlPath();
|
this.initPhantomjsCrawlPath();
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 添加新的构造函数,支持phantomjs自定义命令
|
* 添加新的构造函数,支持phantomjs自定义命令
|
||||||
*
|
* <p>
|
||||||
* example:
|
* example:
|
||||||
* phantomjs.exe 支持windows环境
|
* phantomjs.exe 支持windows环境
|
||||||
* phantomjs --ignore-ssl-errors=yes 忽略抓取地址是https时的一些错误
|
* phantomjs --ignore-ssl-errors=yes 忽略抓取地址是https时的一些错误
|
||||||
* /usr/local/bin/phantomjs 命令的绝对路径,避免因系统环境变量引起的IOException
|
* /usr/local/bin/phantomjs 命令的绝对路径,避免因系统环境变量引起的IOException
|
||||||
*
|
*
|
||||||
* @param phantomJsCommand phantomJsCommand
|
* @param phantomJsCommand phantomJsCommand
|
||||||
*/
|
*/
|
||||||
public PhantomJSDownloader(String phantomJsCommand) {
|
public PhantomJSDownloader(String phantomJsCommand) {
|
||||||
this.initPhantomjsCrawlPath();
|
this.initPhantomjsCrawlPath();
|
||||||
PhantomJSDownloader.phantomJsCommand = phantomJsCommand;
|
PhantomJSDownloader.phantomJsCommand = phantomJsCommand;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 新增构造函数,支持crawl.js路径自定义,因为当其他项目依赖此jar包时,runtime.exec()执行phantomjs命令时无使用法jar包中的crawl.js
|
* 新增构造函数,支持crawl.js路径自定义,因为当其他项目依赖此jar包时,runtime.exec()执行phantomjs命令时无使用法jar包中的crawl.js
|
||||||
* <pre>
|
* <pre>
|
||||||
* crawl.js start --
|
* crawl.js start --
|
||||||
*
|
*
|
||||||
* var system = require('system');
|
* var system = require('system');
|
||||||
* var url = system.args[1];
|
* var url = system.args[1];
|
||||||
*
|
*
|
||||||
* var page = require('webpage').create();
|
* var page = require('webpage').create();
|
||||||
* page.settings.loadImages = false;
|
* page.settings.loadImages = false;
|
||||||
* page.settings.resourceTimeout = 5000;
|
* page.settings.resourceTimeout = 5000;
|
||||||
*
|
*
|
||||||
* page.open(url, function (status) {
|
* page.open(url, function (status) {
|
||||||
* if (status != 'success') {
|
* if (status != 'success') {
|
||||||
* console.log("HTTP request failed!");
|
* console.log("HTTP request failed!");
|
||||||
* } else {
|
* } else {
|
||||||
* console.log(page.content);
|
* console.log(page.content);
|
||||||
* }
|
* }
|
||||||
*
|
*
|
||||||
* page.close();
|
* page.close();
|
||||||
* phantom.exit();
|
* phantom.exit();
|
||||||
* });
|
* });
|
||||||
*
|
*
|
||||||
* -- crawl.js end
|
* -- crawl.js end
|
||||||
* </pre>
|
* </pre>
|
||||||
* 具体项目时可以将以上js代码复制下来使用
|
* 具体项目时可以将以上js代码复制下来使用
|
||||||
*
|
* <p>
|
||||||
* example:
|
* example:
|
||||||
* new PhantomJSDownloader("/your/path/phantomjs", "/your/path/crawl.js");
|
* new PhantomJSDownloader("/your/path/phantomjs", "/your/path/crawl.js");
|
||||||
*
|
*
|
||||||
* @param phantomJsCommand phantomJsCommand
|
* @param phantomJsCommand phantomJsCommand
|
||||||
* @param crawlJsPath crawlJsPath
|
* @param crawlJsPath crawlJsPath
|
||||||
*/
|
*/
|
||||||
public PhantomJSDownloader(String phantomJsCommand, String crawlJsPath) {
|
public PhantomJSDownloader(String phantomJsCommand, String crawlJsPath) {
|
||||||
PhantomJSDownloader.phantomJsCommand = phantomJsCommand;
|
PhantomJSDownloader.phantomJsCommand = phantomJsCommand;
|
||||||
PhantomJSDownloader.crawlJsPath = crawlJsPath;
|
PhantomJSDownloader.crawlJsPath = crawlJsPath;
|
||||||
}
|
}
|
||||||
|
|
||||||
private void initPhantomjsCrawlPath() {
|
private void initPhantomjsCrawlPath() {
|
||||||
PhantomJSDownloader.crawlJsPath = new File(this.getClass().getResource("/").getPath()).getPath() + System.getProperty("file.separator") + "crawl.js ";
|
PhantomJSDownloader.crawlJsPath = new File(this.getClass().getResource("/").getPath()).getPath()
|
||||||
|
+ System.getProperty("file.separator") + "crawl.js ";
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -90,61 +87,41 @@ public class PhantomJSDownloader extends AbstractDownloader {
|
||||||
if (logger.isInfoEnabled()) {
|
if (logger.isInfoEnabled()) {
|
||||||
logger.info("downloading page: " + request.getUrl());
|
logger.info("downloading page: " + request.getUrl());
|
||||||
}
|
}
|
||||||
String content = getPage(request);
|
|
||||||
if (content.contains("HTTP request failed")) {
|
|
||||||
for (int i = 1; i <= getRetryNum(); i++) {
|
|
||||||
content = getPage(request);
|
|
||||||
if (!content.contains("HTTP request failed")) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (content.contains("HTTP request failed")) {
|
|
||||||
//when failed
|
|
||||||
Page page = new Page();
|
|
||||||
page.setRequest(request);
|
|
||||||
return page;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Page page = new Page();
|
Page page = Page.fail();
|
||||||
page.setRawText(content);
|
try {
|
||||||
page.setUrl(new PlainText(request.getUrl()));
|
String content = getPage(request);
|
||||||
page.setRequest(request);
|
if (!content.contains("HTTP request failed")) {
|
||||||
page.setStatusCode(200);
|
page.setDownloadSuccess(true);
|
||||||
|
page.setRawText(content);
|
||||||
|
page.setUrl(new PlainText(request.getUrl()));
|
||||||
|
page.setRequest(request);
|
||||||
|
page.setStatusCode(200);
|
||||||
|
}
|
||||||
|
onSuccess(request);
|
||||||
|
} catch (Exception e) {
|
||||||
|
onError(request, e);
|
||||||
|
logger.warn("download page {} error", request.getUrl(), e);
|
||||||
|
}
|
||||||
return page;
|
return page;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void setThread(int threadNum) {
|
public void setThread(int threadNum) {
|
||||||
this.threadNum = threadNum;
|
// ignore
|
||||||
}
|
}
|
||||||
|
|
||||||
protected String getPage(Request request) {
|
protected String getPage(Request request) throws Exception {
|
||||||
try {
|
String url = request.getUrl();
|
||||||
String url = request.getUrl();
|
Runtime runtime = Runtime.getRuntime();
|
||||||
Runtime runtime = Runtime.getRuntime();
|
Process process = runtime.exec(phantomJsCommand + " " + crawlJsPath + " " + url);
|
||||||
Process process = runtime.exec(phantomJsCommand + " " + crawlJsPath + " " + url);
|
InputStream is = process.getInputStream();
|
||||||
InputStream is = process.getInputStream();
|
BufferedReader br = new BufferedReader(new InputStreamReader(is));
|
||||||
BufferedReader br = new BufferedReader(new InputStreamReader(is));
|
StringBuilder builder = new StringBuilder();
|
||||||
StringBuffer stringBuffer = new StringBuffer();
|
String line;
|
||||||
String line;
|
while ((line = br.readLine()) != null) {
|
||||||
while ((line = br.readLine()) != null) {
|
builder.append(line).append("\n");
|
||||||
stringBuffer.append(line).append("\n");
|
|
||||||
}
|
|
||||||
return stringBuffer.toString();
|
|
||||||
} catch (IOException e) {
|
|
||||||
e.printStackTrace();
|
|
||||||
}
|
}
|
||||||
|
return builder.toString();
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
public int getRetryNum() {
|
|
||||||
return retryNum;
|
|
||||||
}
|
|
||||||
|
|
||||||
public PhantomJSDownloader setRetryNum(int retryNum) {
|
|
||||||
this.retryNum = retryNum;
|
|
||||||
return this;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -36,7 +36,7 @@ public class PhantomJSPageProcessor implements PageProcessor {
|
||||||
}
|
}
|
||||||
|
|
||||||
public static void main(String[] args) throws Exception {
|
public static void main(String[] args) throws Exception {
|
||||||
PhantomJSDownloader phantomDownloader = new PhantomJSDownloader().setRetryNum(3);
|
PhantomJSDownloader phantomDownloader = new PhantomJSDownloader();
|
||||||
|
|
||||||
CollectorPipeline<ResultItems> collectorPipeline = new ResultItemsCollectorPipeline();
|
CollectorPipeline<ResultItems> collectorPipeline = new ResultItemsCollectorPipeline();
|
||||||
|
|
||||||
|
|
|
@ -11,7 +11,7 @@ import us.codecraft.webmagic.Page;
|
||||||
import us.codecraft.webmagic.Request;
|
import us.codecraft.webmagic.Request;
|
||||||
import us.codecraft.webmagic.Site;
|
import us.codecraft.webmagic.Site;
|
||||||
import us.codecraft.webmagic.Task;
|
import us.codecraft.webmagic.Task;
|
||||||
import us.codecraft.webmagic.downloader.Downloader;
|
import us.codecraft.webmagic.downloader.AbstractDownloader;
|
||||||
import us.codecraft.webmagic.selector.Html;
|
import us.codecraft.webmagic.selector.Html;
|
||||||
import us.codecraft.webmagic.selector.PlainText;
|
import us.codecraft.webmagic.selector.PlainText;
|
||||||
|
|
||||||
|
@ -24,112 +24,120 @@ import java.util.Map;
|
||||||
* 需要下载Selenium driver支持。<br>
|
* 需要下载Selenium driver支持。<br>
|
||||||
*
|
*
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
* Date: 13-7-26 <br>
|
* Date: 13-7-26 <br>
|
||||||
* Time: 下午1:37 <br>
|
* Time: 下午1:37 <br>
|
||||||
*/
|
*/
|
||||||
public class SeleniumDownloader implements Downloader, Closeable {
|
public class SeleniumDownloader extends AbstractDownloader implements Closeable {
|
||||||
|
|
||||||
private volatile WebDriverPool webDriverPool;
|
private volatile WebDriverPool webDriverPool;
|
||||||
|
|
||||||
private Logger logger = LoggerFactory.getLogger(getClass());
|
private Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
|
||||||
private int sleepTime = 0;
|
private int sleepTime = 0;
|
||||||
|
|
||||||
private int poolSize = 1;
|
private int poolSize = 1;
|
||||||
|
|
||||||
private static final String DRIVER_PHANTOMJS = "phantomjs";
|
private static final String DRIVER_PHANTOMJS = "phantomjs";
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 新建
|
* 新建
|
||||||
*
|
*
|
||||||
* @param chromeDriverPath chromeDriverPath
|
* @param chromeDriverPath chromeDriverPath
|
||||||
*/
|
*/
|
||||||
public SeleniumDownloader(String chromeDriverPath) {
|
public SeleniumDownloader(String chromeDriverPath) {
|
||||||
System.getProperties().setProperty("webdriver.chrome.driver",
|
System.getProperties().setProperty("webdriver.chrome.driver",
|
||||||
chromeDriverPath);
|
chromeDriverPath);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Constructor without any filed. Construct PhantomJS browser
|
* Constructor without any filed. Construct PhantomJS browser
|
||||||
*
|
*
|
||||||
* @author bob.li.0718@gmail.com
|
* @author bob.li.0718@gmail.com
|
||||||
*/
|
*/
|
||||||
public SeleniumDownloader() {
|
public SeleniumDownloader() {
|
||||||
// System.setProperty("phantomjs.binary.path",
|
// System.setProperty("phantomjs.binary.path",
|
||||||
// "/Users/Bingo/Downloads/phantomjs-1.9.7-macosx/bin/phantomjs");
|
// "/Users/Bingo/Downloads/phantomjs-1.9.7-macosx/bin/phantomjs");
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* set sleep time to wait until load success
|
* set sleep time to wait until load success
|
||||||
*
|
*
|
||||||
* @param sleepTime sleepTime
|
* @param sleepTime sleepTime
|
||||||
* @return this
|
* @return this
|
||||||
*/
|
*/
|
||||||
public SeleniumDownloader setSleepTime(int sleepTime) {
|
public SeleniumDownloader setSleepTime(int sleepTime) {
|
||||||
this.sleepTime = sleepTime;
|
this.sleepTime = sleepTime;
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Page download(Request request, Task task) {
|
public Page download(Request request, Task task) {
|
||||||
checkInit();
|
checkInit();
|
||||||
WebDriver webDriver;
|
WebDriver webDriver = null;
|
||||||
try {
|
Page page = Page.fail();
|
||||||
webDriver = webDriverPool.get();
|
try {
|
||||||
} catch (InterruptedException e) {
|
webDriver = webDriverPool.get();
|
||||||
logger.warn("interrupted", e);
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
logger.info("downloading page " + request.getUrl());
|
|
||||||
webDriver.get(request.getUrl());
|
|
||||||
try {
|
|
||||||
Thread.sleep(sleepTime);
|
|
||||||
} catch (InterruptedException e) {
|
|
||||||
e.printStackTrace();
|
|
||||||
}
|
|
||||||
WebDriver.Options manage = webDriver.manage();
|
|
||||||
Site site = task.getSite();
|
|
||||||
if (site.getCookies() != null) {
|
|
||||||
for (Map.Entry<String, String> cookieEntry : site.getCookies()
|
|
||||||
.entrySet()) {
|
|
||||||
Cookie cookie = new Cookie(cookieEntry.getKey(),
|
|
||||||
cookieEntry.getValue());
|
|
||||||
manage.addCookie(cookie);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
logger.info("downloading page " + request.getUrl());
|
||||||
* TODO You can add mouse event or other processes
|
webDriver.get(request.getUrl());
|
||||||
*
|
try {
|
||||||
* @author: bob.li.0718@gmail.com
|
if (sleepTime > 0) {
|
||||||
*/
|
Thread.sleep(sleepTime);
|
||||||
|
}
|
||||||
|
} catch (InterruptedException e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
}
|
||||||
|
WebDriver.Options manage = webDriver.manage();
|
||||||
|
Site site = task.getSite();
|
||||||
|
if (site.getCookies() != null) {
|
||||||
|
for (Map.Entry<String, String> cookieEntry : site.getCookies()
|
||||||
|
.entrySet()) {
|
||||||
|
Cookie cookie = new Cookie(cookieEntry.getKey(),
|
||||||
|
cookieEntry.getValue());
|
||||||
|
manage.addCookie(cookie);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
WebElement webElement = webDriver.findElement(By.xpath("/html"));
|
/*
|
||||||
String content = webElement.getAttribute("outerHTML");
|
* TODO You can add mouse event or other processes
|
||||||
Page page = new Page();
|
*
|
||||||
page.setRawText(content);
|
* @author: bob.li.0718@gmail.com
|
||||||
page.setHtml(new Html(content, request.getUrl()));
|
*/
|
||||||
page.setUrl(new PlainText(request.getUrl()));
|
|
||||||
page.setRequest(request);
|
|
||||||
webDriverPool.returnToPool(webDriver);
|
|
||||||
return page;
|
|
||||||
}
|
|
||||||
|
|
||||||
private void checkInit() {
|
WebElement webElement = webDriver.findElement(By.xpath("/html"));
|
||||||
if (webDriverPool == null) {
|
String content = webElement.getAttribute("outerHTML");
|
||||||
synchronized (this) {
|
page.setDownloadSuccess(true);
|
||||||
webDriverPool = new WebDriverPool(poolSize);
|
page.setRawText(content);
|
||||||
}
|
page.setHtml(new Html(content, request.getUrl()));
|
||||||
}
|
page.setUrl(new PlainText(request.getUrl()));
|
||||||
}
|
page.setRequest(request);
|
||||||
|
onSuccess(request);
|
||||||
|
} catch (Exception e) {
|
||||||
|
logger.warn("download page {} error", request.getUrl(), e);
|
||||||
|
onError(request, e);
|
||||||
|
} finally {
|
||||||
|
if (webDriver != null) {
|
||||||
|
webDriverPool.returnToPool(webDriver);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return page;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
private void checkInit() {
|
||||||
public void setThread(int thread) {
|
if (webDriverPool == null) {
|
||||||
this.poolSize = thread;
|
synchronized (this) {
|
||||||
}
|
webDriverPool = new WebDriverPool(poolSize);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void close() throws IOException {
|
public void setThread(int thread) {
|
||||||
webDriverPool.closeAll();
|
this.poolSize = thread;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void close() throws IOException {
|
||||||
|
webDriverPool.closeAll();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue