Revert "Common the downloader status process and pass error information when …"
parent
ee5a0585d7
commit
acfbd7b883
|
@ -38,7 +38,7 @@ public abstract class AbstractDownloader implements Downloader {
|
|||
protected void onSuccess(Request request) {
|
||||
}
|
||||
|
||||
protected void onError(Request request, Throwable e) {
|
||||
protected void onError(Request request) {
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -87,7 +87,7 @@ public class HttpClientDownloader extends AbstractDownloader {
|
|||
return page;
|
||||
} catch (IOException e) {
|
||||
logger.warn("download page {} error", request.getUrl(), e);
|
||||
onError(request, e);
|
||||
onError(request);
|
||||
return page;
|
||||
} finally {
|
||||
if (httpResponse != null) {
|
||||
|
|
|
@ -16,17 +16,21 @@ import java.io.*;
|
|||
* @version 0.5.3
|
||||
*/
|
||||
public class PhantomJSDownloader extends AbstractDownloader {
|
||||
private static final Logger logger = LoggerFactory.getLogger(PhantomJSDownloader.class);
|
||||
|
||||
private static Logger logger = LoggerFactory.getLogger(PhantomJSDownloader.class);
|
||||
private static String crawlJsPath;
|
||||
private static String phantomJsCommand = "phantomjs"; // default
|
||||
|
||||
private int retryNum;
|
||||
private int threadNum;
|
||||
|
||||
public PhantomJSDownloader() {
|
||||
this.initPhantomjsCrawlPath();
|
||||
}
|
||||
|
||||
/**
|
||||
* 添加新的构造函数,支持phantomjs自定义命令
|
||||
* <p>
|
||||
*
|
||||
* example:
|
||||
* phantomjs.exe 支持windows环境
|
||||
* phantomjs --ignore-ssl-errors=yes 忽略抓取地址是https时的一些错误
|
||||
|
@ -65,7 +69,7 @@ public class PhantomJSDownloader extends AbstractDownloader {
|
|||
* -- crawl.js end
|
||||
* </pre>
|
||||
* 具体项目时可以将以上js代码复制下来使用
|
||||
* <p>
|
||||
*
|
||||
* example:
|
||||
* new PhantomJSDownloader("/your/path/phantomjs", "/your/path/crawl.js");
|
||||
*
|
||||
|
@ -78,8 +82,7 @@ public class PhantomJSDownloader extends AbstractDownloader {
|
|||
}
|
||||
|
||||
private void initPhantomjsCrawlPath() {
|
||||
PhantomJSDownloader.crawlJsPath = new File(this.getClass().getResource("/").getPath()).getPath()
|
||||
+ System.getProperty("file.separator") + "crawl.js ";
|
||||
PhantomJSDownloader.crawlJsPath = new File(this.getClass().getResource("/").getPath()).getPath() + System.getProperty("file.separator") + "crawl.js ";
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -87,41 +90,61 @@ public class PhantomJSDownloader extends AbstractDownloader {
|
|||
if (logger.isInfoEnabled()) {
|
||||
logger.info("downloading page: " + request.getUrl());
|
||||
}
|
||||
|
||||
Page page = Page.fail();
|
||||
try {
|
||||
String content = getPage(request);
|
||||
if (content.contains("HTTP request failed")) {
|
||||
for (int i = 1; i <= getRetryNum(); i++) {
|
||||
content = getPage(request);
|
||||
if (!content.contains("HTTP request failed")) {
|
||||
page.setDownloadSuccess(true);
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (content.contains("HTTP request failed")) {
|
||||
//when failed
|
||||
Page page = new Page();
|
||||
page.setRequest(request);
|
||||
return page;
|
||||
}
|
||||
}
|
||||
|
||||
Page page = new Page();
|
||||
page.setRawText(content);
|
||||
page.setUrl(new PlainText(request.getUrl()));
|
||||
page.setRequest(request);
|
||||
page.setStatusCode(200);
|
||||
}
|
||||
onSuccess(request);
|
||||
} catch (Exception e) {
|
||||
onError(request, e);
|
||||
logger.warn("download page {} error", request.getUrl(), e);
|
||||
}
|
||||
return page;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setThread(int threadNum) {
|
||||
// ignore
|
||||
this.threadNum = threadNum;
|
||||
}
|
||||
|
||||
protected String getPage(Request request) throws Exception {
|
||||
protected String getPage(Request request) {
|
||||
try {
|
||||
String url = request.getUrl();
|
||||
Runtime runtime = Runtime.getRuntime();
|
||||
Process process = runtime.exec(phantomJsCommand + " " + crawlJsPath + " " + url);
|
||||
InputStream is = process.getInputStream();
|
||||
BufferedReader br = new BufferedReader(new InputStreamReader(is));
|
||||
StringBuilder builder = new StringBuilder();
|
||||
StringBuffer stringBuffer = new StringBuffer();
|
||||
String line;
|
||||
while ((line = br.readLine()) != null) {
|
||||
builder.append(line).append("\n");
|
||||
stringBuffer.append(line).append("\n");
|
||||
}
|
||||
return builder.toString();
|
||||
return stringBuffer.toString();
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
public int getRetryNum() {
|
||||
return retryNum;
|
||||
}
|
||||
|
||||
public PhantomJSDownloader setRetryNum(int retryNum) {
|
||||
this.retryNum = retryNum;
|
||||
return this;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -11,7 +11,7 @@ import us.codecraft.webmagic.Page;
|
|||
import us.codecraft.webmagic.Request;
|
||||
import us.codecraft.webmagic.Site;
|
||||
import us.codecraft.webmagic.Task;
|
||||
import us.codecraft.webmagic.downloader.AbstractDownloader;
|
||||
import us.codecraft.webmagic.downloader.Downloader;
|
||||
import us.codecraft.webmagic.selector.Html;
|
||||
import us.codecraft.webmagic.selector.PlainText;
|
||||
|
||||
|
@ -27,7 +27,7 @@ import java.util.Map;
|
|||
* Date: 13-7-26 <br>
|
||||
* Time: 下午1:37 <br>
|
||||
*/
|
||||
public class SeleniumDownloader extends AbstractDownloader implements Closeable {
|
||||
public class SeleniumDownloader implements Downloader, Closeable {
|
||||
|
||||
private volatile WebDriverPool webDriverPool;
|
||||
|
||||
|
@ -73,17 +73,17 @@ public class SeleniumDownloader extends AbstractDownloader implements Closeable
|
|||
@Override
|
||||
public Page download(Request request, Task task) {
|
||||
checkInit();
|
||||
WebDriver webDriver = null;
|
||||
Page page = Page.fail();
|
||||
WebDriver webDriver;
|
||||
try {
|
||||
webDriver = webDriverPool.get();
|
||||
|
||||
} catch (InterruptedException e) {
|
||||
logger.warn("interrupted", e);
|
||||
return null;
|
||||
}
|
||||
logger.info("downloading page " + request.getUrl());
|
||||
webDriver.get(request.getUrl());
|
||||
try {
|
||||
if (sleepTime > 0) {
|
||||
Thread.sleep(sleepTime);
|
||||
}
|
||||
} catch (InterruptedException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
|
@ -106,20 +106,12 @@ public class SeleniumDownloader extends AbstractDownloader implements Closeable
|
|||
|
||||
WebElement webElement = webDriver.findElement(By.xpath("/html"));
|
||||
String content = webElement.getAttribute("outerHTML");
|
||||
page.setDownloadSuccess(true);
|
||||
Page page = new Page();
|
||||
page.setRawText(content);
|
||||
page.setHtml(new Html(content, request.getUrl()));
|
||||
page.setUrl(new PlainText(request.getUrl()));
|
||||
page.setRequest(request);
|
||||
onSuccess(request);
|
||||
} catch (Exception e) {
|
||||
logger.warn("download page {} error", request.getUrl(), e);
|
||||
onError(request, e);
|
||||
} finally {
|
||||
if (webDriver != null) {
|
||||
webDriverPool.returnToPool(webDriver);
|
||||
}
|
||||
}
|
||||
return page;
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue