From 2183ba9b61a766f94d23ee62d6ee07b219a75f9d Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 22 Jul 2017 12:11:00 +0800 Subject: [PATCH] #571 add getBytes to Page --- .../main/java/us/codecraft/webmagic/Page.java | 10 ++++++++ .../java/us/codecraft/webmagic/Request.java | 14 +++++++++++ .../downloader/HttpClientDownloader.java | 23 +++++++++++-------- .../downloader/HttpClientDownloaderTest.java | 18 +++++++++++++++ 4 files changed, 55 insertions(+), 10 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java index a945607..758e4c6 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java @@ -46,6 +46,8 @@ public class Page { private boolean downloadSuccess = true; + private byte[] bytes; + private List targetRequests = new ArrayList(); public Page() { @@ -228,6 +230,14 @@ public class Page { this.downloadSuccess = downloadSuccess; } + public byte[] getBytes() { + return bytes; + } + + public void setBytes(byte[] bytes) { + this.bytes = bytes; + } + @Override public String toString() { return "Page{" + diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java index f29ccb3..a41de90 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java @@ -45,6 +45,12 @@ public class Request implements Serializable { */ private long priority; + /** + * When it is set to TRUE, the downloader will not try to parse response body to text. + * + */ + private boolean binarayContent = false; + public Request() { } @@ -162,6 +168,14 @@ public class Request implements Serializable { this.requestBody = requestBody; } + public boolean isBinarayContent() { + return binarayContent; + } + + public void setBinarayContent(boolean binarayContent) { + this.binarayContent = binarayContent; + } + @Override public String toString() { return "Request{" + diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 6d4442a..5d0b033 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -108,9 +108,13 @@ public class HttpClientDownloader extends AbstractDownloader { } protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException { - String content = getResponseContent(charset, httpResponse); + byte[] bytes = IOUtils.toByteArray(httpResponse.getEntity().getContent()); + String contentType = httpResponse.getEntity().getContentType() == null ? "" : httpResponse.getEntity().getContentType().getValue(); Page page = new Page(); - page.setRawText(content); + page.setBytes(bytes); + if (!request.isBinarayContent()){ + page.setRawText(getResponseContent(charset, contentType, bytes)); + } page.setUrl(new PlainText(request.getUrl())); page.setRequest(request); page.setStatusCode(httpResponse.getStatusLine().getStatusCode()); @@ -121,22 +125,21 @@ public class HttpClientDownloader extends AbstractDownloader { return page; } - private String getResponseContent(String charset, HttpResponse httpResponse) throws IOException { + private String getResponseContent(String charset, String contentType, byte[] bytes) throws IOException { if (charset == null) { - byte[] contentBytes = IOUtils.toByteArray(httpResponse.getEntity().getContent()); - String htmlCharset = getHtmlCharset(httpResponse, contentBytes); + String htmlCharset = getHtmlCharset(contentType, bytes); if (htmlCharset != null) { - return new String(contentBytes, htmlCharset); + return new String(bytes, htmlCharset); } else { logger.warn("Charset autodetect failed, use {} as charset. Please specify charset in Site.setCharset()", Charset.defaultCharset()); - return new String(contentBytes); + return new String(bytes); } } else { - return IOUtils.toString(httpResponse.getEntity().getContent(), charset); + return new String(bytes, charset); } } - private String getHtmlCharset(HttpResponse httpResponse, byte[] contentBytes) throws IOException { - return CharsetUtils.detectCharset(httpResponse.getEntity().getContentType() == null ? "" : httpResponse.getEntity().getContentType().getValue(), contentBytes); + private String getHtmlCharset(String contentType, byte[] contentBytes) throws IOException { + return CharsetUtils.detectCharset(contentType, contentBytes); } } diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java index 1c8efc5..cbb7abc 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java @@ -271,4 +271,22 @@ public class HttpClientDownloaderTest { }); } + @Test + public void test_download_binary_content() throws Exception { + HttpServer server = httpServer(13423); + server.response("binary"); + Runner.running(server, new Runnable() { + @Override + public void run() throws Exception { + final HttpClientDownloader httpClientDownloader = new HttpClientDownloader(); + Request request = new Request(); + request.setBinarayContent(true); + request.setUrl("http://127.0.0.1:13423/"); + Page page = httpClientDownloader.download(request, Site.me().toTask()); + assertThat(page.getRawText()).isNull(); + assertThat(page.getBytes()).isEqualTo("binary".getBytes()); + } + }); + } + }