From 32f1f2cf44e76a3b1dea048dd37f5cedcfeae80b Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 29 Jul 2017 11:16:09 +0800 Subject: [PATCH] #613 add charset to page --- .../downloader/HttpClientDownloader.java | 27 ++++++++----------- 1 file changed, 11 insertions(+), 16 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 13175fc..4e19e7c 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -113,7 +113,11 @@ public class HttpClientDownloader extends AbstractDownloader { Page page = new Page(); page.setBytes(bytes); if (!request.isBinaryContent()){ - page.setRawText(getResponseContent(charset, contentType, bytes)); + if (charset == null) { + charset = getHtmlCharset(contentType, bytes); + } + page.setCharset(charset); + page.setRawText(new String(bytes, charset)); } page.setUrl(new PlainText(request.getUrl())); page.setRequest(request); @@ -125,21 +129,12 @@ public class HttpClientDownloader extends AbstractDownloader { return page; } - private String getResponseContent(String charset, String contentType, byte[] bytes) throws IOException { - if (charset == null) { - String htmlCharset = getHtmlCharset(contentType, bytes); - if (htmlCharset != null) { - return new String(bytes, htmlCharset); - } else { - logger.warn("Charset autodetect failed, use {} as charset. Please specify charset in Site.setCharset()", Charset.defaultCharset()); - return new String(bytes); - } - } else { - return new String(bytes, charset); - } - } - private String getHtmlCharset(String contentType, byte[] contentBytes) throws IOException { - return CharsetUtils.detectCharset(contentType, contentBytes); + String charset = CharsetUtils.detectCharset(contentType, contentBytes); + if (charset == null) { + charset = Charset.defaultCharset().name(); + logger.warn("Charset autodetect failed, use {} as charset. Please specify charset in Site.setCharset()", Charset.defaultCharset()); + } + return charset; } }