From 03d26c169b63e8462940d1d181862f1eeed640fb Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Mon, 26 May 2014 17:45:30 +0800 Subject: [PATCH] Enhance auto charset detect #126 1. Only read from content once to fix stream closed exception 2. invite moco as server test --- pom.xml | 12 +++ webmagic-core/pom.xml | 5 + .../downloader/HttpClientDownloader.java | 93 +++++++++++-------- .../downloader/HttpClientDownloaderTest.java | 65 ++++++++++--- 4 files changed, 123 insertions(+), 52 deletions(-) diff --git a/pom.xml b/pom.xml index 9d80771..de5cf91 100644 --- a/pom.xml +++ b/pom.xml @@ -95,6 +95,18 @@ fastjson 1.1.37 + + com.github.dreamhead + moco-core + 0.9.1 + test + + + org.slf4j + slf4j-simple + + + log4j log4j diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index 9ce9a97..dedd898 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -35,6 +35,11 @@ xsoup + + com.github.dreamhead + moco-core + + org.slf4j slf4j-api diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 5d2af73..cb8ba1b 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -28,6 +28,7 @@ import us.codecraft.webmagic.utils.HttpConstant; import us.codecraft.webmagic.utils.UrlUtils; import java.io.IOException; +import java.nio.charset.Charset; import java.util.HashMap; import java.util.Map; import java.util.Set; @@ -89,11 +90,6 @@ public class HttpClientDownloader extends AbstractDownloader { httpResponse = getHttpClient(site).execute(httpUriRequest); int statusCode = httpResponse.getStatusLine().getStatusCode(); if (statusAccept(acceptStatCode, statusCode)) { - //charset - if (charset == null) { - charset = getHtmlCharset(httpResponse); - logger.debug("Auto get charset: " + charset); - } Page page = handleResponse(request, charset, httpResponse, task); onSuccess(request); return page; @@ -120,38 +116,6 @@ public class HttpClientDownloader extends AbstractDownloader { } } - protected String getHtmlCharset(CloseableHttpResponse httpResponse) throws IOException { - // 1、encoding in http header Content-Type - String value = httpResponse.getEntity().getContentType().getValue(); - String charset = UrlUtils.getCharset(value); - - if (StringUtils.isEmpty(charset)) { - // 2、charset in meta - String content = IOUtils.toString(httpResponse.getEntity().getContent()); - if (StringUtils.isNotEmpty(content)) { - Document document = Jsoup.parse(content); - Elements links = document.select("meta"); - for (Element link : links) { - // 2.1、 - String metaContent = link.attr("content"); - String metaCharset = link.attr("charset"); - if (metaContent.indexOf("charset") != -1) { - metaContent = metaContent.substring(metaContent.indexOf("charset"), metaContent.length()); - charset = metaContent.split("=")[1]; - break; - } - // 2.2、 - else if (StringUtils.isNotEmpty(metaCharset)) { - charset = metaCharset; - break; - } - } - // 3、todo use tools as cpdetector for content decode - } - } - return charset; - } - @Override public void setThread(int thread) { httpClientGenerator.setPoolSize(thread); @@ -205,7 +169,7 @@ public class HttpClientDownloader extends AbstractDownloader { } protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException { - String content = IOUtils.toString(httpResponse.getEntity().getContent(), charset); + String content = getContent(charset, httpResponse); Page page = new Page(); page.setRawText(content); page.setUrl(new PlainText(request.getUrl())); @@ -213,4 +177,57 @@ public class HttpClientDownloader extends AbstractDownloader { page.setStatusCode(httpResponse.getStatusLine().getStatusCode()); return page; } + + protected String getContent(String charset, HttpResponse httpResponse) throws IOException { + if (charset == null) { + byte[] contentBytes = IOUtils.toByteArray(httpResponse.getEntity().getContent()); + String htmlCharset = getHtmlCharset(httpResponse, contentBytes); + if (htmlCharset != null) { + return new String(contentBytes, htmlCharset); + } else { + logger.warn("Charset autodetect failed, use {} as charset. Please specify charset in Site.setCharset()", Charset.defaultCharset()); + return new String(contentBytes); + } + } else { + return IOUtils.toString(httpResponse.getEntity().getContent(), charset); + } + } + + protected String getHtmlCharset(HttpResponse httpResponse, byte[] contentBytes) throws IOException { + String charset; + // charset + // 1、encoding in http header Content-Type + String value = httpResponse.getEntity().getContentType().getValue(); + charset = UrlUtils.getCharset(value); + if (StringUtils.isNotBlank(charset)) { + logger.debug("Auto get charset: {}", charset); + return charset; + } + // use default charset to decode first time + Charset defaultCharset = Charset.defaultCharset(); + String content = new String(contentBytes, defaultCharset.name()); + // 2、charset in meta + if (StringUtils.isNotEmpty(content)) { + Document document = Jsoup.parse(content); + Elements links = document.select("meta"); + for (Element link : links) { + // 2.1、html4.01 + String metaContent = link.attr("content"); + String metaCharset = link.attr("charset"); + if (metaContent.indexOf("charset") != -1) { + metaContent = metaContent.substring(metaContent.indexOf("charset"), metaContent.length()); + charset = metaContent.split("=")[1]; + break; + } + // 2.2、html5 + else if (StringUtils.isNotEmpty(metaCharset)) { + charset = metaCharset; + break; + } + } + } + logger.debug("Auto get charset: {}", charset); + // 3、todo use tools as cpdetector for content decode + return charset; + } } diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java index 09855a0..084a110 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java @@ -1,5 +1,8 @@ package us.codecraft.webmagic.downloader; +import com.github.dreamhead.moco.*; +import com.github.dreamhead.moco.Runnable; +import org.apache.commons.io.IOUtils; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.impl.client.CloseableHttpClient; import org.junit.Ignore; @@ -13,6 +16,7 @@ import us.codecraft.webmagic.selector.Html; import java.io.IOException; import java.io.UnsupportedEncodingException; +import static com.github.dreamhead.moco.Moco.*; import static org.assertj.core.api.Assertions.assertThat; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; @@ -57,20 +61,53 @@ public class HttpClientDownloaderTest { } @Test - public void testGetHtmlCharset() throws IOException { - HttpClientDownloader downloader = new HttpClientDownloader(); - Site site = Site.me(); - CloseableHttpClient httpClient = new HttpClientGenerator().getClient(site); - // encoding in http header Content-Type - Request requestGBK = new Request("http://sports.163.com/14/0514/13/9S7986F300051CA1.html#p=9RGQDGGH0AI90005"); - CloseableHttpResponse httpResponse = httpClient.execute(downloader.getHttpUriRequest(requestGBK, site, null)); - String charset = downloader.getHtmlCharset(httpResponse); - assertEquals(charset, "GBK"); + public void testGetHtmlCharset() throws Exception { + HttpServer server = httpserver(12306); + server.get(by(uri("/header"))).response(header("Content-Type", "text/html; charset=gbk")); + server.get(by(uri("/meta4"))).response(with(text("\n" + + " \n" + + " \n" + + " \n" + + " \n" + + "")),header("Content-Type","")); + server.get(by(uri("/meta5"))).response(with(text("\n" + + " \n" + + " \n" + + " \n" + + " \n" + + "")),header("Content-Type","")); + Runner.running(server, new Runnable() { + @Override + public void run() { + String charset = getCharsetByUrl("http://127.0.0.1:12306/header"); + assertEquals(charset, "gbk"); + charset = getCharsetByUrl("http://127.0.0.1:12306/meta4"); + assertEquals(charset, "gbk"); + charset = getCharsetByUrl("http://127.0.0.1:12306/meta5"); + assertEquals(charset, "gbk"); + } - // encoding in meta - Request requestUTF_8 = new Request("http://preshing.com/"); - httpResponse = httpClient.execute(downloader.getHttpUriRequest(requestUTF_8, site, null)); - charset = downloader.getHtmlCharset(httpResponse); - assertEquals(charset, "utf-8"); + private String getCharsetByUrl(String url) { + HttpClientDownloader downloader = new HttpClientDownloader(); + Site site = Site.me(); + CloseableHttpClient httpClient = new HttpClientGenerator().getClient(site); + // encoding in http header Content-Type + Request requestGBK = new Request(url); + CloseableHttpResponse httpResponse = null; + try { + httpResponse = httpClient.execute(downloader.getHttpUriRequest(requestGBK, site, null)); + } catch (IOException e) { + e.printStackTrace(); + } + String charset = null; + try { + byte[] contentBytes = IOUtils.toByteArray(httpResponse.getEntity().getContent()); + charset = downloader.getHtmlCharset(httpResponse,contentBytes); + } catch (IOException e) { + e.printStackTrace(); + } + return charset; + } + }); } }