diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index f5b55f9..11ba1c5 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -1,6 +1,7 @@ package us.codecraft.webmagic.downloader; import com.google.common.collect.Sets; +import info.monitorenter.cpdetector.io.CodepageDetectorProxy; import org.apache.commons.io.IOUtils; import org.apache.commons.lang.StringUtils; import org.apache.http.HttpResponse; @@ -28,6 +29,7 @@ import us.codecraft.webmagic.selector.PlainText; import us.codecraft.webmagic.utils.UrlUtils; import java.io.IOException; +import java.net.URL; import java.util.HashMap; import java.util.Map; import java.util.Set; @@ -92,6 +94,7 @@ public class HttpClientDownloader extends AbstractDownloader { //charset if (charset == null) { charset = getHtmlCharset(httpResponse); + logger.debug("Auto get charset: " + charset); } Page page = handleResponse(request, charset, httpResponse, task); onSuccess(request); @@ -119,6 +122,7 @@ public class HttpClientDownloader extends AbstractDownloader { } } + private static CodepageDetectorProxy detector = CodepageDetectorProxy.getInstance(); protected String getHtmlCharset(CloseableHttpResponse httpResponse) throws IOException { // 1、head头部包含编码集 String value = httpResponse.getEntity().getContentType().getValue(); @@ -133,23 +137,28 @@ public class HttpClientDownloader extends AbstractDownloader { for(Element link : links) { // 2.1、处理场景: String metaContent = link.attr("content"); + String metaCharset = link.attr("charset"); if(metaContent.indexOf("charset") != -1) { metaContent = metaContent.substring(metaContent.indexOf("charset"), metaContent.length()); charset = metaContent.split("=")[1]; break; } - // 2.2、处理场景: - String metaCharset = link.attr("charset"); - if(StringUtils.isNotEmpty(metaCharset)) { - charset = metaCharset.split("=")[1]; + else if(StringUtils.isNotEmpty(metaCharset)) { + charset = metaCharset; break; } } // 3、以上两种都不包含的场景 if(StringUtils.isEmpty(charset)) { - // TODO http://cpdetector.sourceforge.net/usage.shtml + java.nio.charset.Charset nioCharset = null; + try { + nioCharset = detector.detectCodepage(httpResponse.getEntity().getContent(), content.length()); + charset = nioCharset.name(); + } catch (IOException e) { + // ignore + } } } } diff --git a/webmagic-core/src/main/lib/antlr-2.7.4.jar b/webmagic-core/src/main/lib/antlr-2.7.4.jar new file mode 100644 index 0000000..45e45b5 Binary files /dev/null and b/webmagic-core/src/main/lib/antlr-2.7.4.jar differ diff --git a/webmagic-core/src/main/lib/cpdetector_1.0.10.jar b/webmagic-core/src/main/lib/cpdetector_1.0.10.jar new file mode 100644 index 0000000..47329f2 Binary files /dev/null and b/webmagic-core/src/main/lib/cpdetector_1.0.10.jar differ diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java index ab84665..072de13 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java @@ -1,5 +1,8 @@ package us.codecraft.webmagic.downloader; +import org.apache.http.client.methods.CloseableHttpResponse; +import org.apache.http.client.methods.HttpUriRequest; +import org.apache.http.impl.client.CloseableHttpClient; import org.junit.Ignore; import org.junit.Test; import us.codecraft.webmagic.Page; @@ -11,6 +14,7 @@ import us.codecraft.webmagic.selector.Html; import java.io.UnsupportedEncodingException; import static org.assertj.core.api.Assertions.assertThat; +import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; /** @@ -52,4 +56,30 @@ public class HttpClientDownloaderTest { assertThat((Integer) page.getTargetRequests().get(0).getExtra(Request.CYCLE_TRIED_TIMES)).isEqualTo(2); } + @Test + public void testGetHtmlCharset() { + HttpClientDownloader downloader = new HttpClientDownloader(); + Site site = Site.me(); + CloseableHttpClient httpClient = new HttpClientGenerator().getClient(site); + try { + // 头部包含编码 + Request requestGBK = new Request("http://sports.163.com/14/0514/13/9S7986F300051CA1.html#p=9RGQDGGH0AI90005"); + CloseableHttpResponse httpResponse = httpClient.execute(downloader.getHttpUriRequest(requestGBK, site, null)); + String charset = downloader.getHtmlCharset(httpResponse); + assertEquals(charset, "GBK"); + + // meta包含编码 + Request requestUTF_8 = new Request("http://preshing.com/"); + httpResponse = httpClient.execute(downloader.getHttpUriRequest(requestUTF_8, site, null)); + charset = downloader.getHtmlCharset(httpResponse); + assertEquals(charset, "utf-8"); + +// Request request = new Request("http://sports.163.com/14/0514/13/9S7986F300051CA1.html#p=9RGQDGGH0AI90005"); +// httpResponse = httpClient.execute(downloader.getHttpUriRequest(request, site, null)); +// charset = downloader.getHtmlCharset(httpResponse); +// assertEquals(charset, "GBK"); + } catch (Exception e) { + e.printStackTrace(); + } + } }