diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
index f5b55f9..11ba1c5 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
@@ -1,6 +1,7 @@
package us.codecraft.webmagic.downloader;
import com.google.common.collect.Sets;
+import info.monitorenter.cpdetector.io.CodepageDetectorProxy;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.http.HttpResponse;
@@ -28,6 +29,7 @@ import us.codecraft.webmagic.selector.PlainText;
import us.codecraft.webmagic.utils.UrlUtils;
import java.io.IOException;
+import java.net.URL;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
@@ -92,6 +94,7 @@ public class HttpClientDownloader extends AbstractDownloader {
//charset
if (charset == null) {
charset = getHtmlCharset(httpResponse);
+ logger.debug("Auto get charset: " + charset);
}
Page page = handleResponse(request, charset, httpResponse, task);
onSuccess(request);
@@ -119,6 +122,7 @@ public class HttpClientDownloader extends AbstractDownloader {
}
}
+ private static CodepageDetectorProxy detector = CodepageDetectorProxy.getInstance();
protected String getHtmlCharset(CloseableHttpResponse httpResponse) throws IOException {
// 1、head头部包含编码集
String value = httpResponse.getEntity().getContentType().getValue();
@@ -133,23 +137,28 @@ public class HttpClientDownloader extends AbstractDownloader {
for(Element link : links) {
// 2.1、处理场景:
String metaContent = link.attr("content");
+ String metaCharset = link.attr("charset");
if(metaContent.indexOf("charset") != -1) {
metaContent = metaContent.substring(metaContent.indexOf("charset"), metaContent.length());
charset = metaContent.split("=")[1];
break;
}
-
// 2.2、处理场景:
- String metaCharset = link.attr("charset");
- if(StringUtils.isNotEmpty(metaCharset)) {
- charset = metaCharset.split("=")[1];
+ else if(StringUtils.isNotEmpty(metaCharset)) {
+ charset = metaCharset;
break;
}
}
// 3、以上两种都不包含的场景
if(StringUtils.isEmpty(charset)) {
- // TODO http://cpdetector.sourceforge.net/usage.shtml
+ java.nio.charset.Charset nioCharset = null;
+ try {
+ nioCharset = detector.detectCodepage(httpResponse.getEntity().getContent(), content.length());
+ charset = nioCharset.name();
+ } catch (IOException e) {
+ // ignore
+ }
}
}
}
diff --git a/webmagic-core/src/main/lib/antlr-2.7.4.jar b/webmagic-core/src/main/lib/antlr-2.7.4.jar
new file mode 100644
index 0000000..45e45b5
Binary files /dev/null and b/webmagic-core/src/main/lib/antlr-2.7.4.jar differ
diff --git a/webmagic-core/src/main/lib/cpdetector_1.0.10.jar b/webmagic-core/src/main/lib/cpdetector_1.0.10.jar
new file mode 100644
index 0000000..47329f2
Binary files /dev/null and b/webmagic-core/src/main/lib/cpdetector_1.0.10.jar differ
diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java
index ab84665..072de13 100644
--- a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java
+++ b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java
@@ -1,5 +1,8 @@
package us.codecraft.webmagic.downloader;
+import org.apache.http.client.methods.CloseableHttpResponse;
+import org.apache.http.client.methods.HttpUriRequest;
+import org.apache.http.impl.client.CloseableHttpClient;
import org.junit.Ignore;
import org.junit.Test;
import us.codecraft.webmagic.Page;
@@ -11,6 +14,7 @@ import us.codecraft.webmagic.selector.Html;
import java.io.UnsupportedEncodingException;
import static org.assertj.core.api.Assertions.assertThat;
+import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
/**
@@ -52,4 +56,30 @@ public class HttpClientDownloaderTest {
assertThat((Integer) page.getTargetRequests().get(0).getExtra(Request.CYCLE_TRIED_TIMES)).isEqualTo(2);
}
+ @Test
+ public void testGetHtmlCharset() {
+ HttpClientDownloader downloader = new HttpClientDownloader();
+ Site site = Site.me();
+ CloseableHttpClient httpClient = new HttpClientGenerator().getClient(site);
+ try {
+ // 头部包含编码
+ Request requestGBK = new Request("http://sports.163.com/14/0514/13/9S7986F300051CA1.html#p=9RGQDGGH0AI90005");
+ CloseableHttpResponse httpResponse = httpClient.execute(downloader.getHttpUriRequest(requestGBK, site, null));
+ String charset = downloader.getHtmlCharset(httpResponse);
+ assertEquals(charset, "GBK");
+
+ // meta包含编码
+ Request requestUTF_8 = new Request("http://preshing.com/");
+ httpResponse = httpClient.execute(downloader.getHttpUriRequest(requestUTF_8, site, null));
+ charset = downloader.getHtmlCharset(httpResponse);
+ assertEquals(charset, "utf-8");
+
+// Request request = new Request("http://sports.163.com/14/0514/13/9S7986F300051CA1.html#p=9RGQDGGH0AI90005");
+// httpResponse = httpClient.execute(downloader.getHttpUriRequest(request, site, null));
+// charset = downloader.getHtmlCharset(httpResponse);
+// assertEquals(charset, "GBK");
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+ }
}