diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
index eeae70e..f5b55f9 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
@@ -2,6 +2,7 @@ package us.codecraft.webmagic.downloader;
import com.google.common.collect.Sets;
import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang.StringUtils;
import org.apache.http.HttpResponse;
import org.apache.http.NameValuePair;
import org.apache.http.annotation.ThreadSafe;
@@ -12,6 +13,10 @@ import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.client.methods.RequestBuilder;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.util.EntityUtils;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.Page;
@@ -86,8 +91,7 @@ public class HttpClientDownloader extends AbstractDownloader {
if (statusAccept(acceptStatCode, statusCode)) {
//charset
if (charset == null) {
- String value = httpResponse.getEntity().getContentType().getValue();
- charset = UrlUtils.getCharset(value);
+ charset = getHtmlCharset(httpResponse);
}
Page page = handleResponse(request, charset, httpResponse, task);
onSuccess(request);
@@ -115,6 +119,43 @@ public class HttpClientDownloader extends AbstractDownloader {
}
}
+ protected String getHtmlCharset(CloseableHttpResponse httpResponse) throws IOException {
+ // 1、head头部包含编码集
+ String value = httpResponse.getEntity().getContentType().getValue();
+ String charset = UrlUtils.getCharset(value);
+
+ if(StringUtils.isEmpty(charset)) {
+ // 2、meta元素中包含编码集
+ String content = IOUtils.toString(httpResponse.getEntity().getContent());
+ if(StringUtils.isNotEmpty(content)) {
+ Document document = Jsoup.parse(content);
+ Elements links = document.select("meta");
+ for(Element link : links) {
+ // 2.1、处理场景:
+ String metaContent = link.attr("content");
+ if(metaContent.indexOf("charset") != -1) {
+ metaContent = metaContent.substring(metaContent.indexOf("charset"), metaContent.length());
+ charset = metaContent.split("=")[1];
+ break;
+ }
+
+ // 2.2、处理场景:
+ String metaCharset = link.attr("charset");
+ if(StringUtils.isNotEmpty(metaCharset)) {
+ charset = metaCharset.split("=")[1];
+ break;
+ }
+ }
+
+ // 3、以上两种都不包含的场景
+ if(StringUtils.isEmpty(charset)) {
+ // TODO http://cpdetector.sourceforge.net/usage.shtml
+ }
+ }
+ }
+ return charset;
+ }
+
@Override
public void setThread(int thread) {
httpClientGenerator.setPoolSize(thread);