diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index eeae70e..f5b55f9 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -2,6 +2,7 @@ package us.codecraft.webmagic.downloader; import com.google.common.collect.Sets; import org.apache.commons.io.IOUtils; +import org.apache.commons.lang.StringUtils; import org.apache.http.HttpResponse; import org.apache.http.NameValuePair; import org.apache.http.annotation.ThreadSafe; @@ -12,6 +13,10 @@ import org.apache.http.client.methods.HttpUriRequest; import org.apache.http.client.methods.RequestBuilder; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.util.EntityUtils; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import us.codecraft.webmagic.Page; @@ -86,8 +91,7 @@ public class HttpClientDownloader extends AbstractDownloader { if (statusAccept(acceptStatCode, statusCode)) { //charset if (charset == null) { - String value = httpResponse.getEntity().getContentType().getValue(); - charset = UrlUtils.getCharset(value); + charset = getHtmlCharset(httpResponse); } Page page = handleResponse(request, charset, httpResponse, task); onSuccess(request); @@ -115,6 +119,43 @@ public class HttpClientDownloader extends AbstractDownloader { } } + protected String getHtmlCharset(CloseableHttpResponse httpResponse) throws IOException { + // 1、head头部包含编码集 + String value = httpResponse.getEntity().getContentType().getValue(); + String charset = UrlUtils.getCharset(value); + + if(StringUtils.isEmpty(charset)) { + // 2、meta元素中包含编码集 + String content = IOUtils.toString(httpResponse.getEntity().getContent()); + if(StringUtils.isNotEmpty(content)) { + Document document = Jsoup.parse(content); + Elements links = document.select("meta"); + for(Element link : links) { + // 2.1、处理场景: + String metaContent = link.attr("content"); + if(metaContent.indexOf("charset") != -1) { + metaContent = metaContent.substring(metaContent.indexOf("charset"), metaContent.length()); + charset = metaContent.split("=")[1]; + break; + } + + // 2.2、处理场景: + String metaCharset = link.attr("charset"); + if(StringUtils.isNotEmpty(metaCharset)) { + charset = metaCharset.split("=")[1]; + break; + } + } + + // 3、以上两种都不包含的场景 + if(StringUtils.isEmpty(charset)) { + // TODO http://cpdetector.sourceforge.net/usage.shtml + } + } + } + return charset; + } + @Override public void setThread(int thread) { httpClientGenerator.setPoolSize(thread);