From 95494d3c4d9e0165299726b27019d138844ae474 Mon Sep 17 00:00:00 2001 From: fengwuze Date: Wed, 14 May 2014 14:45:39 +0800 Subject: [PATCH] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E5=A4=84=E7=90=86meta?= =?UTF-8?q?=E7=9A=84=E9=80=BB=E8=BE=91=E3=80=82=20=E9=81=97=E7=95=99?= =?UTF-8?q?=EF=BC=9A=203=E3=80=81=E7=BD=91=E9=A1=B5=E6=B2=A1=E6=9C=89?= =?UTF-8?q?=E6=8C=87=E5=AE=9A=E7=BC=96=E7=A0=81=E7=9A=84=E6=83=85=E5=86=B5?= =?UTF-8?q?=E4=B8=8B=EF=BC=8C=E9=9C=80=E8=A6=81=E9=87=87=E7=94=A8cpdetecto?= =?UTF-8?q?r=EF=BC=8C=E4=BD=86=E7=9B=AE=E5=89=8Dcpdetector=E8=BF=99?= =?UTF-8?q?=E4=B8=AA=E5=9C=A8Maven=E7=9A=84=E4=B8=AD=E5=A4=AE=E5=BA=93?= =?UTF-8?q?=E9=87=8C=E9=9D=A2=E6=B2=A1=E6=9C=89=EF=BC=8C=E4=B8=8D=E6=B8=85?= =?UTF-8?q?=E6=A5=9A=E5=A6=82=E4=BD=95=E8=A7=A3=E5=86=B3=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../downloader/HttpClientDownloader.java | 45 ++++++++++++++++++- 1 file changed, 43 insertions(+), 2 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index eeae70e..f5b55f9 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -2,6 +2,7 @@ package us.codecraft.webmagic.downloader; import com.google.common.collect.Sets; import org.apache.commons.io.IOUtils; +import org.apache.commons.lang.StringUtils; import org.apache.http.HttpResponse; import org.apache.http.NameValuePair; import org.apache.http.annotation.ThreadSafe; @@ -12,6 +13,10 @@ import org.apache.http.client.methods.HttpUriRequest; import org.apache.http.client.methods.RequestBuilder; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.util.EntityUtils; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import us.codecraft.webmagic.Page; @@ -86,8 +91,7 @@ public class HttpClientDownloader extends AbstractDownloader { if (statusAccept(acceptStatCode, statusCode)) { //charset if (charset == null) { - String value = httpResponse.getEntity().getContentType().getValue(); - charset = UrlUtils.getCharset(value); + charset = getHtmlCharset(httpResponse); } Page page = handleResponse(request, charset, httpResponse, task); onSuccess(request); @@ -115,6 +119,43 @@ public class HttpClientDownloader extends AbstractDownloader { } } + protected String getHtmlCharset(CloseableHttpResponse httpResponse) throws IOException { + // 1、head头部包含编码集 + String value = httpResponse.getEntity().getContentType().getValue(); + String charset = UrlUtils.getCharset(value); + + if(StringUtils.isEmpty(charset)) { + // 2、meta元素中包含编码集 + String content = IOUtils.toString(httpResponse.getEntity().getContent()); + if(StringUtils.isNotEmpty(content)) { + Document document = Jsoup.parse(content); + Elements links = document.select("meta"); + for(Element link : links) { + // 2.1、处理场景: + String metaContent = link.attr("content"); + if(metaContent.indexOf("charset") != -1) { + metaContent = metaContent.substring(metaContent.indexOf("charset"), metaContent.length()); + charset = metaContent.split("=")[1]; + break; + } + + // 2.2、处理场景: + String metaCharset = link.attr("charset"); + if(StringUtils.isNotEmpty(metaCharset)) { + charset = metaCharset.split("=")[1]; + break; + } + } + + // 3、以上两种都不包含的场景 + if(StringUtils.isEmpty(charset)) { + // TODO http://cpdetector.sourceforge.net/usage.shtml + } + } + } + return charset; + } + @Override public void setThread(int thread) { httpClientGenerator.setPoolSize(thread);