diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
index e57d5cd..ca35867 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
@@ -1,7 +1,6 @@
package us.codecraft.webmagic.downloader;
import org.apache.commons.io.IOUtils;
-import org.apache.commons.lang3.StringUtils;
import org.apache.http.HttpHost;
import org.apache.http.HttpResponse;
import org.apache.http.NameValuePair;
@@ -13,10 +12,6 @@ import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.client.methods.RequestBuilder;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.util.EntityUtils;
-import org.jsoup.Jsoup;
-import org.jsoup.nodes.Document;
-import org.jsoup.nodes.Element;
-import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.Page;
@@ -25,8 +20,8 @@ import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.proxy.Proxy;
import us.codecraft.webmagic.selector.PlainText;
+import us.codecraft.webmagic.utils.CharsetUtils;
import us.codecraft.webmagic.utils.HttpConstant;
-import us.codecraft.webmagic.utils.UrlUtils;
import us.codecraft.webmagic.utils.WMCollections;
import java.io.IOException;
@@ -213,40 +208,6 @@ public class HttpClientDownloader extends AbstractDownloader {
}
protected String getHtmlCharset(HttpResponse httpResponse, byte[] contentBytes) throws IOException {
- String charset;
- // charset
- // 1、encoding in http header Content-Type
- String value = httpResponse.getEntity().getContentType().getValue();
- charset = UrlUtils.getCharset(value);
- if (StringUtils.isNotBlank(charset)) {
- logger.debug("Auto get charset: {}", charset);
- return charset;
- }
- // use default charset to decode first time
- Charset defaultCharset = Charset.defaultCharset();
- String content = new String(contentBytes, defaultCharset.name());
- // 2、charset in meta
- if (StringUtils.isNotEmpty(content)) {
- Document document = Jsoup.parse(content);
- Elements links = document.select("meta");
- for (Element link : links) {
- // 2.1、html4.01
- String metaContent = link.attr("content");
- String metaCharset = link.attr("charset");
- if (metaContent.indexOf("charset") != -1) {
- metaContent = metaContent.substring(metaContent.indexOf("charset"), metaContent.length());
- charset = metaContent.split("=")[1];
- break;
- }
- // 2.2、html5
- else if (StringUtils.isNotEmpty(metaCharset)) {
- charset = metaCharset;
- break;
- }
- }
- }
- logger.debug("Auto get charset: {}", charset);
- // 3、todo use tools as cpdetector for content decode
- return charset;
+ return CharsetUtils.detectCharset(httpResponse.getEntity().getContentType().getValue(), contentBytes);
}
}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/CharsetUtils.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/CharsetUtils.java
new file mode 100644
index 0000000..50b4f1b
--- /dev/null
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/CharsetUtils.java
@@ -0,0 +1,61 @@
+package us.codecraft.webmagic.utils;
+
+import org.apache.commons.lang3.StringUtils;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.nio.charset.Charset;
+
+/**
+ * @author code4crafter@gmail.com
+ * Date: 17/3/11
+ * Time: 10:36
+ * @since 0.6.2
+ */
+public abstract class CharsetUtils {
+
+ private static Logger logger = LoggerFactory.getLogger(CharsetUtils.class);
+
+ public static String detectCharset(String contentType, byte[] contentBytes) throws IOException {
+ String charset;
+ // charset
+ // 1、encoding in http header Content-Type
+ charset = UrlUtils.getCharset(contentType);
+ if (StringUtils.isNotBlank(contentType)) {
+ logger.debug("Auto get charset: {}", charset);
+ return charset;
+ }
+ // use default charset to decode first time
+ Charset defaultCharset = Charset.defaultCharset();
+ String content = new String(contentBytes, defaultCharset);
+ // 2、charset in meta
+ if (StringUtils.isNotEmpty(content)) {
+ Document document = Jsoup.parse(content);
+ Elements links = document.select("meta");
+ for (Element link : links) {
+ // 2.1、html4.01
+ String metaContent = link.attr("content");
+ String metaCharset = link.attr("charset");
+ if (metaContent.indexOf("charset") != -1) {
+ metaContent = metaContent.substring(metaContent.indexOf("charset"), metaContent.length());
+ charset = metaContent.split("=")[1];
+ break;
+ }
+ // 2.2、html5
+ else if (StringUtils.isNotEmpty(metaCharset)) {
+ charset = metaCharset;
+ break;
+ }
+ }
+ }
+ logger.debug("Auto get charset: {}", charset);
+ // 3、todo use tools as cpdetector for content decode
+ return charset;
+ }
+
+}