diff --git a/pom.xml b/pom.xml
index 9d80771..de5cf91 100644
--- a/pom.xml
+++ b/pom.xml
@@ -95,6 +95,18 @@
fastjson
1.1.37
+
+ com.github.dreamhead
+ moco-core
+ 0.9.1
+ test
+
+
+ org.slf4j
+ slf4j-simple
+
+
+
log4j
log4j
diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml
index 9ce9a97..dedd898 100644
--- a/webmagic-core/pom.xml
+++ b/webmagic-core/pom.xml
@@ -35,6 +35,11 @@
xsoup
+
+ com.github.dreamhead
+ moco-core
+
+
org.slf4j
slf4j-api
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
index 5d2af73..cb8ba1b 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
@@ -28,6 +28,7 @@ import us.codecraft.webmagic.utils.HttpConstant;
import us.codecraft.webmagic.utils.UrlUtils;
import java.io.IOException;
+import java.nio.charset.Charset;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
@@ -89,11 +90,6 @@ public class HttpClientDownloader extends AbstractDownloader {
httpResponse = getHttpClient(site).execute(httpUriRequest);
int statusCode = httpResponse.getStatusLine().getStatusCode();
if (statusAccept(acceptStatCode, statusCode)) {
- //charset
- if (charset == null) {
- charset = getHtmlCharset(httpResponse);
- logger.debug("Auto get charset: " + charset);
- }
Page page = handleResponse(request, charset, httpResponse, task);
onSuccess(request);
return page;
@@ -120,38 +116,6 @@ public class HttpClientDownloader extends AbstractDownloader {
}
}
- protected String getHtmlCharset(CloseableHttpResponse httpResponse) throws IOException {
- // 1、encoding in http header Content-Type
- String value = httpResponse.getEntity().getContentType().getValue();
- String charset = UrlUtils.getCharset(value);
-
- if (StringUtils.isEmpty(charset)) {
- // 2、charset in meta
- String content = IOUtils.toString(httpResponse.getEntity().getContent());
- if (StringUtils.isNotEmpty(content)) {
- Document document = Jsoup.parse(content);
- Elements links = document.select("meta");
- for (Element link : links) {
- // 2.1、
- String metaContent = link.attr("content");
- String metaCharset = link.attr("charset");
- if (metaContent.indexOf("charset") != -1) {
- metaContent = metaContent.substring(metaContent.indexOf("charset"), metaContent.length());
- charset = metaContent.split("=")[1];
- break;
- }
- // 2.2、
- else if (StringUtils.isNotEmpty(metaCharset)) {
- charset = metaCharset;
- break;
- }
- }
- // 3、todo use tools as cpdetector for content decode
- }
- }
- return charset;
- }
-
@Override
public void setThread(int thread) {
httpClientGenerator.setPoolSize(thread);
@@ -205,7 +169,7 @@ public class HttpClientDownloader extends AbstractDownloader {
}
protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException {
- String content = IOUtils.toString(httpResponse.getEntity().getContent(), charset);
+ String content = getContent(charset, httpResponse);
Page page = new Page();
page.setRawText(content);
page.setUrl(new PlainText(request.getUrl()));
@@ -213,4 +177,57 @@ public class HttpClientDownloader extends AbstractDownloader {
page.setStatusCode(httpResponse.getStatusLine().getStatusCode());
return page;
}
+
+ protected String getContent(String charset, HttpResponse httpResponse) throws IOException {
+ if (charset == null) {
+ byte[] contentBytes = IOUtils.toByteArray(httpResponse.getEntity().getContent());
+ String htmlCharset = getHtmlCharset(httpResponse, contentBytes);
+ if (htmlCharset != null) {
+ return new String(contentBytes, htmlCharset);
+ } else {
+ logger.warn("Charset autodetect failed, use {} as charset. Please specify charset in Site.setCharset()", Charset.defaultCharset());
+ return new String(contentBytes);
+ }
+ } else {
+ return IOUtils.toString(httpResponse.getEntity().getContent(), charset);
+ }
+ }
+
+ protected String getHtmlCharset(HttpResponse httpResponse, byte[] contentBytes) throws IOException {
+ String charset;
+ // charset
+ // 1、encoding in http header Content-Type
+ String value = httpResponse.getEntity().getContentType().getValue();
+ charset = UrlUtils.getCharset(value);
+ if (StringUtils.isNotBlank(charset)) {
+ logger.debug("Auto get charset: {}", charset);
+ return charset;
+ }
+ // use default charset to decode first time
+ Charset defaultCharset = Charset.defaultCharset();
+ String content = new String(contentBytes, defaultCharset.name());
+ // 2、charset in meta
+ if (StringUtils.isNotEmpty(content)) {
+ Document document = Jsoup.parse(content);
+ Elements links = document.select("meta");
+ for (Element link : links) {
+ // 2.1、html4.01
+ String metaContent = link.attr("content");
+ String metaCharset = link.attr("charset");
+ if (metaContent.indexOf("charset") != -1) {
+ metaContent = metaContent.substring(metaContent.indexOf("charset"), metaContent.length());
+ charset = metaContent.split("=")[1];
+ break;
+ }
+ // 2.2、html5
+ else if (StringUtils.isNotEmpty(metaCharset)) {
+ charset = metaCharset;
+ break;
+ }
+ }
+ }
+ logger.debug("Auto get charset: {}", charset);
+ // 3、todo use tools as cpdetector for content decode
+ return charset;
+ }
}
diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java
index 09855a0..084a110 100644
--- a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java
+++ b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java
@@ -1,5 +1,8 @@
package us.codecraft.webmagic.downloader;
+import com.github.dreamhead.moco.*;
+import com.github.dreamhead.moco.Runnable;
+import org.apache.commons.io.IOUtils;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.impl.client.CloseableHttpClient;
import org.junit.Ignore;
@@ -13,6 +16,7 @@ import us.codecraft.webmagic.selector.Html;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
+import static com.github.dreamhead.moco.Moco.*;
import static org.assertj.core.api.Assertions.assertThat;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
@@ -57,20 +61,53 @@ public class HttpClientDownloaderTest {
}
@Test
- public void testGetHtmlCharset() throws IOException {
- HttpClientDownloader downloader = new HttpClientDownloader();
- Site site = Site.me();
- CloseableHttpClient httpClient = new HttpClientGenerator().getClient(site);
- // encoding in http header Content-Type
- Request requestGBK = new Request("http://sports.163.com/14/0514/13/9S7986F300051CA1.html#p=9RGQDGGH0AI90005");
- CloseableHttpResponse httpResponse = httpClient.execute(downloader.getHttpUriRequest(requestGBK, site, null));
- String charset = downloader.getHtmlCharset(httpResponse);
- assertEquals(charset, "GBK");
+ public void testGetHtmlCharset() throws Exception {
+ HttpServer server = httpserver(12306);
+ server.get(by(uri("/header"))).response(header("Content-Type", "text/html; charset=gbk"));
+ server.get(by(uri("/meta4"))).response(with(text("\n" +
+ " \n" +
+ " \n" +
+ " \n" +
+ " \n" +
+ "")),header("Content-Type",""));
+ server.get(by(uri("/meta5"))).response(with(text("\n" +
+ " \n" +
+ " \n" +
+ " \n" +
+ " \n" +
+ "")),header("Content-Type",""));
+ Runner.running(server, new Runnable() {
+ @Override
+ public void run() {
+ String charset = getCharsetByUrl("http://127.0.0.1:12306/header");
+ assertEquals(charset, "gbk");
+ charset = getCharsetByUrl("http://127.0.0.1:12306/meta4");
+ assertEquals(charset, "gbk");
+ charset = getCharsetByUrl("http://127.0.0.1:12306/meta5");
+ assertEquals(charset, "gbk");
+ }
- // encoding in meta
- Request requestUTF_8 = new Request("http://preshing.com/");
- httpResponse = httpClient.execute(downloader.getHttpUriRequest(requestUTF_8, site, null));
- charset = downloader.getHtmlCharset(httpResponse);
- assertEquals(charset, "utf-8");
+ private String getCharsetByUrl(String url) {
+ HttpClientDownloader downloader = new HttpClientDownloader();
+ Site site = Site.me();
+ CloseableHttpClient httpClient = new HttpClientGenerator().getClient(site);
+ // encoding in http header Content-Type
+ Request requestGBK = new Request(url);
+ CloseableHttpResponse httpResponse = null;
+ try {
+ httpResponse = httpClient.execute(downloader.getHttpUriRequest(requestGBK, site, null));
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ String charset = null;
+ try {
+ byte[] contentBytes = IOUtils.toByteArray(httpResponse.getEntity().getContent());
+ charset = downloader.getHtmlCharset(httpResponse,contentBytes);
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ return charset;
+ }
+ });
}
}