Enhance auto charset detect #126

1. Only read from content once to fix stream closed exception
2. invite moco as server test
master
yihua.huang 2014-05-26 17:45:30 +08:00
parent 21982d3460
commit 03d26c169b
4 changed files with 123 additions and 52 deletions

12
pom.xml
View File

@ -95,6 +95,18 @@
<artifactId>fastjson</artifactId>
<version>1.1.37</version>
</dependency>
<dependency>
<groupId>com.github.dreamhead</groupId>
<artifactId>moco-core</artifactId>
<version>0.9.1</version>
<scope>test</scope>
<exclusions>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-simple</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>

View File

@ -35,6 +35,11 @@
<artifactId>xsoup</artifactId>
</dependency>
<dependency>
<groupId>com.github.dreamhead</groupId>
<artifactId>moco-core</artifactId>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>

View File

@ -28,6 +28,7 @@ import us.codecraft.webmagic.utils.HttpConstant;
import us.codecraft.webmagic.utils.UrlUtils;
import java.io.IOException;
import java.nio.charset.Charset;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
@ -89,11 +90,6 @@ public class HttpClientDownloader extends AbstractDownloader {
httpResponse = getHttpClient(site).execute(httpUriRequest);
int statusCode = httpResponse.getStatusLine().getStatusCode();
if (statusAccept(acceptStatCode, statusCode)) {
//charset
if (charset == null) {
charset = getHtmlCharset(httpResponse);
logger.debug("Auto get charset: " + charset);
}
Page page = handleResponse(request, charset, httpResponse, task);
onSuccess(request);
return page;
@ -120,38 +116,6 @@ public class HttpClientDownloader extends AbstractDownloader {
}
}
protected String getHtmlCharset(CloseableHttpResponse httpResponse) throws IOException {
// 1、encoding in http header Content-Type
String value = httpResponse.getEntity().getContentType().getValue();
String charset = UrlUtils.getCharset(value);
if (StringUtils.isEmpty(charset)) {
// 2、charset in meta
String content = IOUtils.toString(httpResponse.getEntity().getContent());
if (StringUtils.isNotEmpty(content)) {
Document document = Jsoup.parse(content);
Elements links = document.select("meta");
for (Element link : links) {
// 2.1、 <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
String metaContent = link.attr("content");
String metaCharset = link.attr("charset");
if (metaContent.indexOf("charset") != -1) {
metaContent = metaContent.substring(metaContent.indexOf("charset"), metaContent.length());
charset = metaContent.split("=")[1];
break;
}
// 2.2、 <meta charset="UTF-8" />
else if (StringUtils.isNotEmpty(metaCharset)) {
charset = metaCharset;
break;
}
}
// 3、todo use tools as cpdetector for content decode
}
}
return charset;
}
@Override
public void setThread(int thread) {
httpClientGenerator.setPoolSize(thread);
@ -205,7 +169,7 @@ public class HttpClientDownloader extends AbstractDownloader {
}
protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException {
String content = IOUtils.toString(httpResponse.getEntity().getContent(), charset);
String content = getContent(charset, httpResponse);
Page page = new Page();
page.setRawText(content);
page.setUrl(new PlainText(request.getUrl()));
@ -213,4 +177,57 @@ public class HttpClientDownloader extends AbstractDownloader {
page.setStatusCode(httpResponse.getStatusLine().getStatusCode());
return page;
}
protected String getContent(String charset, HttpResponse httpResponse) throws IOException {
if (charset == null) {
byte[] contentBytes = IOUtils.toByteArray(httpResponse.getEntity().getContent());
String htmlCharset = getHtmlCharset(httpResponse, contentBytes);
if (htmlCharset != null) {
return new String(contentBytes, htmlCharset);
} else {
logger.warn("Charset autodetect failed, use {} as charset. Please specify charset in Site.setCharset()", Charset.defaultCharset());
return new String(contentBytes);
}
} else {
return IOUtils.toString(httpResponse.getEntity().getContent(), charset);
}
}
protected String getHtmlCharset(HttpResponse httpResponse, byte[] contentBytes) throws IOException {
String charset;
// charset
// 1、encoding in http header Content-Type
String value = httpResponse.getEntity().getContentType().getValue();
charset = UrlUtils.getCharset(value);
if (StringUtils.isNotBlank(charset)) {
logger.debug("Auto get charset: {}", charset);
return charset;
}
// use default charset to decode first time
Charset defaultCharset = Charset.defaultCharset();
String content = new String(contentBytes, defaultCharset.name());
// 2、charset in meta
if (StringUtils.isNotEmpty(content)) {
Document document = Jsoup.parse(content);
Elements links = document.select("meta");
for (Element link : links) {
// 2.1、html4.01 <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
String metaContent = link.attr("content");
String metaCharset = link.attr("charset");
if (metaContent.indexOf("charset") != -1) {
metaContent = metaContent.substring(metaContent.indexOf("charset"), metaContent.length());
charset = metaContent.split("=")[1];
break;
}
// 2.2、html5 <meta charset="UTF-8" />
else if (StringUtils.isNotEmpty(metaCharset)) {
charset = metaCharset;
break;
}
}
}
logger.debug("Auto get charset: {}", charset);
// 3、todo use tools as cpdetector for content decode
return charset;
}
}

View File

@ -1,5 +1,8 @@
package us.codecraft.webmagic.downloader;
import com.github.dreamhead.moco.*;
import com.github.dreamhead.moco.Runnable;
import org.apache.commons.io.IOUtils;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.impl.client.CloseableHttpClient;
import org.junit.Ignore;
@ -13,6 +16,7 @@ import us.codecraft.webmagic.selector.Html;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import static com.github.dreamhead.moco.Moco.*;
import static org.assertj.core.api.Assertions.assertThat;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
@ -57,20 +61,53 @@ public class HttpClientDownloaderTest {
}
@Test
public void testGetHtmlCharset() throws IOException {
HttpClientDownloader downloader = new HttpClientDownloader();
Site site = Site.me();
CloseableHttpClient httpClient = new HttpClientGenerator().getClient(site);
// encoding in http header Content-Type
Request requestGBK = new Request("http://sports.163.com/14/0514/13/9S7986F300051CA1.html#p=9RGQDGGH0AI90005");
CloseableHttpResponse httpResponse = httpClient.execute(downloader.getHttpUriRequest(requestGBK, site, null));
String charset = downloader.getHtmlCharset(httpResponse);
assertEquals(charset, "GBK");
public void testGetHtmlCharset() throws Exception {
HttpServer server = httpserver(12306);
server.get(by(uri("/header"))).response(header("Content-Type", "text/html; charset=gbk"));
server.get(by(uri("/meta4"))).response(with(text("<html>\n" +
" <head>\n" +
" <meta charset='gbk'/>\n" +
" </head>\n" +
" <body></body>\n" +
"</html>")),header("Content-Type",""));
server.get(by(uri("/meta5"))).response(with(text("<html>\n" +
" <head>\n" +
" <meta http-equiv=\"Content-Type\" content=\"text/html; charset=gbk\" />\n" +
" </head>\n" +
" <body></body>\n" +
"</html>")),header("Content-Type",""));
Runner.running(server, new Runnable() {
@Override
public void run() {
String charset = getCharsetByUrl("http://127.0.0.1:12306/header");
assertEquals(charset, "gbk");
charset = getCharsetByUrl("http://127.0.0.1:12306/meta4");
assertEquals(charset, "gbk");
charset = getCharsetByUrl("http://127.0.0.1:12306/meta5");
assertEquals(charset, "gbk");
}
// encoding in meta
Request requestUTF_8 = new Request("http://preshing.com/");
httpResponse = httpClient.execute(downloader.getHttpUriRequest(requestUTF_8, site, null));
charset = downloader.getHtmlCharset(httpResponse);
assertEquals(charset, "utf-8");
private String getCharsetByUrl(String url) {
HttpClientDownloader downloader = new HttpClientDownloader();
Site site = Site.me();
CloseableHttpClient httpClient = new HttpClientGenerator().getClient(site);
// encoding in http header Content-Type
Request requestGBK = new Request(url);
CloseableHttpResponse httpResponse = null;
try {
httpResponse = httpClient.execute(downloader.getHttpUriRequest(requestGBK, site, null));
} catch (IOException e) {
e.printStackTrace();
}
String charset = null;
try {
byte[] contentBytes = IOUtils.toByteArray(httpResponse.getEntity().getContent());
charset = downloader.getHtmlCharset(httpResponse,contentBytes);
} catch (IOException e) {
e.printStackTrace();
}
return charset;
}
});
}
}