Enhance auto charset detect #126
1. Only read from content once to fix stream closed exception 2. invite moco as server testmaster
parent
21982d3460
commit
03d26c169b
12
pom.xml
12
pom.xml
|
@ -95,6 +95,18 @@
|
||||||
<artifactId>fastjson</artifactId>
|
<artifactId>fastjson</artifactId>
|
||||||
<version>1.1.37</version>
|
<version>1.1.37</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>com.github.dreamhead</groupId>
|
||||||
|
<artifactId>moco-core</artifactId>
|
||||||
|
<version>0.9.1</version>
|
||||||
|
<scope>test</scope>
|
||||||
|
<exclusions>
|
||||||
|
<exclusion>
|
||||||
|
<groupId>org.slf4j</groupId>
|
||||||
|
<artifactId>slf4j-simple</artifactId>
|
||||||
|
</exclusion>
|
||||||
|
</exclusions>
|
||||||
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>log4j</groupId>
|
<groupId>log4j</groupId>
|
||||||
<artifactId>log4j</artifactId>
|
<artifactId>log4j</artifactId>
|
||||||
|
|
|
@ -35,6 +35,11 @@
|
||||||
<artifactId>xsoup</artifactId>
|
<artifactId>xsoup</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>com.github.dreamhead</groupId>
|
||||||
|
<artifactId>moco-core</artifactId>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.slf4j</groupId>
|
<groupId>org.slf4j</groupId>
|
||||||
<artifactId>slf4j-api</artifactId>
|
<artifactId>slf4j-api</artifactId>
|
||||||
|
|
|
@ -28,6 +28,7 @@ import us.codecraft.webmagic.utils.HttpConstant;
|
||||||
import us.codecraft.webmagic.utils.UrlUtils;
|
import us.codecraft.webmagic.utils.UrlUtils;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.nio.charset.Charset;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
@ -89,11 +90,6 @@ public class HttpClientDownloader extends AbstractDownloader {
|
||||||
httpResponse = getHttpClient(site).execute(httpUriRequest);
|
httpResponse = getHttpClient(site).execute(httpUriRequest);
|
||||||
int statusCode = httpResponse.getStatusLine().getStatusCode();
|
int statusCode = httpResponse.getStatusLine().getStatusCode();
|
||||||
if (statusAccept(acceptStatCode, statusCode)) {
|
if (statusAccept(acceptStatCode, statusCode)) {
|
||||||
//charset
|
|
||||||
if (charset == null) {
|
|
||||||
charset = getHtmlCharset(httpResponse);
|
|
||||||
logger.debug("Auto get charset: " + charset);
|
|
||||||
}
|
|
||||||
Page page = handleResponse(request, charset, httpResponse, task);
|
Page page = handleResponse(request, charset, httpResponse, task);
|
||||||
onSuccess(request);
|
onSuccess(request);
|
||||||
return page;
|
return page;
|
||||||
|
@ -120,38 +116,6 @@ public class HttpClientDownloader extends AbstractDownloader {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
protected String getHtmlCharset(CloseableHttpResponse httpResponse) throws IOException {
|
|
||||||
// 1、encoding in http header Content-Type
|
|
||||||
String value = httpResponse.getEntity().getContentType().getValue();
|
|
||||||
String charset = UrlUtils.getCharset(value);
|
|
||||||
|
|
||||||
if (StringUtils.isEmpty(charset)) {
|
|
||||||
// 2、charset in meta
|
|
||||||
String content = IOUtils.toString(httpResponse.getEntity().getContent());
|
|
||||||
if (StringUtils.isNotEmpty(content)) {
|
|
||||||
Document document = Jsoup.parse(content);
|
|
||||||
Elements links = document.select("meta");
|
|
||||||
for (Element link : links) {
|
|
||||||
// 2.1、 <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
|
|
||||||
String metaContent = link.attr("content");
|
|
||||||
String metaCharset = link.attr("charset");
|
|
||||||
if (metaContent.indexOf("charset") != -1) {
|
|
||||||
metaContent = metaContent.substring(metaContent.indexOf("charset"), metaContent.length());
|
|
||||||
charset = metaContent.split("=")[1];
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
// 2.2、 <meta charset="UTF-8" />
|
|
||||||
else if (StringUtils.isNotEmpty(metaCharset)) {
|
|
||||||
charset = metaCharset;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// 3、todo use tools as cpdetector for content decode
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return charset;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void setThread(int thread) {
|
public void setThread(int thread) {
|
||||||
httpClientGenerator.setPoolSize(thread);
|
httpClientGenerator.setPoolSize(thread);
|
||||||
|
@ -205,7 +169,7 @@ public class HttpClientDownloader extends AbstractDownloader {
|
||||||
}
|
}
|
||||||
|
|
||||||
protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException {
|
protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException {
|
||||||
String content = IOUtils.toString(httpResponse.getEntity().getContent(), charset);
|
String content = getContent(charset, httpResponse);
|
||||||
Page page = new Page();
|
Page page = new Page();
|
||||||
page.setRawText(content);
|
page.setRawText(content);
|
||||||
page.setUrl(new PlainText(request.getUrl()));
|
page.setUrl(new PlainText(request.getUrl()));
|
||||||
|
@ -213,4 +177,57 @@ public class HttpClientDownloader extends AbstractDownloader {
|
||||||
page.setStatusCode(httpResponse.getStatusLine().getStatusCode());
|
page.setStatusCode(httpResponse.getStatusLine().getStatusCode());
|
||||||
return page;
|
return page;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
protected String getContent(String charset, HttpResponse httpResponse) throws IOException {
|
||||||
|
if (charset == null) {
|
||||||
|
byte[] contentBytes = IOUtils.toByteArray(httpResponse.getEntity().getContent());
|
||||||
|
String htmlCharset = getHtmlCharset(httpResponse, contentBytes);
|
||||||
|
if (htmlCharset != null) {
|
||||||
|
return new String(contentBytes, htmlCharset);
|
||||||
|
} else {
|
||||||
|
logger.warn("Charset autodetect failed, use {} as charset. Please specify charset in Site.setCharset()", Charset.defaultCharset());
|
||||||
|
return new String(contentBytes);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
return IOUtils.toString(httpResponse.getEntity().getContent(), charset);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
protected String getHtmlCharset(HttpResponse httpResponse, byte[] contentBytes) throws IOException {
|
||||||
|
String charset;
|
||||||
|
// charset
|
||||||
|
// 1、encoding in http header Content-Type
|
||||||
|
String value = httpResponse.getEntity().getContentType().getValue();
|
||||||
|
charset = UrlUtils.getCharset(value);
|
||||||
|
if (StringUtils.isNotBlank(charset)) {
|
||||||
|
logger.debug("Auto get charset: {}", charset);
|
||||||
|
return charset;
|
||||||
|
}
|
||||||
|
// use default charset to decode first time
|
||||||
|
Charset defaultCharset = Charset.defaultCharset();
|
||||||
|
String content = new String(contentBytes, defaultCharset.name());
|
||||||
|
// 2、charset in meta
|
||||||
|
if (StringUtils.isNotEmpty(content)) {
|
||||||
|
Document document = Jsoup.parse(content);
|
||||||
|
Elements links = document.select("meta");
|
||||||
|
for (Element link : links) {
|
||||||
|
// 2.1、html4.01 <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
|
||||||
|
String metaContent = link.attr("content");
|
||||||
|
String metaCharset = link.attr("charset");
|
||||||
|
if (metaContent.indexOf("charset") != -1) {
|
||||||
|
metaContent = metaContent.substring(metaContent.indexOf("charset"), metaContent.length());
|
||||||
|
charset = metaContent.split("=")[1];
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
// 2.2、html5 <meta charset="UTF-8" />
|
||||||
|
else if (StringUtils.isNotEmpty(metaCharset)) {
|
||||||
|
charset = metaCharset;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
logger.debug("Auto get charset: {}", charset);
|
||||||
|
// 3、todo use tools as cpdetector for content decode
|
||||||
|
return charset;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,5 +1,8 @@
|
||||||
package us.codecraft.webmagic.downloader;
|
package us.codecraft.webmagic.downloader;
|
||||||
|
|
||||||
|
import com.github.dreamhead.moco.*;
|
||||||
|
import com.github.dreamhead.moco.Runnable;
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.http.client.methods.CloseableHttpResponse;
|
import org.apache.http.client.methods.CloseableHttpResponse;
|
||||||
import org.apache.http.impl.client.CloseableHttpClient;
|
import org.apache.http.impl.client.CloseableHttpClient;
|
||||||
import org.junit.Ignore;
|
import org.junit.Ignore;
|
||||||
|
@ -13,6 +16,7 @@ import us.codecraft.webmagic.selector.Html;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.UnsupportedEncodingException;
|
import java.io.UnsupportedEncodingException;
|
||||||
|
|
||||||
|
import static com.github.dreamhead.moco.Moco.*;
|
||||||
import static org.assertj.core.api.Assertions.assertThat;
|
import static org.assertj.core.api.Assertions.assertThat;
|
||||||
import static org.junit.Assert.assertEquals;
|
import static org.junit.Assert.assertEquals;
|
||||||
import static org.junit.Assert.assertTrue;
|
import static org.junit.Assert.assertTrue;
|
||||||
|
@ -57,20 +61,53 @@ public class HttpClientDownloaderTest {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testGetHtmlCharset() throws IOException {
|
public void testGetHtmlCharset() throws Exception {
|
||||||
HttpClientDownloader downloader = new HttpClientDownloader();
|
HttpServer server = httpserver(12306);
|
||||||
Site site = Site.me();
|
server.get(by(uri("/header"))).response(header("Content-Type", "text/html; charset=gbk"));
|
||||||
CloseableHttpClient httpClient = new HttpClientGenerator().getClient(site);
|
server.get(by(uri("/meta4"))).response(with(text("<html>\n" +
|
||||||
// encoding in http header Content-Type
|
" <head>\n" +
|
||||||
Request requestGBK = new Request("http://sports.163.com/14/0514/13/9S7986F300051CA1.html#p=9RGQDGGH0AI90005");
|
" <meta charset='gbk'/>\n" +
|
||||||
CloseableHttpResponse httpResponse = httpClient.execute(downloader.getHttpUriRequest(requestGBK, site, null));
|
" </head>\n" +
|
||||||
String charset = downloader.getHtmlCharset(httpResponse);
|
" <body></body>\n" +
|
||||||
assertEquals(charset, "GBK");
|
"</html>")),header("Content-Type",""));
|
||||||
|
server.get(by(uri("/meta5"))).response(with(text("<html>\n" +
|
||||||
|
" <head>\n" +
|
||||||
|
" <meta http-equiv=\"Content-Type\" content=\"text/html; charset=gbk\" />\n" +
|
||||||
|
" </head>\n" +
|
||||||
|
" <body></body>\n" +
|
||||||
|
"</html>")),header("Content-Type",""));
|
||||||
|
Runner.running(server, new Runnable() {
|
||||||
|
@Override
|
||||||
|
public void run() {
|
||||||
|
String charset = getCharsetByUrl("http://127.0.0.1:12306/header");
|
||||||
|
assertEquals(charset, "gbk");
|
||||||
|
charset = getCharsetByUrl("http://127.0.0.1:12306/meta4");
|
||||||
|
assertEquals(charset, "gbk");
|
||||||
|
charset = getCharsetByUrl("http://127.0.0.1:12306/meta5");
|
||||||
|
assertEquals(charset, "gbk");
|
||||||
|
}
|
||||||
|
|
||||||
// encoding in meta
|
private String getCharsetByUrl(String url) {
|
||||||
Request requestUTF_8 = new Request("http://preshing.com/");
|
HttpClientDownloader downloader = new HttpClientDownloader();
|
||||||
httpResponse = httpClient.execute(downloader.getHttpUriRequest(requestUTF_8, site, null));
|
Site site = Site.me();
|
||||||
charset = downloader.getHtmlCharset(httpResponse);
|
CloseableHttpClient httpClient = new HttpClientGenerator().getClient(site);
|
||||||
assertEquals(charset, "utf-8");
|
// encoding in http header Content-Type
|
||||||
|
Request requestGBK = new Request(url);
|
||||||
|
CloseableHttpResponse httpResponse = null;
|
||||||
|
try {
|
||||||
|
httpResponse = httpClient.execute(downloader.getHttpUriRequest(requestGBK, site, null));
|
||||||
|
} catch (IOException e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
}
|
||||||
|
String charset = null;
|
||||||
|
try {
|
||||||
|
byte[] contentBytes = IOUtils.toByteArray(httpResponse.getEntity().getContent());
|
||||||
|
charset = downloader.getHtmlCharset(httpResponse,contentBytes);
|
||||||
|
} catch (IOException e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
}
|
||||||
|
return charset;
|
||||||
|
}
|
||||||
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue