修改自动从网页中获取字符的代码块,抽取出来成为单独的方法。
parent
95494d3c4d
commit
fcbfb75608
|
@ -1,6 +1,7 @@
|
||||||
package us.codecraft.webmagic.downloader;
|
package us.codecraft.webmagic.downloader;
|
||||||
|
|
||||||
import com.google.common.collect.Sets;
|
import com.google.common.collect.Sets;
|
||||||
|
import info.monitorenter.cpdetector.io.CodepageDetectorProxy;
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.commons.lang.StringUtils;
|
import org.apache.commons.lang.StringUtils;
|
||||||
import org.apache.http.HttpResponse;
|
import org.apache.http.HttpResponse;
|
||||||
|
@ -28,6 +29,7 @@ import us.codecraft.webmagic.selector.PlainText;
|
||||||
import us.codecraft.webmagic.utils.UrlUtils;
|
import us.codecraft.webmagic.utils.UrlUtils;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.net.URL;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
@ -92,6 +94,7 @@ public class HttpClientDownloader extends AbstractDownloader {
|
||||||
//charset
|
//charset
|
||||||
if (charset == null) {
|
if (charset == null) {
|
||||||
charset = getHtmlCharset(httpResponse);
|
charset = getHtmlCharset(httpResponse);
|
||||||
|
logger.debug("Auto get charset: " + charset);
|
||||||
}
|
}
|
||||||
Page page = handleResponse(request, charset, httpResponse, task);
|
Page page = handleResponse(request, charset, httpResponse, task);
|
||||||
onSuccess(request);
|
onSuccess(request);
|
||||||
|
@ -119,6 +122,7 @@ public class HttpClientDownloader extends AbstractDownloader {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static CodepageDetectorProxy detector = CodepageDetectorProxy.getInstance();
|
||||||
protected String getHtmlCharset(CloseableHttpResponse httpResponse) throws IOException {
|
protected String getHtmlCharset(CloseableHttpResponse httpResponse) throws IOException {
|
||||||
// 1、head头部包含编码集
|
// 1、head头部包含编码集
|
||||||
String value = httpResponse.getEntity().getContentType().getValue();
|
String value = httpResponse.getEntity().getContentType().getValue();
|
||||||
|
@ -133,23 +137,28 @@ public class HttpClientDownloader extends AbstractDownloader {
|
||||||
for(Element link : links) {
|
for(Element link : links) {
|
||||||
// 2.1、处理场景: <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
|
// 2.1、处理场景: <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
|
||||||
String metaContent = link.attr("content");
|
String metaContent = link.attr("content");
|
||||||
|
String metaCharset = link.attr("charset");
|
||||||
if(metaContent.indexOf("charset") != -1) {
|
if(metaContent.indexOf("charset") != -1) {
|
||||||
metaContent = metaContent.substring(metaContent.indexOf("charset"), metaContent.length());
|
metaContent = metaContent.substring(metaContent.indexOf("charset"), metaContent.length());
|
||||||
charset = metaContent.split("=")[1];
|
charset = metaContent.split("=")[1];
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
// 2.2、处理场景: <meta charset="UTF-8" />
|
// 2.2、处理场景: <meta charset="UTF-8" />
|
||||||
String metaCharset = link.attr("charset");
|
else if(StringUtils.isNotEmpty(metaCharset)) {
|
||||||
if(StringUtils.isNotEmpty(metaCharset)) {
|
charset = metaCharset;
|
||||||
charset = metaCharset.split("=")[1];
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// 3、以上两种都不包含的场景
|
// 3、以上两种都不包含的场景
|
||||||
if(StringUtils.isEmpty(charset)) {
|
if(StringUtils.isEmpty(charset)) {
|
||||||
// TODO http://cpdetector.sourceforge.net/usage.shtml
|
java.nio.charset.Charset nioCharset = null;
|
||||||
|
try {
|
||||||
|
nioCharset = detector.detectCodepage(httpResponse.getEntity().getContent(), content.length());
|
||||||
|
charset = nioCharset.name();
|
||||||
|
} catch (IOException e) {
|
||||||
|
// ignore
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Binary file not shown.
Binary file not shown.
|
@ -1,5 +1,8 @@
|
||||||
package us.codecraft.webmagic.downloader;
|
package us.codecraft.webmagic.downloader;
|
||||||
|
|
||||||
|
import org.apache.http.client.methods.CloseableHttpResponse;
|
||||||
|
import org.apache.http.client.methods.HttpUriRequest;
|
||||||
|
import org.apache.http.impl.client.CloseableHttpClient;
|
||||||
import org.junit.Ignore;
|
import org.junit.Ignore;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
import us.codecraft.webmagic.Page;
|
import us.codecraft.webmagic.Page;
|
||||||
|
@ -11,6 +14,7 @@ import us.codecraft.webmagic.selector.Html;
|
||||||
import java.io.UnsupportedEncodingException;
|
import java.io.UnsupportedEncodingException;
|
||||||
|
|
||||||
import static org.assertj.core.api.Assertions.assertThat;
|
import static org.assertj.core.api.Assertions.assertThat;
|
||||||
|
import static org.junit.Assert.assertEquals;
|
||||||
import static org.junit.Assert.assertTrue;
|
import static org.junit.Assert.assertTrue;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -52,4 +56,30 @@ public class HttpClientDownloaderTest {
|
||||||
assertThat((Integer) page.getTargetRequests().get(0).getExtra(Request.CYCLE_TRIED_TIMES)).isEqualTo(2);
|
assertThat((Integer) page.getTargetRequests().get(0).getExtra(Request.CYCLE_TRIED_TIMES)).isEqualTo(2);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testGetHtmlCharset() {
|
||||||
|
HttpClientDownloader downloader = new HttpClientDownloader();
|
||||||
|
Site site = Site.me();
|
||||||
|
CloseableHttpClient httpClient = new HttpClientGenerator().getClient(site);
|
||||||
|
try {
|
||||||
|
// 头部包含编码
|
||||||
|
Request requestGBK = new Request("http://sports.163.com/14/0514/13/9S7986F300051CA1.html#p=9RGQDGGH0AI90005");
|
||||||
|
CloseableHttpResponse httpResponse = httpClient.execute(downloader.getHttpUriRequest(requestGBK, site, null));
|
||||||
|
String charset = downloader.getHtmlCharset(httpResponse);
|
||||||
|
assertEquals(charset, "GBK");
|
||||||
|
|
||||||
|
// meta包含编码
|
||||||
|
Request requestUTF_8 = new Request("http://preshing.com/");
|
||||||
|
httpResponse = httpClient.execute(downloader.getHttpUriRequest(requestUTF_8, site, null));
|
||||||
|
charset = downloader.getHtmlCharset(httpResponse);
|
||||||
|
assertEquals(charset, "utf-8");
|
||||||
|
|
||||||
|
// Request request = new Request("http://sports.163.com/14/0514/13/9S7986F300051CA1.html#p=9RGQDGGH0AI90005");
|
||||||
|
// httpResponse = httpClient.execute(downloader.getHttpUriRequest(request, site, null));
|
||||||
|
// charset = downloader.getHtmlCharset(httpResponse);
|
||||||
|
// assertEquals(charset, "GBK");
|
||||||
|
} catch (Exception e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue