增加处理meta的逻辑。
遗留: 3、网页没有指定编码的情况下,需要采用cpdetector,但目前cpdetector这个在Maven的中央库里面没有,不清楚如何解决。master
parent
dde2d89bbe
commit
95494d3c4d
|
@ -2,6 +2,7 @@ package us.codecraft.webmagic.downloader;
|
||||||
|
|
||||||
import com.google.common.collect.Sets;
|
import com.google.common.collect.Sets;
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.commons.lang.StringUtils;
|
||||||
import org.apache.http.HttpResponse;
|
import org.apache.http.HttpResponse;
|
||||||
import org.apache.http.NameValuePair;
|
import org.apache.http.NameValuePair;
|
||||||
import org.apache.http.annotation.ThreadSafe;
|
import org.apache.http.annotation.ThreadSafe;
|
||||||
|
@ -12,6 +13,10 @@ import org.apache.http.client.methods.HttpUriRequest;
|
||||||
import org.apache.http.client.methods.RequestBuilder;
|
import org.apache.http.client.methods.RequestBuilder;
|
||||||
import org.apache.http.impl.client.CloseableHttpClient;
|
import org.apache.http.impl.client.CloseableHttpClient;
|
||||||
import org.apache.http.util.EntityUtils;
|
import org.apache.http.util.EntityUtils;
|
||||||
|
import org.jsoup.Jsoup;
|
||||||
|
import org.jsoup.nodes.Document;
|
||||||
|
import org.jsoup.nodes.Element;
|
||||||
|
import org.jsoup.select.Elements;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
import us.codecraft.webmagic.Page;
|
import us.codecraft.webmagic.Page;
|
||||||
|
@ -86,8 +91,7 @@ public class HttpClientDownloader extends AbstractDownloader {
|
||||||
if (statusAccept(acceptStatCode, statusCode)) {
|
if (statusAccept(acceptStatCode, statusCode)) {
|
||||||
//charset
|
//charset
|
||||||
if (charset == null) {
|
if (charset == null) {
|
||||||
String value = httpResponse.getEntity().getContentType().getValue();
|
charset = getHtmlCharset(httpResponse);
|
||||||
charset = UrlUtils.getCharset(value);
|
|
||||||
}
|
}
|
||||||
Page page = handleResponse(request, charset, httpResponse, task);
|
Page page = handleResponse(request, charset, httpResponse, task);
|
||||||
onSuccess(request);
|
onSuccess(request);
|
||||||
|
@ -115,6 +119,43 @@ public class HttpClientDownloader extends AbstractDownloader {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
protected String getHtmlCharset(CloseableHttpResponse httpResponse) throws IOException {
|
||||||
|
// 1、head头部包含编码集
|
||||||
|
String value = httpResponse.getEntity().getContentType().getValue();
|
||||||
|
String charset = UrlUtils.getCharset(value);
|
||||||
|
|
||||||
|
if(StringUtils.isEmpty(charset)) {
|
||||||
|
// 2、meta元素中包含编码集
|
||||||
|
String content = IOUtils.toString(httpResponse.getEntity().getContent());
|
||||||
|
if(StringUtils.isNotEmpty(content)) {
|
||||||
|
Document document = Jsoup.parse(content);
|
||||||
|
Elements links = document.select("meta");
|
||||||
|
for(Element link : links) {
|
||||||
|
// 2.1、处理场景: <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
|
||||||
|
String metaContent = link.attr("content");
|
||||||
|
if(metaContent.indexOf("charset") != -1) {
|
||||||
|
metaContent = metaContent.substring(metaContent.indexOf("charset"), metaContent.length());
|
||||||
|
charset = metaContent.split("=")[1];
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 2.2、处理场景: <meta charset="UTF-8" />
|
||||||
|
String metaCharset = link.attr("charset");
|
||||||
|
if(StringUtils.isNotEmpty(metaCharset)) {
|
||||||
|
charset = metaCharset.split("=")[1];
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 3、以上两种都不包含的场景
|
||||||
|
if(StringUtils.isEmpty(charset)) {
|
||||||
|
// TODO http://cpdetector.sourceforge.net/usage.shtml
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return charset;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void setThread(int thread) {
|
public void setThread(int thread) {
|
||||||
httpClientGenerator.setPoolSize(thread);
|
httpClientGenerator.setPoolSize(thread);
|
||||||
|
|
Loading…
Reference in New Issue