add gzip support
parent
adeed3bcaf
commit
5a6a68a318
|
@ -1,8 +1,11 @@
|
||||||
package us.codecraft.webmagic.downloader;
|
package us.codecraft.webmagic.downloader;
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.http.Header;
|
||||||
|
import org.apache.http.HeaderElement;
|
||||||
import org.apache.http.HttpResponse;
|
import org.apache.http.HttpResponse;
|
||||||
import org.apache.http.client.HttpClient;
|
import org.apache.http.client.HttpClient;
|
||||||
|
import org.apache.http.client.entity.GzipDecompressingEntity;
|
||||||
import org.apache.http.client.methods.HttpGet;
|
import org.apache.http.client.methods.HttpGet;
|
||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
import us.codecraft.webmagic.Page;
|
import us.codecraft.webmagic.Page;
|
||||||
|
@ -26,15 +29,19 @@ public class HttpClientDownloader implements Downloader {
|
||||||
public Page download(Request request, Site site) {
|
public Page download(Request request, Site site) {
|
||||||
logger.info("downloading page " + request.getUrl());
|
logger.info("downloading page " + request.getUrl());
|
||||||
HttpClient httpClient = HttpClientPool.getInstance().getClient(site);
|
HttpClient httpClient = HttpClientPool.getInstance().getClient(site);
|
||||||
|
String encoding = site.getEncoding();
|
||||||
try {
|
try {
|
||||||
HttpGet httpGet = new HttpGet(request.getUrl());
|
HttpGet httpGet = new HttpGet(request.getUrl());
|
||||||
HttpResponse httpResponse = httpClient.execute(httpGet);
|
HttpResponse httpResponse = httpClient.execute(httpGet);
|
||||||
int statusCode = httpResponse.getStatusLine().getStatusCode();
|
int statusCode = httpResponse.getStatusLine().getStatusCode();
|
||||||
if (site.getAcceptStatCode().contains(statusCode)) {
|
if (site.getAcceptStatCode().contains(statusCode)) {
|
||||||
if (site.getEncoding() == null){
|
//charset
|
||||||
|
if (encoding == null){
|
||||||
String value = httpResponse.getEntity().getContentType().getValue();
|
String value = httpResponse.getEntity().getContentType().getValue();
|
||||||
site.setEncoding(new PlainText(value).regex("charset=([^\\s]+)").toString());
|
site.setEncoding(new PlainText(value).regex("charset=([^\\s]+)").toString());
|
||||||
}
|
}
|
||||||
|
//
|
||||||
|
handleGzip(httpResponse);
|
||||||
String content = IOUtils.toString(httpResponse.getEntity().getContent(),
|
String content = IOUtils.toString(httpResponse.getEntity().getContent(),
|
||||||
site.getEncoding());
|
site.getEncoding());
|
||||||
Page page = new Page();
|
Page page = new Page();
|
||||||
|
@ -50,4 +57,17 @@ public class HttpClientDownloader implements Downloader {
|
||||||
}
|
}
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private void handleGzip(HttpResponse httpResponse) {
|
||||||
|
Header ceheader = httpResponse.getEntity().getContentEncoding();
|
||||||
|
if (ceheader != null) {
|
||||||
|
HeaderElement[] codecs = ceheader.getElements();
|
||||||
|
for (int i = 0; i < codecs.length; i++) {
|
||||||
|
if (codecs[i].getName().equalsIgnoreCase("gzip")) {
|
||||||
|
httpResponse.setEntity(
|
||||||
|
new GzipDecompressingEntity(httpResponse.getEntity()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue