diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index c1ecff3..1b628cd 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -1,8 +1,11 @@ package us.codecraft.webmagic.downloader; import org.apache.commons.io.IOUtils; +import org.apache.http.Header; +import org.apache.http.HeaderElement; import org.apache.http.HttpResponse; import org.apache.http.client.HttpClient; +import org.apache.http.client.entity.GzipDecompressingEntity; import org.apache.http.client.methods.HttpGet; import org.apache.log4j.Logger; import us.codecraft.webmagic.Page; @@ -26,15 +29,19 @@ public class HttpClientDownloader implements Downloader { public Page download(Request request, Site site) { logger.info("downloading page " + request.getUrl()); HttpClient httpClient = HttpClientPool.getInstance().getClient(site); + String encoding = site.getEncoding(); try { HttpGet httpGet = new HttpGet(request.getUrl()); HttpResponse httpResponse = httpClient.execute(httpGet); int statusCode = httpResponse.getStatusLine().getStatusCode(); if (site.getAcceptStatCode().contains(statusCode)) { - if (site.getEncoding() == null){ + //charset + if (encoding == null){ String value = httpResponse.getEntity().getContentType().getValue(); site.setEncoding(new PlainText(value).regex("charset=([^\\s]+)").toString()); } + // + handleGzip(httpResponse); String content = IOUtils.toString(httpResponse.getEntity().getContent(), site.getEncoding()); Page page = new Page(); @@ -50,4 +57,17 @@ public class HttpClientDownloader implements Downloader { } return null; } + + private void handleGzip(HttpResponse httpResponse) { + Header ceheader = httpResponse.getEntity().getContentEncoding(); + if (ceheader != null) { + HeaderElement[] codecs = ceheader.getElements(); + for (int i = 0; i < codecs.length; i++) { + if (codecs[i].getName().equalsIgnoreCase("gzip")) { + httpResponse.setEntity( + new GzipDecompressingEntity(httpResponse.getEntity())); + } + } + } + } }