From 00b0a751b4ab877b7f87bd55193e4b6565306f2e Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Wed, 6 Nov 2013 06:57:58 +0800 Subject: [PATCH] #33 ignore 'content-encoding' when redirect --- .../downloader/HttpClientGenerator.java | 48 ++++++++----------- .../example/BaiduBaikePageProcesser.java | 1 - 2 files changed, 20 insertions(+), 29 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java index a3319a0..92ba6f8 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java @@ -1,9 +1,8 @@ package us.codecraft.webmagic.downloader; -import org.apache.http.HttpException; -import org.apache.http.HttpRequest; -import org.apache.http.HttpRequestInterceptor; +import org.apache.http.*; import org.apache.http.client.CookieStore; +import org.apache.http.client.protocol.ResponseContentEncoding; import org.apache.http.config.Registry; import org.apache.http.config.RegistryBuilder; import org.apache.http.conn.socket.ConnectionSocketFactory; @@ -60,31 +59,24 @@ public class HttpClientGenerator { } }); } -// httpClientBuilder.disableContentCompression().addInterceptorFirst(new HttpResponseInterceptor() { -// -// public void process( -// final HttpResponse response, -// final HttpContext context) throws HttpException, IOException { -// if (response.getStatusLine().getStatusCode() != 200) { -// return; -// } -// HttpEntity entity = response.getEntity(); -// if (entity != null) { -// Header ceheader = entity.getContentEncoding(); -// if (ceheader != null) { -// HeaderElement[] codecs = ceheader.getElements(); -// for (int i = 0; i < codecs.length; i++) { -// if (codecs[i].getName().equalsIgnoreCase("gzip")) { -// response.setEntity( -// new GzipDecompressingEntity(response.getEntity())); -// return; -// } -// } -// } -// } -// } -// -// }); + // Http client has some problem handling compressing entity for redirect + // So I disable it and do it manually + // https://issues.apache.org/jira/browse/HTTPCLIENT-1432 + httpClientBuilder.disableContentCompression(); + httpClientBuilder.addInterceptorFirst(new HttpResponseInterceptor() { + + private ResponseContentEncoding contentEncoding = new ResponseContentEncoding(); + + public void process( + final HttpResponse response, + final HttpContext context) throws HttpException, IOException { + if (response.getStatusLine().getStatusCode() == 301 || response.getStatusLine().getStatusCode() == 302) { + return; + } + contentEncoding.process(response, context); + } + + }); if (site != null) { httpClientBuilder.setRetryHandler(new DefaultHttpRequestRetryHandler(site.getRetryTimes(), true)); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/BaiduBaikePageProcesser.java b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/BaiduBaikePageProcesser.java index b3e7d78..071b7e6 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/BaiduBaikePageProcesser.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/BaiduBaikePageProcesser.java @@ -37,7 +37,6 @@ public class BaiduBaikePageProcesser implements PageProcessor { list.add(String.format(urlTemplate,"风力发电")); list.add(String.format(urlTemplate,"太阳能")); list.add(String.format(urlTemplate,"地热发电")); - list.add(String.format(urlTemplate,"众数")); list.add(String.format(urlTemplate,"地热发电")); List resultItemses = spider.getAll(list); for (ResultItems resultItemse : resultItemses) {