From c23627bf6322982e0cf60465f2e45caf283f0e71 Mon Sep 17 00:00:00 2001 From: "xbynet@outlook.com" Date: Tue, 17 Jan 2017 00:07:01 +0800 Subject: [PATCH] =?UTF-8?q?=E8=A7=A3=E5=86=B3post/redirect/post=20302?= =?UTF-8?q?=E8=B7=B3=E8=BD=AC=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../downloader/CustomRedirectStrategy.java | 44 +++++++++++++++++++ .../downloader/HttpClientGenerator.java | 5 ++- 2 files changed, 47 insertions(+), 2 deletions(-) create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/downloader/CustomRedirectStrategy.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/CustomRedirectStrategy.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/CustomRedirectStrategy.java new file mode 100644 index 0000000..7c32dbc --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/CustomRedirectStrategy.java @@ -0,0 +1,44 @@ +package us.codecraft.webmagic.downloader; + +import java.net.URI; + +import org.apache.http.HttpRequest; +import org.apache.http.HttpResponse; +import org.apache.http.ProtocolException; +import org.apache.http.client.methods.HttpGet; +import org.apache.http.client.methods.HttpPost; +import org.apache.http.client.methods.HttpRequestWrapper; +import org.apache.http.client.methods.HttpUriRequest; +import org.apache.http.impl.client.LaxRedirectStrategy; +import org.apache.http.protocol.HttpContext; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + *支持post 302跳转策略实现类 + *HttpClient默认跳转:httpClientBuilder.setRedirectStrategy(new LaxRedirectStrategy()); + *上述代码在post/redirect/post这种情况下不会传递原有请求的数据信息。所以参考了下SeimiCrawler这个项目的重定向策略。 + *原代码地址:https://github.com/zhegexiaohuozi/SeimiCrawler/blob/master/project/src/main/java/cn/wanghaomiao/seimi/http/hc/SeimiRedirectStrategy.java + */ +public class CustomRedirectStrategy extends LaxRedirectStrategy { + private Logger logger = LoggerFactory.getLogger(getClass()); + + @Override + public HttpUriRequest getRedirect(HttpRequest request, HttpResponse response, HttpContext context) throws ProtocolException { + URI uri = getLocationURI(request, response, context); + String method = request.getRequestLine().getMethod(); + if ("post".equalsIgnoreCase(method)) { + try { + HttpRequestWrapper httpRequestWrapper = (HttpRequestWrapper) request; + httpRequestWrapper.setURI(uri); + httpRequestWrapper.removeHeaders("Content-Length"); + return httpRequestWrapper; + } catch (Exception e) { + logger.error("强转为HttpRequestWrapper出错"); + } + return new HttpPost(uri); + } else { + return new HttpGet(uri); + } + } +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java index 80a7e29..ef98a47 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java @@ -89,8 +89,9 @@ public class HttpClientGenerator { } }); } - - + //解决post/redirect/post 302跳转问题 + httpClientBuilder.setRedirectStrategy(new CustomRedirectStrategy()); + SocketConfig socketConfig = SocketConfig.custom().setSoTimeout(site.getTimeOut()).setSoKeepAlive(true).setTcpNoDelay(true).build(); httpClientBuilder.setDefaultSocketConfig(socketConfig); connectionManager.setDefaultSocketConfig(socketConfig);