From 16e12e3bc936382a6503823fe21169120d9978a0 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Fri, 11 Oct 2013 08:37:21 +0800 Subject: [PATCH] #27 customize http header for downloader --- .../main/java/us/codecraft/webmagic/Site.java | 32 +++++++++++++++++-- .../downloader/HttpClientDownloader.java | 8 +++++ .../webmagic/downloader/HttpClientPool.java | 5 ++- 3 files changed, 39 insertions(+), 6 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java index 0817335..a84ba48 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java @@ -8,8 +8,8 @@ import java.util.*; * Object contains setting for crawler.
* * @author code4crafter@gmail.com
- * @since 0.1.0 * @see us.codecraft.webmagic.processor.PageProcessor + * @since 0.1.0 */ public class Site { @@ -38,6 +38,14 @@ public class Site { private Set acceptStatCode = DEFAULT_STATUS_CODE_SET; + private Map headers = new HashMap(); + + public static interface HeaderConst { + + public static final String REFERER = "Referer"; + } + + static { DEFAULT_STATUS_CODE_SET.add(200); } @@ -139,10 +147,12 @@ public class Site { /** * set timeout for downloader in ms + * * @param timeOut */ - public void setTimeOut(int timeOut) { + public Site setTimeOut(int timeOut) { this.timeOut = timeOut; + return this; } /** @@ -216,7 +226,7 @@ public class Site { } /** - * Get retry times when download fail immediately, 0 by default.
+ * Get retry times immediately when download fail, 0 by default.
* * @return retry times when download fail */ @@ -224,6 +234,22 @@ public class Site { return retryTimes; } + public Map getHeaders() { + return headers; + } + + /** + * Put an Http header for downloader.
+ * Use {@link #addCookie(String, String)} for cookie and {@link #setUserAgent(String)} for user-agent.
+ * @param key key of http header, there are some keys constant in {@link HeaderConst} + * @param value value of header + * @return + */ + public Site addHeader(String key, String value){ + headers.put(key,value); + return this; + } + /** * Set retry times when download fail, 0 by default.
* diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 82a4a9a..b6f0034 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -19,6 +19,7 @@ import us.codecraft.webmagic.utils.UrlUtils; import java.io.IOException; import java.util.HashSet; +import java.util.Map; import java.util.Set; @@ -66,10 +67,12 @@ public class HttpClientDownloader implements Downloader { int retryTimes = 0; Set acceptStatCode; String charset = null; + Map headers = null; if (site != null) { retryTimes = site.getRetryTimes(); acceptStatCode = site.getAcceptStatCode(); charset = site.getCharset(); + headers = site.getHeaders(); } else { acceptStatCode = new HashSet(); acceptStatCode.add(200); @@ -78,6 +81,11 @@ public class HttpClientDownloader implements Downloader { HttpClient httpClient = HttpClientPool.getInstance(poolSize).getClient(site); try { HttpGet httpGet = new HttpGet(request.getUrl()); + if (headers!=null){ + for (Map.Entry headerEntry : headers.entrySet()) { + httpGet.addHeader(headerEntry.getKey(),headerEntry.getValue()); + } + } HttpResponse httpResponse = null; int tried = 0; boolean retry; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java index 52e2f99..c256ac4 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java @@ -54,7 +54,7 @@ public class HttpClientPool { } params.setIntParameter(CoreConnectionPNames.SO_TIMEOUT, site.getTimeOut()); params.setIntParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, site.getTimeOut()); - + params.setParameter(ClientPNames.COOKIE_POLICY, CookiePolicy.BEST_MATCH); HttpProtocolParamBean paramsBean = new HttpProtocolParamBean(params); paramsBean.setVersion(HttpVersion.HTTP_1_1); if (site != null && site.getCharset() != null) { @@ -73,8 +73,7 @@ public class HttpClientPool { if (site != null) { generateCookie(httpClient, site); } - httpClient.getParams().setIntParameter("http.socket.timeout", 60000); - httpClient.getParams().setParameter(ClientPNames.COOKIE_POLICY, CookiePolicy.BEST_MATCH); + return httpClient; }