#27 customize http header for downloader
parent
1a2c84ea78
commit
16e12e3bc9
|
@ -8,8 +8,8 @@ import java.util.*;
|
|||
* Object contains setting for crawler.<br>
|
||||
*
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* @since 0.1.0
|
||||
* @see us.codecraft.webmagic.processor.PageProcessor
|
||||
* @since 0.1.0
|
||||
*/
|
||||
public class Site {
|
||||
|
||||
|
@ -38,6 +38,14 @@ public class Site {
|
|||
|
||||
private Set<Integer> acceptStatCode = DEFAULT_STATUS_CODE_SET;
|
||||
|
||||
private Map<String,String> headers = new HashMap<String, String>();
|
||||
|
||||
public static interface HeaderConst {
|
||||
|
||||
public static final String REFERER = "Referer";
|
||||
}
|
||||
|
||||
|
||||
static {
|
||||
DEFAULT_STATUS_CODE_SET.add(200);
|
||||
}
|
||||
|
@ -139,10 +147,12 @@ public class Site {
|
|||
|
||||
/**
|
||||
* set timeout for downloader in ms
|
||||
*
|
||||
* @param timeOut
|
||||
*/
|
||||
public void setTimeOut(int timeOut) {
|
||||
public Site setTimeOut(int timeOut) {
|
||||
this.timeOut = timeOut;
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -216,7 +226,7 @@ public class Site {
|
|||
}
|
||||
|
||||
/**
|
||||
* Get retry times when download fail immediately, 0 by default.<br>
|
||||
* Get retry times immediately when download fail, 0 by default.<br>
|
||||
*
|
||||
* @return retry times when download fail
|
||||
*/
|
||||
|
@ -224,6 +234,22 @@ public class Site {
|
|||
return retryTimes;
|
||||
}
|
||||
|
||||
public Map<String, String> getHeaders() {
|
||||
return headers;
|
||||
}
|
||||
|
||||
/**
|
||||
* Put an Http header for downloader. <br/>
|
||||
* Use {@link #addCookie(String, String)} for cookie and {@link #setUserAgent(String)} for user-agent. <br/>
|
||||
* @param key key of http header, there are some keys constant in {@link HeaderConst}
|
||||
* @param value value of header
|
||||
* @return
|
||||
*/
|
||||
public Site addHeader(String key, String value){
|
||||
headers.put(key,value);
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set retry times when download fail, 0 by default.<br>
|
||||
*
|
||||
|
|
|
@ -19,6 +19,7 @@ import us.codecraft.webmagic.utils.UrlUtils;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.util.HashSet;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
|
||||
|
@ -66,10 +67,12 @@ public class HttpClientDownloader implements Downloader {
|
|||
int retryTimes = 0;
|
||||
Set<Integer> acceptStatCode;
|
||||
String charset = null;
|
||||
Map<String,String> headers = null;
|
||||
if (site != null) {
|
||||
retryTimes = site.getRetryTimes();
|
||||
acceptStatCode = site.getAcceptStatCode();
|
||||
charset = site.getCharset();
|
||||
headers = site.getHeaders();
|
||||
} else {
|
||||
acceptStatCode = new HashSet<Integer>();
|
||||
acceptStatCode.add(200);
|
||||
|
@ -78,6 +81,11 @@ public class HttpClientDownloader implements Downloader {
|
|||
HttpClient httpClient = HttpClientPool.getInstance(poolSize).getClient(site);
|
||||
try {
|
||||
HttpGet httpGet = new HttpGet(request.getUrl());
|
||||
if (headers!=null){
|
||||
for (Map.Entry<String, String> headerEntry : headers.entrySet()) {
|
||||
httpGet.addHeader(headerEntry.getKey(),headerEntry.getValue());
|
||||
}
|
||||
}
|
||||
HttpResponse httpResponse = null;
|
||||
int tried = 0;
|
||||
boolean retry;
|
||||
|
|
|
@ -54,7 +54,7 @@ public class HttpClientPool {
|
|||
}
|
||||
params.setIntParameter(CoreConnectionPNames.SO_TIMEOUT, site.getTimeOut());
|
||||
params.setIntParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, site.getTimeOut());
|
||||
|
||||
params.setParameter(ClientPNames.COOKIE_POLICY, CookiePolicy.BEST_MATCH);
|
||||
HttpProtocolParamBean paramsBean = new HttpProtocolParamBean(params);
|
||||
paramsBean.setVersion(HttpVersion.HTTP_1_1);
|
||||
if (site != null && site.getCharset() != null) {
|
||||
|
@ -73,8 +73,7 @@ public class HttpClientPool {
|
|||
if (site != null) {
|
||||
generateCookie(httpClient, site);
|
||||
}
|
||||
httpClient.getParams().setIntParameter("http.socket.timeout", 60000);
|
||||
httpClient.getParams().setParameter(ClientPNames.COOKIE_POLICY, CookiePolicy.BEST_MATCH);
|
||||
|
||||
return httpClient;
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue