#27 customize http header for downloader
parent
1a2c84ea78
commit
16e12e3bc9
|
@ -8,8 +8,8 @@ import java.util.*;
|
||||||
* Object contains setting for crawler.<br>
|
* Object contains setting for crawler.<br>
|
||||||
*
|
*
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
* @since 0.1.0
|
|
||||||
* @see us.codecraft.webmagic.processor.PageProcessor
|
* @see us.codecraft.webmagic.processor.PageProcessor
|
||||||
|
* @since 0.1.0
|
||||||
*/
|
*/
|
||||||
public class Site {
|
public class Site {
|
||||||
|
|
||||||
|
@ -38,6 +38,14 @@ public class Site {
|
||||||
|
|
||||||
private Set<Integer> acceptStatCode = DEFAULT_STATUS_CODE_SET;
|
private Set<Integer> acceptStatCode = DEFAULT_STATUS_CODE_SET;
|
||||||
|
|
||||||
|
private Map<String,String> headers = new HashMap<String, String>();
|
||||||
|
|
||||||
|
public static interface HeaderConst {
|
||||||
|
|
||||||
|
public static final String REFERER = "Referer";
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
static {
|
static {
|
||||||
DEFAULT_STATUS_CODE_SET.add(200);
|
DEFAULT_STATUS_CODE_SET.add(200);
|
||||||
}
|
}
|
||||||
|
@ -139,10 +147,12 @@ public class Site {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* set timeout for downloader in ms
|
* set timeout for downloader in ms
|
||||||
|
*
|
||||||
* @param timeOut
|
* @param timeOut
|
||||||
*/
|
*/
|
||||||
public void setTimeOut(int timeOut) {
|
public Site setTimeOut(int timeOut) {
|
||||||
this.timeOut = timeOut;
|
this.timeOut = timeOut;
|
||||||
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -216,7 +226,7 @@ public class Site {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Get retry times when download fail immediately, 0 by default.<br>
|
* Get retry times immediately when download fail, 0 by default.<br>
|
||||||
*
|
*
|
||||||
* @return retry times when download fail
|
* @return retry times when download fail
|
||||||
*/
|
*/
|
||||||
|
@ -224,6 +234,22 @@ public class Site {
|
||||||
return retryTimes;
|
return retryTimes;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public Map<String, String> getHeaders() {
|
||||||
|
return headers;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Put an Http header for downloader. <br/>
|
||||||
|
* Use {@link #addCookie(String, String)} for cookie and {@link #setUserAgent(String)} for user-agent. <br/>
|
||||||
|
* @param key key of http header, there are some keys constant in {@link HeaderConst}
|
||||||
|
* @param value value of header
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public Site addHeader(String key, String value){
|
||||||
|
headers.put(key,value);
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Set retry times when download fail, 0 by default.<br>
|
* Set retry times when download fail, 0 by default.<br>
|
||||||
*
|
*
|
||||||
|
|
|
@ -19,6 +19,7 @@ import us.codecraft.webmagic.utils.UrlUtils;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
|
import java.util.Map;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
|
|
||||||
|
@ -66,10 +67,12 @@ public class HttpClientDownloader implements Downloader {
|
||||||
int retryTimes = 0;
|
int retryTimes = 0;
|
||||||
Set<Integer> acceptStatCode;
|
Set<Integer> acceptStatCode;
|
||||||
String charset = null;
|
String charset = null;
|
||||||
|
Map<String,String> headers = null;
|
||||||
if (site != null) {
|
if (site != null) {
|
||||||
retryTimes = site.getRetryTimes();
|
retryTimes = site.getRetryTimes();
|
||||||
acceptStatCode = site.getAcceptStatCode();
|
acceptStatCode = site.getAcceptStatCode();
|
||||||
charset = site.getCharset();
|
charset = site.getCharset();
|
||||||
|
headers = site.getHeaders();
|
||||||
} else {
|
} else {
|
||||||
acceptStatCode = new HashSet<Integer>();
|
acceptStatCode = new HashSet<Integer>();
|
||||||
acceptStatCode.add(200);
|
acceptStatCode.add(200);
|
||||||
|
@ -78,6 +81,11 @@ public class HttpClientDownloader implements Downloader {
|
||||||
HttpClient httpClient = HttpClientPool.getInstance(poolSize).getClient(site);
|
HttpClient httpClient = HttpClientPool.getInstance(poolSize).getClient(site);
|
||||||
try {
|
try {
|
||||||
HttpGet httpGet = new HttpGet(request.getUrl());
|
HttpGet httpGet = new HttpGet(request.getUrl());
|
||||||
|
if (headers!=null){
|
||||||
|
for (Map.Entry<String, String> headerEntry : headers.entrySet()) {
|
||||||
|
httpGet.addHeader(headerEntry.getKey(),headerEntry.getValue());
|
||||||
|
}
|
||||||
|
}
|
||||||
HttpResponse httpResponse = null;
|
HttpResponse httpResponse = null;
|
||||||
int tried = 0;
|
int tried = 0;
|
||||||
boolean retry;
|
boolean retry;
|
||||||
|
|
|
@ -54,7 +54,7 @@ public class HttpClientPool {
|
||||||
}
|
}
|
||||||
params.setIntParameter(CoreConnectionPNames.SO_TIMEOUT, site.getTimeOut());
|
params.setIntParameter(CoreConnectionPNames.SO_TIMEOUT, site.getTimeOut());
|
||||||
params.setIntParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, site.getTimeOut());
|
params.setIntParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, site.getTimeOut());
|
||||||
|
params.setParameter(ClientPNames.COOKIE_POLICY, CookiePolicy.BEST_MATCH);
|
||||||
HttpProtocolParamBean paramsBean = new HttpProtocolParamBean(params);
|
HttpProtocolParamBean paramsBean = new HttpProtocolParamBean(params);
|
||||||
paramsBean.setVersion(HttpVersion.HTTP_1_1);
|
paramsBean.setVersion(HttpVersion.HTTP_1_1);
|
||||||
if (site != null && site.getCharset() != null) {
|
if (site != null && site.getCharset() != null) {
|
||||||
|
@ -73,8 +73,7 @@ public class HttpClientPool {
|
||||||
if (site != null) {
|
if (site != null) {
|
||||||
generateCookie(httpClient, site);
|
generateCookie(httpClient, site);
|
||||||
}
|
}
|
||||||
httpClient.getParams().setIntParameter("http.socket.timeout", 60000);
|
|
||||||
httpClient.getParams().setParameter(ClientPNames.COOKIE_POLICY, CookiePolicy.BEST_MATCH);
|
|
||||||
return httpClient;
|
return httpClient;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue