Merge pull request #513 from xbynet/master
Request支持设置header与cookie、新增POST请求时,XML、JSON参数支持、Page支持获取响应headermaster
commit
25df6650d9
|
@ -1,14 +1,16 @@
|
|||
package us.codecraft.webmagic;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.http.Header;
|
||||
|
||||
import us.codecraft.webmagic.selector.Html;
|
||||
import us.codecraft.webmagic.selector.Json;
|
||||
import us.codecraft.webmagic.selector.Selectable;
|
||||
import us.codecraft.webmagic.utils.UrlUtils;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Object storing extracted result and urls to fetch.<br>
|
||||
* Not thread safe.<br>
|
||||
|
@ -43,6 +45,11 @@ public class Page {
|
|||
private boolean needCycleRetry;
|
||||
|
||||
private List<Request> targetRequests = new ArrayList<Request>();
|
||||
|
||||
/**
|
||||
* Http响应头
|
||||
*/
|
||||
private Header[] headers=null;
|
||||
|
||||
public Page() {
|
||||
}
|
||||
|
@ -210,6 +217,14 @@ public class Page {
|
|||
return this;
|
||||
}
|
||||
|
||||
public Header[] getHeaders() {
|
||||
return headers;
|
||||
}
|
||||
|
||||
public void setHeaders(Header[] headers) {
|
||||
this.headers = headers;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "Page{" +
|
||||
|
@ -219,6 +234,11 @@ public class Page {
|
|||
", url=" + url +
|
||||
", statusCode=" + statusCode +
|
||||
", targetRequests=" + targetRequests +
|
||||
", headers=" + headers+
|
||||
'}';
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
|
|
@ -1,11 +1,21 @@
|
|||
package us.codecraft.webmagic;
|
||||
|
||||
import us.codecraft.webmagic.utils.Experimental;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.http.Header;
|
||||
import org.apache.http.HttpEntity;
|
||||
import org.apache.http.cookie.Cookie;
|
||||
import org.apache.http.entity.StringEntity;
|
||||
import org.apache.http.impl.cookie.BasicClientCookie;
|
||||
import org.apache.http.message.BasicHeader;
|
||||
|
||||
import us.codecraft.webmagic.utils.Experimental;
|
||||
import us.codecraft.webmagic.utils.UrlUtils;
|
||||
|
||||
/**
|
||||
* Object contains url to crawl.<br>
|
||||
* It contains some additional information.<br>
|
||||
|
@ -33,6 +43,18 @@ public class Request implements Serializable {
|
|||
* POST/GET param set
|
||||
* */
|
||||
private Map<String,String> params=new HashMap<String, String>();
|
||||
|
||||
/**
|
||||
* support for json,xml or more,在post时,设置此选项会使params参数和nameValuePair extra失效。
|
||||
*/
|
||||
private HttpEntity entity;
|
||||
|
||||
/**
|
||||
* cookies for current url, if not set use Site's cookies
|
||||
*/
|
||||
private List<Cookie> cookies=new ArrayList<Cookie>();
|
||||
|
||||
private List<Header> headers=new ArrayList<Header>();
|
||||
|
||||
/**
|
||||
* Priority of the request.<br>
|
||||
|
@ -145,12 +167,59 @@ public class Request implements Serializable {
|
|||
if (method != null ? !method.equals(request.method) : request.method != null) return false;
|
||||
return params != null ? params.equals(request.params) : request.params == null;
|
||||
}
|
||||
public void addHeader(String name,String value){
|
||||
Header header=new BasicHeader(name,value);
|
||||
headers.add(header);
|
||||
}
|
||||
public List<Header> getHeaders(){
|
||||
return headers;
|
||||
}
|
||||
public void addCookie(String key,String value){
|
||||
BasicClientCookie c=new BasicClientCookie(key, value);
|
||||
c.setDomain(UrlUtils.getDomain(url));
|
||||
cookies.add(c);
|
||||
}
|
||||
public List<Cookie> getCookies() {
|
||||
return cookies;
|
||||
}
|
||||
|
||||
public void setCookies(List<Cookie> cookies) {
|
||||
this.cookies = cookies;
|
||||
}
|
||||
/**
|
||||
* 设置json参数
|
||||
*/
|
||||
public void setJsonParam(String jsonStr,String encoding){
|
||||
StringEntity e=new StringEntity(jsonStr,encoding==null?"UTF-8":encoding);
|
||||
e.setContentEncoding(encoding==null?"UTF-8":encoding);
|
||||
e.setContentType("application/json");
|
||||
entity=e;
|
||||
}
|
||||
/**
|
||||
* 设置xml参数
|
||||
*/
|
||||
public void setXmlParam(String xmlStr,String encoding){
|
||||
StringEntity e=new StringEntity(xmlStr,encoding==null?"UTF-8":encoding);
|
||||
e.setContentEncoding(encoding==null?"UTF-8":encoding);
|
||||
e.setContentType("text/xml");
|
||||
entity=e;
|
||||
}
|
||||
public HttpEntity getEntity() {
|
||||
return entity;
|
||||
}
|
||||
|
||||
public void setEntity(HttpEntity entity) {
|
||||
this.entity = entity;
|
||||
}
|
||||
@Override
|
||||
public int hashCode() {
|
||||
int result = url != null ? url.hashCode() : 0;
|
||||
result = 31 * result + (method != null ? method.hashCode() : 0);
|
||||
result = 31 * result + (params != null ? params.hashCode() : 0);
|
||||
result = 31 * result + (headers != null ? headers.hashCode() : 0);
|
||||
result = 31 * result + (entity != null ? entity.hashCode() : 0);
|
||||
result = 31 * result + (cookies != null ? cookies.hashCode() : 0);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
|
@ -162,6 +231,10 @@ public class Request implements Serializable {
|
|||
", extras=" + extras +
|
||||
", params=" + params +
|
||||
", priority=" + priority +
|
||||
", headers=" + headers +
|
||||
", entity=" + entity +
|
||||
", cookies="+ cookies+
|
||||
'}';
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,21 +1,37 @@
|
|||
package us.codecraft.webmagic.downloader;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.charset.Charset;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.commons.collections.CollectionUtils;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.http.Header;
|
||||
import org.apache.http.HttpHost;
|
||||
import org.apache.http.HttpResponse;
|
||||
import org.apache.http.NameValuePair;
|
||||
import org.apache.http.annotation.ThreadSafe;
|
||||
import org.apache.http.client.CookieStore;
|
||||
import org.apache.http.client.config.CookieSpecs;
|
||||
import org.apache.http.client.config.RequestConfig;
|
||||
import org.apache.http.client.entity.UrlEncodedFormEntity;
|
||||
import org.apache.http.client.methods.CloseableHttpResponse;
|
||||
import org.apache.http.client.methods.HttpUriRequest;
|
||||
import org.apache.http.client.methods.RequestBuilder;
|
||||
import org.apache.http.client.protocol.HttpClientContext;
|
||||
import org.apache.http.cookie.Cookie;
|
||||
import org.apache.http.impl.client.BasicCookieStore;
|
||||
import org.apache.http.impl.client.CloseableHttpClient;
|
||||
import org.apache.http.message.BasicNameValuePair;
|
||||
import org.apache.http.util.EntityUtils;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.Request;
|
||||
import us.codecraft.webmagic.Site;
|
||||
|
@ -26,10 +42,6 @@ import us.codecraft.webmagic.utils.CharsetUtils;
|
|||
import us.codecraft.webmagic.utils.HttpConstant;
|
||||
import us.codecraft.webmagic.utils.WMCollections;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.charset.Charset;
|
||||
import java.util.*;
|
||||
|
||||
|
||||
/**
|
||||
* The http downloader based on HttpClient.
|
||||
|
@ -94,11 +106,26 @@ public class HttpClientDownloader extends AbstractDownloader {
|
|||
}
|
||||
|
||||
HttpUriRequest httpUriRequest = getHttpUriRequest(request, site, headers, proxyHost);
|
||||
httpResponse = getHttpClient(site, proxy).execute(httpUriRequest);
|
||||
HttpClientContext context=null;
|
||||
if(request.getCookies()!=null && CollectionUtils.isNotEmpty(request.getCookies())){
|
||||
context=new HttpClientContext();
|
||||
CookieStore cookieStore=new BasicCookieStore();
|
||||
for(Cookie c:request.getCookies()){
|
||||
cookieStore.addCookie(c);
|
||||
}
|
||||
context.setCookieStore(cookieStore);
|
||||
}
|
||||
if(request.getHeaders()!=null && CollectionUtils.isNotEmpty(request.getHeaders())){
|
||||
for(Header h:request.getHeaders()){
|
||||
httpUriRequest.setHeader(h);
|
||||
}
|
||||
}
|
||||
httpResponse = getHttpClient(site, proxy).execute(httpUriRequest,context);
|
||||
statusCode = httpResponse.getStatusLine().getStatusCode();
|
||||
request.putExtra(Request.STATUS_CODE, statusCode);
|
||||
if (statusAccept(acceptStatCode, statusCode)) {
|
||||
Page page = handleResponse(request, charset, httpResponse, task);
|
||||
page.setHeaders(httpResponse.getAllHeaders());
|
||||
onSuccess(request);
|
||||
return page;
|
||||
} else {
|
||||
|
@ -164,7 +191,11 @@ public class HttpClientDownloader extends AbstractDownloader {
|
|||
//default get
|
||||
return addQueryParams(RequestBuilder.get(),request.getParams());
|
||||
} else if (method.equalsIgnoreCase(HttpConstant.Method.POST)) {
|
||||
return addFormParams(RequestBuilder.post(), (NameValuePair[]) request.getExtra("nameValuePair"), request.getParams());
|
||||
if(request.getEntity()!=null){
|
||||
return RequestBuilder.post().setEntity(request.getEntity());
|
||||
}else{
|
||||
return addFormParams(RequestBuilder.post(), (NameValuePair[]) request.getExtra("nameValuePair"), request.getParams());
|
||||
}
|
||||
} else if (method.equalsIgnoreCase(HttpConstant.Method.HEAD)) {
|
||||
return addQueryParams(RequestBuilder.head(),request.getParams());
|
||||
} else if (method.equalsIgnoreCase(HttpConstant.Method.PUT)) {
|
||||
|
|
|
@ -26,7 +26,7 @@ public abstract class CharsetUtils {
|
|||
// charset
|
||||
// 1、encoding in http header Content-Type
|
||||
charset = UrlUtils.getCharset(contentType);
|
||||
if (StringUtils.isNotBlank(contentType)) {
|
||||
if (StringUtils.isNotBlank(contentType) && StringUtils.isNotBlank(charset)) {
|
||||
logger.debug("Auto get charset: {}", charset);
|
||||
return charset;
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue