diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java
index 7c0064d..1a6527d 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java
@@ -1,14 +1,16 @@
package us.codecraft.webmagic;
+import java.util.ArrayList;
+import java.util.List;
+
import org.apache.commons.lang3.StringUtils;
+import org.apache.http.Header;
+
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.Json;
import us.codecraft.webmagic.selector.Selectable;
import us.codecraft.webmagic.utils.UrlUtils;
-import java.util.ArrayList;
-import java.util.List;
-
/**
* Object storing extracted result and urls to fetch.
* Not thread safe.
@@ -43,6 +45,11 @@ public class Page {
private boolean needCycleRetry;
private List targetRequests = new ArrayList();
+
+ /**
+ * Http响应头
+ */
+ private Header[] headers=null;
public Page() {
}
@@ -210,6 +217,14 @@ public class Page {
return this;
}
+ public Header[] getHeaders() {
+ return headers;
+ }
+
+ public void setHeaders(Header[] headers) {
+ this.headers = headers;
+ }
+
@Override
public String toString() {
return "Page{" +
@@ -219,6 +234,11 @@ public class Page {
", url=" + url +
", statusCode=" + statusCode +
", targetRequests=" + targetRequests +
+ ", headers=" + headers+
'}';
}
+
+
+
+
}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java
index c8c5978..d44f61f 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java
@@ -1,11 +1,21 @@
package us.codecraft.webmagic;
-import us.codecraft.webmagic.utils.Experimental;
-
import java.io.Serializable;
+import java.util.ArrayList;
import java.util.HashMap;
+import java.util.List;
import java.util.Map;
+import org.apache.http.Header;
+import org.apache.http.HttpEntity;
+import org.apache.http.cookie.Cookie;
+import org.apache.http.entity.StringEntity;
+import org.apache.http.impl.cookie.BasicClientCookie;
+import org.apache.http.message.BasicHeader;
+
+import us.codecraft.webmagic.utils.Experimental;
+import us.codecraft.webmagic.utils.UrlUtils;
+
/**
* Object contains url to crawl.
* It contains some additional information.
@@ -33,6 +43,18 @@ public class Request implements Serializable {
* POST/GET param set
* */
private Map params=new HashMap();
+
+ /**
+ * support for json,xml or more,在post时,设置此选项会使params参数和nameValuePair extra失效。
+ */
+ private HttpEntity entity;
+
+ /**
+ * cookies for current url, if not set use Site's cookies
+ */
+ private List cookies=new ArrayList();
+
+ private List headers=new ArrayList();
/**
* Priority of the request.
@@ -145,12 +167,59 @@ public class Request implements Serializable {
if (method != null ? !method.equals(request.method) : request.method != null) return false;
return params != null ? params.equals(request.params) : request.params == null;
}
+ public void addHeader(String name,String value){
+ Header header=new BasicHeader(name,value);
+ headers.add(header);
+ }
+ public List getHeaders(){
+ return headers;
+ }
+ public void addCookie(String key,String value){
+ BasicClientCookie c=new BasicClientCookie(key, value);
+ c.setDomain(UrlUtils.getDomain(url));
+ cookies.add(c);
+ }
+ public List getCookies() {
+ return cookies;
+ }
+ public void setCookies(List cookies) {
+ this.cookies = cookies;
+ }
+ /**
+ * 设置json参数
+ */
+ public void setJsonParam(String jsonStr,String encoding){
+ StringEntity e=new StringEntity(jsonStr,encoding==null?"UTF-8":encoding);
+ e.setContentEncoding(encoding==null?"UTF-8":encoding);
+ e.setContentType("application/json");
+ entity=e;
+ }
+ /**
+ * 设置xml参数
+ */
+ public void setXmlParam(String xmlStr,String encoding){
+ StringEntity e=new StringEntity(xmlStr,encoding==null?"UTF-8":encoding);
+ e.setContentEncoding(encoding==null?"UTF-8":encoding);
+ e.setContentType("text/xml");
+ entity=e;
+ }
+ public HttpEntity getEntity() {
+ return entity;
+ }
+
+ public void setEntity(HttpEntity entity) {
+ this.entity = entity;
+ }
@Override
public int hashCode() {
int result = url != null ? url.hashCode() : 0;
result = 31 * result + (method != null ? method.hashCode() : 0);
result = 31 * result + (params != null ? params.hashCode() : 0);
+ result = 31 * result + (headers != null ? headers.hashCode() : 0);
+ result = 31 * result + (entity != null ? entity.hashCode() : 0);
+ result = 31 * result + (cookies != null ? cookies.hashCode() : 0);
+
return result;
}
@@ -162,6 +231,10 @@ public class Request implements Serializable {
", extras=" + extras +
", params=" + params +
", priority=" + priority +
+ ", headers=" + headers +
+ ", entity=" + entity +
+ ", cookies="+ cookies+
'}';
}
+
}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
index fa907a1..669ba37 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
@@ -1,21 +1,37 @@
package us.codecraft.webmagic.downloader;
+import java.io.IOException;
+import java.nio.charset.Charset;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.io.IOUtils;
+import org.apache.http.Header;
import org.apache.http.HttpHost;
import org.apache.http.HttpResponse;
import org.apache.http.NameValuePair;
import org.apache.http.annotation.ThreadSafe;
+import org.apache.http.client.CookieStore;
import org.apache.http.client.config.CookieSpecs;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.client.methods.RequestBuilder;
+import org.apache.http.client.protocol.HttpClientContext;
+import org.apache.http.cookie.Cookie;
+import org.apache.http.impl.client.BasicCookieStore;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
@@ -26,10 +42,6 @@ import us.codecraft.webmagic.utils.CharsetUtils;
import us.codecraft.webmagic.utils.HttpConstant;
import us.codecraft.webmagic.utils.WMCollections;
-import java.io.IOException;
-import java.nio.charset.Charset;
-import java.util.*;
-
/**
* The http downloader based on HttpClient.
@@ -94,11 +106,26 @@ public class HttpClientDownloader extends AbstractDownloader {
}
HttpUriRequest httpUriRequest = getHttpUriRequest(request, site, headers, proxyHost);
- httpResponse = getHttpClient(site, proxy).execute(httpUriRequest);
+ HttpClientContext context=null;
+ if(request.getCookies()!=null && CollectionUtils.isNotEmpty(request.getCookies())){
+ context=new HttpClientContext();
+ CookieStore cookieStore=new BasicCookieStore();
+ for(Cookie c:request.getCookies()){
+ cookieStore.addCookie(c);
+ }
+ context.setCookieStore(cookieStore);
+ }
+ if(request.getHeaders()!=null && CollectionUtils.isNotEmpty(request.getHeaders())){
+ for(Header h:request.getHeaders()){
+ httpUriRequest.setHeader(h);
+ }
+ }
+ httpResponse = getHttpClient(site, proxy).execute(httpUriRequest,context);
statusCode = httpResponse.getStatusLine().getStatusCode();
request.putExtra(Request.STATUS_CODE, statusCode);
if (statusAccept(acceptStatCode, statusCode)) {
Page page = handleResponse(request, charset, httpResponse, task);
+ page.setHeaders(httpResponse.getAllHeaders());
onSuccess(request);
return page;
} else {
@@ -164,7 +191,11 @@ public class HttpClientDownloader extends AbstractDownloader {
//default get
return addQueryParams(RequestBuilder.get(),request.getParams());
} else if (method.equalsIgnoreCase(HttpConstant.Method.POST)) {
- return addFormParams(RequestBuilder.post(), (NameValuePair[]) request.getExtra("nameValuePair"), request.getParams());
+ if(request.getEntity()!=null){
+ return RequestBuilder.post().setEntity(request.getEntity());
+ }else{
+ return addFormParams(RequestBuilder.post(), (NameValuePair[]) request.getExtra("nameValuePair"), request.getParams());
+ }
} else if (method.equalsIgnoreCase(HttpConstant.Method.HEAD)) {
return addQueryParams(RequestBuilder.head(),request.getParams());
} else if (method.equalsIgnoreCase(HttpConstant.Method.PUT)) {
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/CharsetUtils.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/CharsetUtils.java
index 50b4f1b..ccf00a4 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/CharsetUtils.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/CharsetUtils.java
@@ -26,7 +26,7 @@ public abstract class CharsetUtils {
// charset
// 1、encoding in http header Content-Type
charset = UrlUtils.getCharset(contentType);
- if (StringUtils.isNotBlank(contentType)) {
+ if (StringUtils.isNotBlank(contentType) && StringUtils.isNotBlank(charset)) {
logger.debug("Auto get charset: {}", charset);
return charset;
}