diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java index 7c0064d..1a6527d 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java @@ -1,14 +1,16 @@ package us.codecraft.webmagic; +import java.util.ArrayList; +import java.util.List; + import org.apache.commons.lang3.StringUtils; +import org.apache.http.Header; + import us.codecraft.webmagic.selector.Html; import us.codecraft.webmagic.selector.Json; import us.codecraft.webmagic.selector.Selectable; import us.codecraft.webmagic.utils.UrlUtils; -import java.util.ArrayList; -import java.util.List; - /** * Object storing extracted result and urls to fetch.
* Not thread safe.
@@ -43,6 +45,11 @@ public class Page { private boolean needCycleRetry; private List targetRequests = new ArrayList(); + + /** + * Http响应头 + */ + private Header[] headers=null; public Page() { } @@ -210,6 +217,14 @@ public class Page { return this; } + public Header[] getHeaders() { + return headers; + } + + public void setHeaders(Header[] headers) { + this.headers = headers; + } + @Override public String toString() { return "Page{" + @@ -219,6 +234,11 @@ public class Page { ", url=" + url + ", statusCode=" + statusCode + ", targetRequests=" + targetRequests + + ", headers=" + headers+ '}'; } + + + + } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java index c8c5978..d44f61f 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java @@ -1,11 +1,21 @@ package us.codecraft.webmagic; -import us.codecraft.webmagic.utils.Experimental; - import java.io.Serializable; +import java.util.ArrayList; import java.util.HashMap; +import java.util.List; import java.util.Map; +import org.apache.http.Header; +import org.apache.http.HttpEntity; +import org.apache.http.cookie.Cookie; +import org.apache.http.entity.StringEntity; +import org.apache.http.impl.cookie.BasicClientCookie; +import org.apache.http.message.BasicHeader; + +import us.codecraft.webmagic.utils.Experimental; +import us.codecraft.webmagic.utils.UrlUtils; + /** * Object contains url to crawl.
* It contains some additional information.
@@ -33,6 +43,18 @@ public class Request implements Serializable { * POST/GET param set * */ private Map params=new HashMap(); + + /** + * support for json,xml or more,在post时,设置此选项会使params参数和nameValuePair extra失效。 + */ + private HttpEntity entity; + + /** + * cookies for current url, if not set use Site's cookies + */ + private List cookies=new ArrayList(); + + private List
headers=new ArrayList
(); /** * Priority of the request.
@@ -145,12 +167,59 @@ public class Request implements Serializable { if (method != null ? !method.equals(request.method) : request.method != null) return false; return params != null ? params.equals(request.params) : request.params == null; } + public void addHeader(String name,String value){ + Header header=new BasicHeader(name,value); + headers.add(header); + } + public List
getHeaders(){ + return headers; + } + public void addCookie(String key,String value){ + BasicClientCookie c=new BasicClientCookie(key, value); + c.setDomain(UrlUtils.getDomain(url)); + cookies.add(c); + } + public List getCookies() { + return cookies; + } + public void setCookies(List cookies) { + this.cookies = cookies; + } + /** + * 设置json参数 + */ + public void setJsonParam(String jsonStr,String encoding){ + StringEntity e=new StringEntity(jsonStr,encoding==null?"UTF-8":encoding); + e.setContentEncoding(encoding==null?"UTF-8":encoding); + e.setContentType("application/json"); + entity=e; + } + /** + * 设置xml参数 + */ + public void setXmlParam(String xmlStr,String encoding){ + StringEntity e=new StringEntity(xmlStr,encoding==null?"UTF-8":encoding); + e.setContentEncoding(encoding==null?"UTF-8":encoding); + e.setContentType("text/xml"); + entity=e; + } + public HttpEntity getEntity() { + return entity; + } + + public void setEntity(HttpEntity entity) { + this.entity = entity; + } @Override public int hashCode() { int result = url != null ? url.hashCode() : 0; result = 31 * result + (method != null ? method.hashCode() : 0); result = 31 * result + (params != null ? params.hashCode() : 0); + result = 31 * result + (headers != null ? headers.hashCode() : 0); + result = 31 * result + (entity != null ? entity.hashCode() : 0); + result = 31 * result + (cookies != null ? cookies.hashCode() : 0); + return result; } @@ -162,6 +231,10 @@ public class Request implements Serializable { ", extras=" + extras + ", params=" + params + ", priority=" + priority + + ", headers=" + headers + + ", entity=" + entity + + ", cookies="+ cookies+ '}'; } + } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index fa907a1..669ba37 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -1,21 +1,37 @@ package us.codecraft.webmagic.downloader; +import java.io.IOException; +import java.nio.charset.Charset; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import org.apache.commons.collections.CollectionUtils; import org.apache.commons.io.IOUtils; +import org.apache.http.Header; import org.apache.http.HttpHost; import org.apache.http.HttpResponse; import org.apache.http.NameValuePair; import org.apache.http.annotation.ThreadSafe; +import org.apache.http.client.CookieStore; import org.apache.http.client.config.CookieSpecs; import org.apache.http.client.config.RequestConfig; import org.apache.http.client.entity.UrlEncodedFormEntity; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpUriRequest; import org.apache.http.client.methods.RequestBuilder; +import org.apache.http.client.protocol.HttpClientContext; +import org.apache.http.cookie.Cookie; +import org.apache.http.impl.client.BasicCookieStore; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.message.BasicNameValuePair; import org.apache.http.util.EntityUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; + import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Site; @@ -26,10 +42,6 @@ import us.codecraft.webmagic.utils.CharsetUtils; import us.codecraft.webmagic.utils.HttpConstant; import us.codecraft.webmagic.utils.WMCollections; -import java.io.IOException; -import java.nio.charset.Charset; -import java.util.*; - /** * The http downloader based on HttpClient. @@ -94,11 +106,26 @@ public class HttpClientDownloader extends AbstractDownloader { } HttpUriRequest httpUriRequest = getHttpUriRequest(request, site, headers, proxyHost); - httpResponse = getHttpClient(site, proxy).execute(httpUriRequest); + HttpClientContext context=null; + if(request.getCookies()!=null && CollectionUtils.isNotEmpty(request.getCookies())){ + context=new HttpClientContext(); + CookieStore cookieStore=new BasicCookieStore(); + for(Cookie c:request.getCookies()){ + cookieStore.addCookie(c); + } + context.setCookieStore(cookieStore); + } + if(request.getHeaders()!=null && CollectionUtils.isNotEmpty(request.getHeaders())){ + for(Header h:request.getHeaders()){ + httpUriRequest.setHeader(h); + } + } + httpResponse = getHttpClient(site, proxy).execute(httpUriRequest,context); statusCode = httpResponse.getStatusLine().getStatusCode(); request.putExtra(Request.STATUS_CODE, statusCode); if (statusAccept(acceptStatCode, statusCode)) { Page page = handleResponse(request, charset, httpResponse, task); + page.setHeaders(httpResponse.getAllHeaders()); onSuccess(request); return page; } else { @@ -164,7 +191,11 @@ public class HttpClientDownloader extends AbstractDownloader { //default get return addQueryParams(RequestBuilder.get(),request.getParams()); } else if (method.equalsIgnoreCase(HttpConstant.Method.POST)) { - return addFormParams(RequestBuilder.post(), (NameValuePair[]) request.getExtra("nameValuePair"), request.getParams()); + if(request.getEntity()!=null){ + return RequestBuilder.post().setEntity(request.getEntity()); + }else{ + return addFormParams(RequestBuilder.post(), (NameValuePair[]) request.getExtra("nameValuePair"), request.getParams()); + } } else if (method.equalsIgnoreCase(HttpConstant.Method.HEAD)) { return addQueryParams(RequestBuilder.head(),request.getParams()); } else if (method.equalsIgnoreCase(HttpConstant.Method.PUT)) {