diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java index f9495a4..d24ceba 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java @@ -1,13 +1,16 @@ package us.codecraft.webmagic; +import java.util.ArrayList; +import java.util.List; + import org.apache.commons.lang3.StringUtils; +import org.apache.http.Header; + import us.codecraft.webmagic.selector.Html; import us.codecraft.webmagic.selector.Json; import us.codecraft.webmagic.selector.Selectable; import us.codecraft.webmagic.utils.UrlUtils; -import java.util.ArrayList; -import java.util.List; import java.util.Map; /** @@ -46,7 +49,7 @@ public class Page { private boolean needCycleRetry; private List targetRequests = new ArrayList(); - + public Page() { } @@ -232,6 +235,11 @@ public class Page { ", statusCode=" + statusCode + ", needCycleRetry=" + needCycleRetry + ", targetRequests=" + targetRequests + + ", headers=" + headers+ '}'; } + + + + } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java index 21cd72e..9a63c8c 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java @@ -1,9 +1,14 @@ package us.codecraft.webmagic; +import org.apache.http.Header; +import org.apache.http.cookie.Cookie; +import us.codecraft.webmagic.model.HttpRequestBody; import us.codecraft.webmagic.utils.Experimental; import java.io.Serializable; +import java.util.ArrayList; import java.util.HashMap; +import java.util.List; import java.util.Map; /** @@ -23,14 +28,19 @@ public class Request implements Serializable { private String method; + private HttpRequestBody requestBody; + /** * Store additional information in extras. */ private Map extras; + /** - * POST/GET param set - * */ - private Map params=new HashMap(); + * cookies for current url, if not set use Site's cookies + */ + private List cookies=new ArrayList(); + + private List
headers=new ArrayList
(); /** * Priority of the request.
@@ -109,57 +119,38 @@ public class Request implements Serializable { this.method = method; } - public Map getParams() { - return params; - } - /** - * set params for request - *
- * DO NOT set this for request already has params, like 'https://github.com/search?q=webmagic' - * @param params params - * */ - public void setParams(Map params) { - this.params = params; - } - /** - * set params for request - *
- * DO NOT set this for request already has params, like 'https://github.com/search?q=webmagic' - * @param key key - * @param value value - * */ - public void putParams(String key,String value) { - params.put(key,value); - } - - @Override - public boolean equals(Object o) { - if (this == o) return true; - if (o == null || getClass() != o.getClass()) return false; - - Request request = (Request) o; - - if (url != null ? !url.equals(request.url) : request.url != null) return false; - if (method != null ? !method.equals(request.method) : request.method != null) return false; - return params != null ? params.equals(request.params) : request.params == null; - } - @Override public int hashCode() { int result = url != null ? url.hashCode() : 0; result = 31 * result + (method != null ? method.hashCode() : 0); - result = 31 * result + (params != null ? params.hashCode() : 0); + result = 31 * result + (headers != null ? headers.hashCode() : 0); + result = 31 * result + (cookies != null ? cookies.hashCode() : 0); + return result; } + public List getCookies() { + return cookies; + } + + public List
getHeaders() { + return headers; + } + + public HttpRequestBody getRequestBody() { + return requestBody; + } + @Override public String toString() { return "Request{" + "url='" + url + '\'' + ", method='" + method + '\'' + ", extras=" + extras + - ", params=" + params + ", priority=" + priority + + ", headers=" + headers + + ", cookies="+ cookies+ '}'; } + } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index e6523ec..b26bcf9 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -1,15 +1,26 @@ package us.codecraft.webmagic.downloader; +import org.apache.commons.collections.CollectionUtils; import org.apache.commons.io.IOUtils; +import org.apache.http.Header; +import org.apache.http.HttpHost; import org.apache.http.HttpResponse; import org.apache.http.annotation.ThreadSafe; import org.apache.http.auth.AuthState; import org.apache.http.auth.UsernamePasswordCredentials; +import org.apache.http.client.CookieStore; +import org.apache.http.client.config.CookieSpecs; +import org.apache.http.client.config.RequestConfig; +import org.apache.http.client.entity.UrlEncodedFormEntity; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpUriRequest; +import org.apache.http.client.methods.RequestBuilder; import org.apache.http.client.protocol.HttpClientContext; +import org.apache.http.cookie.Cookie; import org.apache.http.impl.auth.BasicScheme; +import org.apache.http.impl.client.BasicCookieStore; import org.apache.http.impl.client.CloseableHttpClient; +import org.apache.http.message.BasicNameValuePair; import org.apache.http.protocol.BasicHttpContext; import org.apache.http.protocol.HttpContext; import org.apache.http.util.EntityUtils; @@ -24,11 +35,11 @@ import us.codecraft.webmagic.proxy.ProxyProvider; import us.codecraft.webmagic.selector.PlainText; import us.codecraft.webmagic.utils.CharsetUtils; import us.codecraft.webmagic.utils.HttpClientUtils; +import us.codecraft.webmagic.utils.HttpConstant; import java.io.IOException; import java.nio.charset.Charset; -import java.util.HashMap; -import java.util.Map; +import java.util.*; /** @@ -88,7 +99,7 @@ public class HttpClientDownloader extends AbstractDownloader { int statusCode = 0; Site site = task.getSite(); Proxy proxy = null; - HttpContext httpContext = new BasicHttpContext(); + HttpClientContext httpContext = new HttpClientContext(); if (proxyProvider != null) { proxy = proxyProvider.getProxy(task); AuthState authState = new AuthState(); @@ -97,6 +108,18 @@ public class HttpClientDownloader extends AbstractDownloader { } CloseableHttpClient httpClient = getHttpClient(site); HttpUriRequest httpUriRequest = httpUriRequestConverter.convert(request, site, proxy); + if (request.getCookies() != null && CollectionUtils.isNotEmpty(request.getCookies())) { + CookieStore cookieStore = new BasicCookieStore(); + for (Cookie c : request.getCookies()) { + cookieStore.addCookie(c); + } + httpContext.setCookieStore(cookieStore); + } + if (request.getHeaders() != null && CollectionUtils.isNotEmpty(request.getHeaders())) { + for (Header h : request.getHeaders()) { + httpUriRequest.setHeader(h); + } + } try { httpResponse = httpClient.execute(httpUriRequest, httpContext); statusCode = httpResponse.getStatusLine().getStatusCode(); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java index db131d0..22aa31d 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java @@ -1,22 +1,16 @@ package us.codecraft.webmagic.downloader; import org.apache.http.HttpHost; -import org.apache.http.NameValuePair; import org.apache.http.client.config.CookieSpecs; import org.apache.http.client.config.RequestConfig; -import org.apache.http.client.entity.UrlEncodedFormEntity; import org.apache.http.client.methods.HttpUriRequest; import org.apache.http.client.methods.RequestBuilder; -import org.apache.http.message.BasicNameValuePair; +import org.apache.http.entity.ByteArrayEntity; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.proxy.Proxy; import us.codecraft.webmagic.utils.HttpConstant; -import java.nio.charset.Charset; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; import java.util.Map; /** @@ -53,32 +47,27 @@ public class HttpUriRequestConverter { String method = request.getMethod(); if (method == null || method.equalsIgnoreCase(HttpConstant.Method.GET)) { //default get - return addQueryParams(RequestBuilder.get(),request.getParams()); + return RequestBuilder.get(); } else if (method.equalsIgnoreCase(HttpConstant.Method.POST)) { - return addFormParams(RequestBuilder.post(), (NameValuePair[]) request.getExtra("nameValuePair"), request.getParams()); + return addFormParams(RequestBuilder.post(),request); } else if (method.equalsIgnoreCase(HttpConstant.Method.HEAD)) { - return addQueryParams(RequestBuilder.head(),request.getParams()); + return RequestBuilder.head(); } else if (method.equalsIgnoreCase(HttpConstant.Method.PUT)) { - return addFormParams(RequestBuilder.put(), (NameValuePair[]) request.getExtra("nameValuePair"), request.getParams()); + return addFormParams(RequestBuilder.put(), request); } else if (method.equalsIgnoreCase(HttpConstant.Method.DELETE)) { - return addQueryParams(RequestBuilder.delete(),request.getParams()); + return RequestBuilder.delete(); } else if (method.equalsIgnoreCase(HttpConstant.Method.TRACE)) { - return addQueryParams(RequestBuilder.trace(),request.getParams()); + return RequestBuilder.trace(); } throw new IllegalArgumentException("Illegal HTTP Method " + method); } - private RequestBuilder addFormParams(RequestBuilder requestBuilder, NameValuePair[] nameValuePair, Map params) { - List allNameValuePair=new ArrayList(); - if (nameValuePair != null && nameValuePair.length > 0) { - allNameValuePair= Arrays.asList(nameValuePair); + private RequestBuilder addFormParams(RequestBuilder requestBuilder, Request request) { + if (request.getRequestBody() != null) { + ByteArrayEntity entity = new ByteArrayEntity(request.getRequestBody().getBody()); + entity.setContentType(request.getRequestBody().getContentType()); + requestBuilder.setEntity(entity); } - if (params != null) { - for (String key : params.keySet()) { - allNameValuePair.add(new BasicNameValuePair(key, params.get(key))); - } - } - requestBuilder.setEntity(new UrlEncodedFormEntity(allNameValuePair, Charset.forName("utf8"))); return requestBuilder; } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/model/HttpRequestBody.java b/webmagic-core/src/main/java/us/codecraft/webmagic/model/HttpRequestBody.java new file mode 100644 index 0000000..fc318ea --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/model/HttpRequestBody.java @@ -0,0 +1,72 @@ +package us.codecraft.webmagic.model; + +import org.apache.http.NameValuePair; +import org.apache.http.client.utils.URLEncodedUtils; +import org.apache.http.message.BasicNameValuePair; + +import java.io.UnsupportedEncodingException; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; + +/** + * @author code4crafter@gmail.com + * Date: 17/4/8 + */ +public class HttpRequestBody { + + public static abstract class ContentType { + + public static final String JSON = "application/json"; + + public static final String XML = "text/xml"; + + public static final String FORM = "application/x-www-form-urlencoded"; + + public static final String MULTIPART = "multipart/form-data"; + } + + private final byte[] body; + + private final String contentType; + + private final String encoding; + + public HttpRequestBody(byte[] body, String contentType, String encoding) { + this.body = body; + this.contentType = contentType; + this.encoding = encoding; + } + + public String getContentType() { + return contentType; + } + + public String getEncoding() { + return encoding; + } + + public static HttpRequestBody json(String json, String encoding) throws UnsupportedEncodingException { + return new HttpRequestBody(json.getBytes(encoding), ContentType.JSON, encoding); + } + + public static HttpRequestBody xml(String xml, String encoding) throws UnsupportedEncodingException { + return new HttpRequestBody(xml.getBytes(encoding), ContentType.XML, encoding); + } + + public static HttpRequestBody custom(byte[] body, String contentType, String encoding) throws UnsupportedEncodingException { + return new HttpRequestBody(body, contentType, encoding); + } + + public static HttpRequestBody form(Map params, String encoding) throws UnsupportedEncodingException { + List nameValuePairs = new ArrayList(params.size()); + for (Map.Entry entry : params.entrySet()) { + nameValuePairs.add(new BasicNameValuePair(entry.getKey(), String.valueOf(entry.getValue()))); + } + return new HttpRequestBody(URLEncodedUtils.format(nameValuePairs, encoding).getBytes(encoding), ContentType.FORM, encoding); + } + + public byte[] getBody() { + return body; + } +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/CharsetUtils.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/CharsetUtils.java index 50b4f1b..ccf00a4 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/CharsetUtils.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/CharsetUtils.java @@ -26,7 +26,7 @@ public abstract class CharsetUtils { // charset // 1、encoding in http header Content-Type charset = UrlUtils.getCharset(contentType); - if (StringUtils.isNotBlank(contentType)) { + if (StringUtils.isNotBlank(contentType) && StringUtils.isNotBlank(charset)) { logger.debug("Auto get charset: {}", charset); return charset; }