From 1c24baa8d12e27e311527a09c569f9b425d8b5f0 Mon Sep 17 00:00:00 2001 From: "xbynet@outlook.com" Date: Wed, 29 Mar 2017 02:27:06 +0800 Subject: [PATCH 1/3] =?UTF-8?q?Request=E6=94=AF=E6=8C=81=E8=AE=BE=E7=BD=AE?= =?UTF-8?q?header=E4=B8=8Ecookie=20=E6=96=B0=E5=A2=9EPOST=E8=AF=B7?= =?UTF-8?q?=E6=B1=82=E6=97=B6,XML=E3=80=81JSON=E5=8F=82=E6=95=B0=E6=94=AF?= =?UTF-8?q?=E6=8C=81=20Page=E6=94=AF=E6=8C=81=E8=8E=B7=E5=8F=96=E5=93=8D?= =?UTF-8?q?=E5=BA=94header?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../main/java/us/codecraft/webmagic/Page.java | 26 ++++++- .../java/us/codecraft/webmagic/Request.java | 77 ++++++++++++++++++- .../downloader/HttpClientDownloader.java | 43 +++++++++-- 3 files changed, 135 insertions(+), 11 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java index 7c0064d..1a6527d 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java @@ -1,14 +1,16 @@ package us.codecraft.webmagic; +import java.util.ArrayList; +import java.util.List; + import org.apache.commons.lang3.StringUtils; +import org.apache.http.Header; + import us.codecraft.webmagic.selector.Html; import us.codecraft.webmagic.selector.Json; import us.codecraft.webmagic.selector.Selectable; import us.codecraft.webmagic.utils.UrlUtils; -import java.util.ArrayList; -import java.util.List; - /** * Object storing extracted result and urls to fetch.
* Not thread safe.
@@ -43,6 +45,11 @@ public class Page { private boolean needCycleRetry; private List targetRequests = new ArrayList(); + + /** + * Http响应头 + */ + private Header[] headers=null; public Page() { } @@ -210,6 +217,14 @@ public class Page { return this; } + public Header[] getHeaders() { + return headers; + } + + public void setHeaders(Header[] headers) { + this.headers = headers; + } + @Override public String toString() { return "Page{" + @@ -219,6 +234,11 @@ public class Page { ", url=" + url + ", statusCode=" + statusCode + ", targetRequests=" + targetRequests + + ", headers=" + headers+ '}'; } + + + + } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java index c8c5978..d44f61f 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java @@ -1,11 +1,21 @@ package us.codecraft.webmagic; -import us.codecraft.webmagic.utils.Experimental; - import java.io.Serializable; +import java.util.ArrayList; import java.util.HashMap; +import java.util.List; import java.util.Map; +import org.apache.http.Header; +import org.apache.http.HttpEntity; +import org.apache.http.cookie.Cookie; +import org.apache.http.entity.StringEntity; +import org.apache.http.impl.cookie.BasicClientCookie; +import org.apache.http.message.BasicHeader; + +import us.codecraft.webmagic.utils.Experimental; +import us.codecraft.webmagic.utils.UrlUtils; + /** * Object contains url to crawl.
* It contains some additional information.
@@ -33,6 +43,18 @@ public class Request implements Serializable { * POST/GET param set * */ private Map params=new HashMap(); + + /** + * support for json,xml or more,在post时,设置此选项会使params参数和nameValuePair extra失效。 + */ + private HttpEntity entity; + + /** + * cookies for current url, if not set use Site's cookies + */ + private List cookies=new ArrayList(); + + private List
headers=new ArrayList
(); /** * Priority of the request.
@@ -145,12 +167,59 @@ public class Request implements Serializable { if (method != null ? !method.equals(request.method) : request.method != null) return false; return params != null ? params.equals(request.params) : request.params == null; } + public void addHeader(String name,String value){ + Header header=new BasicHeader(name,value); + headers.add(header); + } + public List
getHeaders(){ + return headers; + } + public void addCookie(String key,String value){ + BasicClientCookie c=new BasicClientCookie(key, value); + c.setDomain(UrlUtils.getDomain(url)); + cookies.add(c); + } + public List getCookies() { + return cookies; + } + public void setCookies(List cookies) { + this.cookies = cookies; + } + /** + * 设置json参数 + */ + public void setJsonParam(String jsonStr,String encoding){ + StringEntity e=new StringEntity(jsonStr,encoding==null?"UTF-8":encoding); + e.setContentEncoding(encoding==null?"UTF-8":encoding); + e.setContentType("application/json"); + entity=e; + } + /** + * 设置xml参数 + */ + public void setXmlParam(String xmlStr,String encoding){ + StringEntity e=new StringEntity(xmlStr,encoding==null?"UTF-8":encoding); + e.setContentEncoding(encoding==null?"UTF-8":encoding); + e.setContentType("text/xml"); + entity=e; + } + public HttpEntity getEntity() { + return entity; + } + + public void setEntity(HttpEntity entity) { + this.entity = entity; + } @Override public int hashCode() { int result = url != null ? url.hashCode() : 0; result = 31 * result + (method != null ? method.hashCode() : 0); result = 31 * result + (params != null ? params.hashCode() : 0); + result = 31 * result + (headers != null ? headers.hashCode() : 0); + result = 31 * result + (entity != null ? entity.hashCode() : 0); + result = 31 * result + (cookies != null ? cookies.hashCode() : 0); + return result; } @@ -162,6 +231,10 @@ public class Request implements Serializable { ", extras=" + extras + ", params=" + params + ", priority=" + priority + + ", headers=" + headers + + ", entity=" + entity + + ", cookies="+ cookies+ '}'; } + } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index fa907a1..669ba37 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -1,21 +1,37 @@ package us.codecraft.webmagic.downloader; +import java.io.IOException; +import java.nio.charset.Charset; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import org.apache.commons.collections.CollectionUtils; import org.apache.commons.io.IOUtils; +import org.apache.http.Header; import org.apache.http.HttpHost; import org.apache.http.HttpResponse; import org.apache.http.NameValuePair; import org.apache.http.annotation.ThreadSafe; +import org.apache.http.client.CookieStore; import org.apache.http.client.config.CookieSpecs; import org.apache.http.client.config.RequestConfig; import org.apache.http.client.entity.UrlEncodedFormEntity; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpUriRequest; import org.apache.http.client.methods.RequestBuilder; +import org.apache.http.client.protocol.HttpClientContext; +import org.apache.http.cookie.Cookie; +import org.apache.http.impl.client.BasicCookieStore; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.message.BasicNameValuePair; import org.apache.http.util.EntityUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; + import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Site; @@ -26,10 +42,6 @@ import us.codecraft.webmagic.utils.CharsetUtils; import us.codecraft.webmagic.utils.HttpConstant; import us.codecraft.webmagic.utils.WMCollections; -import java.io.IOException; -import java.nio.charset.Charset; -import java.util.*; - /** * The http downloader based on HttpClient. @@ -94,11 +106,26 @@ public class HttpClientDownloader extends AbstractDownloader { } HttpUriRequest httpUriRequest = getHttpUriRequest(request, site, headers, proxyHost); - httpResponse = getHttpClient(site, proxy).execute(httpUriRequest); + HttpClientContext context=null; + if(request.getCookies()!=null && CollectionUtils.isNotEmpty(request.getCookies())){ + context=new HttpClientContext(); + CookieStore cookieStore=new BasicCookieStore(); + for(Cookie c:request.getCookies()){ + cookieStore.addCookie(c); + } + context.setCookieStore(cookieStore); + } + if(request.getHeaders()!=null && CollectionUtils.isNotEmpty(request.getHeaders())){ + for(Header h:request.getHeaders()){ + httpUriRequest.setHeader(h); + } + } + httpResponse = getHttpClient(site, proxy).execute(httpUriRequest,context); statusCode = httpResponse.getStatusLine().getStatusCode(); request.putExtra(Request.STATUS_CODE, statusCode); if (statusAccept(acceptStatCode, statusCode)) { Page page = handleResponse(request, charset, httpResponse, task); + page.setHeaders(httpResponse.getAllHeaders()); onSuccess(request); return page; } else { @@ -164,7 +191,11 @@ public class HttpClientDownloader extends AbstractDownloader { //default get return addQueryParams(RequestBuilder.get(),request.getParams()); } else if (method.equalsIgnoreCase(HttpConstant.Method.POST)) { - return addFormParams(RequestBuilder.post(), (NameValuePair[]) request.getExtra("nameValuePair"), request.getParams()); + if(request.getEntity()!=null){ + return RequestBuilder.post().setEntity(request.getEntity()); + }else{ + return addFormParams(RequestBuilder.post(), (NameValuePair[]) request.getExtra("nameValuePair"), request.getParams()); + } } else if (method.equalsIgnoreCase(HttpConstant.Method.HEAD)) { return addQueryParams(RequestBuilder.head(),request.getParams()); } else if (method.equalsIgnoreCase(HttpConstant.Method.PUT)) { From c93a8a27227defa82788783ca012e60dbd2a5014 Mon Sep 17 00:00:00 2001 From: xbynet Date: Fri, 31 Mar 2017 18:27:18 +0800 Subject: [PATCH 2/3] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E5=AD=97=E7=AC=A6?= =?UTF-8?q?=E7=BC=96=E7=A0=81=E6=A3=80=E6=B5=8BBUG?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../src/main/java/us/codecraft/webmagic/utils/CharsetUtils.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/CharsetUtils.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/CharsetUtils.java index 50b4f1b..ccf00a4 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/CharsetUtils.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/CharsetUtils.java @@ -26,7 +26,7 @@ public abstract class CharsetUtils { // charset // 1、encoding in http header Content-Type charset = UrlUtils.getCharset(contentType); - if (StringUtils.isNotBlank(contentType)) { + if (StringUtils.isNotBlank(contentType) && StringUtils.isNotBlank(charset)) { logger.debug("Auto get charset: {}", charset); return charset; } From 395396c68ec257d1e982e696dab53653cb4bccfe Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 8 Apr 2017 11:59:52 +0800 Subject: [PATCH 3/3] =?UTF-8?q?=E5=A2=9E=E5=8A=A0HttpRequestBody?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../webmagic/model/HttpRequestBody.java | 69 +++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/model/HttpRequestBody.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/model/HttpRequestBody.java b/webmagic-core/src/main/java/us/codecraft/webmagic/model/HttpRequestBody.java new file mode 100644 index 0000000..39a92f7 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/model/HttpRequestBody.java @@ -0,0 +1,69 @@ +package us.codecraft.webmagic.model; + +import org.apache.http.NameValuePair; +import org.apache.http.client.utils.URLEncodedUtils; +import org.apache.http.message.BasicNameValuePair; + +import java.io.UnsupportedEncodingException; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; + +/** + * @author code4crafter@gmail.com + * Date: 17/4/8 + */ +public class HttpRequestBody { + + public static abstract class ContentType { + + public static final String JSON = "application/json"; + + public static final String XML = "text/xml"; + + public static final String FORM = "application/x-www-form-urlencoded"; + + public static final String MULTIPART = "multipart/form-data"; + } + + private final byte[] body; + + private final String contentType; + + private final String encoding; + + public HttpRequestBody(byte[] body, String contentType, String encoding) { + this.body = body; + this.contentType = contentType; + this.encoding = encoding; + } + + public String getContentType() { + return contentType; + } + + public String getEncoding() { + return encoding; + } + + public static HttpRequestBody json(String json, String encoding) throws UnsupportedEncodingException { + return new HttpRequestBody(json.getBytes(encoding), ContentType.JSON, encoding); + } + + public static HttpRequestBody xml(String xml, String encoding) throws UnsupportedEncodingException { + return new HttpRequestBody(xml.getBytes(encoding), ContentType.XML, encoding); + } + + public static HttpRequestBody custom(byte[] body, String contentType, String encoding) throws UnsupportedEncodingException { + return new HttpRequestBody(body, contentType, encoding); + } + + public static HttpRequestBody form(Map params, String encoding) throws UnsupportedEncodingException { + List nameValuePairs = new ArrayList(params.size()); + for (Map.Entry entry : params.entrySet()) { + nameValuePairs.add(new BasicNameValuePair(entry.getKey(), String.valueOf(entry.getValue()))); + } + return new HttpRequestBody(URLEncodedUtils.format(nameValuePairs, encoding).getBytes(encoding), ContentType.FORM, encoding); + } + +}