From 8ba2da146cc21e460efcf92c1c62af5da750122d Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Thu, 24 Apr 2014 10:51:37 +0800 Subject: [PATCH] request method #108 and more cookie #109 config --- .../java/us/codecraft/webmagic/Request.java | 17 ++++++ .../main/java/us/codecraft/webmagic/Site.java | 60 ++++++++++++++++--- .../webmagic/constant/HttpConstant.java | 35 +++++++++++ .../downloader/HttpClientDownloader.java | 23 ++++++- .../downloader/HttpClientGenerator.java | 13 ++-- .../LocalDuplicatedRemovedScheduler.java | 6 +- 6 files changed, 138 insertions(+), 16 deletions(-) create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/constant/HttpConstant.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java index 142a20c..aeca08f 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java @@ -21,6 +21,8 @@ public class Request implements Serializable { private String url; + private String method; + /** * Store additional information in extras. */ @@ -106,10 +108,25 @@ public class Request implements Serializable { this.url = url; } + /** + * The http method of the request. Get for default. + * @return httpMethod + * @see us.codecraft.webmagic.constant.HttpConstant.Method + * @since 0.5.0 + */ + public String getMethod() { + return method; + } + + public void setMethod(String method) { + this.method = method; + } + @Override public String toString() { return "Request{" + "url='" + url + '\'' + + ", method='" + method + '\'' + ", extras=" + extras + ", priority=" + priority + '}'; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java index 48b43f0..3a5dd33 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java @@ -1,5 +1,7 @@ package us.codecraft.webmagic; +import com.google.common.collect.HashBasedTable; +import com.google.common.collect.Table; import org.apache.http.HttpHost; import us.codecraft.webmagic.utils.UrlUtils; @@ -18,7 +20,9 @@ public class Site { private String userAgent; - private Map cookies = new LinkedHashMap(); + private Map defaultCookies = new LinkedHashMap(); + + private Table cookies = HashBasedTable.create(); private String charset; @@ -45,6 +49,10 @@ public class Site { private boolean useGzip = true; + /** + * @see us.codecraft.webmagic.constant.HttpConstant.Header + * @deprecated + */ public static interface HeaderConst { public static final String REFERER = "Referer"; @@ -72,7 +80,20 @@ public class Site { * @return this */ public Site addCookie(String name, String value) { - cookies.put(name, value); + defaultCookies.put(name, value); + return this; + } + + /** + * Add a cookie with specific domain. + * + * @param domain + * @param name + * @param value + * @return + */ + public Site addCookie(String domain, String name, String value) { + cookies.put(domain, name, value); return this; } @@ -93,6 +114,25 @@ public class Site { * @return get cookies */ public Map getCookies() { + return defaultCookies; + } + + /** + * get cookies of all domains + * + * @return get cookies + */ + public Map> getAllCookies() { + return cookies.columnMap(); + } + + /** + * get cookies + * + * @return get cookies + */ + public Table getaCookies() { + cookies.columnMap(); return cookies; } @@ -203,10 +243,10 @@ public class Site { * Add a url to start url.
* Because urls are more a Spider's property than Site, move it to {@link Spider#addUrl(String...)}} * - * @deprecated - * @see Spider#addUrl(String...) * @param startUrl * @return this + * @see Spider#addUrl(String...) + * @deprecated */ public Site addStartUrl(String startUrl) { return addStartRequest(new Request(startUrl)); @@ -216,10 +256,10 @@ public class Site { * Add a url to start url.
* Because urls are more a Spider's property than Site, move it to {@link Spider#addRequest(Request...)}} * - * @deprecated - * @see Spider#addRequest(Request...) * @param startRequest * @return this + * @see Spider#addRequest(Request...) + * @deprecated */ public Site addStartRequest(Request startRequest) { this.startRequests.add(startRequest); @@ -312,6 +352,7 @@ public class Site { /** * set up httpProxy for this site + * * @param httpProxy * @return */ @@ -364,7 +405,8 @@ public class Site { if (acceptStatCode != null ? !acceptStatCode.equals(site.acceptStatCode) : site.acceptStatCode != null) return false; if (charset != null ? !charset.equals(site.charset) : site.charset != null) return false; - if (cookies != null ? !cookies.equals(site.cookies) : site.cookies != null) return false; + if (defaultCookies != null ? !defaultCookies.equals(site.defaultCookies) : site.defaultCookies != null) + return false; if (domain != null ? !domain.equals(site.domain) : site.domain != null) return false; if (headers != null ? !headers.equals(site.headers) : site.headers != null) return false; if (startRequests != null ? !startRequests.equals(site.startRequests) : site.startRequests != null) @@ -378,7 +420,7 @@ public class Site { public int hashCode() { int result = domain != null ? domain.hashCode() : 0; result = 31 * result + (userAgent != null ? userAgent.hashCode() : 0); - result = 31 * result + (cookies != null ? cookies.hashCode() : 0); + result = 31 * result + (defaultCookies != null ? defaultCookies.hashCode() : 0); result = 31 * result + (charset != null ? charset.hashCode() : 0); result = 31 * result + (startRequests != null ? startRequests.hashCode() : 0); result = 31 * result + sleepTime; @@ -395,7 +437,7 @@ public class Site { return "Site{" + "domain='" + domain + '\'' + ", userAgent='" + userAgent + '\'' + - ", cookies=" + cookies + + ", cookies=" + defaultCookies + ", charset='" + charset + '\'' + ", startRequests=" + startRequests + ", sleepTime=" + sleepTime + diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/constant/HttpConstant.java b/webmagic-core/src/main/java/us/codecraft/webmagic/constant/HttpConstant.java new file mode 100644 index 0000000..52f7ecb --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/constant/HttpConstant.java @@ -0,0 +1,35 @@ +package us.codecraft.webmagic.constant; + +/** + * Some constants of Http protocal. + * @author code4crafer@gmail.com + * @since 0.5.0 + */ +public abstract class HttpConstant { + + public static abstract class Method { + + public static final String GET = "GET"; + + public static final String HEAD = "HEAD"; + + public static final String POST = "POST"; + + public static final String PUT = "PUT"; + + public static final String DELETE = "DELETE"; + + public static final String TRACE = "TRACE"; + + public static final String CONNECT = "CONNECT"; + + } + + public static abstract class Header { + + public static final String REFERER = "Referer"; + + public static final String USER_AGENT = "User-Agent"; + } + +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 13e220f..4fecf32 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -17,6 +17,7 @@ import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Task; +import us.codecraft.webmagic.constant.HttpConstant; import us.codecraft.webmagic.selector.PlainText; import us.codecraft.webmagic.utils.UrlUtils; @@ -75,7 +76,7 @@ public class HttpClientDownloader extends AbstractDownloader { } else { acceptStatCode = Sets.newHashSet(200); } - logger.info("downloading page {}" , request.getUrl()); + logger.info("downloading page {}", request.getUrl()); CloseableHttpResponse httpResponse = null; try { HttpUriRequest httpUriRequest = getHttpUriRequest(request, site, headers); @@ -123,7 +124,7 @@ public class HttpClientDownloader extends AbstractDownloader { } protected HttpUriRequest getHttpUriRequest(Request request, Site site, Map headers) { - RequestBuilder requestBuilder = RequestBuilder.get().setUri(request.getUrl()); + RequestBuilder requestBuilder = selectRequestMethod(request.getMethod()).setUri(request.getUrl()); if (headers != null) { for (Map.Entry headerEntry : headers.entrySet()) { requestBuilder.addHeader(headerEntry.getKey(), headerEntry.getValue()); @@ -141,6 +142,24 @@ public class HttpClientDownloader extends AbstractDownloader { return requestBuilder.build(); } + protected RequestBuilder selectRequestMethod(String method) { + if (method == null || method.equalsIgnoreCase(HttpConstant.Method.GET)) { + //default get + return RequestBuilder.get(); + } else if (method.equalsIgnoreCase(HttpConstant.Method.POST)) { + return RequestBuilder.post(); + } else if (method.equalsIgnoreCase(HttpConstant.Method.HEAD)) { + return RequestBuilder.head(); + } else if (method.equalsIgnoreCase(HttpConstant.Method.PUT)) { + return RequestBuilder.put(); + } else if (method.equalsIgnoreCase(HttpConstant.Method.DELETE)) { + return RequestBuilder.delete(); + } else if (method.equalsIgnoreCase(HttpConstant.Method.TRACE)) { + return RequestBuilder.trace(); + } + throw new IllegalArgumentException("Illegal HTTP Method " + method); + } + protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException { String content = IOUtils.toString(httpResponse.getEntity().getContent(), charset); Page page = new Page(); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java index edb3a49..136d9c5 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java @@ -36,7 +36,7 @@ public class HttpClientGenerator { connectionManager.setDefaultMaxPerRoute(100); } - public HttpClientGenerator setPoolSize(int poolSize){ + public HttpClientGenerator setPoolSize(int poolSize) { connectionManager.setMaxTotal(poolSize); return this; } @@ -76,10 +76,15 @@ public class HttpClientGenerator { private void generateCookie(HttpClientBuilder httpClientBuilder, Site site) { CookieStore cookieStore = new BasicCookieStore(); - if (site.getCookies() != null) { - for (Map.Entry cookieEntry : site.getCookies().entrySet()) { + for (Map.Entry cookieEntry : site.getCookies().entrySet()) { + BasicClientCookie cookie = new BasicClientCookie(cookieEntry.getKey(), cookieEntry.getValue()); + cookie.setDomain(site.getDomain()); + cookieStore.addCookie(cookie); + } + for (Map.Entry> domainEntry : site.getAllCookies().entrySet()) { + for (Map.Entry cookieEntry : domainEntry.getValue().entrySet()) { BasicClientCookie cookie = new BasicClientCookie(cookieEntry.getKey(), cookieEntry.getValue()); - cookie.setDomain(site.getDomain()); + cookie.setDomain(domainEntry.getKey()); cookieStore.addCookie(cookie); } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/LocalDuplicatedRemovedScheduler.java b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/LocalDuplicatedRemovedScheduler.java index 2807e0f..015aa47 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/LocalDuplicatedRemovedScheduler.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/LocalDuplicatedRemovedScheduler.java @@ -25,12 +25,16 @@ public abstract class LocalDuplicatedRemovedScheduler implements MonitorableSche @Override public void push(Request request, Task task) { logger.trace("get a candidate url {}", request.getUrl()); - if (urls.add(request.getUrl()) || shouldReserved(request)) { + if (isDuplicate(request) || shouldReserved(request)) { logger.debug("push to queue {}", request.getUrl()); pushWhenNoDuplicate(request, task); } } + protected boolean isDuplicate(Request request) { + return urls.add(request.getUrl()); + } + protected boolean shouldReserved(Request request) { return request.getExtra(Request.CYCLE_TRIED_TIMES) != null; }