parent
b06aa489fb
commit
8ba2da146c
|
@ -21,6 +21,8 @@ public class Request implements Serializable {
|
||||||
|
|
||||||
private String url;
|
private String url;
|
||||||
|
|
||||||
|
private String method;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Store additional information in extras.
|
* Store additional information in extras.
|
||||||
*/
|
*/
|
||||||
|
@ -106,10 +108,25 @@ public class Request implements Serializable {
|
||||||
this.url = url;
|
this.url = url;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The http method of the request. Get for default.
|
||||||
|
* @return httpMethod
|
||||||
|
* @see us.codecraft.webmagic.constant.HttpConstant.Method
|
||||||
|
* @since 0.5.0
|
||||||
|
*/
|
||||||
|
public String getMethod() {
|
||||||
|
return method;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setMethod(String method) {
|
||||||
|
this.method = method;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String toString() {
|
public String toString() {
|
||||||
return "Request{" +
|
return "Request{" +
|
||||||
"url='" + url + '\'' +
|
"url='" + url + '\'' +
|
||||||
|
", method='" + method + '\'' +
|
||||||
", extras=" + extras +
|
", extras=" + extras +
|
||||||
", priority=" + priority +
|
", priority=" + priority +
|
||||||
'}';
|
'}';
|
||||||
|
|
|
@ -1,5 +1,7 @@
|
||||||
package us.codecraft.webmagic;
|
package us.codecraft.webmagic;
|
||||||
|
|
||||||
|
import com.google.common.collect.HashBasedTable;
|
||||||
|
import com.google.common.collect.Table;
|
||||||
import org.apache.http.HttpHost;
|
import org.apache.http.HttpHost;
|
||||||
import us.codecraft.webmagic.utils.UrlUtils;
|
import us.codecraft.webmagic.utils.UrlUtils;
|
||||||
|
|
||||||
|
@ -18,7 +20,9 @@ public class Site {
|
||||||
|
|
||||||
private String userAgent;
|
private String userAgent;
|
||||||
|
|
||||||
private Map<String, String> cookies = new LinkedHashMap<String, String>();
|
private Map<String, String> defaultCookies = new LinkedHashMap<String, String>();
|
||||||
|
|
||||||
|
private Table<String, String, String> cookies = HashBasedTable.create();
|
||||||
|
|
||||||
private String charset;
|
private String charset;
|
||||||
|
|
||||||
|
@ -45,6 +49,10 @@ public class Site {
|
||||||
|
|
||||||
private boolean useGzip = true;
|
private boolean useGzip = true;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @see us.codecraft.webmagic.constant.HttpConstant.Header
|
||||||
|
* @deprecated
|
||||||
|
*/
|
||||||
public static interface HeaderConst {
|
public static interface HeaderConst {
|
||||||
|
|
||||||
public static final String REFERER = "Referer";
|
public static final String REFERER = "Referer";
|
||||||
|
@ -72,7 +80,20 @@ public class Site {
|
||||||
* @return this
|
* @return this
|
||||||
*/
|
*/
|
||||||
public Site addCookie(String name, String value) {
|
public Site addCookie(String name, String value) {
|
||||||
cookies.put(name, value);
|
defaultCookies.put(name, value);
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Add a cookie with specific domain.
|
||||||
|
*
|
||||||
|
* @param domain
|
||||||
|
* @param name
|
||||||
|
* @param value
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public Site addCookie(String domain, String name, String value) {
|
||||||
|
cookies.put(domain, name, value);
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -93,6 +114,25 @@ public class Site {
|
||||||
* @return get cookies
|
* @return get cookies
|
||||||
*/
|
*/
|
||||||
public Map<String, String> getCookies() {
|
public Map<String, String> getCookies() {
|
||||||
|
return defaultCookies;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* get cookies of all domains
|
||||||
|
*
|
||||||
|
* @return get cookies
|
||||||
|
*/
|
||||||
|
public Map<String,Map<String, String>> getAllCookies() {
|
||||||
|
return cookies.columnMap();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* get cookies
|
||||||
|
*
|
||||||
|
* @return get cookies
|
||||||
|
*/
|
||||||
|
public Table<String,String, String> getaCookies() {
|
||||||
|
cookies.columnMap();
|
||||||
return cookies;
|
return cookies;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -203,10 +243,10 @@ public class Site {
|
||||||
* Add a url to start url.<br>
|
* Add a url to start url.<br>
|
||||||
* Because urls are more a Spider's property than Site, move it to {@link Spider#addUrl(String...)}}
|
* Because urls are more a Spider's property than Site, move it to {@link Spider#addUrl(String...)}}
|
||||||
*
|
*
|
||||||
* @deprecated
|
|
||||||
* @see Spider#addUrl(String...)
|
|
||||||
* @param startUrl
|
* @param startUrl
|
||||||
* @return this
|
* @return this
|
||||||
|
* @see Spider#addUrl(String...)
|
||||||
|
* @deprecated
|
||||||
*/
|
*/
|
||||||
public Site addStartUrl(String startUrl) {
|
public Site addStartUrl(String startUrl) {
|
||||||
return addStartRequest(new Request(startUrl));
|
return addStartRequest(new Request(startUrl));
|
||||||
|
@ -216,10 +256,10 @@ public class Site {
|
||||||
* Add a url to start url.<br>
|
* Add a url to start url.<br>
|
||||||
* Because urls are more a Spider's property than Site, move it to {@link Spider#addRequest(Request...)}}
|
* Because urls are more a Spider's property than Site, move it to {@link Spider#addRequest(Request...)}}
|
||||||
*
|
*
|
||||||
* @deprecated
|
|
||||||
* @see Spider#addRequest(Request...)
|
|
||||||
* @param startRequest
|
* @param startRequest
|
||||||
* @return this
|
* @return this
|
||||||
|
* @see Spider#addRequest(Request...)
|
||||||
|
* @deprecated
|
||||||
*/
|
*/
|
||||||
public Site addStartRequest(Request startRequest) {
|
public Site addStartRequest(Request startRequest) {
|
||||||
this.startRequests.add(startRequest);
|
this.startRequests.add(startRequest);
|
||||||
|
@ -312,6 +352,7 @@ public class Site {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* set up httpProxy for this site
|
* set up httpProxy for this site
|
||||||
|
*
|
||||||
* @param httpProxy
|
* @param httpProxy
|
||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
|
@ -364,7 +405,8 @@ public class Site {
|
||||||
if (acceptStatCode != null ? !acceptStatCode.equals(site.acceptStatCode) : site.acceptStatCode != null)
|
if (acceptStatCode != null ? !acceptStatCode.equals(site.acceptStatCode) : site.acceptStatCode != null)
|
||||||
return false;
|
return false;
|
||||||
if (charset != null ? !charset.equals(site.charset) : site.charset != null) return false;
|
if (charset != null ? !charset.equals(site.charset) : site.charset != null) return false;
|
||||||
if (cookies != null ? !cookies.equals(site.cookies) : site.cookies != null) return false;
|
if (defaultCookies != null ? !defaultCookies.equals(site.defaultCookies) : site.defaultCookies != null)
|
||||||
|
return false;
|
||||||
if (domain != null ? !domain.equals(site.domain) : site.domain != null) return false;
|
if (domain != null ? !domain.equals(site.domain) : site.domain != null) return false;
|
||||||
if (headers != null ? !headers.equals(site.headers) : site.headers != null) return false;
|
if (headers != null ? !headers.equals(site.headers) : site.headers != null) return false;
|
||||||
if (startRequests != null ? !startRequests.equals(site.startRequests) : site.startRequests != null)
|
if (startRequests != null ? !startRequests.equals(site.startRequests) : site.startRequests != null)
|
||||||
|
@ -378,7 +420,7 @@ public class Site {
|
||||||
public int hashCode() {
|
public int hashCode() {
|
||||||
int result = domain != null ? domain.hashCode() : 0;
|
int result = domain != null ? domain.hashCode() : 0;
|
||||||
result = 31 * result + (userAgent != null ? userAgent.hashCode() : 0);
|
result = 31 * result + (userAgent != null ? userAgent.hashCode() : 0);
|
||||||
result = 31 * result + (cookies != null ? cookies.hashCode() : 0);
|
result = 31 * result + (defaultCookies != null ? defaultCookies.hashCode() : 0);
|
||||||
result = 31 * result + (charset != null ? charset.hashCode() : 0);
|
result = 31 * result + (charset != null ? charset.hashCode() : 0);
|
||||||
result = 31 * result + (startRequests != null ? startRequests.hashCode() : 0);
|
result = 31 * result + (startRequests != null ? startRequests.hashCode() : 0);
|
||||||
result = 31 * result + sleepTime;
|
result = 31 * result + sleepTime;
|
||||||
|
@ -395,7 +437,7 @@ public class Site {
|
||||||
return "Site{" +
|
return "Site{" +
|
||||||
"domain='" + domain + '\'' +
|
"domain='" + domain + '\'' +
|
||||||
", userAgent='" + userAgent + '\'' +
|
", userAgent='" + userAgent + '\'' +
|
||||||
", cookies=" + cookies +
|
", cookies=" + defaultCookies +
|
||||||
", charset='" + charset + '\'' +
|
", charset='" + charset + '\'' +
|
||||||
", startRequests=" + startRequests +
|
", startRequests=" + startRequests +
|
||||||
", sleepTime=" + sleepTime +
|
", sleepTime=" + sleepTime +
|
||||||
|
|
|
@ -0,0 +1,35 @@
|
||||||
|
package us.codecraft.webmagic.constant;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Some constants of Http protocal.
|
||||||
|
* @author code4crafer@gmail.com
|
||||||
|
* @since 0.5.0
|
||||||
|
*/
|
||||||
|
public abstract class HttpConstant {
|
||||||
|
|
||||||
|
public static abstract class Method {
|
||||||
|
|
||||||
|
public static final String GET = "GET";
|
||||||
|
|
||||||
|
public static final String HEAD = "HEAD";
|
||||||
|
|
||||||
|
public static final String POST = "POST";
|
||||||
|
|
||||||
|
public static final String PUT = "PUT";
|
||||||
|
|
||||||
|
public static final String DELETE = "DELETE";
|
||||||
|
|
||||||
|
public static final String TRACE = "TRACE";
|
||||||
|
|
||||||
|
public static final String CONNECT = "CONNECT";
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public static abstract class Header {
|
||||||
|
|
||||||
|
public static final String REFERER = "Referer";
|
||||||
|
|
||||||
|
public static final String USER_AGENT = "User-Agent";
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -17,6 +17,7 @@ import us.codecraft.webmagic.Page;
|
||||||
import us.codecraft.webmagic.Request;
|
import us.codecraft.webmagic.Request;
|
||||||
import us.codecraft.webmagic.Site;
|
import us.codecraft.webmagic.Site;
|
||||||
import us.codecraft.webmagic.Task;
|
import us.codecraft.webmagic.Task;
|
||||||
|
import us.codecraft.webmagic.constant.HttpConstant;
|
||||||
import us.codecraft.webmagic.selector.PlainText;
|
import us.codecraft.webmagic.selector.PlainText;
|
||||||
import us.codecraft.webmagic.utils.UrlUtils;
|
import us.codecraft.webmagic.utils.UrlUtils;
|
||||||
|
|
||||||
|
@ -75,7 +76,7 @@ public class HttpClientDownloader extends AbstractDownloader {
|
||||||
} else {
|
} else {
|
||||||
acceptStatCode = Sets.newHashSet(200);
|
acceptStatCode = Sets.newHashSet(200);
|
||||||
}
|
}
|
||||||
logger.info("downloading page {}" , request.getUrl());
|
logger.info("downloading page {}", request.getUrl());
|
||||||
CloseableHttpResponse httpResponse = null;
|
CloseableHttpResponse httpResponse = null;
|
||||||
try {
|
try {
|
||||||
HttpUriRequest httpUriRequest = getHttpUriRequest(request, site, headers);
|
HttpUriRequest httpUriRequest = getHttpUriRequest(request, site, headers);
|
||||||
|
@ -123,7 +124,7 @@ public class HttpClientDownloader extends AbstractDownloader {
|
||||||
}
|
}
|
||||||
|
|
||||||
protected HttpUriRequest getHttpUriRequest(Request request, Site site, Map<String, String> headers) {
|
protected HttpUriRequest getHttpUriRequest(Request request, Site site, Map<String, String> headers) {
|
||||||
RequestBuilder requestBuilder = RequestBuilder.get().setUri(request.getUrl());
|
RequestBuilder requestBuilder = selectRequestMethod(request.getMethod()).setUri(request.getUrl());
|
||||||
if (headers != null) {
|
if (headers != null) {
|
||||||
for (Map.Entry<String, String> headerEntry : headers.entrySet()) {
|
for (Map.Entry<String, String> headerEntry : headers.entrySet()) {
|
||||||
requestBuilder.addHeader(headerEntry.getKey(), headerEntry.getValue());
|
requestBuilder.addHeader(headerEntry.getKey(), headerEntry.getValue());
|
||||||
|
@ -141,6 +142,24 @@ public class HttpClientDownloader extends AbstractDownloader {
|
||||||
return requestBuilder.build();
|
return requestBuilder.build();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
protected RequestBuilder selectRequestMethod(String method) {
|
||||||
|
if (method == null || method.equalsIgnoreCase(HttpConstant.Method.GET)) {
|
||||||
|
//default get
|
||||||
|
return RequestBuilder.get();
|
||||||
|
} else if (method.equalsIgnoreCase(HttpConstant.Method.POST)) {
|
||||||
|
return RequestBuilder.post();
|
||||||
|
} else if (method.equalsIgnoreCase(HttpConstant.Method.HEAD)) {
|
||||||
|
return RequestBuilder.head();
|
||||||
|
} else if (method.equalsIgnoreCase(HttpConstant.Method.PUT)) {
|
||||||
|
return RequestBuilder.put();
|
||||||
|
} else if (method.equalsIgnoreCase(HttpConstant.Method.DELETE)) {
|
||||||
|
return RequestBuilder.delete();
|
||||||
|
} else if (method.equalsIgnoreCase(HttpConstant.Method.TRACE)) {
|
||||||
|
return RequestBuilder.trace();
|
||||||
|
}
|
||||||
|
throw new IllegalArgumentException("Illegal HTTP Method " + method);
|
||||||
|
}
|
||||||
|
|
||||||
protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException {
|
protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException {
|
||||||
String content = IOUtils.toString(httpResponse.getEntity().getContent(), charset);
|
String content = IOUtils.toString(httpResponse.getEntity().getContent(), charset);
|
||||||
Page page = new Page();
|
Page page = new Page();
|
||||||
|
|
|
@ -36,7 +36,7 @@ public class HttpClientGenerator {
|
||||||
connectionManager.setDefaultMaxPerRoute(100);
|
connectionManager.setDefaultMaxPerRoute(100);
|
||||||
}
|
}
|
||||||
|
|
||||||
public HttpClientGenerator setPoolSize(int poolSize){
|
public HttpClientGenerator setPoolSize(int poolSize) {
|
||||||
connectionManager.setMaxTotal(poolSize);
|
connectionManager.setMaxTotal(poolSize);
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
@ -76,12 +76,17 @@ public class HttpClientGenerator {
|
||||||
|
|
||||||
private void generateCookie(HttpClientBuilder httpClientBuilder, Site site) {
|
private void generateCookie(HttpClientBuilder httpClientBuilder, Site site) {
|
||||||
CookieStore cookieStore = new BasicCookieStore();
|
CookieStore cookieStore = new BasicCookieStore();
|
||||||
if (site.getCookies() != null) {
|
|
||||||
for (Map.Entry<String, String> cookieEntry : site.getCookies().entrySet()) {
|
for (Map.Entry<String, String> cookieEntry : site.getCookies().entrySet()) {
|
||||||
BasicClientCookie cookie = new BasicClientCookie(cookieEntry.getKey(), cookieEntry.getValue());
|
BasicClientCookie cookie = new BasicClientCookie(cookieEntry.getKey(), cookieEntry.getValue());
|
||||||
cookie.setDomain(site.getDomain());
|
cookie.setDomain(site.getDomain());
|
||||||
cookieStore.addCookie(cookie);
|
cookieStore.addCookie(cookie);
|
||||||
}
|
}
|
||||||
|
for (Map.Entry<String, Map<String, String>> domainEntry : site.getAllCookies().entrySet()) {
|
||||||
|
for (Map.Entry<String, String> cookieEntry : domainEntry.getValue().entrySet()) {
|
||||||
|
BasicClientCookie cookie = new BasicClientCookie(cookieEntry.getKey(), cookieEntry.getValue());
|
||||||
|
cookie.setDomain(domainEntry.getKey());
|
||||||
|
cookieStore.addCookie(cookie);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
httpClientBuilder.setDefaultCookieStore(cookieStore);
|
httpClientBuilder.setDefaultCookieStore(cookieStore);
|
||||||
}
|
}
|
||||||
|
|
|
@ -25,12 +25,16 @@ public abstract class LocalDuplicatedRemovedScheduler implements MonitorableSche
|
||||||
@Override
|
@Override
|
||||||
public void push(Request request, Task task) {
|
public void push(Request request, Task task) {
|
||||||
logger.trace("get a candidate url {}", request.getUrl());
|
logger.trace("get a candidate url {}", request.getUrl());
|
||||||
if (urls.add(request.getUrl()) || shouldReserved(request)) {
|
if (isDuplicate(request) || shouldReserved(request)) {
|
||||||
logger.debug("push to queue {}", request.getUrl());
|
logger.debug("push to queue {}", request.getUrl());
|
||||||
pushWhenNoDuplicate(request, task);
|
pushWhenNoDuplicate(request, task);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
protected boolean isDuplicate(Request request) {
|
||||||
|
return urls.add(request.getUrl());
|
||||||
|
}
|
||||||
|
|
||||||
protected boolean shouldReserved(Request request) {
|
protected boolean shouldReserved(Request request) {
|
||||||
return request.getExtra(Request.CYCLE_TRIED_TIMES) != null;
|
return request.getExtra(Request.CYCLE_TRIED_TIMES) != null;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue