diff --git a/pom.xml b/pom.xml
index 04b6dec..4279ec7 100644
--- a/pom.xml
+++ b/pom.xml
@@ -6,7 +6,7 @@
7
us.codecraft
- 0.6.2-SNAPSHOT
+ 0.7.0-SNAPSHOT
4.0.0
pom
diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml
index ad96961..7ca5c7b 100644
--- a/webmagic-core/pom.xml
+++ b/webmagic-core/pom.xml
@@ -3,7 +3,7 @@
us.codecraft
webmagic-parent
- 0.6.2-SNAPSHOT
+ 0.7.0-SNAPSHOT
4.0.0
diff --git a/webmagic-core/pom.xml.versionsBackup b/webmagic-core/pom.xml.versionsBackup
deleted file mode 100644
index b530bab..0000000
--- a/webmagic-core/pom.xml.versionsBackup
+++ /dev/null
@@ -1,86 +0,0 @@
-
-
-
- us.codecraft
- webmagic-parent
- 0.5.2
-
- 4.0.0
-
- webmagic-core
-
-
-
- org.apache.httpcomponents
- httpclient
-
-
-
- junit
- junit
-
-
-
- com.google.guava
- guava
-
-
-
- org.apache.commons
- commons-lang3
-
-
-
- us.codecraft
- xsoup
-
-
-
- com.github.dreamhead
- moco-core
-
-
-
- org.slf4j
- slf4j-api
-
-
-
- org.slf4j
- slf4j-log4j12
-
-
-
- commons-collections
- commons-collections
-
-
-
- org.assertj
- assertj-core
-
-
-
- org.jsoup
- jsoup
-
-
-
- org.apache.commons
- commons-io
-
-
-
- com.jayway.jsonpath
- json-path
- 0.8.1
-
-
-
- com.alibaba
- fastjson
-
-
-
-
-
\ No newline at end of file
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java
index 1a6527d..7dd48f8 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java
@@ -1,16 +1,15 @@
package us.codecraft.webmagic;
-import java.util.ArrayList;
-import java.util.List;
-
import org.apache.commons.lang3.StringUtils;
-import org.apache.http.Header;
-
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.Json;
import us.codecraft.webmagic.selector.Selectable;
import us.codecraft.webmagic.utils.UrlUtils;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+
/**
* Object storing extracted result and urls to fetch.
* Not thread safe.
@@ -40,17 +39,14 @@ public class Page {
private Selectable url;
+ private Map> headers;
+
private int statusCode;
private boolean needCycleRetry;
private List targetRequests = new ArrayList();
- /**
- * Http响应头
- */
- private Header[] headers=null;
-
public Page() {
}
@@ -77,7 +73,7 @@ public class Page {
*/
public Html getHtml() {
if (html == null) {
- html = new Html(UrlUtils.fixAllRelativeHrefs(rawText, request.getUrl()));
+ html = new Html(rawText, request.getUrl());
}
return html;
}
@@ -217,14 +213,14 @@ public class Page {
return this;
}
- public Header[] getHeaders() {
- return headers;
- }
+ public Map> getHeaders() {
+ return headers;
+ }
+
+ public void setHeaders(Map> headers) {
+ this.headers = headers;
+ }
- public void setHeaders(Header[] headers) {
- this.headers = headers;
- }
-
@Override
public String toString() {
return "Page{" +
@@ -232,7 +228,9 @@ public class Page {
", resultItems=" + resultItems +
", rawText='" + rawText + '\'' +
", url=" + url +
+ ", headers=" + headers +
", statusCode=" + statusCode +
+ ", needCycleRetry=" + needCycleRetry +
", targetRequests=" + targetRequests +
", headers=" + headers+
'}';
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java
index d44f61f..d2ea247 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java
@@ -1,20 +1,11 @@
package us.codecraft.webmagic;
-import java.io.Serializable;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-
-import org.apache.http.Header;
-import org.apache.http.HttpEntity;
-import org.apache.http.cookie.Cookie;
-import org.apache.http.entity.StringEntity;
-import org.apache.http.impl.cookie.BasicClientCookie;
-import org.apache.http.message.BasicHeader;
-
+import us.codecraft.webmagic.model.HttpRequestBody;
import us.codecraft.webmagic.utils.Experimental;
-import us.codecraft.webmagic.utils.UrlUtils;
+
+import java.io.Serializable;
+import java.util.HashMap;
+import java.util.Map;
/**
* Object contains url to crawl.
@@ -28,33 +19,24 @@ public class Request implements Serializable {
private static final long serialVersionUID = 2062192774891352043L;
public static final String CYCLE_TRIED_TIMES = "_cycle_tried_times";
- public static final String STATUS_CODE = "statusCode";
- public static final String PROXY = "proxy";
private String url;
private String method;
+ private HttpRequestBody requestBody;
+
/**
* Store additional information in extras.
*/
private Map extras;
- /**
- * POST/GET param set
- * */
- private Map params=new HashMap();
-
- /**
- * support for json,xml or more,在post时,设置此选项会使params参数和nameValuePair extra失效。
- */
- private HttpEntity entity;
-
+
/**
* cookies for current url, if not set use Site's cookies
*/
- private List cookies=new ArrayList();
-
- private List headers=new ArrayList();
+ private Map cookies = new HashMap();
+
+ private Map headers = new HashMap();
/**
* Priority of the request.
@@ -133,27 +115,11 @@ public class Request implements Serializable {
this.method = method;
}
- public Map getParams() {
- return params;
- }
- /**
- * set params for request
- *
- * DO NOT set this for request already has params, like 'https://github.com/search?q=webmagic'
- * @param params params
- * */
- public void setParams(Map params) {
- this.params = params;
- }
- /**
- * set params for request
- *
- * DO NOT set this for request already has params, like 'https://github.com/search?q=webmagic'
- * @param key key
- * @param value value
- * */
- public void putParams(String key,String value) {
- params.put(key,value);
+ @Override
+ public int hashCode() {
+ int result = url != null ? url.hashCode() : 0;
+ result = 31 * result + (method != null ? method.hashCode() : 0);
+ return result;
}
@Override
@@ -164,63 +130,33 @@ public class Request implements Serializable {
Request request = (Request) o;
if (url != null ? !url.equals(request.url) : request.url != null) return false;
- if (method != null ? !method.equals(request.method) : request.method != null) return false;
- return params != null ? params.equals(request.params) : request.params == null;
+ return method != null ? method.equals(request.method) : request.method == null;
}
- public void addHeader(String name,String value){
- Header header=new BasicHeader(name,value);
- headers.add(header);
- }
- public List getHeaders(){
- return headers;
- }
- public void addCookie(String key,String value){
- BasicClientCookie c=new BasicClientCookie(key, value);
- c.setDomain(UrlUtils.getDomain(url));
- cookies.add(c);
- }
- public List getCookies() {
- return cookies;
- }
- public void setCookies(List cookies) {
- this.cookies = cookies;
- }
- /**
- * 设置json参数
- */
- public void setJsonParam(String jsonStr,String encoding){
- StringEntity e=new StringEntity(jsonStr,encoding==null?"UTF-8":encoding);
- e.setContentEncoding(encoding==null?"UTF-8":encoding);
- e.setContentType("application/json");
- entity=e;
+ public Request addCookie(String name, String value) {
+ cookies.put(name, value);
+ return this;
}
- /**
- * 设置xml参数
- */
- public void setXmlParam(String xmlStr,String encoding){
- StringEntity e=new StringEntity(xmlStr,encoding==null?"UTF-8":encoding);
- e.setContentEncoding(encoding==null?"UTF-8":encoding);
- e.setContentType("text/xml");
- entity=e;
- }
- public HttpEntity getEntity() {
- return entity;
- }
- public void setEntity(HttpEntity entity) {
- this.entity = entity;
- }
- @Override
- public int hashCode() {
- int result = url != null ? url.hashCode() : 0;
- result = 31 * result + (method != null ? method.hashCode() : 0);
- result = 31 * result + (params != null ? params.hashCode() : 0);
- result = 31 * result + (headers != null ? headers.hashCode() : 0);
- result = 31 * result + (entity != null ? entity.hashCode() : 0);
- result = 31 * result + (cookies != null ? cookies.hashCode() : 0);
-
- return result;
+ public Request addHeader(String name, String value) {
+ headers.put(name, value);
+ return this;
+ }
+
+ public Map getCookies() {
+ return cookies;
+ }
+
+ public Map getHeaders() {
+ return headers;
+ }
+
+ public HttpRequestBody getRequestBody() {
+ return requestBody;
+ }
+
+ public void setRequestBody(HttpRequestBody requestBody) {
+ this.requestBody = requestBody;
}
@Override
@@ -229,10 +165,8 @@ public class Request implements Serializable {
"url='" + url + '\'' +
", method='" + method + '\'' +
", extras=" + extras +
- ", params=" + params +
", priority=" + priority +
", headers=" + headers +
- ", entity=" + entity +
", cookies="+ cookies+
'}';
}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java
index ac9f9ce..520902d 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java
@@ -1,12 +1,5 @@
package us.codecraft.webmagic;
-import org.apache.http.HttpHost;
-import org.apache.http.auth.UsernamePasswordCredentials;
-import us.codecraft.webmagic.proxy.Proxy;
-import us.codecraft.webmagic.proxy.ProxyPool;
-import us.codecraft.webmagic.proxy.SimpleProxyPool;
-import us.codecraft.webmagic.utils.UrlUtils;
-
import java.util.*;
/**
@@ -28,11 +21,6 @@ public class Site {
private String charset;
- /**
- * startUrls is the urls the crawler to start with.
- */
- private List startRequests = new ArrayList();
-
private int sleepTime = 5000;
private int retryTimes = 0;
@@ -49,24 +37,8 @@ public class Site {
private Map headers = new HashMap();
- private HttpHost httpProxy;
-
- private UsernamePasswordCredentials usernamePasswordCredentials; //代理用户名密码设置
-
- private ProxyPool httpProxyPool;
-
private boolean useGzip = true;
- /**
- * @see us.codecraft.webmagic.utils.HttpConstant.Header
- * @deprecated
- */
- public static interface HeaderConst {
-
- public static final String REFERER = "Referer";
- }
-
-
static {
DEFAULT_STATUS_CODE_SET.add(200);
}
@@ -225,52 +197,6 @@ public class Site {
return acceptStatCode;
}
- /**
- * get start urls
- *
- * @return start urls
- * @see #getStartRequests
- * @deprecated
- */
- @Deprecated
- public List getStartUrls() {
- return UrlUtils.convertToUrls(startRequests);
- }
-
- public List getStartRequests() {
- return startRequests;
- }
-
- /**
- * Add a url to start url.
- * Because urls are more a Spider's property than Site, move it to {@link Spider#addUrl(String...)}}
- *
- * @param startUrl startUrl
- * @return this
- * @see Spider#addUrl(String...)
- * @deprecated
- */
- public Site addStartUrl(String startUrl) {
- return addStartRequest(new Request(startUrl));
- }
-
- /**
- * Add a url to start url.
- * Because urls are more a Spider's property than Site, move it to {@link Spider#addRequest(Request...)}}
- *
- * @param startRequest startRequest
- * @return this
- * @see Spider#addRequest(Request...)
- * @deprecated
- */
- public Site addStartRequest(Request startRequest) {
- this.startRequests.add(startRequest);
- if (domain == null && startRequest.getUrl() != null) {
- domain = UrlUtils.getDomain(startRequest.getUrl());
- }
- return this;
- }
-
/**
* Set the interval between the processing of two pages.
* Time unit is micro seconds.
@@ -350,21 +276,6 @@ public class Site {
return this;
}
- public HttpHost getHttpProxy() {
- return httpProxy;
- }
-
- /**
- * set up httpProxy for this site
- *
- * @param httpProxy httpProxy
- * @return this
- */
- public Site setHttpProxy(HttpHost httpProxy) {
- this.httpProxy = httpProxy;
- return this;
- }
-
public boolean isUseGzip() {
return useGzip;
}
@@ -400,7 +311,11 @@ public class Site {
return new Task() {
@Override
public String getUUID() {
- return Site.this.getDomain();
+ String uuid = Site.this.getDomain();
+ if (uuid == null) {
+ uuid = UUID.randomUUID().toString();
+ }
+ return uuid;
}
@Override
@@ -428,8 +343,6 @@ public class Site {
return false;
if (domain != null ? !domain.equals(site.domain) : site.domain != null) return false;
if (headers != null ? !headers.equals(site.headers) : site.headers != null) return false;
- if (startRequests != null ? !startRequests.equals(site.startRequests) : site.startRequests != null)
- return false;
if (userAgent != null ? !userAgent.equals(site.userAgent) : site.userAgent != null) return false;
return true;
@@ -441,7 +354,6 @@ public class Site {
result = 31 * result + (userAgent != null ? userAgent.hashCode() : 0);
result = 31 * result + (defaultCookies != null ? defaultCookies.hashCode() : 0);
result = 31 * result + (charset != null ? charset.hashCode() : 0);
- result = 31 * result + (startRequests != null ? startRequests.hashCode() : 0);
result = 31 * result + sleepTime;
result = 31 * result + retryTimes;
result = 31 * result + cycleRetryTimes;
@@ -458,7 +370,6 @@ public class Site {
", userAgent='" + userAgent + '\'' +
", cookies=" + defaultCookies +
", charset='" + charset + '\'' +
- ", startRequests=" + startRequests +
", sleepTime=" + sleepTime +
", retryTimes=" + retryTimes +
", cycleRetryTimes=" + cycleRetryTimes +
@@ -468,53 +379,4 @@ public class Site {
'}';
}
- /**
- * Set httpProxyPool, String[0]:ip, String[1]:port
- *
- * @param proxyPool proxyPool
- * @return this
- */
- public Site setHttpProxyPool(ProxyPool proxyPool) {
- this.httpProxyPool = proxyPool;
- return this;
- }
-
- /**
- * Set httpProxyPool, String[0]:ip, String[1]:port
- *
- * @param httpProxyList httpProxyList
- * @param isUseLastProxy isUseLastProxy
- * @return this
- */
- public Site setHttpProxyPool(List httpProxyList, boolean isUseLastProxy) {
- this.httpProxyPool=new SimpleProxyPool(httpProxyList, isUseLastProxy);
- return this;
- }
-
- public Site enableHttpProxyPool() {
- this.httpProxyPool=new SimpleProxyPool();
- return this;
- }
-
- public UsernamePasswordCredentials getUsernamePasswordCredentials() {
- return usernamePasswordCredentials;
- }
-
- public Site setUsernamePasswordCredentials(UsernamePasswordCredentials usernamePasswordCredentials) {
- this.usernamePasswordCredentials = usernamePasswordCredentials;
- return this;
- }
-
- public ProxyPool getHttpProxyPool() {
- return httpProxyPool;
- }
-
- public Proxy getHttpProxyFromPool() {
- return httpProxyPool.getProxy();
- }
-
- public void returnHttpProxyToPool(HttpHost proxy,int statusCode) {
- httpProxyPool.returnProxy(proxy,statusCode);
- }
-
}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
index 49734b7..5e785af 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
@@ -126,7 +126,6 @@ public class Spider implements Runnable, Task {
public Spider(PageProcessor pageProcessor) {
this.pageProcessor = pageProcessor;
this.site = pageProcessor.getSite();
- this.startRequests = pageProcessor.getSite().getStartRequests();
}
/**
@@ -419,8 +418,6 @@ public class Spider implements Runnable, Task {
pipeline.process(page.getResultItems(), this);
}
}
- //for proxy status management
- request.putExtra(Request.STATUS_CODE, page.getStatusCode());
sleep(site.getSleepTime());
}
@@ -482,7 +479,9 @@ public class Spider implements Runnable, Task {
public List getAll(Collection urls) {
destroyWhenExit = false;
spawnUrl = false;
- startRequests.clear();
+ if (startRequests!=null){
+ startRequests.clear();
+ }
for (Request request : UrlUtils.convertToRequests(urls)) {
addRequest(request);
}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
index 669ba37..284702d 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
@@ -1,46 +1,26 @@
package us.codecraft.webmagic.downloader;
-import java.io.IOException;
-import java.nio.charset.Charset;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-
-import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.io.IOUtils;
-import org.apache.http.Header;
-import org.apache.http.HttpHost;
import org.apache.http.HttpResponse;
-import org.apache.http.NameValuePair;
import org.apache.http.annotation.ThreadSafe;
-import org.apache.http.client.CookieStore;
-import org.apache.http.client.config.CookieSpecs;
-import org.apache.http.client.config.RequestConfig;
-import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
-import org.apache.http.client.methods.HttpUriRequest;
-import org.apache.http.client.methods.RequestBuilder;
-import org.apache.http.client.protocol.HttpClientContext;
-import org.apache.http.cookie.Cookie;
-import org.apache.http.impl.client.BasicCookieStore;
import org.apache.http.impl.client.CloseableHttpClient;
-import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Task;
-import us.codecraft.webmagic.proxy.Proxy;
+import us.codecraft.webmagic.proxy.ProxyProvider;
import us.codecraft.webmagic.selector.PlainText;
import us.codecraft.webmagic.utils.CharsetUtils;
-import us.codecraft.webmagic.utils.HttpConstant;
-import us.codecraft.webmagic.utils.WMCollections;
+import us.codecraft.webmagic.utils.HttpClientUtils;
+
+import java.io.IOException;
+import java.nio.charset.Charset;
+import java.util.HashMap;
+import java.util.Map;
/**
@@ -58,9 +38,23 @@ public class HttpClientDownloader extends AbstractDownloader {
private HttpClientGenerator httpClientGenerator = new HttpClientGenerator();
- private CloseableHttpClient getHttpClient(Site site, Proxy proxy) {
+ private HttpUriRequestConverter httpUriRequestConverter = new HttpUriRequestConverter();
+
+ private ProxyProvider proxyProvider;
+
+ private boolean responseHeader = true;
+
+ public void setHttpUriRequestConverter(HttpUriRequestConverter httpUriRequestConverter) {
+ this.httpUriRequestConverter = httpUriRequestConverter;
+ }
+
+ public void setProxyProvider(ProxyProvider proxyProvider) {
+ this.proxyProvider = proxyProvider;
+ }
+
+ private CloseableHttpClient getHttpClient(Site site) {
if (site == null) {
- return httpClientGenerator.getClient(null, proxy);
+ return httpClientGenerator.getClient(null);
}
String domain = site.getDomain();
CloseableHttpClient httpClient = httpClients.get(domain);
@@ -68,7 +62,7 @@ public class HttpClientDownloader extends AbstractDownloader {
synchronized (this) {
httpClient = httpClients.get(domain);
if (httpClient == null) {
- httpClient = httpClientGenerator.getClient(site, proxy);
+ httpClient = httpClientGenerator.getClient(site);
httpClients.put(domain, httpClient);
}
}
@@ -78,54 +72,19 @@ public class HttpClientDownloader extends AbstractDownloader {
@Override
public Page download(Request request, Task task) {
- Site site = null;
- if (task != null) {
- site = task.getSite();
+ if (task == null || task.getSite() == null) {
+ throw new NullPointerException("task or site can not be null");
}
- Set acceptStatCode;
- String charset = null;
- Map headers = null;
- if (site != null) {
- acceptStatCode = site.getAcceptStatCode();
- charset = site.getCharset();
- headers = site.getHeaders();
- } else {
- acceptStatCode = WMCollections.newHashSet(200);
- }
- logger.info("downloading page {}", request.getUrl());
+ logger.debug("downloading page {}", request.getUrl());
CloseableHttpResponse httpResponse = null;
- int statusCode = 0;
+ Site site = task.getSite();
+ CloseableHttpClient httpClient = getHttpClient(site);
+ HttpClientRequestContext requestContext = httpUriRequestConverter.convert(request, site, proxyProvider != null ? proxyProvider.getProxy(task) : null);
try {
- HttpHost proxyHost = null;
- Proxy proxy = null; //TODO
- if (site != null && site.getHttpProxyPool() != null && site.getHttpProxyPool().isEnable()) {
- proxy = site.getHttpProxyFromPool();
- proxyHost = proxy.getHttpHost();
- } else if (site != null && site.getHttpProxy() != null){
- proxyHost = site.getHttpProxy();
- }
-
- HttpUriRequest httpUriRequest = getHttpUriRequest(request, site, headers, proxyHost);
- HttpClientContext context=null;
- if(request.getCookies()!=null && CollectionUtils.isNotEmpty(request.getCookies())){
- context=new HttpClientContext();
- CookieStore cookieStore=new BasicCookieStore();
- for(Cookie c:request.getCookies()){
- cookieStore.addCookie(c);
- }
- context.setCookieStore(cookieStore);
- }
- if(request.getHeaders()!=null && CollectionUtils.isNotEmpty(request.getHeaders())){
- for(Header h:request.getHeaders()){
- httpUriRequest.setHeader(h);
- }
- }
- httpResponse = getHttpClient(site, proxy).execute(httpUriRequest,context);
- statusCode = httpResponse.getStatusLine().getStatusCode();
- request.putExtra(Request.STATUS_CODE, statusCode);
- if (statusAccept(acceptStatCode, statusCode)) {
- Page page = handleResponse(request, charset, httpResponse, task);
- page.setHeaders(httpResponse.getAllHeaders());
+ httpResponse = httpClient.execute(requestContext.getHttpUriRequest(), requestContext.getHttpClientContext());
+ int statusCode = httpResponse.getStatusLine().getStatusCode();
+ if (site.getAcceptStatCode().contains(statusCode)) {
+ Page page = handleResponse(request, site.getCharset(), httpResponse, task);
onSuccess(request);
return page;
} else {
@@ -144,11 +103,6 @@ public class HttpClientDownloader extends AbstractDownloader {
//ensure the connection is released back to pool
EntityUtils.consumeQuietly(httpResponse.getEntity());
}
- request.putExtra(Request.STATUS_CODE, statusCode);
- if (site != null && site.getHttpProxyPool() != null && site.getHttpProxyPool().isEnable()) {
- site.returnHttpProxyToPool((HttpHost) request.getExtra(Request.PROXY), (Integer) request
- .getExtra(Request.STATUS_CODE));
- }
}
}
@@ -157,91 +111,20 @@ public class HttpClientDownloader extends AbstractDownloader {
httpClientGenerator.setPoolSize(thread);
}
- protected boolean statusAccept(Set acceptStatCode, int statusCode) {
- return acceptStatCode.contains(statusCode);
- }
-
- protected HttpUriRequest getHttpUriRequest(Request request, Site site, Map headers, HttpHost proxy) {
- RequestBuilder requestBuilder = selectRequestMethod(request).setUri(request.getUrl());
- if (headers != null) {
- for (Map.Entry headerEntry : headers.entrySet()) {
- requestBuilder.addHeader(headerEntry.getKey(), headerEntry.getValue());
- }
- }
-
- RequestConfig.Builder requestConfigBuilder = RequestConfig.custom();
- if (site != null) {
- requestConfigBuilder.setConnectionRequestTimeout(site.getTimeOut())
- .setSocketTimeout(site.getTimeOut())
- .setConnectTimeout(site.getTimeOut())
- .setCookieSpec(CookieSpecs.BEST_MATCH);
- }
-
- if (proxy != null) {
- requestConfigBuilder.setProxy(proxy);
- request.putExtra(Request.PROXY, proxy);
- }
- requestBuilder.setConfig(requestConfigBuilder.build());
- return requestBuilder.build();
- }
-
- protected RequestBuilder selectRequestMethod(Request request) {
- String method = request.getMethod();
- if (method == null || method.equalsIgnoreCase(HttpConstant.Method.GET)) {
- //default get
- return addQueryParams(RequestBuilder.get(),request.getParams());
- } else if (method.equalsIgnoreCase(HttpConstant.Method.POST)) {
- if(request.getEntity()!=null){
- return RequestBuilder.post().setEntity(request.getEntity());
- }else{
- return addFormParams(RequestBuilder.post(), (NameValuePair[]) request.getExtra("nameValuePair"), request.getParams());
- }
- } else if (method.equalsIgnoreCase(HttpConstant.Method.HEAD)) {
- return addQueryParams(RequestBuilder.head(),request.getParams());
- } else if (method.equalsIgnoreCase(HttpConstant.Method.PUT)) {
- return addFormParams(RequestBuilder.put(), (NameValuePair[]) request.getExtra("nameValuePair"), request.getParams());
- } else if (method.equalsIgnoreCase(HttpConstant.Method.DELETE)) {
- return addQueryParams(RequestBuilder.delete(),request.getParams());
- } else if (method.equalsIgnoreCase(HttpConstant.Method.TRACE)) {
- return addQueryParams(RequestBuilder.trace(),request.getParams());
- }
- throw new IllegalArgumentException("Illegal HTTP Method " + method);
- }
-
- private RequestBuilder addFormParams(RequestBuilder requestBuilder, NameValuePair[] nameValuePair, Map params) {
- List allNameValuePair=new ArrayList();
- if (nameValuePair != null && nameValuePair.length > 0) {
- allNameValuePair= Arrays.asList(nameValuePair);
- }
- if (params != null) {
- for (String key : params.keySet()) {
- allNameValuePair.add(new BasicNameValuePair(key, params.get(key)));
- }
- }
- requestBuilder.setEntity(new UrlEncodedFormEntity(allNameValuePair, Charset.forName("utf8")));
- return requestBuilder;
- }
-
- private RequestBuilder addQueryParams(RequestBuilder requestBuilder, Map params) {
- if (params != null) {
- for (Map.Entry entry : params.entrySet()) {
- requestBuilder.addParameter(entry.getKey(), entry.getValue());
- }
- }
- return requestBuilder;
- }
-
protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException {
- String content = getContent(charset, httpResponse);
+ String content = getResponseContent(charset, httpResponse);
Page page = new Page();
page.setRawText(content);
page.setUrl(new PlainText(request.getUrl()));
page.setRequest(request);
page.setStatusCode(httpResponse.getStatusLine().getStatusCode());
+ if (responseHeader) {
+ page.setHeaders(HttpClientUtils.convertHeaders(httpResponse.getAllHeaders()));
+ }
return page;
}
- protected String getContent(String charset, HttpResponse httpResponse) throws IOException {
+ private String getResponseContent(String charset, HttpResponse httpResponse) throws IOException {
if (charset == null) {
byte[] contentBytes = IOUtils.toByteArray(httpResponse.getEntity().getContent());
String htmlCharset = getHtmlCharset(httpResponse, contentBytes);
@@ -256,7 +139,7 @@ public class HttpClientDownloader extends AbstractDownloader {
}
}
- protected String getHtmlCharset(HttpResponse httpResponse, byte[] contentBytes) throws IOException {
+ private String getHtmlCharset(HttpResponse httpResponse, byte[] contentBytes) throws IOException {
return CharsetUtils.detectCharset(httpResponse.getEntity().getContentType().getValue(), contentBytes);
}
}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java
index aec5309..9e17f60 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java
@@ -1,13 +1,9 @@
package us.codecraft.webmagic.downloader;
-import org.apache.commons.lang3.StringUtils;
import org.apache.http.HttpException;
import org.apache.http.HttpRequest;
import org.apache.http.HttpRequestInterceptor;
-import org.apache.http.auth.AuthScope;
-import org.apache.http.auth.UsernamePasswordCredentials;
import org.apache.http.client.CookieStore;
-import org.apache.http.client.CredentialsProvider;
import org.apache.http.config.Registry;
import org.apache.http.config.RegistryBuilder;
import org.apache.http.config.SocketConfig;
@@ -21,7 +17,6 @@ import org.apache.http.protocol.HttpContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.Site;
-import us.codecraft.webmagic.proxy.Proxy;
import javax.net.ssl.SSLContext;
import javax.net.ssl.TrustManager;
@@ -92,38 +87,20 @@ public class HttpClientGenerator {
return this;
}
- public CloseableHttpClient getClient(Site site, Proxy proxy) {
- return generateClient(site, proxy);
+ public CloseableHttpClient getClient(Site site) {
+ return generateClient(site);
}
- private CloseableHttpClient generateClient(Site site, Proxy proxy) {
- CredentialsProvider credsProvider = null;
+ private CloseableHttpClient generateClient(Site site) {
HttpClientBuilder httpClientBuilder = HttpClients.custom();
- if (proxy != null && StringUtils.isNotBlank(proxy.getUser()) && StringUtils.isNotBlank(proxy.getPassword()))
- {
- credsProvider= new BasicCredentialsProvider();
- credsProvider.setCredentials(
- new AuthScope(proxy.getHttpHost().getAddress().getHostAddress(), proxy.getHttpHost().getPort()),
- new UsernamePasswordCredentials(proxy.getUser(), proxy.getPassword()));
- httpClientBuilder.setDefaultCredentialsProvider(credsProvider);
- }
-
- if (site != null && site.getHttpProxy()!= null && site.getUsernamePasswordCredentials() != null){
- credsProvider = new BasicCredentialsProvider();
- credsProvider.setCredentials(
- new AuthScope(site.getHttpProxy()),//可以访问的范围
- site.getUsernamePasswordCredentials());//用户名和密码
- httpClientBuilder.setDefaultCredentialsProvider(credsProvider);
- }
-
httpClientBuilder.setConnectionManager(connectionManager);
- if (site != null && site.getUserAgent() != null) {
+ if (site.getUserAgent() != null) {
httpClientBuilder.setUserAgent(site.getUserAgent());
} else {
httpClientBuilder.setUserAgent("");
}
- if (site == null || site.isUseGzip()) {
+ if (site.isUseGzip()) {
httpClientBuilder.addInterceptorFirst(new HttpRequestInterceptor() {
public void process(
@@ -140,16 +117,12 @@ public class HttpClientGenerator {
SocketConfig.Builder socketConfigBuilder = SocketConfig.custom();
socketConfigBuilder.setSoKeepAlive(true).setTcpNoDelay(true);
- if (site != null) {
- socketConfigBuilder.setSoTimeout(site.getTimeOut());
- }
+ socketConfigBuilder.setSoTimeout(site.getTimeOut());
SocketConfig socketConfig = socketConfigBuilder.build();
httpClientBuilder.setDefaultSocketConfig(socketConfig);
connectionManager.setDefaultSocketConfig(socketConfig);
- if (site != null) {
- httpClientBuilder.setRetryHandler(new DefaultHttpRequestRetryHandler(site.getRetryTimes(), true));
- generateCookie(httpClientBuilder, site);
- }
+ httpClientBuilder.setRetryHandler(new DefaultHttpRequestRetryHandler(site.getRetryTimes(), true));
+ generateCookie(httpClientBuilder, site);
return httpClientBuilder.build();
}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientRequestContext.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientRequestContext.java
new file mode 100644
index 0000000..74e6d25
--- /dev/null
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientRequestContext.java
@@ -0,0 +1,34 @@
+package us.codecraft.webmagic.downloader;
+
+import org.apache.http.client.methods.HttpUriRequest;
+import org.apache.http.client.protocol.HttpClientContext;
+
+/**
+ * @author code4crafter@gmail.com
+ * Date: 17/4/8
+ * Time: 19:43
+ * @since 0.7.0
+ */
+public class HttpClientRequestContext {
+
+ private HttpUriRequest httpUriRequest;
+
+ private HttpClientContext httpClientContext;
+
+ public HttpUriRequest getHttpUriRequest() {
+ return httpUriRequest;
+ }
+
+ public void setHttpUriRequest(HttpUriRequest httpUriRequest) {
+ this.httpUriRequest = httpUriRequest;
+ }
+
+ public HttpClientContext getHttpClientContext() {
+ return httpClientContext;
+ }
+
+ public void setHttpClientContext(HttpClientContext httpClientContext) {
+ this.httpClientContext = httpClientContext;
+ }
+
+}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java
new file mode 100644
index 0000000..8ca0bf9
--- /dev/null
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java
@@ -0,0 +1,116 @@
+package us.codecraft.webmagic.downloader;
+
+import org.apache.http.HttpHost;
+import org.apache.http.auth.AuthState;
+import org.apache.http.auth.UsernamePasswordCredentials;
+import org.apache.http.client.CookieStore;
+import org.apache.http.client.config.CookieSpecs;
+import org.apache.http.client.config.RequestConfig;
+import org.apache.http.client.methods.HttpUriRequest;
+import org.apache.http.client.methods.RequestBuilder;
+import org.apache.http.client.protocol.HttpClientContext;
+import org.apache.http.entity.ByteArrayEntity;
+import org.apache.http.impl.auth.BasicScheme;
+import org.apache.http.impl.client.BasicCookieStore;
+import org.apache.http.impl.cookie.BasicClientCookie;
+import us.codecraft.webmagic.Request;
+import us.codecraft.webmagic.Site;
+import us.codecraft.webmagic.proxy.Proxy;
+import us.codecraft.webmagic.utils.HttpConstant;
+import us.codecraft.webmagic.utils.UrlUtils;
+
+import java.util.Map;
+
+/**
+ * @author code4crafter@gmail.com
+ * Date: 17/3/18
+ * Time: 11:28
+ *
+ * @since 0.7.0
+ */
+public class HttpUriRequestConverter {
+
+ public HttpClientRequestContext convert(Request request, Site site, Proxy proxy) {
+ HttpClientRequestContext httpClientRequestContext = new HttpClientRequestContext();
+ httpClientRequestContext.setHttpUriRequest(convertHttpUriRequest(request, site, proxy));
+ httpClientRequestContext.setHttpClientContext(convertHttpClientContext(request, site, proxy));
+ return httpClientRequestContext;
+ }
+
+ private HttpClientContext convertHttpClientContext(Request request, Site site, Proxy proxy) {
+ HttpClientContext httpContext = new HttpClientContext();
+ if (proxy != null) {
+ AuthState authState = new AuthState();
+ authState.update(new BasicScheme(), new UsernamePasswordCredentials(proxy.getUsername(), proxy.getPassword()));
+ httpContext.setAttribute(HttpClientContext.PROXY_AUTH_STATE, authState);
+ }
+ if (request.getCookies() != null && !request.getCookies().isEmpty()) {
+ CookieStore cookieStore = new BasicCookieStore();
+ for (Map.Entry cookieEntry : request.getCookies().entrySet()) {
+ BasicClientCookie cookie1 = new BasicClientCookie(cookieEntry.getKey(), cookieEntry.getValue());
+ cookie1.setDomain(UrlUtils.removePort(UrlUtils.getDomain(request.getUrl())));
+ cookieStore.addCookie(cookie1);
+ }
+ httpContext.setCookieStore(cookieStore);
+ }
+ return httpContext;
+ }
+
+ private HttpUriRequest convertHttpUriRequest(Request request, Site site, Proxy proxy) {
+ RequestBuilder requestBuilder = selectRequestMethod(request).setUri(request.getUrl());
+ if (site.getHeaders() != null) {
+ for (Map.Entry headerEntry : site.getHeaders().entrySet()) {
+ requestBuilder.addHeader(headerEntry.getKey(), headerEntry.getValue());
+ }
+ }
+
+ RequestConfig.Builder requestConfigBuilder = RequestConfig.custom();
+ if (site != null) {
+ requestConfigBuilder.setConnectionRequestTimeout(site.getTimeOut())
+ .setSocketTimeout(site.getTimeOut())
+ .setConnectTimeout(site.getTimeOut())
+ .setCookieSpec(CookieSpecs.STANDARD);
+ }
+
+ if (proxy != null) {
+ requestConfigBuilder.setProxy(new HttpHost(proxy.getHost(), proxy.getPort()));
+ }
+ requestBuilder.setConfig(requestConfigBuilder.build());
+ HttpUriRequest httpUriRequest = requestBuilder.build();
+ if (request.getHeaders() != null && !request.getHeaders().isEmpty()) {
+ for (Map.Entry header : request.getHeaders().entrySet()) {
+ httpUriRequest.addHeader(header.getKey(), header.getValue());
+ }
+ }
+ return httpUriRequest;
+ }
+
+ private RequestBuilder selectRequestMethod(Request request) {
+ String method = request.getMethod();
+ if (method == null || method.equalsIgnoreCase(HttpConstant.Method.GET)) {
+ //default get
+ return RequestBuilder.get();
+ } else if (method.equalsIgnoreCase(HttpConstant.Method.POST)) {
+ return addFormParams(RequestBuilder.post(),request);
+ } else if (method.equalsIgnoreCase(HttpConstant.Method.HEAD)) {
+ return RequestBuilder.head();
+ } else if (method.equalsIgnoreCase(HttpConstant.Method.PUT)) {
+ return addFormParams(RequestBuilder.put(), request);
+ } else if (method.equalsIgnoreCase(HttpConstant.Method.DELETE)) {
+ return RequestBuilder.delete();
+ } else if (method.equalsIgnoreCase(HttpConstant.Method.TRACE)) {
+ return RequestBuilder.trace();
+ }
+ throw new IllegalArgumentException("Illegal HTTP Method " + method);
+ }
+
+ private RequestBuilder addFormParams(RequestBuilder requestBuilder, Request request) {
+ if (request.getRequestBody() != null) {
+ ByteArrayEntity entity = new ByteArrayEntity(request.getRequestBody().getBody());
+ entity.setContentType(request.getRequestBody().getContentType());
+ requestBuilder.setEntity(entity);
+ }
+ return requestBuilder;
+ }
+
+}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/model/HttpRequestBody.java b/webmagic-core/src/main/java/us/codecraft/webmagic/model/HttpRequestBody.java
new file mode 100644
index 0000000..fc318ea
--- /dev/null
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/model/HttpRequestBody.java
@@ -0,0 +1,72 @@
+package us.codecraft.webmagic.model;
+
+import org.apache.http.NameValuePair;
+import org.apache.http.client.utils.URLEncodedUtils;
+import org.apache.http.message.BasicNameValuePair;
+
+import java.io.UnsupportedEncodingException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * @author code4crafter@gmail.com
+ * Date: 17/4/8
+ */
+public class HttpRequestBody {
+
+ public static abstract class ContentType {
+
+ public static final String JSON = "application/json";
+
+ public static final String XML = "text/xml";
+
+ public static final String FORM = "application/x-www-form-urlencoded";
+
+ public static final String MULTIPART = "multipart/form-data";
+ }
+
+ private final byte[] body;
+
+ private final String contentType;
+
+ private final String encoding;
+
+ public HttpRequestBody(byte[] body, String contentType, String encoding) {
+ this.body = body;
+ this.contentType = contentType;
+ this.encoding = encoding;
+ }
+
+ public String getContentType() {
+ return contentType;
+ }
+
+ public String getEncoding() {
+ return encoding;
+ }
+
+ public static HttpRequestBody json(String json, String encoding) throws UnsupportedEncodingException {
+ return new HttpRequestBody(json.getBytes(encoding), ContentType.JSON, encoding);
+ }
+
+ public static HttpRequestBody xml(String xml, String encoding) throws UnsupportedEncodingException {
+ return new HttpRequestBody(xml.getBytes(encoding), ContentType.XML, encoding);
+ }
+
+ public static HttpRequestBody custom(byte[] body, String contentType, String encoding) throws UnsupportedEncodingException {
+ return new HttpRequestBody(body, contentType, encoding);
+ }
+
+ public static HttpRequestBody form(Map params, String encoding) throws UnsupportedEncodingException {
+ List nameValuePairs = new ArrayList(params.size());
+ for (Map.Entry entry : params.entrySet()) {
+ nameValuePairs.add(new BasicNameValuePair(entry.getKey(), String.valueOf(entry.getValue())));
+ }
+ return new HttpRequestBody(URLEncodedUtils.format(nameValuePairs, encoding).getBytes(encoding), ContentType.FORM, encoding);
+ }
+
+ public byte[] getBody() {
+ return body;
+ }
+}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/SimplePageProcessor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/SimplePageProcessor.java
index a0572a9..842429b 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/SimplePageProcessor.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/SimplePageProcessor.java
@@ -2,7 +2,6 @@ package us.codecraft.webmagic.processor;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
-import us.codecraft.webmagic.utils.UrlUtils;
import java.util.List;
@@ -18,9 +17,8 @@ public class SimplePageProcessor implements PageProcessor {
private Site site;
- public SimplePageProcessor(String startUrl, String urlPattern) {
- this.site = Site.me().addStartUrl(startUrl).
- setDomain(UrlUtils.getDomain(startUrl));
+ public SimplePageProcessor(String urlPattern) {
+ this.site = Site.me();
//compile "*" expression to regex
this.urlPattern = "(" + urlPattern.replace(".", "\\.").replace("*", "[^\"'#]*") + ")";
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java
index dbe3a18..a38ccaa 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java
@@ -1,199 +1,41 @@
package us.codecraft.webmagic.proxy;
-import org.apache.http.HttpHost;
-
-import java.io.Serializable;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.concurrent.Delayed;
-import java.util.concurrent.TimeUnit;
-
-/**
- * >>>> Proxy lifecycle
-
- +----------+ +-----+
- | last use | | new |
- +-----+----+ +---+-+
- | +------+ |
- +->| init |<--+
- +--+---+
- |
- v
- +--------+
- +--->| borrow |
- | +---+----+
- | |+------------------+
- | v
- | +--------+
- | | in use | Respone Time
- | +---+----+
- | |+------------------+
- | v
- | +--------+
- | | return |
- | +---+----+
- | |+-------------------+
- | v
- | +-------+ reuse interval
- | | delay | (delay time)
- | +---+---+
- | |+-------------------+
- | v
- | +------+
- | | idle | idle time
- | +---+--+
- | |+-------------------+
- +--------+
- */
-
/**
- * Object has these status of lifecycle above.
*
- * @author yxssfxwzy@sina.com
- * @since 0.5.1
- * @see SimpleProxyPool
*/
-public class Proxy implements Delayed, Serializable {
+public class Proxy {
- private static final long serialVersionUID = 228939737383625551L;
- public static final int ERROR_403 = 403;
- public static final int ERROR_404 = 404;
- public static final int ERROR_BANNED = 10000;// banned by website
- public static final int ERROR_Proxy = 10001;// the proxy itself failed
- public static final int SUCCESS = 200;
-
- private final HttpHost httpHost;
- private String user;
+ private String host;
+ private int port;
+ private String username;
private String password;
-
- private int reuseTimeInterval = 1500;// ms
- private Long canReuseTime = 0L;
- private Long lastBorrowTime = System.currentTimeMillis();
- private Long responseTime = 0L;
+ public Proxy(String host, int port) {
+ this.host = host;
+ this.port = port;
+ }
- private int failedNum = 0;
- private int successNum = 0;
- private int borrowNum = 0;
-
- private List failedErrorType = new ArrayList();
-
- public Proxy(HttpHost httpHost, String user, String password) {
- this.httpHost = httpHost;
- this.user = user;
+ public Proxy(String host, int port, String username, String password) {
+ this.host = host;
+ this.port = port;
+ this.username = username;
this.password = password;
- this.canReuseTime = System.nanoTime() + TimeUnit.NANOSECONDS.convert(reuseTimeInterval, TimeUnit.MILLISECONDS);
}
- public Proxy(HttpHost httpHost, int reuseInterval, String user, String password) {
- this.httpHost = httpHost;
- this.user = user;
- this.password = password;
- this.canReuseTime = System.nanoTime() + TimeUnit.NANOSECONDS.convert(reuseInterval, TimeUnit.MILLISECONDS);
+ public String getHost() {
+ return host;
}
- public int getSuccessNum() {
- return successNum;
+ public int getPort() {
+ return port;
}
- public void successNumIncrement(int increment) {
- this.successNum += increment;
+ public String getUsername() {
+ return username;
}
- public Long getLastUseTime() {
- return lastBorrowTime;
- }
-
- public void setLastBorrowTime(Long lastBorrowTime) {
- this.lastBorrowTime = lastBorrowTime;
- }
-
- public void recordResponse() {
- this.responseTime = (System.currentTimeMillis() - lastBorrowTime + responseTime) / 2;
- this.lastBorrowTime = System.currentTimeMillis();
- }
-
- public List getFailedErrorType() {
- return failedErrorType;
- }
-
- public void setFailedErrorType(List failedErrorType) {
- this.failedErrorType = failedErrorType;
- }
-
- public void fail(int failedErrorType) {
- this.failedNum++;
- this.failedErrorType.add(failedErrorType);
- }
-
- public void setFailedNum(int failedNum) {
- this.failedNum = failedNum;
- }
-
- public int getFailedNum() {
- return failedNum;
- }
-
- public String getFailedType() {
- String re = "";
- for (Integer i : this.failedErrorType) {
- re += i + " . ";
- }
- return re;
- }
-
- public HttpHost getHttpHost() {
- return httpHost;
- }
-
- public int getReuseTimeInterval() {
- return reuseTimeInterval;
- }
-
- public void setReuseTimeInterval(int reuseTimeInterval) {
- this.reuseTimeInterval = reuseTimeInterval;
- this.canReuseTime = System.nanoTime() + TimeUnit.NANOSECONDS.convert(reuseTimeInterval, TimeUnit.MILLISECONDS);
-
- }
-
- @Override
- public long getDelay(TimeUnit unit) {
- return unit.convert(canReuseTime - System.nanoTime(), TimeUnit.NANOSECONDS);
- }
-
- @Override
- public int compareTo(Delayed o) {
- Proxy that = (Proxy) o;
- return canReuseTime > that.canReuseTime ? 1 : (canReuseTime < that.canReuseTime ? -1 : 0);
-
- }
-
- @Override
- public String toString() {
-
- String re = String.format("host: %15s >> %5dms >> success: %-3.2f%% >> borrow: %d", httpHost.getAddress().getHostAddress(), responseTime,
- successNum * 100.0 / borrowNum, borrowNum);
- return re;
-
- }
-
- public String getUser()
- {
- return user;
-
- }
- public String getPassword()
- {
+ public String getPassword() {
return password;
-
- }
-
- public void borrowNumIncrement(int increment) {
- this.borrowNum += increment;
- }
-
- public int getBorrowNum() {
- return borrowNum;
}
}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyPool.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyPool.java
deleted file mode 100644
index 40b1913..0000000
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyPool.java
+++ /dev/null
@@ -1,12 +0,0 @@
-package us.codecraft.webmagic.proxy;
-
-import org.apache.http.HttpHost;
-
-/**
- * Created by edwardsbean on 15-2-28.
- */
-public interface ProxyPool {
- public void returnProxy(HttpHost host, int statusCode);
- public Proxy getProxy();
- public boolean isEnable();
-}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java
new file mode 100644
index 0000000..4266d78
--- /dev/null
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java
@@ -0,0 +1,14 @@
+package us.codecraft.webmagic.proxy;
+
+import us.codecraft.webmagic.Task;
+
+/**
+ * Created by edwardsbean on 15-2-28.
+ */
+public interface ProxyProvider {
+
+ void returnProxy(Proxy proxy, boolean banned, Task task);
+
+ Proxy getProxy(Task task);
+
+}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ResponseChecker.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ResponseChecker.java
new file mode 100644
index 0000000..3e68c11
--- /dev/null
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ResponseChecker.java
@@ -0,0 +1,13 @@
+package us.codecraft.webmagic.proxy;
+
+import org.apache.http.HttpResponse;
+
+/**
+ * @author code4crafter@gmail.com
+ * Date: 17/3/20
+ * Time: 下午10:52
+ */
+public interface ResponseChecker {
+
+ boolean isBanned(HttpResponse httpResponse);
+}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyPool.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyPool.java
deleted file mode 100644
index f7cd049..0000000
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyPool.java
+++ /dev/null
@@ -1,310 +0,0 @@
-package us.codecraft.webmagic.proxy;
-
-import org.apache.http.HttpHost;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-import us.codecraft.webmagic.utils.FilePersistentBase;
-import us.codecraft.webmagic.utils.ProxyUtils;
-
-import java.io.*;
-import java.net.InetAddress;
-import java.net.UnknownHostException;
-import java.util.*;
-import java.util.Map.Entry;
-import java.util.concurrent.BlockingQueue;
-import java.util.concurrent.ConcurrentHashMap;
-import java.util.concurrent.DelayQueue;
-
-/**
- * Pooled Proxy Object
- *
- * @author yxssfxwzy@sina.com
- * @see Proxy
- * @since 0.5.1
- */
-public class SimpleProxyPool implements ProxyPool {
-
- private Logger logger = LoggerFactory.getLogger(getClass());
-
- private BlockingQueue proxyQueue = new DelayQueue();
- private Map allProxy = new ConcurrentHashMap();
-
- private int reuseInterval = 1500;// ms
- private int reviveTime = 2 * 60 * 60 * 1000;// ms
- private int saveProxyInterval = 10 * 60 * 1000;// ms
-
- private boolean isEnable = false;
- private boolean validateWhenInit = false;
- // private boolean isUseLastProxy = true;
- private String proxyFilePath = "/data/webmagic/lastUse.proxy";
-
- private FilePersistentBase fBase = new FilePersistentBase();
-
- private Timer timer = new Timer(true);
- private TimerTask saveProxyTask = new TimerTask() {
-
- @Override
- public void run() {
- saveProxyList();
- logger.info(allProxyStatus());
- }
- };
-
- public SimpleProxyPool() {
- this(null, true);
- }
-
- public SimpleProxyPool(List httpProxyList) {
- this(httpProxyList, true);
- }
-
- public SimpleProxyPool(List httpProxyList, boolean isUseLastProxy) {
- if (httpProxyList != null) {
- addProxy(httpProxyList.toArray(new String[httpProxyList.size()][]));
- }
- if (isUseLastProxy) {
- if (!new File(proxyFilePath).exists()) {
- setFilePath();
- }
- readProxyList();
- timer.schedule(saveProxyTask, 0, saveProxyInterval);
- }
- }
-
- private void setFilePath() {
- String tmpDir = System.getProperty("java.io.tmpdir");
- String path = tmpDir + FilePersistentBase.PATH_SEPERATOR + "webmagic" + FilePersistentBase.PATH_SEPERATOR + "lastUse.proxy";
- if (tmpDir != null && new File(tmpDir).isDirectory()) {
- fBase.setPath(tmpDir + FilePersistentBase.PATH_SEPERATOR + "webmagic");
- File f = fBase.getFile(path);
- if (!f.exists()) {
- try {
- f.createNewFile();
-
- } catch (IOException e) {
- logger.error("proxy file create error", e);
- }
- }
-
- } else {
- logger.error("java tmp dir not exists");
- }
- this.proxyFilePath = path;
- }
-
- private void saveProxyList() {
- if (allProxy.size() == 0) {
- return;
- }
- try {
- ObjectOutputStream os = new ObjectOutputStream(new FileOutputStream(fBase.getFile(proxyFilePath)));
- os.writeObject(prepareForSaving());
- os.close();
- logger.info("save proxy");
- } catch (FileNotFoundException e) {
- logger.error("proxy file not found", e);
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
-
- private Map prepareForSaving() {
- Map tmp = new HashMap();
- for (Entry e : allProxy.entrySet()) {
- Proxy p = e.getValue();
- p.setFailedNum(0);
- tmp.put(e.getKey(), p);
- }
- return tmp;
- }
-
- private void readProxyList() {
- try {
- ObjectInputStream is = new ObjectInputStream(new FileInputStream(fBase.getFile(proxyFilePath)));
- addProxy((Map) is.readObject());
- is.close();
- } catch (FileNotFoundException e) {
- logger.info("last use proxy file not found", e);
- } catch (IOException e) {
- // e.printStackTrace();
- } catch (ClassNotFoundException e) {
- // e.printStackTrace();
- }
- }
-
- private void addProxy(Map httpProxyMap) {
- isEnable = true;
- for (Entry entry : httpProxyMap.entrySet()) {
- try {
- if (allProxy.containsKey(entry.getKey())) {
- continue;
- }
- if (!validateWhenInit || ProxyUtils.validateProxy(entry.getValue().getHttpHost())) {
- entry.getValue().setFailedNum(0);
- entry.getValue().setReuseTimeInterval(reuseInterval);
- proxyQueue.add(entry.getValue());
- allProxy.put(entry.getKey(), entry.getValue());
- }
- } catch (NumberFormatException e) {
- logger.error("HttpHost init error:", e);
- }
- }
- logger.info("proxy pool size>>>>" + allProxy.size());
- }
-
- public void addProxy(String[]... httpProxyList) {
- isEnable = true;
- for (String[] s : httpProxyList) {
- try {
- if (allProxy.containsKey(s[2])) {
- continue;
- }
- HttpHost item = new HttpHost(InetAddress.getByName(s[2]), Integer.valueOf(s[3]));
- if (!validateWhenInit || ProxyUtils.validateProxy(item)) {
- Proxy p = new Proxy(item, reuseInterval, s[0], s[1]);
- proxyQueue.add(p);
- allProxy.put(s[2], p);
- }
- } catch (NumberFormatException e) {
- logger.error("HttpHost init error:", e);
- } catch (UnknownHostException e) {
- logger.error("HttpHost init error:", e);
- }
- }
- logger.info("proxy pool size>>>>" + allProxy.size());
- }
-
- public Proxy getProxy() {
- Proxy proxy = null;
- try {
- Long time = System.currentTimeMillis();
- proxy = proxyQueue.take();
- double costTime = (System.currentTimeMillis() - time) / 1000.0;
- if (costTime > reuseInterval) {
- logger.info("get proxy time >>>> " + costTime);
- }
- Proxy p = allProxy.get(proxy.getHttpHost().getAddress().getHostAddress());
- p.setLastBorrowTime(System.currentTimeMillis());
- p.borrowNumIncrement(1);
- } catch (InterruptedException e) {
- logger.error("get proxy error", e);
- }
- if (proxy == null) {
- throw new NoSuchElementException();
- }
- return proxy;
- }
-
- public void returnProxy(HttpHost host, int statusCode) {
- Proxy p = allProxy.get(host.getAddress().getHostAddress());
- if (p == null) {
- return;
- }
- switch (statusCode) {
- case Proxy.SUCCESS:
- p.setReuseTimeInterval(reuseInterval);
- p.setFailedNum(0);
- p.setFailedErrorType(new ArrayList());
- p.recordResponse();
- p.successNumIncrement(1);
- break;
- case Proxy.ERROR_403:
- // banned,try longer interval
- p.fail(Proxy.ERROR_403);
- p.setReuseTimeInterval(reuseInterval * p.getFailedNum());
- logger.info(host + " >>>> reuseTimeInterval is >>>> " + p.getReuseTimeInterval() / 1000.0);
- break;
- case Proxy.ERROR_BANNED:
- p.fail(Proxy.ERROR_BANNED);
- p.setReuseTimeInterval(10 * 60 * 1000 * p.getFailedNum());
- logger.warn("this proxy is banned >>>> " + p.getHttpHost());
- logger.info(host + " >>>> reuseTimeInterval is >>>> " + p.getReuseTimeInterval() / 1000.0);
- break;
- case Proxy.ERROR_404:
- // p.fail(Proxy.ERROR_404);
- // p.setReuseTimeInterval(reuseInterval * p.getFailedNum());
- break;
- default:
- p.fail(statusCode);
- break;
- }
- if (p.getFailedNum() > 20) {
- p.setReuseTimeInterval(reviveTime);
- logger.error("remove proxy >>>> " + host + ">>>>" + p.getFailedType() + " >>>> remain proxy >>>> " + proxyQueue.size());
- return;
- }
- if (p.getFailedNum() > 0 && p.getFailedNum() % 5 == 0) {
- if (!ProxyUtils.validateProxy(host)) {
- p.setReuseTimeInterval(reviveTime);
- logger.error("remove proxy >>>> " + host + ">>>>" + p.getFailedType() + " >>>> remain proxy >>>> " + proxyQueue.size());
- return;
- }
- }
- try {
- proxyQueue.put(p);
- } catch (InterruptedException e) {
- logger.warn("proxyQueue return proxy error", e);
- }
- }
-
- public String allProxyStatus() {
- String re = "all proxy info >>>> \n";
- for (Entry entry : allProxy.entrySet()) {
- re += entry.getValue().toString() + "\n";
- }
- return re;
- }
-
- public int getIdleNum() {
- return proxyQueue.size();
- }
-
- public int getReuseInterval() {
- return reuseInterval;
- }
-
- public void setReuseInterval(int reuseInterval) {
- this.reuseInterval = reuseInterval;
- }
-
- public void enable(boolean isEnable) {
- this.isEnable = isEnable;
- }
-
- public boolean isEnable() {
- return isEnable;
- }
-
- public int getReviveTime() {
- return reviveTime;
- }
-
- public void setReviveTime(int reviveTime) {
- this.reviveTime = reviveTime;
- }
-
- public boolean isValidateWhenInit() {
- return validateWhenInit;
- }
-
- public void validateWhenInit(boolean validateWhenInit) {
- this.validateWhenInit = validateWhenInit;
- }
-
- public int getSaveProxyInterval() {
- return saveProxyInterval;
- }
-
- public void setSaveProxyInterval(int saveProxyInterval) {
- this.saveProxyInterval = saveProxyInterval;
- }
-
- public String getProxyFilePath() {
- return proxyFilePath;
- }
-
- public void setProxyFilePath(String proxyFilePath) {
- this.proxyFilePath = proxyFilePath;
- }
-
-}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/TimerReuseProxy.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/TimerReuseProxy.java
new file mode 100644
index 0000000..7002df4
--- /dev/null
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/TimerReuseProxy.java
@@ -0,0 +1,159 @@
+package us.codecraft.webmagic.proxy;
+
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.concurrent.Delayed;
+import java.util.concurrent.TimeUnit;
+
+/**
+ * >>>> Proxy lifecycle
+
+ +----------+ +-----+
+ | last use | | new |
+ +-----+----+ +---+-+
+ | +------+ |
+ +->| init |<--+
+ +--+---+
+ |
+ v
+ +--------+
+ +--->| borrow |
+ | +---+----+
+ | |+------------------+
+ | v
+ | +--------+
+ | | in use | Respone Time
+ | +---+----+
+ | |+------------------+
+ | v
+ | +--------+
+ | | return |
+ | +---+----+
+ | |+-------------------+
+ | v
+ | +-------+ reuse interval
+ | | delay | (delay time)
+ | +---+---+
+ | |+-------------------+
+ | v
+ | +------+
+ | | idle | idle time
+ | +---+--+
+ | |+-------------------+
+ +--------+
+ */
+
+/**
+ * Object has these status of lifecycle above.
+ *
+ * @author yxssfxwzy@sina.com
+ * @since 0.5.1
+ * @see TimerReuseProxyPool
+ */
+
+public class TimerReuseProxy extends Proxy implements Delayed, Serializable {
+
+ private static final long serialVersionUID = 228939737383625551L;
+ public static final int ERROR_403 = 403;
+ public static final int ERROR_404 = 404;
+ public static final int ERROR_BANNED = 10000;// banned by website
+ public static final int ERROR_Proxy = 10001;// the proxy itself failed
+ public static final int SUCCESS = 200;
+
+ private int reuseTimeInterval = 1500;// ms
+ private Long canReuseTime = 0L;
+ private Long lastBorrowTime = System.currentTimeMillis();
+ private Long responseTime = 0L;
+
+ private int failedNum = 0;
+ private int successNum = 0;
+ private int borrowNum = 0;
+
+ private List failedErrorType = new ArrayList();
+
+ public TimerReuseProxy(String host, int port, String username, String password) {
+ super(host, port, username, password);
+ }
+
+
+ public int getSuccessNum() {
+ return successNum;
+ }
+
+ public void successNumIncrement(int increment) {
+ this.successNum += increment;
+ }
+
+ public Long getLastUseTime() {
+ return lastBorrowTime;
+ }
+
+ public void setLastBorrowTime(Long lastBorrowTime) {
+ this.lastBorrowTime = lastBorrowTime;
+ }
+
+ public void recordResponse() {
+ this.responseTime = (System.currentTimeMillis() - lastBorrowTime + responseTime) / 2;
+ this.lastBorrowTime = System.currentTimeMillis();
+ }
+
+ public List getFailedErrorType() {
+ return failedErrorType;
+ }
+
+ public void setFailedErrorType(List failedErrorType) {
+ this.failedErrorType = failedErrorType;
+ }
+
+ public void fail(int failedErrorType) {
+ this.failedNum++;
+ this.failedErrorType.add(failedErrorType);
+ }
+
+ public void setFailedNum(int failedNum) {
+ this.failedNum = failedNum;
+ }
+
+ public int getFailedNum() {
+ return failedNum;
+ }
+
+ public String getFailedType() {
+ String re = "";
+ for (Integer i : this.failedErrorType) {
+ re += i + " . ";
+ }
+ return re;
+ }
+
+ public int getReuseTimeInterval() {
+ return reuseTimeInterval;
+ }
+
+ public void setReuseTimeInterval(int reuseTimeInterval) {
+ this.reuseTimeInterval = reuseTimeInterval;
+ this.canReuseTime = System.nanoTime() + TimeUnit.NANOSECONDS.convert(reuseTimeInterval, TimeUnit.MILLISECONDS);
+
+ }
+
+ @Override
+ public long getDelay(TimeUnit unit) {
+ return unit.convert(canReuseTime - System.nanoTime(), TimeUnit.NANOSECONDS);
+ }
+
+ @Override
+ public int compareTo(Delayed o) {
+ TimerReuseProxy that = (TimerReuseProxy) o;
+ return canReuseTime > that.canReuseTime ? 1 : (canReuseTime < that.canReuseTime ? -1 : 0);
+
+ }
+
+ public void borrowNumIncrement(int increment) {
+ this.borrowNum += increment;
+ }
+
+ public int getBorrowNum() {
+ return borrowNum;
+ }
+}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/TimerReuseProxyPool.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/TimerReuseProxyPool.java
new file mode 100644
index 0000000..6dbac5d
--- /dev/null
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/TimerReuseProxyPool.java
@@ -0,0 +1,204 @@
+package us.codecraft.webmagic.proxy;
+
+import us.codecraft.webmagic.Task;
+
+/**
+ * Pooled Proxy Object
+ *
+ * @author yxssfxwzy@sina.com
+ * @see Proxy
+ * @since 0.5.1
+ */
+public class TimerReuseProxyPool implements ProxyProvider {
+ @Override
+ public void returnProxy(Proxy proxy, boolean banned, Task task) {
+
+ }
+
+ @Override
+ public Proxy getProxy(Task task) {
+ return null;
+ }
+
+// private Logger logger = LoggerFactory.getLogger(getClass());
+//
+// private BlockingQueue proxyQueue = new DelayQueue();
+// private Map allProxy = new ConcurrentHashMap();
+//
+// private int reuseInterval = 1500;// ms
+// private int reviveTime = 2 * 60 * 60 * 1000;// ms
+// private int saveProxyInterval = 10 * 60 * 1000;// ms
+//
+// private boolean isEnable = false;
+// private boolean validateWhenInit = false;
+// // private boolean isUseLastProxy = true;
+//
+// public TimerReuseProxyPool(List httpProxyList) {
+// this(httpProxyList, true);
+// }
+//
+// private void addProxy(Map httpProxyMap) {
+// isEnable = true;
+// for (Entry entry : httpProxyMap.entrySet()) {
+// try {
+// if (allProxy.containsKey(entry.getKey())) {
+// continue;
+// }
+// if (!validateWhenInit || ProxyUtils.validateProxy(entry.getValue().getHttpHost())) {
+// entry.getValue().setFailedNum(0);
+// entry.getValue().setReuseTimeInterval(reuseInterval);
+// proxyQueue.add(entry.getValue());
+// allProxy.put(entry.getKey(), entry.getValue());
+// }
+// } catch (NumberFormatException e) {
+// logger.error("HttpHost init error:", e);
+// }
+// }
+// logger.info("proxy pool size>>>>" + allProxy.size());
+// }
+//
+// public void addProxy(Proxy... httpProxyList) {
+// isEnable = true;
+// for (Proxy proxy : httpProxyList) {
+// if (!validateWhenInit || ProxyUtils.validateProxy(proxy.getProxyHost())) {
+// TimerReuseProxy p = new TimerReuseProxy(proxy.getProxyHost(), proxy.getUsername(), proxy.getPassword(), reuseInterval);
+// proxyQueue.add(p);
+// allProxy.put(p.getProxyHost().getHost(), p);
+// }
+// }
+// logger.info("proxy pool size>>>>" + allProxy.size());
+// }
+//
+// public TimerReuseProxy getProxy() {
+// TimerReuseProxy proxy = null;
+// try {
+// Long time = System.currentTimeMillis();
+// proxy = proxyQueue.take();
+// double costTime = (System.currentTimeMillis() - time) / 1000.0;
+// if (costTime > reuseInterval) {
+// logger.info("get proxy time >>>> " + costTime);
+// }
+// TimerReuseProxy p = allProxy.get(proxy.getProxyHost().getHost());
+// p.setLastBorrowTime(System.currentTimeMillis());
+// p.borrowNumIncrement(1);
+// } catch (InterruptedException e) {
+// logger.error("get proxy error", e);
+// }
+// if (proxy == null) {
+// throw new NoSuchElementException();
+// }
+// return proxy;
+// }
+//
+// public void returnProxy(Proxy proxy, int statusCode) {
+// TimerReuseProxy p = allProxy.get(proxy.getProxyHost());
+// if (p == null) {
+// return;
+// }
+// switch (statusCode) {
+// case TimerReuseProxy.SUCCESS:
+// p.setReuseTimeInterval(reuseInterval);
+// p.setFailedNum(0);
+// p.setFailedErrorType(new ArrayList());
+// p.recordResponse();
+// p.successNumIncrement(1);
+// break;
+// case TimerReuseProxy.ERROR_403:
+// // banned,try longer interval
+// p.fail(TimerReuseProxy.ERROR_403);
+// p.setReuseTimeInterval(reuseInterval * p.getFailedNum());
+// logger.info(proxy + " >>>> reuseTimeInterval is >>>> " + p.getReuseTimeInterval() / 1000.0);
+// break;
+// case TimerReuseProxy.ERROR_BANNED:
+// p.fail(TimerReuseProxy.ERROR_BANNED);
+// p.setReuseTimeInterval(10 * 60 * 1000 * p.getFailedNum());
+// logger.info(proxy + " >>>> reuseTimeInterval is >>>> " + p.getReuseTimeInterval() / 1000.0);
+// break;
+// case TimerReuseProxy.ERROR_404:
+// // p.fail(Proxy.ERROR_404);
+// // p.setReuseTimeInterval(reuseInterval * p.getFailedNum());
+// break;
+// default:
+// p.fail(statusCode);
+// break;
+// }
+// if (p.getFailedNum() > 20) {
+// p.setReuseTimeInterval(reviveTime);
+// logger.error("remove proxy >>>> " + proxy + ">>>>" + p.getFailedType() + " >>>> remain proxy >>>> " + proxyQueue.size());
+// return;
+// }
+// if (p.getFailedNum() > 0 && p.getFailedNum() % 5 == 0) {
+// if (!ProxyUtils.validateProxy(proxy)) {
+// p.setReuseTimeInterval(reviveTime);
+// logger.error("remove proxy >>>> " + proxy + ">>>>" + p.getFailedType() + " >>>> remain proxy >>>> " + proxyQueue.size());
+// return;
+// }
+// }
+// try {
+// proxyQueue.put(p);
+// } catch (InterruptedException e) {
+// logger.warn("proxyQueue return proxy error", e);
+// }
+// }
+//
+// public String allProxyStatus() {
+// String re = "all proxy info >>>> \n";
+// for (Entry entry : allProxy.entrySet()) {
+// re += entry.getValue().toString() + "\n";
+// }
+// return re;
+// }
+//
+// public int getIdleNum() {
+// return proxyQueue.size();
+// }
+//
+// public int getReuseInterval() {
+// return reuseInterval;
+// }
+//
+// public void setReuseInterval(int reuseInterval) {
+// this.reuseInterval = reuseInterval;
+// }
+//
+// public void enable(boolean isEnable) {
+// this.isEnable = isEnable;
+// }
+//
+// public boolean isEnable() {
+// return isEnable;
+// }
+//
+// public int getReviveTime() {
+// return reviveTime;
+// }
+//
+// public void setReviveTime(int reviveTime) {
+// this.reviveTime = reviveTime;
+// }
+//
+// public boolean isValidateWhenInit() {
+// return validateWhenInit;
+// }
+//
+// public void validateWhenInit(boolean validateWhenInit) {
+// this.validateWhenInit = validateWhenInit;
+// }
+//
+// public int getSaveProxyInterval() {
+// return saveProxyInterval;
+// }
+//
+// public void setSaveProxyInterval(int saveProxyInterval) {
+// this.saveProxyInterval = saveProxyInterval;
+// }
+//
+// public String getProxyFilePath() {
+// return proxyFilePath;
+// }
+//
+// public void setProxyFilePath(String proxyFilePath) {
+// this.proxyFilePath = proxyFilePath;
+// }
+
+}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java
index d80e8b4..7b22639 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java
@@ -44,6 +44,16 @@ public class Html extends HtmlNode {
*/
private Document document;
+ public Html(String text, String url) {
+ try {
+ disableJsoupHtmlEntityEscape();
+ this.document = Jsoup.parse(text, url);
+ } catch (Exception e) {
+ this.document = null;
+ logger.warn("parse document error ", e);
+ }
+ }
+
public Html(String text) {
try {
disableJsoupHtmlEntityEscape();
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java
index 030522f..89de5a6 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java
@@ -34,7 +34,7 @@ public class HtmlNode extends AbstractSelectable {
@Override
public Selectable links() {
- return xpath("//a/@href");
+ return selectElements(new LinksSelector());
}
@Override
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/LinksSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/LinksSelector.java
new file mode 100644
index 0000000..5296a74
--- /dev/null
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/LinksSelector.java
@@ -0,0 +1,51 @@
+package us.codecraft.webmagic.selector;
+
+import org.jsoup.helper.StringUtil;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * Links selector based on jsoup. Use absolute url.
+ *
+ * @author code4crafter@gmail.com
+ * @since 0.7.0
+ */
+public class LinksSelector extends BaseElementSelector {
+
+ @Override
+ public String select(Element element) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public List selectList(Element element) {
+ Elements elements = element.select("a");
+ List links = new ArrayList(elements.size());
+ for (Element element0 : elements) {
+ if (!StringUtil.isBlank(element0.baseUri())) {
+ links.add(element0.attr("abs:href"));
+ } else {
+ links.add(element0.attr("href"));
+ }
+ }
+ return links;
+ }
+
+ @Override
+ public Element selectElement(Element element) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public List selectElements(Element element) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public boolean hasAttribute() {
+ return true;
+ }
+}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/HttpClientUtils.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/HttpClientUtils.java
new file mode 100644
index 0000000..93f8fe9
--- /dev/null
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/HttpClientUtils.java
@@ -0,0 +1,28 @@
+package us.codecraft.webmagic.utils;
+
+import org.apache.http.Header;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * @author code4crafter@gmail.com
+ * Date: 17/3/27
+ */
+public abstract class HttpClientUtils {
+
+ public static Map> convertHeaders(Header[] headers){
+ Map> results = new HashMap>();
+ for (Header header : headers) {
+ List list = results.get(header.getName());
+ if (list == null) {
+ list = new ArrayList();
+ results.put(header.getName(), list);
+ }
+ list.add(header.getValue());
+ }
+ return results;
+ }
+}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/ProxyUtils.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/ProxyUtils.java
index f44c2ac..9b734c7 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/ProxyUtils.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/ProxyUtils.java
@@ -1,19 +1,12 @@
package us.codecraft.webmagic.utils;
-import java.io.IOException;
-import java.net.Inet6Address;
-import java.net.InetAddress;
-import java.net.InetSocketAddress;
-import java.net.NetworkInterface;
-import java.net.Socket;
-import java.net.SocketException;
-import java.net.UnknownHostException;
-import java.util.Enumeration;
-import java.util.regex.Pattern;
-
-import org.apache.http.HttpHost;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import us.codecraft.webmagic.proxy.Proxy;
+
+import java.io.IOException;
+import java.net.InetSocketAddress;
+import java.net.Socket;
/**
* Pooled Proxy Object
@@ -23,68 +16,19 @@ import org.slf4j.LoggerFactory;
*/
public class ProxyUtils {
- private static InetAddress localAddr;
- private static String networkInterface = "eth7";
private static final Logger logger = LoggerFactory.getLogger(ProxyUtils.class);
- static {
- init();
- }
- private static void init() {
- // first way to get local IP
- try {
- localAddr = InetAddress.getLocalHost();
- logger.info("local IP:" + localAddr.getHostAddress());
- } catch (UnknownHostException e) {
- logger.info("try again\n");
- }
- if (localAddr != null) {
- return;
- }
- // other way to get local IP
- Enumeration localAddrs;
- try {
- // modify your network interface name
- NetworkInterface ni = NetworkInterface.getByName(networkInterface);
- if (ni == null) {
- return;
- }
- localAddrs = ni.getInetAddresses();
- if (localAddrs == null || !localAddrs.hasMoreElements()) {
- logger.error("choose NetworkInterface\n" + getNetworkInterface());
- return;
- }
- while (localAddrs.hasMoreElements()) {
- InetAddress tmp = localAddrs.nextElement();
- if (!tmp.isLoopbackAddress() && !tmp.isLinkLocalAddress() && !(tmp instanceof Inet6Address)) {
- localAddr = tmp;
- logger.info("local IP:" + localAddr.getHostAddress());
- break;
- }
- }
- } catch (Exception e) {
- logger.error("Failure when init ProxyUtil", e);
- logger.error("choose NetworkInterface\n" + getNetworkInterface());
- }
- }
-
- public static boolean validateProxy(HttpHost p) {
- if (localAddr == null) {
- logger.error("cannot get local IP");
- return false;
- }
- boolean isReachable = false;
+ public static boolean validateProxy(Proxy p) {
Socket socket = null;
try {
socket = new Socket();
- socket.bind(new InetSocketAddress(localAddr, 0));
- InetSocketAddress endpointSocketAddr = new InetSocketAddress(p.getAddress().getHostAddress(), p.getPort());
+ InetSocketAddress endpointSocketAddr = new InetSocketAddress(p.getHost(), p.getPort());
socket.connect(endpointSocketAddr, 3000);
- logger.debug("SUCCESS - connection established! Local: " + localAddr.getHostAddress() + " remote: " + p);
- isReachable = true;
+ return true;
} catch (IOException e) {
- logger.warn("FAILRE - CAN not connect! Local: " + localAddr.getHostAddress() + " remote: " + p);
+ logger.warn("FAILRE - CAN not connect! remote: " + p);
+ return false;
} finally {
if (socket != null) {
try {
@@ -94,30 +38,7 @@ public class ProxyUtils {
}
}
}
- return isReachable;
+
}
- private static String getNetworkInterface() {
-
- String networkInterfaceName = ">>>> modify networkInterface in us.codecraft.webmagic.utils.ProxyUtils";
- Enumeration enumeration = null;
- try {
- enumeration = NetworkInterface.getNetworkInterfaces();
- } catch (SocketException e1) {
- e1.printStackTrace();
- }
- while (enumeration.hasMoreElements()) {
- NetworkInterface networkInterface = enumeration.nextElement();
-
- Enumeration addr = networkInterface.getInetAddresses();
- while (addr.hasMoreElements()) {
- String s = addr.nextElement().getHostAddress();
- Pattern IPV4_PATTERN = Pattern.compile("^(25[0-5]|2[0-4]\\d|[0-1]?\\d?\\d)(\\.(25[0-5]|2[0-4]\\d|[0-1]?\\d?\\d)){3}$");
- if (s != null && IPV4_PATTERN.matcher(s).matches()) {
- networkInterfaceName += networkInterface.toString() + "IP:" + s + "\n\n";
- }
- }
- }
- return networkInterfaceName;
- }
}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java
index ed7ae8c..72a9d3f 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java
@@ -80,7 +80,7 @@ public class UrlUtils {
if (i > 0) {
domain = StringUtils.substring(domain, 0, i);
}
- return domain;
+ return removePort(domain);
}
public static String removePort(String domain) {
diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/HtmlTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/HtmlTest.java
index 6cf5382..faf249f 100644
--- a/webmagic-core/src/test/java/us/codecraft/webmagic/HtmlTest.java
+++ b/webmagic-core/src/test/java/us/codecraft/webmagic/HtmlTest.java
@@ -48,4 +48,14 @@ public class HtmlTest {
Selectable selectable = html.xpath("//a[1]").nodes().get(0);
assertThat(selectable.xpath("/a/@href").get()).isEqualTo("/xx/xx");
}
+
+ @Test
+ public void testGetHrefsByJsoup(){
+ Html html = new Html("issues
","https://github.com/code4craft/webmagic/");
+ assertThat(html.xpath("//a[1]/@abs:href").get()).isEqualTo("https://github.com/code4craft/webmagic/issues");
+ assertThat(html.xpath("//img/@abs:src").get()).isEqualTo("https://github.com/code4craft/webmagic/webmagic.jpg");
+ html = new Html("issues
");
+ assertThat(html.xpath("//a[1]/@abs:href").get()).isEqualTo("https://github.com/code4craft/webmagic/issues");
+ assertThat(html.xpath("//img/@abs:src").get()).isEqualTo("https://github.com/code4craft/webmagic/webmagic.jpg");
+ }
}
diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/SpiderTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/SpiderTest.java
index ba29387..4f4a280 100644
--- a/webmagic-core/src/test/java/us/codecraft/webmagic/SpiderTest.java
+++ b/webmagic-core/src/test/java/us/codecraft/webmagic/SpiderTest.java
@@ -19,12 +19,12 @@ public class SpiderTest {
@Ignore("long time")
@Test
public void testStartAndStop() throws InterruptedException {
- Spider spider = Spider.create(new SimplePageProcessor("http://www.oschina.net/", "http://www.oschina.net/*")).addPipeline(new Pipeline() {
+ Spider spider = Spider.create(new SimplePageProcessor( "http://www.oschina.net/*")).addPipeline(new Pipeline() {
@Override
public void process(ResultItems resultItems, Task task) {
System.out.println(1);
}
- }).thread(1);
+ }).thread(1).addUrl("http://www.oschina.net/");
spider.start();
Thread.sleep(10000);
spider.stop();
diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java
index 5440b33..9c93915 100644
--- a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java
+++ b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java
@@ -3,9 +3,10 @@ package us.codecraft.webmagic.downloader;
import com.github.dreamhead.moco.HttpServer;
import com.github.dreamhead.moco.Runnable;
import com.github.dreamhead.moco.Runner;
+import org.apache.commons.collections.map.HashedMap;
import org.apache.commons.io.IOUtils;
import org.apache.http.client.methods.CloseableHttpResponse;
-import org.apache.http.client.methods.RequestBuilder;
+import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
@@ -14,11 +15,14 @@ import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Task;
+import us.codecraft.webmagic.model.HttpRequestBody;
import us.codecraft.webmagic.selector.Html;
+import us.codecraft.webmagic.utils.CharsetUtils;
import us.codecraft.webmagic.utils.HttpConstant;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
+import java.util.Map;
import static com.github.dreamhead.moco.Moco.*;
import static org.assertj.core.api.Assertions.assertThat;
@@ -30,7 +34,7 @@ import static org.junit.Assert.assertTrue;
*/
public class HttpClientDownloaderTest {
- public static final String PAGE_ALWAYS_NOT_EXISTS = "http://localhost:13421/404";
+ public static final String PAGE_ALWAYS_NOT_EXISTS = "http://localhost:13423/404";
@Test
public void testDownloader() {
@@ -59,7 +63,7 @@ public class HttpClientDownloaderTest {
@Test
public void testGetHtmlCharset() throws Exception {
- HttpServer server = httpserver(12306);
+ HttpServer server = httpserver(13423);
server.get(by(uri("/header"))).response(header("Content-Type", "text/html; charset=gbk"));
server.get(by(uri("/meta4"))).response(with(text("\n" +
" \n" +
@@ -76,30 +80,30 @@ public class HttpClientDownloaderTest {
Runner.running(server, new Runnable() {
@Override
public void run() {
- String charset = getCharsetByUrl("http://127.0.0.1:12306/header");
+ String charset = getCharsetByUrl("http://127.0.0.1:13423/header");
assertEquals(charset, "gbk");
- charset = getCharsetByUrl("http://127.0.0.1:12306/meta4");
+ charset = getCharsetByUrl("http://127.0.0.1:13423/meta4");
assertEquals(charset, "gbk");
- charset = getCharsetByUrl("http://127.0.0.1:12306/meta5");
+ charset = getCharsetByUrl("http://127.0.0.1:13423/meta5");
assertEquals(charset, "gbk");
}
private String getCharsetByUrl(String url) {
HttpClientDownloader downloader = new HttpClientDownloader();
Site site = Site.me();
- CloseableHttpClient httpClient = new HttpClientGenerator().getClient(site, null);
+ CloseableHttpClient httpClient = new HttpClientGenerator().getClient(site);
// encoding in http header Content-Type
Request requestGBK = new Request(url);
CloseableHttpResponse httpResponse = null;
try {
- httpResponse = httpClient.execute(downloader.getHttpUriRequest(requestGBK, site, null,null));
+ httpResponse = httpClient.execute(new HttpUriRequestConverter().convert(requestGBK, site, null).getHttpUriRequest());
} catch (IOException e) {
e.printStackTrace();
}
String charset = null;
try {
byte[] contentBytes = IOUtils.toByteArray(httpResponse.getEntity().getContent());
- charset = downloader.getHtmlCharset(httpResponse,contentBytes);
+ charset = CharsetUtils.detectCharset(httpResponse.getEntity().getContentType().getValue(), contentBytes);
} catch (IOException e) {
e.printStackTrace();
}
@@ -110,53 +114,108 @@ public class HttpClientDownloaderTest {
@Test
public void test_selectRequestMethod() throws Exception {
- HttpServer server = httpserver(12306);
+ HttpServer server = httpserver(13423);
server.get(eq(query("q"), "webmagic")).response("get");
server.post(eq(form("q"), "webmagic")).response("post");
server.put(eq(form("q"), "webmagic")).response("put");
server.delete(eq(query("q"), "webmagic")).response("delete");
server.request(and(by(method("HEAD")),eq(query("q"), "webmagic"))).response(header("method","head"));
server.request(and(by(method("TRACE")),eq(query("q"), "webmagic"))).response("trace");
+ final HttpUriRequestConverter httpUriRequestConverter = new HttpUriRequestConverter();
+ final Site site = Site.me();
+ Runner.running(server, new Runnable() {
+ @Override
+ public void run() throws Exception {
+ Request request = new Request();
+ request.setUrl("http://127.0.0.1:13423/search?q=webmagic");
+ request.setMethod(HttpConstant.Method.GET);
+ Map params = new HashedMap();
+ params.put("q","webmagic");
+ HttpUriRequest httpUriRequest = httpUriRequestConverter.convert(request,site,null).getHttpUriRequest();
+ assertThat(EntityUtils.toString(HttpClients.custom().build().execute(httpUriRequest).getEntity())).isEqualTo("get");
+ request.setMethod(HttpConstant.Method.DELETE);
+ httpUriRequest = httpUriRequestConverter.convert(request, site, null).getHttpUriRequest();
+ assertThat(EntityUtils.toString(HttpClients.custom().build().execute(httpUriRequest).getEntity())).isEqualTo("delete");
+ request.setMethod(HttpConstant.Method.HEAD);
+ httpUriRequest = httpUriRequestConverter.convert(request, site, null).getHttpUriRequest();
+ assertThat(HttpClients.custom().build().execute(httpUriRequest).getFirstHeader("method").getValue()).isEqualTo("head");
+ request.setMethod(HttpConstant.Method.TRACE);
+ httpUriRequest = httpUriRequestConverter.convert(request, site, null).getHttpUriRequest();
+ assertThat(EntityUtils.toString(HttpClients.custom().build().execute(httpUriRequest).getEntity())).isEqualTo("trace");
+ request.setUrl("http://127.0.0.1:13423/search");
+ request.setMethod(HttpConstant.Method.POST);
+ request.setRequestBody(HttpRequestBody.form(params, "utf-8"));
+ httpUriRequest = httpUriRequestConverter.convert(request, site, null).getHttpUriRequest();
+ assertThat(EntityUtils.toString(HttpClients.custom().build().execute(httpUriRequest).getEntity())).isEqualTo("post");
+ request.setMethod(HttpConstant.Method.PUT);
+ httpUriRequest = httpUriRequestConverter.convert(request, site, null).getHttpUriRequest();
+ assertThat(EntityUtils.toString(HttpClients.custom().build().execute(httpUriRequest).getEntity())).isEqualTo("put");
+ }
+ });
+ }
+
+ @Test
+ public void test_set_request_cookie() throws Exception {
+ HttpServer server = httpserver(13423);
+ server.get(eq(cookie("cookie"), "cookie-webmagic")).response("ok");
Runner.running(server, new Runnable() {
@Override
public void run() throws Exception {
HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
Request request = new Request();
- request.setUrl("http://127.0.0.1:12306/search");
- request.putParams("q", "webmagic");
- request.setMethod(HttpConstant.Method.GET);
- RequestBuilder requestBuilder = httpClientDownloader.selectRequestMethod(request).setUri(request.getUrl());
- assertThat(EntityUtils.toString(HttpClients.custom().build().execute(requestBuilder.build()).getEntity())).isEqualTo("get");
- request.setMethod(HttpConstant.Method.POST);
- requestBuilder = httpClientDownloader.selectRequestMethod(request).setUri(request.getUrl());
- assertThat(EntityUtils.toString(HttpClients.custom().build().execute(requestBuilder.build()).getEntity())).isEqualTo("post");
- request.setMethod(HttpConstant.Method.PUT);
- requestBuilder = httpClientDownloader.selectRequestMethod(request).setUri(request.getUrl());
- assertThat(EntityUtils.toString(HttpClients.custom().build().execute(requestBuilder.build()).getEntity())).isEqualTo("put");
- request.setMethod(HttpConstant.Method.DELETE);
- requestBuilder = httpClientDownloader.selectRequestMethod(request).setUri(request.getUrl());
- assertThat(EntityUtils.toString(HttpClients.custom().build().execute(requestBuilder.build()).getEntity())).isEqualTo("delete");
- request.setMethod(HttpConstant.Method.HEAD);
- requestBuilder = httpClientDownloader.selectRequestMethod(request).setUri(request.getUrl());
- assertThat(HttpClients.custom().build().execute(requestBuilder.build()).getFirstHeader("method").getValue()).isEqualTo("head");
- request.setMethod(HttpConstant.Method.TRACE);
- requestBuilder = httpClientDownloader.selectRequestMethod(request).setUri(request.getUrl());
- assertThat(EntityUtils.toString(HttpClients.custom().build().execute(requestBuilder.build()).getEntity())).isEqualTo("trace");
+ request.setUrl("http://127.0.0.1:13423");
+ request.addCookie("cookie","cookie-webmagic");
+ Page page = httpClientDownloader.download(request, Site.me().toTask());
+ assertThat(page.getRawText()).isEqualTo("ok");
+ }
+ });
+ }
+
+ @Test
+ public void test_set_request_header() throws Exception {
+ HttpServer server = httpserver(13423);
+ server.get(eq(header("header"), "header-webmagic")).response("ok");
+ Runner.running(server, new Runnable() {
+ @Override
+ public void run() throws Exception {
+ HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
+ Request request = new Request();
+ request.setUrl("http://127.0.0.1:13423");
+ request.addHeader("header","header-webmagic");
+ Page page = httpClientDownloader.download(request, Site.me().toTask());
+ assertThat(page.getRawText()).isEqualTo("ok");
+ }
+ });
+ }
+
+ @Test
+ public void test_set_site_cookie() throws Exception {
+ HttpServer server = httpserver(13423);
+ server.get(eq(cookie("cookie"), "cookie-webmagic")).response("ok");
+ Runner.running(server, new Runnable() {
+ @Override
+ public void run() throws Exception {
+ HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
+ Request request = new Request();
+ request.setUrl("http://127.0.0.1:13423");
+ Site site = Site.me().addCookie("cookie", "cookie-webmagic").setDomain("127.0.0.1");
+ Page page = httpClientDownloader.download(request, site.toTask());
+ assertThat(page.getRawText()).isEqualTo("ok");
}
});
}
@Test
public void test_download_when_task_is_null() throws Exception {
- HttpServer server = httpserver(12306);
+ HttpServer server = httpserver(13423);
server.response("foo");
Runner.running(server, new Runnable() {
@Override
public void run() throws Exception {
final HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
Request request = new Request();
- request.setUrl("http://127.0.0.1:12306/");
- Page page = httpClientDownloader.download(request, null);
+ request.setUrl("http://127.0.0.1:13423/");
+ Page page = httpClientDownloader.download(request, Site.me().toTask());
assertThat(page.getRawText()).isEqualTo("foo");
}
});
diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java
index f218356..86af367 100644
--- a/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java
+++ b/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java
@@ -2,13 +2,10 @@ package us.codecraft.webmagic.proxy;
import org.apache.http.HttpHost;
import org.junit.BeforeClass;
-import org.junit.Test;
import java.util.ArrayList;
import java.util.List;
-import static org.assertj.core.api.Assertions.assertThat;
-
/**
* @author yxssfxwzy@sina.com May 30, 2014
*
@@ -27,30 +24,6 @@ public class ProxyTest {
}
}
- @Test
- public void testProxy() {
- SimpleProxyPool proxyPool = new SimpleProxyPool(httpProxyList,false);
- proxyPool.setReuseInterval(500);
- assertThat(proxyPool.getIdleNum()).isEqualTo(4);
- for (int i = 0; i < 2; i++) {
- List fetchList = new ArrayList();
- while (proxyPool.getIdleNum() != 0) {
- Proxy proxy = proxyPool.getProxy();
- HttpHost httphost = proxy.getHttpHost();
- // httphostList.add(httphost);
- System.out.println(httphost.getHostName() + ":" + httphost.getPort());
- Fetch tmp = new Fetch(httphost);
- tmp.start();
- fetchList.add(tmp);
- }
- for (Fetch fetch : fetchList) {
- proxyPool.returnProxy(fetch.hp, Proxy.SUCCESS);
- }
- System.out.println(proxyPool.allProxyStatus());
-
- }
- }
-
class Fetch extends Thread {
HttpHost hp;
diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/LinksSelectorTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/LinksSelectorTest.java
new file mode 100644
index 0000000..3fcb71b
--- /dev/null
+++ b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/LinksSelectorTest.java
@@ -0,0 +1,21 @@
+package us.codecraft.webmagic.selector;
+
+import org.junit.Test;
+
+import java.util.List;
+
+/**
+ * @author code4crafter@gmail.com
+ * Date: 17/4/8
+ * Time: 下午9:41
+ */
+public class LinksSelectorTest {
+
+ private String html = "";
+
+ @Test
+ public void testLinks() throws Exception {
+ List links = new LinksSelector().selectList(html);
+ System.out.println(links);
+ }
+}
diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml
index a48bdd0..0848817 100644
--- a/webmagic-extension/pom.xml
+++ b/webmagic-extension/pom.xml
@@ -3,7 +3,7 @@
us.codecraft
webmagic-parent
- 0.6.2-SNAPSHOT
+ 0.7.0-SNAPSHOT
4.0.0
diff --git a/webmagic-extension/pom.xml.versionsBackup b/webmagic-extension/pom.xml.versionsBackup
deleted file mode 100644
index 47496ec..0000000
--- a/webmagic-extension/pom.xml.versionsBackup
+++ /dev/null
@@ -1,29 +0,0 @@
-
-
-
- us.codecraft
- webmagic-parent
- 0.5.2
-
- 4.0.0
-
- webmagic-extension
-
-
-
- redis.clients
- jedis
- 2.0.0
-
-
- us.codecraft
- webmagic-core
- ${project.version}
-
-
- junit
- junit
-
-
-
-
\ No newline at end of file
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/FileCache.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/FileCache.java
deleted file mode 100644
index 3c7e6ff..0000000
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/FileCache.java
+++ /dev/null
@@ -1,124 +0,0 @@
-package us.codecraft.webmagic.downloader;
-
-import org.apache.commons.codec.digest.DigestUtils;
-import org.apache.commons.lang3.StringUtils;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-import us.codecraft.webmagic.*;
-import us.codecraft.webmagic.utils.Experimental;
-import us.codecraft.webmagic.pipeline.Pipeline;
-import us.codecraft.webmagic.processor.PageProcessor;
-import us.codecraft.webmagic.processor.SimplePageProcessor;
-import us.codecraft.webmagic.selector.Html;
-import us.codecraft.webmagic.selector.PlainText;
-import us.codecraft.webmagic.utils.FilePersistentBase;
-import us.codecraft.webmagic.utils.UrlUtils;
-
-import java.io.*;
-
-/**
- * Download file and saved to file for cache.
- *
- * @author code4crafter@gmail.com
- * @since 0.2.1
- */
-@Experimental
-public class FileCache extends FilePersistentBase implements Downloader, Pipeline, PageProcessor {
-
- private Downloader downloaderWhenFileMiss;
-
- private final PageProcessor pageProcessor;
-
- private Logger logger = LoggerFactory.getLogger(getClass());
-
- public FileCache(String startUrl, String urlPattern) {
- this(startUrl, urlPattern, "/data/webmagic/temp/");
- }
-
- public FileCache(String startUrl, String urlPattern, String path) {
- this.pageProcessor = new SimplePageProcessor(startUrl, urlPattern);
- setPath(path);
- downloaderWhenFileMiss = new HttpClientDownloader();
- }
-
- public FileCache setDownloaderWhenFileMiss(Downloader downloaderWhenFileMiss) {
- this.downloaderWhenFileMiss = downloaderWhenFileMiss;
- return this;
- }
-
- @Override
- public Page download(Request request, Task task) {
- String path = this.path + "/" + task.getUUID() + "/";
- Page page = null;
- try {
- final File file = getFile(path + DigestUtils.md5Hex(request.getUrl()));
- BufferedReader bufferedReader = new BufferedReader(new FileReader(file));
- String line = bufferedReader.readLine();
- if (line.equals("url:\t" + request.getUrl())) {
- final String html = getHtml(bufferedReader);
- page = new Page();
- page.setRequest(request);
- page.setUrl(PlainText.create(request.getUrl()));
- page.setHtml(Html.create(UrlUtils.fixAllRelativeHrefs(html, request.getUrl())));
- }
- } catch (IOException e) {
- if (e instanceof FileNotFoundException) {
- logger.info("File not exist for url " + request.getUrl());
- } else {
- logger.warn("File read error for url " + request.getUrl(), e);
- }
- }
- if (page == null) {
- page = downloadWhenMiss(request, task);
- }
- return page;
- }
-
- @Override
- public void setThread(int thread) {
-
- }
-
- private String getHtml(BufferedReader bufferedReader) throws IOException {
- String line;
- StringBuilder htmlBuilder = new StringBuilder();
- line = bufferedReader.readLine();
- line = StringUtils.removeStart(line, "html:\t");
- htmlBuilder.append(line);
- while ((line = bufferedReader.readLine()) != null) {
- htmlBuilder.append(line);
- }
- return htmlBuilder.toString();
- }
-
- private Page downloadWhenMiss(Request request, Task task) {
- Page page = null;
- if (downloaderWhenFileMiss != null) {
- page = downloaderWhenFileMiss.download(request, task);
- }
- return page;
- }
-
- @Override
- public void process(ResultItems resultItems, Task task) {
- String path = this.path + PATH_SEPERATOR + task.getUUID() + PATH_SEPERATOR;
- try {
- PrintWriter printWriter = new PrintWriter(new FileWriter(getFile(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".html")));
- printWriter.println("url:\t" + resultItems.getRequest().getUrl());
- printWriter.println("html:\t" + resultItems.get("html"));
- printWriter.close();
- } catch (IOException e) {
- logger.warn("write file error", e);
- }
- }
-
- @Override
- public void process(Page page) {
- pageProcessor.process(page);
- }
-
- @Override
- public Site getSite() {
- return pageProcessor.getSite();
- }
-}
diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/downloader/FileCacheTest.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/downloader/FileCacheTest.java
deleted file mode 100644
index f73b344..0000000
--- a/webmagic-extension/src/test/java/us/codecraft/webmagic/downloader/FileCacheTest.java
+++ /dev/null
@@ -1,18 +0,0 @@
-package us.codecraft.webmagic.downloader;
-
-import org.junit.Ignore;
-import org.junit.Test;
-import us.codecraft.webmagic.Spider;
-
-/**
- * @author code4crafter@gmail.com
- */
-public class FileCacheTest {
-
- @Ignore("takes long")
- @Test
- public void test() {
- FileCache fileCache = new FileCache("http://my.oschina.net/flashsword/blog", "http://my.oschina.net/flashsword/blog/*");
- Spider.create(fileCache).downloader(fileCache).pipeline(fileCache).run();
- }
-}
diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/processor/GithubRepoProcessor.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/processor/GithubRepoProcessor.java
index bf9e381..1c8742c 100644
--- a/webmagic-extension/src/test/java/us/codecraft/webmagic/processor/GithubRepoProcessor.java
+++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/processor/GithubRepoProcessor.java
@@ -19,7 +19,7 @@ public class GithubRepoProcessor implements PageProcessor {
@Override
public Site getSite() {
- return Site.me().addStartUrl("https://github.com/code4craft/webmagic");
+ return Site.me();
}
@Test
diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml
index eed2b77..a447e39 100644
--- a/webmagic-samples/pom.xml
+++ b/webmagic-samples/pom.xml
@@ -3,7 +3,7 @@
webmagic-parent
us.codecraft
- 0.6.2-SNAPSHOT
+ 0.7.0-SNAPSHOT
4.0.0
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/DianpingFtlDataScanner.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/DianpingFtlDataScanner.java
index 7239e36..77def20 100644
--- a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/DianpingFtlDataScanner.java
+++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/DianpingFtlDataScanner.java
@@ -21,7 +21,7 @@ public class DianpingFtlDataScanner implements AfterExtractor {
private List data;
public static void main(String[] args) {
- OOSpider.create(Site.me().addStartUrl("http://w.alpha.dp/").setSleepTime(0), DianpingFtlDataScanner.class)
+ OOSpider.create(Site.me().setSleepTime(0), DianpingFtlDataScanner.class)
.thread(5).run();
}
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/GithubRepo.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/GithubRepo.java
index e8998ec..941bdbd 100644
--- a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/GithubRepo.java
+++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/GithubRepo.java
@@ -41,9 +41,10 @@ public class GithubRepo implements HasKey {
private String url;
public static void main(String[] args) {
- OOSpider.create(Site.me().addStartUrl("https://github.com/explore").setSleepTime(0).setRetryTimes(3),
+ OOSpider.create(Site.me().setSleepTime(0).setRetryTimes(3),
new JsonFilePageModelPipeline(), GithubRepo.class)
- .scheduler(new FileCacheQueueScheduler("/data/webmagic/cache/")).thread(15).run();
+ .addUrl("https://github.com/explore")
+ .setScheduler(new FileCacheQueueScheduler("/data/webmagic/cache/")).thread(15).run();
}
@Override
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/IteyeBlog.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/IteyeBlog.java
index 7e3dc51..6a10f47 100644
--- a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/IteyeBlog.java
+++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/IteyeBlog.java
@@ -28,7 +28,7 @@ public class IteyeBlog implements Blog{
}
public static void main(String[] args) {
- OOSpider.create(Site.me().addStartUrl("http://flashsword20.iteye.com/blog"), IteyeBlog.class).run();
+ OOSpider.create(Site.me(), IteyeBlog.class).addUrl("http://flashsword20.iteye.com/blog").run();
}
public String getTitle() {
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/Kr36NewsModel.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/Kr36NewsModel.java
index a1ef3fd..a1cc545 100644
--- a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/Kr36NewsModel.java
+++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/Kr36NewsModel.java
@@ -32,12 +32,12 @@ public class Kr36NewsModel {
public static void main(String[] args) throws IOException, JMException {
//Just for benchmark
- Spider thread = OOSpider.create(Site.me().addStartUrl("http://www.36kr.com/").setSleepTime(0), new PageModelPipeline() {
+ Spider thread = OOSpider.create(Site.me().setSleepTime(0), new PageModelPipeline() {
@Override
public void process(Object o, Task task) {
}
- }, Kr36NewsModel.class).thread(20);
+ }, Kr36NewsModel.class).thread(20).addUrl("http://www.36kr.com/");
thread.start();
SpiderMonitor spiderMonitor = SpiderMonitor.instance();
spiderMonitor.register(thread);
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaAnswer.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaAnswer.java
index 112f86a..cd93093 100644
--- a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaAnswer.java
+++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaAnswer.java
@@ -22,7 +22,7 @@ public class OschinaAnswer implements AfterExtractor{
private String content;
public static void main(String[] args) {
- OOSpider.create(Site.me().addStartUrl("http://www.oschina.net/question/567527_120597"), OschinaAnswer.class).run();
+ OOSpider.create(Site.me(), OschinaAnswer.class).addUrl("http://www.oschina.net/question/567527_120597").run();
}
@Override
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java
index 468b855..286e6f5 100644
--- a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java
+++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java
@@ -26,7 +26,7 @@ public class OschinaBlog{
public static void main(String[] args) {
OOSpider.create(Site.me()
- .setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36").addStartUrl("http://my.oschina.net/flashsword/blog")
+ .setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36")
.setSleepTime(0)
.setRetryTimes(3)
,new PageModelPipeline() {
@@ -34,7 +34,7 @@ public class OschinaBlog{
public void process(Object o, Task task) {
}
- }, OschinaBlog.class).thread(10).run();
+ }, OschinaBlog.class).thread(10).addUrl("http://my.oschina.net/flashsword/blog").run();
}
public String getTitle() {
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiandianBlogProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiandianBlogProcessor.java
index 25baa1f..8bd7d58 100644
--- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiandianBlogProcessor.java
+++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiandianBlogProcessor.java
@@ -35,7 +35,7 @@ public class DiandianBlogProcessor implements PageProcessor {
public Site getSite() {
//site定义抽取配置,以及开始url等
if (site == null) {
- site = Site.me().setDomain("progressdaily.diandian.com").addStartUrl("http://progressdaily.diandian.com/").
+ site = Site.me().setDomain("progressdaily.diandian.com").
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
}
return site;
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java
index 3ceba0a..61458d0 100644
--- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java
+++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java
@@ -34,13 +34,13 @@ public class DiaoyuwengProcessor implements PageProcessor {
@Override
public Site getSite() {
if (site==null){
- site= Site.me().setDomain("www.diaoyuweng.com").addStartUrl("http://www.diaoyuweng.com/home.php?mod=space&uid=88304&do=thread&view=me&type=thread&from=space").
+ site= Site.me().setDomain("www.diaoyuweng.com").
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31").setCharset("GBK").setSleepTime(500);
}
return site;
}
public static void main(String[] args) {
- Spider.create(new DiaoyuwengProcessor()).run();
+ Spider.create(new DiaoyuwengProcessor()).addUrl("http://www.diaoyuweng.com/home.php?mod=space&uid=88304&do=thread&view=me&type=thread&from=space").run();
}
}
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/F58PageProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/F58PageProcesser.java
index 3d27be8..8091b65 100644
--- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/F58PageProcesser.java
+++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/F58PageProcesser.java
@@ -25,10 +25,10 @@ public class F58PageProcesser implements PageProcessor {
@Override
public Site getSite() {
- return Site.me().setDomain("sh.58.com").addStartUrl("http://sh1.51a8.com/").setCycleRetryTimes(2); //To change body of implemented methods use File | Settings | File Templates.
+ return Site.me().setDomain("sh.58.com").setCycleRetryTimes(2); //To change body of implemented methods use File | Settings | File Templates.
}
public static void main(String[] args) {
- Spider.create(new F58PageProcesser()).setScheduler(new RedisScheduler("localhost")).run();
+ Spider.create(new F58PageProcesser()).setScheduler(new RedisScheduler("localhost")).addUrl("http://sh1.51a8.com/").run();
}
}
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java
index 000cb99..1cc90b0 100644
--- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java
+++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java
@@ -21,11 +21,11 @@ public class HuxiuProcessor implements PageProcessor {
@Override
public Site getSite() {
- return Site.me().setDomain("www.huxiu.com").addStartUrl("http://www.huxiu.com/");
+ return Site.me().setDomain("www.huxiu.com");
}
public static void main(String[] args) {
- Spider.create(new HuxiuProcessor()).run();
+ Spider.create(new HuxiuProcessor()).addUrl("http://www.huxiu.com/").run();
}
}
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/InfoQMiniBookProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/InfoQMiniBookProcessor.java
index 3ef3957..280f8f1 100644
--- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/InfoQMiniBookProcessor.java
+++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/InfoQMiniBookProcessor.java
@@ -29,7 +29,7 @@ public class InfoQMiniBookProcessor implements PageProcessor {
@Override
public Site getSite() {
if (site == null) {
- site = Site.me().setDomain("www.infoq.com").addStartUrl("http://www.infoq.com/cn/minibooks").addCookie("RegisteredUserCookie", "sDDDc8dIAgZSq67uJSXhtpQaHEi1XDOH").
+ site = Site.me().setDomain("www.infoq.com").addCookie("RegisteredUserCookie", "sDDDc8dIAgZSq67uJSXhtpQaHEi1XDOH").
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
}
return site;
@@ -38,6 +38,7 @@ public class InfoQMiniBookProcessor implements PageProcessor {
public static void main(String[] args) {
Spider.create(new InfoQMiniBookProcessor())
.thread(5)
+ .addUrl("http://www.infoq.com/cn/minibooks")
.run();
}
}
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/IteyeBlogProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/IteyeBlogProcessor.java
index 26b85e8..6dce807 100644
--- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/IteyeBlogProcessor.java
+++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/IteyeBlogProcessor.java
@@ -22,12 +22,12 @@ public class IteyeBlogProcessor implements PageProcessor {
@Override
public Site getSite() {
if (site == null) {
- site = Site.me().setDomain("yanghaoli.iteye.com").addStartUrl("http://yanghaoli.iteye.com/");
+ site = Site.me().setDomain("yanghaoli.iteye.com");
}
return site;
}
public static void main(String[] args) {
- Spider.create(new IteyeBlogProcessor()).thread(5).run();
+ Spider.create(new IteyeBlogProcessor()).thread(5).addUrl("http://yanghaoli.iteye.com/").run();
}
}
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/KaichibaProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/KaichibaProcessor.java
index 0ab6c64..b373f52 100644
--- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/KaichibaProcessor.java
+++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/KaichibaProcessor.java
@@ -22,11 +22,11 @@ public class KaichibaProcessor implements PageProcessor {
@Override
public Site getSite() {
- return Site.me().setDomain("kaichiba.com").addStartUrl("http://kaichiba.com/shop/41725781").setCharset("utf-8").
+ return Site.me().setDomain("kaichiba.com").setCharset("utf-8").
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
}
public static void main(String[] args) {
- Spider.create(new KaichibaProcessor()).run();
+ Spider.create(new KaichibaProcessor()).addUrl("http://kaichiba.com/shop/41725781").run();
}
}
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MeicanProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MeicanProcessor.java
index bfa347d..cb4c498 100644
--- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MeicanProcessor.java
+++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MeicanProcessor.java
@@ -28,11 +28,11 @@ public class MeicanProcessor implements PageProcessor {
@Override
public Site getSite() {
- return Site.me().setDomain("meican.com").addStartUrl("http://www.meican.com/shanghai/districts").setCharset("utf-8").
+ return Site.me().setDomain("meican.com").setCharset("utf-8").
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
}
public static void main(String[] args) {
- Spider.create(new MeicanProcessor()).run();
+ Spider.create(new MeicanProcessor()).addUrl("http://www.meican.com/shanghai/districts").run();
}
}
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/NjuBBSProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/NjuBBSProcessor.java
index 16dcb0c..ce0f817 100644
--- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/NjuBBSProcessor.java
+++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/NjuBBSProcessor.java
@@ -1,7 +1,8 @@
package us.codecraft.webmagic.samples;
-import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Page;
+import us.codecraft.webmagic.Site;
+import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import java.util.List;
@@ -22,6 +23,10 @@ public class NjuBBSProcessor implements PageProcessor {
@Override
public Site getSite() {
- return Site.me().setDomain("bbs.nju.edu.cn").addStartUrl("http://bbs.nju.edu.cn/board?board=Pictures");
+ return Site.me().setDomain("bbs.nju.edu.cn");
+ }
+
+ public static void main(String[] args) {
+ Spider.create(new NjuBBSProcessor()).addUrl("http://bbs.nju.edu.cn/board?board=Pictures").run();
}
}
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java
deleted file mode 100644
index e6db04e..0000000
--- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java
+++ /dev/null
@@ -1,41 +0,0 @@
-package us.codecraft.webmagic.samples;
-
-import us.codecraft.webmagic.Page;
-import us.codecraft.webmagic.Site;
-import us.codecraft.webmagic.Spider;
-import us.codecraft.webmagic.monitor.SpiderMonitor;
-import us.codecraft.webmagic.processor.PageProcessor;
-import us.codecraft.webmagic.scheduler.BloomFilterDuplicateRemover;
-import us.codecraft.webmagic.scheduler.QueueScheduler;
-
-import javax.management.JMException;
-import java.util.List;
-
-/**
- * @author code4crafter@gmail.com
- */
-public class OschinaBlogPageProcesser implements PageProcessor {
-
- private Site site = Site.me().setDomain("my.oschina.net").addStartUrl("http://my.oschina.net/flashsword/blog");
-
- @Override
- public void process(Page page) {
- List links = page.getHtml().links().regex("http://my\\.oschina\\.net/flashsword/blog/\\d+").all();
- page.addTargetRequests(links);
- page.putField("title", page.getHtml().xpath("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1/text()").toString());
- page.putField("content", page.getHtml().xpath("//div[@class='BlogContent']/tidyText()").toString());
- page.putField("tags",page.getHtml().xpath("//div[@class='BlogTags']/a/text()").all());
- }
-
- @Override
- public Site getSite() {
- return site;
-
- }
-
- public static void main(String[] args) throws JMException {
- Spider spider = Spider.create(new OschinaBlogPageProcesser()).setScheduler(new QueueScheduler().setDuplicateRemover(new BloomFilterDuplicateRemover(2000)));
- SpiderMonitor.instance().register(spider);
- spider.run();
- }
-}
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaPageProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaPageProcesser.java
deleted file mode 100644
index b75cc83..0000000
--- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaPageProcesser.java
+++ /dev/null
@@ -1,27 +0,0 @@
-package us.codecraft.webmagic.samples;
-
-import us.codecraft.webmagic.Site;
-import us.codecraft.webmagic.Page;
-import us.codecraft.webmagic.processor.PageProcessor;
-
-import java.util.List;
-
-/**
- * @author code4crafter@gmail.com
- */
-public class OschinaPageProcesser implements PageProcessor {
-
- @Override
- public void process(Page page) {
- List strings = page.getHtml().regex("]*href=[\"']{1}(http://www\\.oschina\\.net/question/[\\w]+)[\"']{1}").all();
- page.addTargetRequests(strings);
- page.putField("title", page.getHtml().xpath("//div[@class='QTitle']/h1/a"));
- page.putField("content", page.getHtml().xpath("//div[@class='Question']//div[@class='Content']/div[@class='detail']"));
- }
-
- @Override
- public Site getSite() {
- return Site.me().setDomain("www.oschina.net").addStartUrl("http://www.oschina.net/").
- setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
- }
-}
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/QzoneBlogProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/QzoneBlogProcessor.java
index d9cee2b..037b333 100644
--- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/QzoneBlogProcessor.java
+++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/QzoneBlogProcessor.java
@@ -24,7 +24,7 @@ public class QzoneBlogProcessor implements PageProcessor {
@Override
public Site getSite() {
- return Site.me().setDomain("www.diandian.com").addStartUrl("http://17dujingdian.com/").
+ return Site.me().setDomain("www.diandian.com").
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
}
}
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/TianyaPageProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/TianyaPageProcesser.java
index d14b442..6cc8f99 100644
--- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/TianyaPageProcesser.java
+++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/TianyaPageProcesser.java
@@ -21,6 +21,6 @@ public class TianyaPageProcesser implements PageProcessor {
@Override
public Site getSite() {
- return Site.me().setDomain("http://bbs.tianya.cn/").addStartUrl("http://bbs.tianya.cn/"); //To change body of implemented methods use File | Settings | File Templates.
+ return Site.me().setDomain("http://bbs.tianya.cn/"); //To change body of implemented methods use File | Settings | File Templates.
}
}
diff --git a/webmagic-samples/src/test/java/us/codecraft/webmagic/SpiderTest.java b/webmagic-samples/src/test/java/us/codecraft/webmagic/SpiderTest.java
index 2fd690d..f8dfb97 100644
--- a/webmagic-samples/src/test/java/us/codecraft/webmagic/SpiderTest.java
+++ b/webmagic-samples/src/test/java/us/codecraft/webmagic/SpiderTest.java
@@ -28,10 +28,10 @@ public class SpiderTest {
// PageProcessor pageProcessor = new MeicanProcessor();
// Spider.me().pipeline(new FilePipeline()).scheduler(new FileCacheQueueScheduler(pageProcessor.getSite(),"/data/temp/webmagic/cache/")).
// processor(pageProcessor).run();
- SimplePageProcessor pageProcessor2 = new SimplePageProcessor("http://www.diaoyuweng.com/home.php?mod=space&uid=88304&do=thread&view=me&type=thread&from=space", "http://www.diaoyuweng.com/thread-*-1-1.html");
+ SimplePageProcessor pageProcessor2 = new SimplePageProcessor( "http://www.diaoyuweng.com/thread-*-1-1.html");
System.out.println(pageProcessor2.getSite().getCharset());
pageProcessor2.getSite().setSleepTime(500);
- Spider.create(pageProcessor2).addPipeline(new FilePipeline()).scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")).
+ Spider.create(pageProcessor2).addUrl("http://www.diaoyuweng.com/home.php?mod=space&uid=88304&do=thread&view=me&type=thread&from=space").addPipeline(new FilePipeline()).scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")).
run();
diff --git a/webmagic-samples/src/test/java/us/codecraft/webmagic/model/ProcessorBenchmark.java b/webmagic-samples/src/test/java/us/codecraft/webmagic/model/ProcessorBenchmark.java
index 193908d..7c61926 100644
--- a/webmagic-samples/src/test/java/us/codecraft/webmagic/model/ProcessorBenchmark.java
+++ b/webmagic-samples/src/test/java/us/codecraft/webmagic/model/ProcessorBenchmark.java
@@ -17,7 +17,7 @@ public class ProcessorBenchmark {
@Ignore
@Test
public void test() {
- ModelPageProcessor modelPageProcessor = ModelPageProcessor.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog"), OschinaBlog.class);
+ ModelPageProcessor modelPageProcessor = ModelPageProcessor.create(Site.me(), OschinaBlog.class);
Page page = new Page();
page.setRequest(new Request("http://my.oschina.net/flashsword/blog"));
page.setUrl(new PlainText("http://my.oschina.net/flashsword/blog"));
diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml
index 9b8b732..1e33539 100644
--- a/webmagic-saxon/pom.xml
+++ b/webmagic-saxon/pom.xml
@@ -3,7 +3,7 @@
webmagic-parent
us.codecraft
- 0.6.2-SNAPSHOT
+ 0.7.0-SNAPSHOT
4.0.0
diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml
index 3c6f673..cd1ec64 100755
--- a/webmagic-scripts/pom.xml
+++ b/webmagic-scripts/pom.xml
@@ -3,7 +3,7 @@
webmagic-parent
us.codecraft
- 0.6.2-SNAPSHOT
+ 0.7.0-SNAPSHOT
4.0.0
diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml
index b66ca0c..bdc9d8a 100644
--- a/webmagic-selenium/pom.xml
+++ b/webmagic-selenium/pom.xml
@@ -3,7 +3,7 @@
webmagic-parent
us.codecraft
- 0.6.2-SNAPSHOT
+ 0.7.0-SNAPSHOT
4.0.0