commit
6ead04a758
2
pom.xml
2
pom.xml
|
@ -6,7 +6,7 @@
|
||||||
<version>7</version>
|
<version>7</version>
|
||||||
</parent>
|
</parent>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<version>0.6.2-SNAPSHOT</version>
|
<version>0.7.0-SNAPSHOT</version>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
<packaging>pom</packaging>
|
<packaging>pom</packaging>
|
||||||
<properties>
|
<properties>
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<artifactId>webmagic-parent</artifactId>
|
<artifactId>webmagic-parent</artifactId>
|
||||||
<version>0.6.2-SNAPSHOT</version>
|
<version>0.7.0-SNAPSHOT</version>
|
||||||
</parent>
|
</parent>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
|
|
@ -1,86 +0,0 @@
|
||||||
<?xml version="1.0" encoding="UTF-8"?>
|
|
||||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
|
||||||
<parent>
|
|
||||||
<groupId>us.codecraft</groupId>
|
|
||||||
<artifactId>webmagic-parent</artifactId>
|
|
||||||
<version>0.5.2</version>
|
|
||||||
</parent>
|
|
||||||
<modelVersion>4.0.0</modelVersion>
|
|
||||||
|
|
||||||
<artifactId>webmagic-core</artifactId>
|
|
||||||
|
|
||||||
<dependencies>
|
|
||||||
<dependency>
|
|
||||||
<groupId>org.apache.httpcomponents</groupId>
|
|
||||||
<artifactId>httpclient</artifactId>
|
|
||||||
</dependency>
|
|
||||||
|
|
||||||
<dependency>
|
|
||||||
<groupId>junit</groupId>
|
|
||||||
<artifactId>junit</artifactId>
|
|
||||||
</dependency>
|
|
||||||
|
|
||||||
<dependency>
|
|
||||||
<groupId>com.google.guava</groupId>
|
|
||||||
<artifactId>guava</artifactId>
|
|
||||||
</dependency>
|
|
||||||
|
|
||||||
<dependency>
|
|
||||||
<groupId>org.apache.commons</groupId>
|
|
||||||
<artifactId>commons-lang3</artifactId>
|
|
||||||
</dependency>
|
|
||||||
|
|
||||||
<dependency>
|
|
||||||
<groupId>us.codecraft</groupId>
|
|
||||||
<artifactId>xsoup</artifactId>
|
|
||||||
</dependency>
|
|
||||||
|
|
||||||
<dependency>
|
|
||||||
<groupId>com.github.dreamhead</groupId>
|
|
||||||
<artifactId>moco-core</artifactId>
|
|
||||||
</dependency>
|
|
||||||
|
|
||||||
<dependency>
|
|
||||||
<groupId>org.slf4j</groupId>
|
|
||||||
<artifactId>slf4j-api</artifactId>
|
|
||||||
</dependency>
|
|
||||||
|
|
||||||
<dependency>
|
|
||||||
<groupId>org.slf4j</groupId>
|
|
||||||
<artifactId>slf4j-log4j12</artifactId>
|
|
||||||
</dependency>
|
|
||||||
|
|
||||||
<dependency>
|
|
||||||
<groupId>commons-collections</groupId>
|
|
||||||
<artifactId>commons-collections</artifactId>
|
|
||||||
</dependency>
|
|
||||||
|
|
||||||
<dependency>
|
|
||||||
<groupId>org.assertj</groupId>
|
|
||||||
<artifactId>assertj-core</artifactId>
|
|
||||||
</dependency>
|
|
||||||
|
|
||||||
<dependency>
|
|
||||||
<groupId>org.jsoup</groupId>
|
|
||||||
<artifactId>jsoup</artifactId>
|
|
||||||
</dependency>
|
|
||||||
|
|
||||||
<dependency>
|
|
||||||
<groupId>org.apache.commons</groupId>
|
|
||||||
<artifactId>commons-io</artifactId>
|
|
||||||
</dependency>
|
|
||||||
|
|
||||||
<dependency>
|
|
||||||
<groupId>com.jayway.jsonpath</groupId>
|
|
||||||
<artifactId>json-path</artifactId>
|
|
||||||
<version>0.8.1</version>
|
|
||||||
</dependency>
|
|
||||||
|
|
||||||
<dependency>
|
|
||||||
<groupId>com.alibaba</groupId>
|
|
||||||
<artifactId>fastjson</artifactId>
|
|
||||||
</dependency>
|
|
||||||
|
|
||||||
</dependencies>
|
|
||||||
|
|
||||||
</project>
|
|
|
@ -1,16 +1,15 @@
|
||||||
package us.codecraft.webmagic;
|
package us.codecraft.webmagic;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
import org.apache.http.Header;
|
|
||||||
|
|
||||||
import us.codecraft.webmagic.selector.Html;
|
import us.codecraft.webmagic.selector.Html;
|
||||||
import us.codecraft.webmagic.selector.Json;
|
import us.codecraft.webmagic.selector.Json;
|
||||||
import us.codecraft.webmagic.selector.Selectable;
|
import us.codecraft.webmagic.selector.Selectable;
|
||||||
import us.codecraft.webmagic.utils.UrlUtils;
|
import us.codecraft.webmagic.utils.UrlUtils;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Object storing extracted result and urls to fetch.<br>
|
* Object storing extracted result and urls to fetch.<br>
|
||||||
* Not thread safe.<br>
|
* Not thread safe.<br>
|
||||||
|
@ -40,17 +39,14 @@ public class Page {
|
||||||
|
|
||||||
private Selectable url;
|
private Selectable url;
|
||||||
|
|
||||||
|
private Map<String,List<String>> headers;
|
||||||
|
|
||||||
private int statusCode;
|
private int statusCode;
|
||||||
|
|
||||||
private boolean needCycleRetry;
|
private boolean needCycleRetry;
|
||||||
|
|
||||||
private List<Request> targetRequests = new ArrayList<Request>();
|
private List<Request> targetRequests = new ArrayList<Request>();
|
||||||
|
|
||||||
/**
|
|
||||||
* Http响应头
|
|
||||||
*/
|
|
||||||
private Header[] headers=null;
|
|
||||||
|
|
||||||
public Page() {
|
public Page() {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -77,7 +73,7 @@ public class Page {
|
||||||
*/
|
*/
|
||||||
public Html getHtml() {
|
public Html getHtml() {
|
||||||
if (html == null) {
|
if (html == null) {
|
||||||
html = new Html(UrlUtils.fixAllRelativeHrefs(rawText, request.getUrl()));
|
html = new Html(rawText, request.getUrl());
|
||||||
}
|
}
|
||||||
return html;
|
return html;
|
||||||
}
|
}
|
||||||
|
@ -217,14 +213,14 @@ public class Page {
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Header[] getHeaders() {
|
public Map<String, List<String>> getHeaders() {
|
||||||
return headers;
|
return headers;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void setHeaders(Map<String, List<String>> headers) {
|
||||||
|
this.headers = headers;
|
||||||
|
}
|
||||||
|
|
||||||
public void setHeaders(Header[] headers) {
|
|
||||||
this.headers = headers;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String toString() {
|
public String toString() {
|
||||||
return "Page{" +
|
return "Page{" +
|
||||||
|
@ -232,7 +228,9 @@ public class Page {
|
||||||
", resultItems=" + resultItems +
|
", resultItems=" + resultItems +
|
||||||
", rawText='" + rawText + '\'' +
|
", rawText='" + rawText + '\'' +
|
||||||
", url=" + url +
|
", url=" + url +
|
||||||
|
", headers=" + headers +
|
||||||
", statusCode=" + statusCode +
|
", statusCode=" + statusCode +
|
||||||
|
", needCycleRetry=" + needCycleRetry +
|
||||||
", targetRequests=" + targetRequests +
|
", targetRequests=" + targetRequests +
|
||||||
", headers=" + headers+
|
", headers=" + headers+
|
||||||
'}';
|
'}';
|
||||||
|
|
|
@ -1,20 +1,11 @@
|
||||||
package us.codecraft.webmagic;
|
package us.codecraft.webmagic;
|
||||||
|
|
||||||
import java.io.Serializable;
|
import us.codecraft.webmagic.model.HttpRequestBody;
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.HashMap;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Map;
|
|
||||||
|
|
||||||
import org.apache.http.Header;
|
|
||||||
import org.apache.http.HttpEntity;
|
|
||||||
import org.apache.http.cookie.Cookie;
|
|
||||||
import org.apache.http.entity.StringEntity;
|
|
||||||
import org.apache.http.impl.cookie.BasicClientCookie;
|
|
||||||
import org.apache.http.message.BasicHeader;
|
|
||||||
|
|
||||||
import us.codecraft.webmagic.utils.Experimental;
|
import us.codecraft.webmagic.utils.Experimental;
|
||||||
import us.codecraft.webmagic.utils.UrlUtils;
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Object contains url to crawl.<br>
|
* Object contains url to crawl.<br>
|
||||||
|
@ -28,33 +19,24 @@ public class Request implements Serializable {
|
||||||
private static final long serialVersionUID = 2062192774891352043L;
|
private static final long serialVersionUID = 2062192774891352043L;
|
||||||
|
|
||||||
public static final String CYCLE_TRIED_TIMES = "_cycle_tried_times";
|
public static final String CYCLE_TRIED_TIMES = "_cycle_tried_times";
|
||||||
public static final String STATUS_CODE = "statusCode";
|
|
||||||
public static final String PROXY = "proxy";
|
|
||||||
|
|
||||||
private String url;
|
private String url;
|
||||||
|
|
||||||
private String method;
|
private String method;
|
||||||
|
|
||||||
|
private HttpRequestBody requestBody;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Store additional information in extras.
|
* Store additional information in extras.
|
||||||
*/
|
*/
|
||||||
private Map<String, Object> extras;
|
private Map<String, Object> extras;
|
||||||
/**
|
|
||||||
* POST/GET param set
|
|
||||||
* */
|
|
||||||
private Map<String,String> params=new HashMap<String, String>();
|
|
||||||
|
|
||||||
/**
|
|
||||||
* support for json,xml or more,在post时,设置此选项会使params参数和nameValuePair extra失效。
|
|
||||||
*/
|
|
||||||
private HttpEntity entity;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* cookies for current url, if not set use Site's cookies
|
* cookies for current url, if not set use Site's cookies
|
||||||
*/
|
*/
|
||||||
private List<Cookie> cookies=new ArrayList<Cookie>();
|
private Map<String, String> cookies = new HashMap<String, String>();
|
||||||
|
|
||||||
private List<Header> headers=new ArrayList<Header>();
|
private Map<String, String> headers = new HashMap<String, String>();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Priority of the request.<br>
|
* Priority of the request.<br>
|
||||||
|
@ -133,27 +115,11 @@ public class Request implements Serializable {
|
||||||
this.method = method;
|
this.method = method;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Map<String, String> getParams() {
|
@Override
|
||||||
return params;
|
public int hashCode() {
|
||||||
}
|
int result = url != null ? url.hashCode() : 0;
|
||||||
/**
|
result = 31 * result + (method != null ? method.hashCode() : 0);
|
||||||
* set params for request
|
return result;
|
||||||
* <br>
|
|
||||||
* DO NOT set this for request already has params, like 'https://github.com/search?q=webmagic'
|
|
||||||
* @param params params
|
|
||||||
* */
|
|
||||||
public void setParams(Map<String, String> params) {
|
|
||||||
this.params = params;
|
|
||||||
}
|
|
||||||
/**
|
|
||||||
* set params for request
|
|
||||||
* <br>
|
|
||||||
* DO NOT set this for request already has params, like 'https://github.com/search?q=webmagic'
|
|
||||||
* @param key key
|
|
||||||
* @param value value
|
|
||||||
* */
|
|
||||||
public void putParams(String key,String value) {
|
|
||||||
params.put(key,value);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -164,63 +130,33 @@ public class Request implements Serializable {
|
||||||
Request request = (Request) o;
|
Request request = (Request) o;
|
||||||
|
|
||||||
if (url != null ? !url.equals(request.url) : request.url != null) return false;
|
if (url != null ? !url.equals(request.url) : request.url != null) return false;
|
||||||
if (method != null ? !method.equals(request.method) : request.method != null) return false;
|
return method != null ? method.equals(request.method) : request.method == null;
|
||||||
return params != null ? params.equals(request.params) : request.params == null;
|
|
||||||
}
|
}
|
||||||
public void addHeader(String name,String value){
|
|
||||||
Header header=new BasicHeader(name,value);
|
|
||||||
headers.add(header);
|
|
||||||
}
|
|
||||||
public List<Header> getHeaders(){
|
|
||||||
return headers;
|
|
||||||
}
|
|
||||||
public void addCookie(String key,String value){
|
|
||||||
BasicClientCookie c=new BasicClientCookie(key, value);
|
|
||||||
c.setDomain(UrlUtils.getDomain(url));
|
|
||||||
cookies.add(c);
|
|
||||||
}
|
|
||||||
public List<Cookie> getCookies() {
|
|
||||||
return cookies;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setCookies(List<Cookie> cookies) {
|
public Request addCookie(String name, String value) {
|
||||||
this.cookies = cookies;
|
cookies.put(name, value);
|
||||||
}
|
return this;
|
||||||
/**
|
|
||||||
* 设置json参数
|
|
||||||
*/
|
|
||||||
public void setJsonParam(String jsonStr,String encoding){
|
|
||||||
StringEntity e=new StringEntity(jsonStr,encoding==null?"UTF-8":encoding);
|
|
||||||
e.setContentEncoding(encoding==null?"UTF-8":encoding);
|
|
||||||
e.setContentType("application/json");
|
|
||||||
entity=e;
|
|
||||||
}
|
}
|
||||||
/**
|
|
||||||
* 设置xml参数
|
|
||||||
*/
|
|
||||||
public void setXmlParam(String xmlStr,String encoding){
|
|
||||||
StringEntity e=new StringEntity(xmlStr,encoding==null?"UTF-8":encoding);
|
|
||||||
e.setContentEncoding(encoding==null?"UTF-8":encoding);
|
|
||||||
e.setContentType("text/xml");
|
|
||||||
entity=e;
|
|
||||||
}
|
|
||||||
public HttpEntity getEntity() {
|
|
||||||
return entity;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setEntity(HttpEntity entity) {
|
public Request addHeader(String name, String value) {
|
||||||
this.entity = entity;
|
headers.put(name, value);
|
||||||
}
|
return this;
|
||||||
@Override
|
}
|
||||||
public int hashCode() {
|
|
||||||
int result = url != null ? url.hashCode() : 0;
|
public Map<String, String> getCookies() {
|
||||||
result = 31 * result + (method != null ? method.hashCode() : 0);
|
return cookies;
|
||||||
result = 31 * result + (params != null ? params.hashCode() : 0);
|
}
|
||||||
result = 31 * result + (headers != null ? headers.hashCode() : 0);
|
|
||||||
result = 31 * result + (entity != null ? entity.hashCode() : 0);
|
public Map<String, String> getHeaders() {
|
||||||
result = 31 * result + (cookies != null ? cookies.hashCode() : 0);
|
return headers;
|
||||||
|
}
|
||||||
return result;
|
|
||||||
|
public HttpRequestBody getRequestBody() {
|
||||||
|
return requestBody;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setRequestBody(HttpRequestBody requestBody) {
|
||||||
|
this.requestBody = requestBody;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -229,10 +165,8 @@ public class Request implements Serializable {
|
||||||
"url='" + url + '\'' +
|
"url='" + url + '\'' +
|
||||||
", method='" + method + '\'' +
|
", method='" + method + '\'' +
|
||||||
", extras=" + extras +
|
", extras=" + extras +
|
||||||
", params=" + params +
|
|
||||||
", priority=" + priority +
|
", priority=" + priority +
|
||||||
", headers=" + headers +
|
", headers=" + headers +
|
||||||
", entity=" + entity +
|
|
||||||
", cookies="+ cookies+
|
", cookies="+ cookies+
|
||||||
'}';
|
'}';
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,12 +1,5 @@
|
||||||
package us.codecraft.webmagic;
|
package us.codecraft.webmagic;
|
||||||
|
|
||||||
import org.apache.http.HttpHost;
|
|
||||||
import org.apache.http.auth.UsernamePasswordCredentials;
|
|
||||||
import us.codecraft.webmagic.proxy.Proxy;
|
|
||||||
import us.codecraft.webmagic.proxy.ProxyPool;
|
|
||||||
import us.codecraft.webmagic.proxy.SimpleProxyPool;
|
|
||||||
import us.codecraft.webmagic.utils.UrlUtils;
|
|
||||||
|
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -28,11 +21,6 @@ public class Site {
|
||||||
|
|
||||||
private String charset;
|
private String charset;
|
||||||
|
|
||||||
/**
|
|
||||||
* startUrls is the urls the crawler to start with.
|
|
||||||
*/
|
|
||||||
private List<Request> startRequests = new ArrayList<Request>();
|
|
||||||
|
|
||||||
private int sleepTime = 5000;
|
private int sleepTime = 5000;
|
||||||
|
|
||||||
private int retryTimes = 0;
|
private int retryTimes = 0;
|
||||||
|
@ -49,24 +37,8 @@ public class Site {
|
||||||
|
|
||||||
private Map<String, String> headers = new HashMap<String, String>();
|
private Map<String, String> headers = new HashMap<String, String>();
|
||||||
|
|
||||||
private HttpHost httpProxy;
|
|
||||||
|
|
||||||
private UsernamePasswordCredentials usernamePasswordCredentials; //代理用户名密码设置
|
|
||||||
|
|
||||||
private ProxyPool httpProxyPool;
|
|
||||||
|
|
||||||
private boolean useGzip = true;
|
private boolean useGzip = true;
|
||||||
|
|
||||||
/**
|
|
||||||
* @see us.codecraft.webmagic.utils.HttpConstant.Header
|
|
||||||
* @deprecated
|
|
||||||
*/
|
|
||||||
public static interface HeaderConst {
|
|
||||||
|
|
||||||
public static final String REFERER = "Referer";
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
static {
|
static {
|
||||||
DEFAULT_STATUS_CODE_SET.add(200);
|
DEFAULT_STATUS_CODE_SET.add(200);
|
||||||
}
|
}
|
||||||
|
@ -225,52 +197,6 @@ public class Site {
|
||||||
return acceptStatCode;
|
return acceptStatCode;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* get start urls
|
|
||||||
*
|
|
||||||
* @return start urls
|
|
||||||
* @see #getStartRequests
|
|
||||||
* @deprecated
|
|
||||||
*/
|
|
||||||
@Deprecated
|
|
||||||
public List<String> getStartUrls() {
|
|
||||||
return UrlUtils.convertToUrls(startRequests);
|
|
||||||
}
|
|
||||||
|
|
||||||
public List<Request> getStartRequests() {
|
|
||||||
return startRequests;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Add a url to start url.<br>
|
|
||||||
* Because urls are more a Spider's property than Site, move it to {@link Spider#addUrl(String...)}}
|
|
||||||
*
|
|
||||||
* @param startUrl startUrl
|
|
||||||
* @return this
|
|
||||||
* @see Spider#addUrl(String...)
|
|
||||||
* @deprecated
|
|
||||||
*/
|
|
||||||
public Site addStartUrl(String startUrl) {
|
|
||||||
return addStartRequest(new Request(startUrl));
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Add a url to start url.<br>
|
|
||||||
* Because urls are more a Spider's property than Site, move it to {@link Spider#addRequest(Request...)}}
|
|
||||||
*
|
|
||||||
* @param startRequest startRequest
|
|
||||||
* @return this
|
|
||||||
* @see Spider#addRequest(Request...)
|
|
||||||
* @deprecated
|
|
||||||
*/
|
|
||||||
public Site addStartRequest(Request startRequest) {
|
|
||||||
this.startRequests.add(startRequest);
|
|
||||||
if (domain == null && startRequest.getUrl() != null) {
|
|
||||||
domain = UrlUtils.getDomain(startRequest.getUrl());
|
|
||||||
}
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Set the interval between the processing of two pages.<br>
|
* Set the interval between the processing of two pages.<br>
|
||||||
* Time unit is micro seconds.<br>
|
* Time unit is micro seconds.<br>
|
||||||
|
@ -350,21 +276,6 @@ public class Site {
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
public HttpHost getHttpProxy() {
|
|
||||||
return httpProxy;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* set up httpProxy for this site
|
|
||||||
*
|
|
||||||
* @param httpProxy httpProxy
|
|
||||||
* @return this
|
|
||||||
*/
|
|
||||||
public Site setHttpProxy(HttpHost httpProxy) {
|
|
||||||
this.httpProxy = httpProxy;
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
public boolean isUseGzip() {
|
public boolean isUseGzip() {
|
||||||
return useGzip;
|
return useGzip;
|
||||||
}
|
}
|
||||||
|
@ -400,7 +311,11 @@ public class Site {
|
||||||
return new Task() {
|
return new Task() {
|
||||||
@Override
|
@Override
|
||||||
public String getUUID() {
|
public String getUUID() {
|
||||||
return Site.this.getDomain();
|
String uuid = Site.this.getDomain();
|
||||||
|
if (uuid == null) {
|
||||||
|
uuid = UUID.randomUUID().toString();
|
||||||
|
}
|
||||||
|
return uuid;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -428,8 +343,6 @@ public class Site {
|
||||||
return false;
|
return false;
|
||||||
if (domain != null ? !domain.equals(site.domain) : site.domain != null) return false;
|
if (domain != null ? !domain.equals(site.domain) : site.domain != null) return false;
|
||||||
if (headers != null ? !headers.equals(site.headers) : site.headers != null) return false;
|
if (headers != null ? !headers.equals(site.headers) : site.headers != null) return false;
|
||||||
if (startRequests != null ? !startRequests.equals(site.startRequests) : site.startRequests != null)
|
|
||||||
return false;
|
|
||||||
if (userAgent != null ? !userAgent.equals(site.userAgent) : site.userAgent != null) return false;
|
if (userAgent != null ? !userAgent.equals(site.userAgent) : site.userAgent != null) return false;
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
|
@ -441,7 +354,6 @@ public class Site {
|
||||||
result = 31 * result + (userAgent != null ? userAgent.hashCode() : 0);
|
result = 31 * result + (userAgent != null ? userAgent.hashCode() : 0);
|
||||||
result = 31 * result + (defaultCookies != null ? defaultCookies.hashCode() : 0);
|
result = 31 * result + (defaultCookies != null ? defaultCookies.hashCode() : 0);
|
||||||
result = 31 * result + (charset != null ? charset.hashCode() : 0);
|
result = 31 * result + (charset != null ? charset.hashCode() : 0);
|
||||||
result = 31 * result + (startRequests != null ? startRequests.hashCode() : 0);
|
|
||||||
result = 31 * result + sleepTime;
|
result = 31 * result + sleepTime;
|
||||||
result = 31 * result + retryTimes;
|
result = 31 * result + retryTimes;
|
||||||
result = 31 * result + cycleRetryTimes;
|
result = 31 * result + cycleRetryTimes;
|
||||||
|
@ -458,7 +370,6 @@ public class Site {
|
||||||
", userAgent='" + userAgent + '\'' +
|
", userAgent='" + userAgent + '\'' +
|
||||||
", cookies=" + defaultCookies +
|
", cookies=" + defaultCookies +
|
||||||
", charset='" + charset + '\'' +
|
", charset='" + charset + '\'' +
|
||||||
", startRequests=" + startRequests +
|
|
||||||
", sleepTime=" + sleepTime +
|
", sleepTime=" + sleepTime +
|
||||||
", retryTimes=" + retryTimes +
|
", retryTimes=" + retryTimes +
|
||||||
", cycleRetryTimes=" + cycleRetryTimes +
|
", cycleRetryTimes=" + cycleRetryTimes +
|
||||||
|
@ -468,53 +379,4 @@ public class Site {
|
||||||
'}';
|
'}';
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Set httpProxyPool, String[0]:ip, String[1]:port <br>
|
|
||||||
*
|
|
||||||
* @param proxyPool proxyPool
|
|
||||||
* @return this
|
|
||||||
*/
|
|
||||||
public Site setHttpProxyPool(ProxyPool proxyPool) {
|
|
||||||
this.httpProxyPool = proxyPool;
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Set httpProxyPool, String[0]:ip, String[1]:port <br>
|
|
||||||
*
|
|
||||||
* @param httpProxyList httpProxyList
|
|
||||||
* @param isUseLastProxy isUseLastProxy
|
|
||||||
* @return this
|
|
||||||
*/
|
|
||||||
public Site setHttpProxyPool(List<String[]> httpProxyList, boolean isUseLastProxy) {
|
|
||||||
this.httpProxyPool=new SimpleProxyPool(httpProxyList, isUseLastProxy);
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
public Site enableHttpProxyPool() {
|
|
||||||
this.httpProxyPool=new SimpleProxyPool();
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
public UsernamePasswordCredentials getUsernamePasswordCredentials() {
|
|
||||||
return usernamePasswordCredentials;
|
|
||||||
}
|
|
||||||
|
|
||||||
public Site setUsernamePasswordCredentials(UsernamePasswordCredentials usernamePasswordCredentials) {
|
|
||||||
this.usernamePasswordCredentials = usernamePasswordCredentials;
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
public ProxyPool getHttpProxyPool() {
|
|
||||||
return httpProxyPool;
|
|
||||||
}
|
|
||||||
|
|
||||||
public Proxy getHttpProxyFromPool() {
|
|
||||||
return httpProxyPool.getProxy();
|
|
||||||
}
|
|
||||||
|
|
||||||
public void returnHttpProxyToPool(HttpHost proxy,int statusCode) {
|
|
||||||
httpProxyPool.returnProxy(proxy,statusCode);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -126,7 +126,6 @@ public class Spider implements Runnable, Task {
|
||||||
public Spider(PageProcessor pageProcessor) {
|
public Spider(PageProcessor pageProcessor) {
|
||||||
this.pageProcessor = pageProcessor;
|
this.pageProcessor = pageProcessor;
|
||||||
this.site = pageProcessor.getSite();
|
this.site = pageProcessor.getSite();
|
||||||
this.startRequests = pageProcessor.getSite().getStartRequests();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -419,8 +418,6 @@ public class Spider implements Runnable, Task {
|
||||||
pipeline.process(page.getResultItems(), this);
|
pipeline.process(page.getResultItems(), this);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
//for proxy status management
|
|
||||||
request.putExtra(Request.STATUS_CODE, page.getStatusCode());
|
|
||||||
sleep(site.getSleepTime());
|
sleep(site.getSleepTime());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -482,7 +479,9 @@ public class Spider implements Runnable, Task {
|
||||||
public <T> List<T> getAll(Collection<String> urls) {
|
public <T> List<T> getAll(Collection<String> urls) {
|
||||||
destroyWhenExit = false;
|
destroyWhenExit = false;
|
||||||
spawnUrl = false;
|
spawnUrl = false;
|
||||||
startRequests.clear();
|
if (startRequests!=null){
|
||||||
|
startRequests.clear();
|
||||||
|
}
|
||||||
for (Request request : UrlUtils.convertToRequests(urls)) {
|
for (Request request : UrlUtils.convertToRequests(urls)) {
|
||||||
addRequest(request);
|
addRequest(request);
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,46 +1,26 @@
|
||||||
package us.codecraft.webmagic.downloader;
|
package us.codecraft.webmagic.downloader;
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.nio.charset.Charset;
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.HashMap;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Map;
|
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
import org.apache.commons.collections.CollectionUtils;
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.http.Header;
|
|
||||||
import org.apache.http.HttpHost;
|
|
||||||
import org.apache.http.HttpResponse;
|
import org.apache.http.HttpResponse;
|
||||||
import org.apache.http.NameValuePair;
|
|
||||||
import org.apache.http.annotation.ThreadSafe;
|
import org.apache.http.annotation.ThreadSafe;
|
||||||
import org.apache.http.client.CookieStore;
|
|
||||||
import org.apache.http.client.config.CookieSpecs;
|
|
||||||
import org.apache.http.client.config.RequestConfig;
|
|
||||||
import org.apache.http.client.entity.UrlEncodedFormEntity;
|
|
||||||
import org.apache.http.client.methods.CloseableHttpResponse;
|
import org.apache.http.client.methods.CloseableHttpResponse;
|
||||||
import org.apache.http.client.methods.HttpUriRequest;
|
|
||||||
import org.apache.http.client.methods.RequestBuilder;
|
|
||||||
import org.apache.http.client.protocol.HttpClientContext;
|
|
||||||
import org.apache.http.cookie.Cookie;
|
|
||||||
import org.apache.http.impl.client.BasicCookieStore;
|
|
||||||
import org.apache.http.impl.client.CloseableHttpClient;
|
import org.apache.http.impl.client.CloseableHttpClient;
|
||||||
import org.apache.http.message.BasicNameValuePair;
|
|
||||||
import org.apache.http.util.EntityUtils;
|
import org.apache.http.util.EntityUtils;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import us.codecraft.webmagic.Page;
|
import us.codecraft.webmagic.Page;
|
||||||
import us.codecraft.webmagic.Request;
|
import us.codecraft.webmagic.Request;
|
||||||
import us.codecraft.webmagic.Site;
|
import us.codecraft.webmagic.Site;
|
||||||
import us.codecraft.webmagic.Task;
|
import us.codecraft.webmagic.Task;
|
||||||
import us.codecraft.webmagic.proxy.Proxy;
|
import us.codecraft.webmagic.proxy.ProxyProvider;
|
||||||
import us.codecraft.webmagic.selector.PlainText;
|
import us.codecraft.webmagic.selector.PlainText;
|
||||||
import us.codecraft.webmagic.utils.CharsetUtils;
|
import us.codecraft.webmagic.utils.CharsetUtils;
|
||||||
import us.codecraft.webmagic.utils.HttpConstant;
|
import us.codecraft.webmagic.utils.HttpClientUtils;
|
||||||
import us.codecraft.webmagic.utils.WMCollections;
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.charset.Charset;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -58,9 +38,23 @@ public class HttpClientDownloader extends AbstractDownloader {
|
||||||
|
|
||||||
private HttpClientGenerator httpClientGenerator = new HttpClientGenerator();
|
private HttpClientGenerator httpClientGenerator = new HttpClientGenerator();
|
||||||
|
|
||||||
private CloseableHttpClient getHttpClient(Site site, Proxy proxy) {
|
private HttpUriRequestConverter httpUriRequestConverter = new HttpUriRequestConverter();
|
||||||
|
|
||||||
|
private ProxyProvider proxyProvider;
|
||||||
|
|
||||||
|
private boolean responseHeader = true;
|
||||||
|
|
||||||
|
public void setHttpUriRequestConverter(HttpUriRequestConverter httpUriRequestConverter) {
|
||||||
|
this.httpUriRequestConverter = httpUriRequestConverter;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setProxyProvider(ProxyProvider proxyProvider) {
|
||||||
|
this.proxyProvider = proxyProvider;
|
||||||
|
}
|
||||||
|
|
||||||
|
private CloseableHttpClient getHttpClient(Site site) {
|
||||||
if (site == null) {
|
if (site == null) {
|
||||||
return httpClientGenerator.getClient(null, proxy);
|
return httpClientGenerator.getClient(null);
|
||||||
}
|
}
|
||||||
String domain = site.getDomain();
|
String domain = site.getDomain();
|
||||||
CloseableHttpClient httpClient = httpClients.get(domain);
|
CloseableHttpClient httpClient = httpClients.get(domain);
|
||||||
|
@ -68,7 +62,7 @@ public class HttpClientDownloader extends AbstractDownloader {
|
||||||
synchronized (this) {
|
synchronized (this) {
|
||||||
httpClient = httpClients.get(domain);
|
httpClient = httpClients.get(domain);
|
||||||
if (httpClient == null) {
|
if (httpClient == null) {
|
||||||
httpClient = httpClientGenerator.getClient(site, proxy);
|
httpClient = httpClientGenerator.getClient(site);
|
||||||
httpClients.put(domain, httpClient);
|
httpClients.put(domain, httpClient);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -78,54 +72,19 @@ public class HttpClientDownloader extends AbstractDownloader {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Page download(Request request, Task task) {
|
public Page download(Request request, Task task) {
|
||||||
Site site = null;
|
if (task == null || task.getSite() == null) {
|
||||||
if (task != null) {
|
throw new NullPointerException("task or site can not be null");
|
||||||
site = task.getSite();
|
|
||||||
}
|
}
|
||||||
Set<Integer> acceptStatCode;
|
logger.debug("downloading page {}", request.getUrl());
|
||||||
String charset = null;
|
|
||||||
Map<String, String> headers = null;
|
|
||||||
if (site != null) {
|
|
||||||
acceptStatCode = site.getAcceptStatCode();
|
|
||||||
charset = site.getCharset();
|
|
||||||
headers = site.getHeaders();
|
|
||||||
} else {
|
|
||||||
acceptStatCode = WMCollections.newHashSet(200);
|
|
||||||
}
|
|
||||||
logger.info("downloading page {}", request.getUrl());
|
|
||||||
CloseableHttpResponse httpResponse = null;
|
CloseableHttpResponse httpResponse = null;
|
||||||
int statusCode = 0;
|
Site site = task.getSite();
|
||||||
|
CloseableHttpClient httpClient = getHttpClient(site);
|
||||||
|
HttpClientRequestContext requestContext = httpUriRequestConverter.convert(request, site, proxyProvider != null ? proxyProvider.getProxy(task) : null);
|
||||||
try {
|
try {
|
||||||
HttpHost proxyHost = null;
|
httpResponse = httpClient.execute(requestContext.getHttpUriRequest(), requestContext.getHttpClientContext());
|
||||||
Proxy proxy = null; //TODO
|
int statusCode = httpResponse.getStatusLine().getStatusCode();
|
||||||
if (site != null && site.getHttpProxyPool() != null && site.getHttpProxyPool().isEnable()) {
|
if (site.getAcceptStatCode().contains(statusCode)) {
|
||||||
proxy = site.getHttpProxyFromPool();
|
Page page = handleResponse(request, site.getCharset(), httpResponse, task);
|
||||||
proxyHost = proxy.getHttpHost();
|
|
||||||
} else if (site != null && site.getHttpProxy() != null){
|
|
||||||
proxyHost = site.getHttpProxy();
|
|
||||||
}
|
|
||||||
|
|
||||||
HttpUriRequest httpUriRequest = getHttpUriRequest(request, site, headers, proxyHost);
|
|
||||||
HttpClientContext context=null;
|
|
||||||
if(request.getCookies()!=null && CollectionUtils.isNotEmpty(request.getCookies())){
|
|
||||||
context=new HttpClientContext();
|
|
||||||
CookieStore cookieStore=new BasicCookieStore();
|
|
||||||
for(Cookie c:request.getCookies()){
|
|
||||||
cookieStore.addCookie(c);
|
|
||||||
}
|
|
||||||
context.setCookieStore(cookieStore);
|
|
||||||
}
|
|
||||||
if(request.getHeaders()!=null && CollectionUtils.isNotEmpty(request.getHeaders())){
|
|
||||||
for(Header h:request.getHeaders()){
|
|
||||||
httpUriRequest.setHeader(h);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
httpResponse = getHttpClient(site, proxy).execute(httpUriRequest,context);
|
|
||||||
statusCode = httpResponse.getStatusLine().getStatusCode();
|
|
||||||
request.putExtra(Request.STATUS_CODE, statusCode);
|
|
||||||
if (statusAccept(acceptStatCode, statusCode)) {
|
|
||||||
Page page = handleResponse(request, charset, httpResponse, task);
|
|
||||||
page.setHeaders(httpResponse.getAllHeaders());
|
|
||||||
onSuccess(request);
|
onSuccess(request);
|
||||||
return page;
|
return page;
|
||||||
} else {
|
} else {
|
||||||
|
@ -144,11 +103,6 @@ public class HttpClientDownloader extends AbstractDownloader {
|
||||||
//ensure the connection is released back to pool
|
//ensure the connection is released back to pool
|
||||||
EntityUtils.consumeQuietly(httpResponse.getEntity());
|
EntityUtils.consumeQuietly(httpResponse.getEntity());
|
||||||
}
|
}
|
||||||
request.putExtra(Request.STATUS_CODE, statusCode);
|
|
||||||
if (site != null && site.getHttpProxyPool() != null && site.getHttpProxyPool().isEnable()) {
|
|
||||||
site.returnHttpProxyToPool((HttpHost) request.getExtra(Request.PROXY), (Integer) request
|
|
||||||
.getExtra(Request.STATUS_CODE));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -157,91 +111,20 @@ public class HttpClientDownloader extends AbstractDownloader {
|
||||||
httpClientGenerator.setPoolSize(thread);
|
httpClientGenerator.setPoolSize(thread);
|
||||||
}
|
}
|
||||||
|
|
||||||
protected boolean statusAccept(Set<Integer> acceptStatCode, int statusCode) {
|
|
||||||
return acceptStatCode.contains(statusCode);
|
|
||||||
}
|
|
||||||
|
|
||||||
protected HttpUriRequest getHttpUriRequest(Request request, Site site, Map<String, String> headers, HttpHost proxy) {
|
|
||||||
RequestBuilder requestBuilder = selectRequestMethod(request).setUri(request.getUrl());
|
|
||||||
if (headers != null) {
|
|
||||||
for (Map.Entry<String, String> headerEntry : headers.entrySet()) {
|
|
||||||
requestBuilder.addHeader(headerEntry.getKey(), headerEntry.getValue());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
RequestConfig.Builder requestConfigBuilder = RequestConfig.custom();
|
|
||||||
if (site != null) {
|
|
||||||
requestConfigBuilder.setConnectionRequestTimeout(site.getTimeOut())
|
|
||||||
.setSocketTimeout(site.getTimeOut())
|
|
||||||
.setConnectTimeout(site.getTimeOut())
|
|
||||||
.setCookieSpec(CookieSpecs.BEST_MATCH);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (proxy != null) {
|
|
||||||
requestConfigBuilder.setProxy(proxy);
|
|
||||||
request.putExtra(Request.PROXY, proxy);
|
|
||||||
}
|
|
||||||
requestBuilder.setConfig(requestConfigBuilder.build());
|
|
||||||
return requestBuilder.build();
|
|
||||||
}
|
|
||||||
|
|
||||||
protected RequestBuilder selectRequestMethod(Request request) {
|
|
||||||
String method = request.getMethod();
|
|
||||||
if (method == null || method.equalsIgnoreCase(HttpConstant.Method.GET)) {
|
|
||||||
//default get
|
|
||||||
return addQueryParams(RequestBuilder.get(),request.getParams());
|
|
||||||
} else if (method.equalsIgnoreCase(HttpConstant.Method.POST)) {
|
|
||||||
if(request.getEntity()!=null){
|
|
||||||
return RequestBuilder.post().setEntity(request.getEntity());
|
|
||||||
}else{
|
|
||||||
return addFormParams(RequestBuilder.post(), (NameValuePair[]) request.getExtra("nameValuePair"), request.getParams());
|
|
||||||
}
|
|
||||||
} else if (method.equalsIgnoreCase(HttpConstant.Method.HEAD)) {
|
|
||||||
return addQueryParams(RequestBuilder.head(),request.getParams());
|
|
||||||
} else if (method.equalsIgnoreCase(HttpConstant.Method.PUT)) {
|
|
||||||
return addFormParams(RequestBuilder.put(), (NameValuePair[]) request.getExtra("nameValuePair"), request.getParams());
|
|
||||||
} else if (method.equalsIgnoreCase(HttpConstant.Method.DELETE)) {
|
|
||||||
return addQueryParams(RequestBuilder.delete(),request.getParams());
|
|
||||||
} else if (method.equalsIgnoreCase(HttpConstant.Method.TRACE)) {
|
|
||||||
return addQueryParams(RequestBuilder.trace(),request.getParams());
|
|
||||||
}
|
|
||||||
throw new IllegalArgumentException("Illegal HTTP Method " + method);
|
|
||||||
}
|
|
||||||
|
|
||||||
private RequestBuilder addFormParams(RequestBuilder requestBuilder, NameValuePair[] nameValuePair, Map<String, String> params) {
|
|
||||||
List<NameValuePair> allNameValuePair=new ArrayList<NameValuePair>();
|
|
||||||
if (nameValuePair != null && nameValuePair.length > 0) {
|
|
||||||
allNameValuePair= Arrays.asList(nameValuePair);
|
|
||||||
}
|
|
||||||
if (params != null) {
|
|
||||||
for (String key : params.keySet()) {
|
|
||||||
allNameValuePair.add(new BasicNameValuePair(key, params.get(key)));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
requestBuilder.setEntity(new UrlEncodedFormEntity(allNameValuePair, Charset.forName("utf8")));
|
|
||||||
return requestBuilder;
|
|
||||||
}
|
|
||||||
|
|
||||||
private RequestBuilder addQueryParams(RequestBuilder requestBuilder, Map<String, String> params) {
|
|
||||||
if (params != null) {
|
|
||||||
for (Map.Entry<String, String> entry : params.entrySet()) {
|
|
||||||
requestBuilder.addParameter(entry.getKey(), entry.getValue());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return requestBuilder;
|
|
||||||
}
|
|
||||||
|
|
||||||
protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException {
|
protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException {
|
||||||
String content = getContent(charset, httpResponse);
|
String content = getResponseContent(charset, httpResponse);
|
||||||
Page page = new Page();
|
Page page = new Page();
|
||||||
page.setRawText(content);
|
page.setRawText(content);
|
||||||
page.setUrl(new PlainText(request.getUrl()));
|
page.setUrl(new PlainText(request.getUrl()));
|
||||||
page.setRequest(request);
|
page.setRequest(request);
|
||||||
page.setStatusCode(httpResponse.getStatusLine().getStatusCode());
|
page.setStatusCode(httpResponse.getStatusLine().getStatusCode());
|
||||||
|
if (responseHeader) {
|
||||||
|
page.setHeaders(HttpClientUtils.convertHeaders(httpResponse.getAllHeaders()));
|
||||||
|
}
|
||||||
return page;
|
return page;
|
||||||
}
|
}
|
||||||
|
|
||||||
protected String getContent(String charset, HttpResponse httpResponse) throws IOException {
|
private String getResponseContent(String charset, HttpResponse httpResponse) throws IOException {
|
||||||
if (charset == null) {
|
if (charset == null) {
|
||||||
byte[] contentBytes = IOUtils.toByteArray(httpResponse.getEntity().getContent());
|
byte[] contentBytes = IOUtils.toByteArray(httpResponse.getEntity().getContent());
|
||||||
String htmlCharset = getHtmlCharset(httpResponse, contentBytes);
|
String htmlCharset = getHtmlCharset(httpResponse, contentBytes);
|
||||||
|
@ -256,7 +139,7 @@ public class HttpClientDownloader extends AbstractDownloader {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
protected String getHtmlCharset(HttpResponse httpResponse, byte[] contentBytes) throws IOException {
|
private String getHtmlCharset(HttpResponse httpResponse, byte[] contentBytes) throws IOException {
|
||||||
return CharsetUtils.detectCharset(httpResponse.getEntity().getContentType().getValue(), contentBytes);
|
return CharsetUtils.detectCharset(httpResponse.getEntity().getContentType().getValue(), contentBytes);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,13 +1,9 @@
|
||||||
package us.codecraft.webmagic.downloader;
|
package us.codecraft.webmagic.downloader;
|
||||||
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
|
||||||
import org.apache.http.HttpException;
|
import org.apache.http.HttpException;
|
||||||
import org.apache.http.HttpRequest;
|
import org.apache.http.HttpRequest;
|
||||||
import org.apache.http.HttpRequestInterceptor;
|
import org.apache.http.HttpRequestInterceptor;
|
||||||
import org.apache.http.auth.AuthScope;
|
|
||||||
import org.apache.http.auth.UsernamePasswordCredentials;
|
|
||||||
import org.apache.http.client.CookieStore;
|
import org.apache.http.client.CookieStore;
|
||||||
import org.apache.http.client.CredentialsProvider;
|
|
||||||
import org.apache.http.config.Registry;
|
import org.apache.http.config.Registry;
|
||||||
import org.apache.http.config.RegistryBuilder;
|
import org.apache.http.config.RegistryBuilder;
|
||||||
import org.apache.http.config.SocketConfig;
|
import org.apache.http.config.SocketConfig;
|
||||||
|
@ -21,7 +17,6 @@ import org.apache.http.protocol.HttpContext;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
import us.codecraft.webmagic.Site;
|
import us.codecraft.webmagic.Site;
|
||||||
import us.codecraft.webmagic.proxy.Proxy;
|
|
||||||
|
|
||||||
import javax.net.ssl.SSLContext;
|
import javax.net.ssl.SSLContext;
|
||||||
import javax.net.ssl.TrustManager;
|
import javax.net.ssl.TrustManager;
|
||||||
|
@ -92,38 +87,20 @@ public class HttpClientGenerator {
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
public CloseableHttpClient getClient(Site site, Proxy proxy) {
|
public CloseableHttpClient getClient(Site site) {
|
||||||
return generateClient(site, proxy);
|
return generateClient(site);
|
||||||
}
|
}
|
||||||
|
|
||||||
private CloseableHttpClient generateClient(Site site, Proxy proxy) {
|
private CloseableHttpClient generateClient(Site site) {
|
||||||
CredentialsProvider credsProvider = null;
|
|
||||||
HttpClientBuilder httpClientBuilder = HttpClients.custom();
|
HttpClientBuilder httpClientBuilder = HttpClients.custom();
|
||||||
|
|
||||||
if (proxy != null && StringUtils.isNotBlank(proxy.getUser()) && StringUtils.isNotBlank(proxy.getPassword()))
|
|
||||||
{
|
|
||||||
credsProvider= new BasicCredentialsProvider();
|
|
||||||
credsProvider.setCredentials(
|
|
||||||
new AuthScope(proxy.getHttpHost().getAddress().getHostAddress(), proxy.getHttpHost().getPort()),
|
|
||||||
new UsernamePasswordCredentials(proxy.getUser(), proxy.getPassword()));
|
|
||||||
httpClientBuilder.setDefaultCredentialsProvider(credsProvider);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (site != null && site.getHttpProxy()!= null && site.getUsernamePasswordCredentials() != null){
|
|
||||||
credsProvider = new BasicCredentialsProvider();
|
|
||||||
credsProvider.setCredentials(
|
|
||||||
new AuthScope(site.getHttpProxy()),//可以访问的范围
|
|
||||||
site.getUsernamePasswordCredentials());//用户名和密码
|
|
||||||
httpClientBuilder.setDefaultCredentialsProvider(credsProvider);
|
|
||||||
}
|
|
||||||
|
|
||||||
httpClientBuilder.setConnectionManager(connectionManager);
|
httpClientBuilder.setConnectionManager(connectionManager);
|
||||||
if (site != null && site.getUserAgent() != null) {
|
if (site.getUserAgent() != null) {
|
||||||
httpClientBuilder.setUserAgent(site.getUserAgent());
|
httpClientBuilder.setUserAgent(site.getUserAgent());
|
||||||
} else {
|
} else {
|
||||||
httpClientBuilder.setUserAgent("");
|
httpClientBuilder.setUserAgent("");
|
||||||
}
|
}
|
||||||
if (site == null || site.isUseGzip()) {
|
if (site.isUseGzip()) {
|
||||||
httpClientBuilder.addInterceptorFirst(new HttpRequestInterceptor() {
|
httpClientBuilder.addInterceptorFirst(new HttpRequestInterceptor() {
|
||||||
|
|
||||||
public void process(
|
public void process(
|
||||||
|
@ -140,16 +117,12 @@ public class HttpClientGenerator {
|
||||||
|
|
||||||
SocketConfig.Builder socketConfigBuilder = SocketConfig.custom();
|
SocketConfig.Builder socketConfigBuilder = SocketConfig.custom();
|
||||||
socketConfigBuilder.setSoKeepAlive(true).setTcpNoDelay(true);
|
socketConfigBuilder.setSoKeepAlive(true).setTcpNoDelay(true);
|
||||||
if (site != null) {
|
socketConfigBuilder.setSoTimeout(site.getTimeOut());
|
||||||
socketConfigBuilder.setSoTimeout(site.getTimeOut());
|
|
||||||
}
|
|
||||||
SocketConfig socketConfig = socketConfigBuilder.build();
|
SocketConfig socketConfig = socketConfigBuilder.build();
|
||||||
httpClientBuilder.setDefaultSocketConfig(socketConfig);
|
httpClientBuilder.setDefaultSocketConfig(socketConfig);
|
||||||
connectionManager.setDefaultSocketConfig(socketConfig);
|
connectionManager.setDefaultSocketConfig(socketConfig);
|
||||||
if (site != null) {
|
httpClientBuilder.setRetryHandler(new DefaultHttpRequestRetryHandler(site.getRetryTimes(), true));
|
||||||
httpClientBuilder.setRetryHandler(new DefaultHttpRequestRetryHandler(site.getRetryTimes(), true));
|
generateCookie(httpClientBuilder, site);
|
||||||
generateCookie(httpClientBuilder, site);
|
|
||||||
}
|
|
||||||
return httpClientBuilder.build();
|
return httpClientBuilder.build();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,34 @@
|
||||||
|
package us.codecraft.webmagic.downloader;
|
||||||
|
|
||||||
|
import org.apache.http.client.methods.HttpUriRequest;
|
||||||
|
import org.apache.http.client.protocol.HttpClientContext;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author code4crafter@gmail.com
|
||||||
|
* Date: 17/4/8
|
||||||
|
* Time: 19:43
|
||||||
|
* @since 0.7.0
|
||||||
|
*/
|
||||||
|
public class HttpClientRequestContext {
|
||||||
|
|
||||||
|
private HttpUriRequest httpUriRequest;
|
||||||
|
|
||||||
|
private HttpClientContext httpClientContext;
|
||||||
|
|
||||||
|
public HttpUriRequest getHttpUriRequest() {
|
||||||
|
return httpUriRequest;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setHttpUriRequest(HttpUriRequest httpUriRequest) {
|
||||||
|
this.httpUriRequest = httpUriRequest;
|
||||||
|
}
|
||||||
|
|
||||||
|
public HttpClientContext getHttpClientContext() {
|
||||||
|
return httpClientContext;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setHttpClientContext(HttpClientContext httpClientContext) {
|
||||||
|
this.httpClientContext = httpClientContext;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,116 @@
|
||||||
|
package us.codecraft.webmagic.downloader;
|
||||||
|
|
||||||
|
import org.apache.http.HttpHost;
|
||||||
|
import org.apache.http.auth.AuthState;
|
||||||
|
import org.apache.http.auth.UsernamePasswordCredentials;
|
||||||
|
import org.apache.http.client.CookieStore;
|
||||||
|
import org.apache.http.client.config.CookieSpecs;
|
||||||
|
import org.apache.http.client.config.RequestConfig;
|
||||||
|
import org.apache.http.client.methods.HttpUriRequest;
|
||||||
|
import org.apache.http.client.methods.RequestBuilder;
|
||||||
|
import org.apache.http.client.protocol.HttpClientContext;
|
||||||
|
import org.apache.http.entity.ByteArrayEntity;
|
||||||
|
import org.apache.http.impl.auth.BasicScheme;
|
||||||
|
import org.apache.http.impl.client.BasicCookieStore;
|
||||||
|
import org.apache.http.impl.cookie.BasicClientCookie;
|
||||||
|
import us.codecraft.webmagic.Request;
|
||||||
|
import us.codecraft.webmagic.Site;
|
||||||
|
import us.codecraft.webmagic.proxy.Proxy;
|
||||||
|
import us.codecraft.webmagic.utils.HttpConstant;
|
||||||
|
import us.codecraft.webmagic.utils.UrlUtils;
|
||||||
|
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author code4crafter@gmail.com
|
||||||
|
* Date: 17/3/18
|
||||||
|
* Time: 11:28
|
||||||
|
*
|
||||||
|
* @since 0.7.0
|
||||||
|
*/
|
||||||
|
public class HttpUriRequestConverter {
|
||||||
|
|
||||||
|
public HttpClientRequestContext convert(Request request, Site site, Proxy proxy) {
|
||||||
|
HttpClientRequestContext httpClientRequestContext = new HttpClientRequestContext();
|
||||||
|
httpClientRequestContext.setHttpUriRequest(convertHttpUriRequest(request, site, proxy));
|
||||||
|
httpClientRequestContext.setHttpClientContext(convertHttpClientContext(request, site, proxy));
|
||||||
|
return httpClientRequestContext;
|
||||||
|
}
|
||||||
|
|
||||||
|
private HttpClientContext convertHttpClientContext(Request request, Site site, Proxy proxy) {
|
||||||
|
HttpClientContext httpContext = new HttpClientContext();
|
||||||
|
if (proxy != null) {
|
||||||
|
AuthState authState = new AuthState();
|
||||||
|
authState.update(new BasicScheme(), new UsernamePasswordCredentials(proxy.getUsername(), proxy.getPassword()));
|
||||||
|
httpContext.setAttribute(HttpClientContext.PROXY_AUTH_STATE, authState);
|
||||||
|
}
|
||||||
|
if (request.getCookies() != null && !request.getCookies().isEmpty()) {
|
||||||
|
CookieStore cookieStore = new BasicCookieStore();
|
||||||
|
for (Map.Entry<String, String> cookieEntry : request.getCookies().entrySet()) {
|
||||||
|
BasicClientCookie cookie1 = new BasicClientCookie(cookieEntry.getKey(), cookieEntry.getValue());
|
||||||
|
cookie1.setDomain(UrlUtils.removePort(UrlUtils.getDomain(request.getUrl())));
|
||||||
|
cookieStore.addCookie(cookie1);
|
||||||
|
}
|
||||||
|
httpContext.setCookieStore(cookieStore);
|
||||||
|
}
|
||||||
|
return httpContext;
|
||||||
|
}
|
||||||
|
|
||||||
|
private HttpUriRequest convertHttpUriRequest(Request request, Site site, Proxy proxy) {
|
||||||
|
RequestBuilder requestBuilder = selectRequestMethod(request).setUri(request.getUrl());
|
||||||
|
if (site.getHeaders() != null) {
|
||||||
|
for (Map.Entry<String, String> headerEntry : site.getHeaders().entrySet()) {
|
||||||
|
requestBuilder.addHeader(headerEntry.getKey(), headerEntry.getValue());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
RequestConfig.Builder requestConfigBuilder = RequestConfig.custom();
|
||||||
|
if (site != null) {
|
||||||
|
requestConfigBuilder.setConnectionRequestTimeout(site.getTimeOut())
|
||||||
|
.setSocketTimeout(site.getTimeOut())
|
||||||
|
.setConnectTimeout(site.getTimeOut())
|
||||||
|
.setCookieSpec(CookieSpecs.STANDARD);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (proxy != null) {
|
||||||
|
requestConfigBuilder.setProxy(new HttpHost(proxy.getHost(), proxy.getPort()));
|
||||||
|
}
|
||||||
|
requestBuilder.setConfig(requestConfigBuilder.build());
|
||||||
|
HttpUriRequest httpUriRequest = requestBuilder.build();
|
||||||
|
if (request.getHeaders() != null && !request.getHeaders().isEmpty()) {
|
||||||
|
for (Map.Entry<String, String> header : request.getHeaders().entrySet()) {
|
||||||
|
httpUriRequest.addHeader(header.getKey(), header.getValue());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return httpUriRequest;
|
||||||
|
}
|
||||||
|
|
||||||
|
private RequestBuilder selectRequestMethod(Request request) {
|
||||||
|
String method = request.getMethod();
|
||||||
|
if (method == null || method.equalsIgnoreCase(HttpConstant.Method.GET)) {
|
||||||
|
//default get
|
||||||
|
return RequestBuilder.get();
|
||||||
|
} else if (method.equalsIgnoreCase(HttpConstant.Method.POST)) {
|
||||||
|
return addFormParams(RequestBuilder.post(),request);
|
||||||
|
} else if (method.equalsIgnoreCase(HttpConstant.Method.HEAD)) {
|
||||||
|
return RequestBuilder.head();
|
||||||
|
} else if (method.equalsIgnoreCase(HttpConstant.Method.PUT)) {
|
||||||
|
return addFormParams(RequestBuilder.put(), request);
|
||||||
|
} else if (method.equalsIgnoreCase(HttpConstant.Method.DELETE)) {
|
||||||
|
return RequestBuilder.delete();
|
||||||
|
} else if (method.equalsIgnoreCase(HttpConstant.Method.TRACE)) {
|
||||||
|
return RequestBuilder.trace();
|
||||||
|
}
|
||||||
|
throw new IllegalArgumentException("Illegal HTTP Method " + method);
|
||||||
|
}
|
||||||
|
|
||||||
|
private RequestBuilder addFormParams(RequestBuilder requestBuilder, Request request) {
|
||||||
|
if (request.getRequestBody() != null) {
|
||||||
|
ByteArrayEntity entity = new ByteArrayEntity(request.getRequestBody().getBody());
|
||||||
|
entity.setContentType(request.getRequestBody().getContentType());
|
||||||
|
requestBuilder.setEntity(entity);
|
||||||
|
}
|
||||||
|
return requestBuilder;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,72 @@
|
||||||
|
package us.codecraft.webmagic.model;
|
||||||
|
|
||||||
|
import org.apache.http.NameValuePair;
|
||||||
|
import org.apache.http.client.utils.URLEncodedUtils;
|
||||||
|
import org.apache.http.message.BasicNameValuePair;
|
||||||
|
|
||||||
|
import java.io.UnsupportedEncodingException;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author code4crafter@gmail.com
|
||||||
|
* Date: 17/4/8
|
||||||
|
*/
|
||||||
|
public class HttpRequestBody {
|
||||||
|
|
||||||
|
public static abstract class ContentType {
|
||||||
|
|
||||||
|
public static final String JSON = "application/json";
|
||||||
|
|
||||||
|
public static final String XML = "text/xml";
|
||||||
|
|
||||||
|
public static final String FORM = "application/x-www-form-urlencoded";
|
||||||
|
|
||||||
|
public static final String MULTIPART = "multipart/form-data";
|
||||||
|
}
|
||||||
|
|
||||||
|
private final byte[] body;
|
||||||
|
|
||||||
|
private final String contentType;
|
||||||
|
|
||||||
|
private final String encoding;
|
||||||
|
|
||||||
|
public HttpRequestBody(byte[] body, String contentType, String encoding) {
|
||||||
|
this.body = body;
|
||||||
|
this.contentType = contentType;
|
||||||
|
this.encoding = encoding;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getContentType() {
|
||||||
|
return contentType;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getEncoding() {
|
||||||
|
return encoding;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static HttpRequestBody json(String json, String encoding) throws UnsupportedEncodingException {
|
||||||
|
return new HttpRequestBody(json.getBytes(encoding), ContentType.JSON, encoding);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static HttpRequestBody xml(String xml, String encoding) throws UnsupportedEncodingException {
|
||||||
|
return new HttpRequestBody(xml.getBytes(encoding), ContentType.XML, encoding);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static HttpRequestBody custom(byte[] body, String contentType, String encoding) throws UnsupportedEncodingException {
|
||||||
|
return new HttpRequestBody(body, contentType, encoding);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static HttpRequestBody form(Map<String,Object> params, String encoding) throws UnsupportedEncodingException {
|
||||||
|
List<NameValuePair> nameValuePairs = new ArrayList<NameValuePair>(params.size());
|
||||||
|
for (Map.Entry<String, Object> entry : params.entrySet()) {
|
||||||
|
nameValuePairs.add(new BasicNameValuePair(entry.getKey(), String.valueOf(entry.getValue())));
|
||||||
|
}
|
||||||
|
return new HttpRequestBody(URLEncodedUtils.format(nameValuePairs, encoding).getBytes(encoding), ContentType.FORM, encoding);
|
||||||
|
}
|
||||||
|
|
||||||
|
public byte[] getBody() {
|
||||||
|
return body;
|
||||||
|
}
|
||||||
|
}
|
|
@ -2,7 +2,6 @@ package us.codecraft.webmagic.processor;
|
||||||
|
|
||||||
import us.codecraft.webmagic.Page;
|
import us.codecraft.webmagic.Page;
|
||||||
import us.codecraft.webmagic.Site;
|
import us.codecraft.webmagic.Site;
|
||||||
import us.codecraft.webmagic.utils.UrlUtils;
|
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
|
@ -18,9 +17,8 @@ public class SimplePageProcessor implements PageProcessor {
|
||||||
|
|
||||||
private Site site;
|
private Site site;
|
||||||
|
|
||||||
public SimplePageProcessor(String startUrl, String urlPattern) {
|
public SimplePageProcessor(String urlPattern) {
|
||||||
this.site = Site.me().addStartUrl(startUrl).
|
this.site = Site.me();
|
||||||
setDomain(UrlUtils.getDomain(startUrl));
|
|
||||||
//compile "*" expression to regex
|
//compile "*" expression to regex
|
||||||
this.urlPattern = "(" + urlPattern.replace(".", "\\.").replace("*", "[^\"'#]*") + ")";
|
this.urlPattern = "(" + urlPattern.replace(".", "\\.").replace("*", "[^\"'#]*") + ")";
|
||||||
|
|
||||||
|
|
|
@ -1,199 +1,41 @@
|
||||||
package us.codecraft.webmagic.proxy;
|
package us.codecraft.webmagic.proxy;
|
||||||
|
|
||||||
import org.apache.http.HttpHost;
|
|
||||||
|
|
||||||
import java.io.Serializable;
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.concurrent.Delayed;
|
|
||||||
import java.util.concurrent.TimeUnit;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* >>>> Proxy lifecycle
|
|
||||||
|
|
||||||
+----------+ +-----+
|
|
||||||
| last use | | new |
|
|
||||||
+-----+----+ +---+-+
|
|
||||||
| +------+ |
|
|
||||||
+->| init |<--+
|
|
||||||
+--+---+
|
|
||||||
|
|
|
||||||
v
|
|
||||||
+--------+
|
|
||||||
+--->| borrow |
|
|
||||||
| +---+----+
|
|
||||||
| |+------------------+
|
|
||||||
| v
|
|
||||||
| +--------+
|
|
||||||
| | in use | Respone Time
|
|
||||||
| +---+----+
|
|
||||||
| |+------------------+
|
|
||||||
| v
|
|
||||||
| +--------+
|
|
||||||
| | return |
|
|
||||||
| +---+----+
|
|
||||||
| |+-------------------+
|
|
||||||
| v
|
|
||||||
| +-------+ reuse interval
|
|
||||||
| | delay | (delay time)
|
|
||||||
| +---+---+
|
|
||||||
| |+-------------------+
|
|
||||||
| v
|
|
||||||
| +------+
|
|
||||||
| | idle | idle time
|
|
||||||
| +---+--+
|
|
||||||
| |+-------------------+
|
|
||||||
+--------+
|
|
||||||
*/
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Object has these status of lifecycle above.<br>
|
|
||||||
*
|
*
|
||||||
* @author yxssfxwzy@sina.com <br>
|
|
||||||
* @since 0.5.1
|
|
||||||
* @see SimpleProxyPool
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
public class Proxy implements Delayed, Serializable {
|
public class Proxy {
|
||||||
|
|
||||||
private static final long serialVersionUID = 228939737383625551L;
|
private String host;
|
||||||
public static final int ERROR_403 = 403;
|
private int port;
|
||||||
public static final int ERROR_404 = 404;
|
private String username;
|
||||||
public static final int ERROR_BANNED = 10000;// banned by website
|
|
||||||
public static final int ERROR_Proxy = 10001;// the proxy itself failed
|
|
||||||
public static final int SUCCESS = 200;
|
|
||||||
|
|
||||||
private final HttpHost httpHost;
|
|
||||||
private String user;
|
|
||||||
private String password;
|
private String password;
|
||||||
|
|
||||||
|
|
||||||
private int reuseTimeInterval = 1500;// ms
|
public Proxy(String host, int port) {
|
||||||
private Long canReuseTime = 0L;
|
this.host = host;
|
||||||
private Long lastBorrowTime = System.currentTimeMillis();
|
this.port = port;
|
||||||
private Long responseTime = 0L;
|
}
|
||||||
|
|
||||||
private int failedNum = 0;
|
public Proxy(String host, int port, String username, String password) {
|
||||||
private int successNum = 0;
|
this.host = host;
|
||||||
private int borrowNum = 0;
|
this.port = port;
|
||||||
|
this.username = username;
|
||||||
private List<Integer> failedErrorType = new ArrayList<Integer>();
|
|
||||||
|
|
||||||
public Proxy(HttpHost httpHost, String user, String password) {
|
|
||||||
this.httpHost = httpHost;
|
|
||||||
this.user = user;
|
|
||||||
this.password = password;
|
this.password = password;
|
||||||
this.canReuseTime = System.nanoTime() + TimeUnit.NANOSECONDS.convert(reuseTimeInterval, TimeUnit.MILLISECONDS);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public Proxy(HttpHost httpHost, int reuseInterval, String user, String password) {
|
public String getHost() {
|
||||||
this.httpHost = httpHost;
|
return host;
|
||||||
this.user = user;
|
|
||||||
this.password = password;
|
|
||||||
this.canReuseTime = System.nanoTime() + TimeUnit.NANOSECONDS.convert(reuseInterval, TimeUnit.MILLISECONDS);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public int getSuccessNum() {
|
public int getPort() {
|
||||||
return successNum;
|
return port;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void successNumIncrement(int increment) {
|
public String getUsername() {
|
||||||
this.successNum += increment;
|
return username;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Long getLastUseTime() {
|
public String getPassword() {
|
||||||
return lastBorrowTime;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setLastBorrowTime(Long lastBorrowTime) {
|
|
||||||
this.lastBorrowTime = lastBorrowTime;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void recordResponse() {
|
|
||||||
this.responseTime = (System.currentTimeMillis() - lastBorrowTime + responseTime) / 2;
|
|
||||||
this.lastBorrowTime = System.currentTimeMillis();
|
|
||||||
}
|
|
||||||
|
|
||||||
public List<Integer> getFailedErrorType() {
|
|
||||||
return failedErrorType;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setFailedErrorType(List<Integer> failedErrorType) {
|
|
||||||
this.failedErrorType = failedErrorType;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void fail(int failedErrorType) {
|
|
||||||
this.failedNum++;
|
|
||||||
this.failedErrorType.add(failedErrorType);
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setFailedNum(int failedNum) {
|
|
||||||
this.failedNum = failedNum;
|
|
||||||
}
|
|
||||||
|
|
||||||
public int getFailedNum() {
|
|
||||||
return failedNum;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getFailedType() {
|
|
||||||
String re = "";
|
|
||||||
for (Integer i : this.failedErrorType) {
|
|
||||||
re += i + " . ";
|
|
||||||
}
|
|
||||||
return re;
|
|
||||||
}
|
|
||||||
|
|
||||||
public HttpHost getHttpHost() {
|
|
||||||
return httpHost;
|
|
||||||
}
|
|
||||||
|
|
||||||
public int getReuseTimeInterval() {
|
|
||||||
return reuseTimeInterval;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setReuseTimeInterval(int reuseTimeInterval) {
|
|
||||||
this.reuseTimeInterval = reuseTimeInterval;
|
|
||||||
this.canReuseTime = System.nanoTime() + TimeUnit.NANOSECONDS.convert(reuseTimeInterval, TimeUnit.MILLISECONDS);
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public long getDelay(TimeUnit unit) {
|
|
||||||
return unit.convert(canReuseTime - System.nanoTime(), TimeUnit.NANOSECONDS);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public int compareTo(Delayed o) {
|
|
||||||
Proxy that = (Proxy) o;
|
|
||||||
return canReuseTime > that.canReuseTime ? 1 : (canReuseTime < that.canReuseTime ? -1 : 0);
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public String toString() {
|
|
||||||
|
|
||||||
String re = String.format("host: %15s >> %5dms >> success: %-3.2f%% >> borrow: %d", httpHost.getAddress().getHostAddress(), responseTime,
|
|
||||||
successNum * 100.0 / borrowNum, borrowNum);
|
|
||||||
return re;
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getUser()
|
|
||||||
{
|
|
||||||
return user;
|
|
||||||
|
|
||||||
}
|
|
||||||
public String getPassword()
|
|
||||||
{
|
|
||||||
return password;
|
return password;
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
public void borrowNumIncrement(int increment) {
|
|
||||||
this.borrowNum += increment;
|
|
||||||
}
|
|
||||||
|
|
||||||
public int getBorrowNum() {
|
|
||||||
return borrowNum;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,12 +0,0 @@
|
||||||
package us.codecraft.webmagic.proxy;
|
|
||||||
|
|
||||||
import org.apache.http.HttpHost;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Created by edwardsbean on 15-2-28.
|
|
||||||
*/
|
|
||||||
public interface ProxyPool {
|
|
||||||
public void returnProxy(HttpHost host, int statusCode);
|
|
||||||
public Proxy getProxy();
|
|
||||||
public boolean isEnable();
|
|
||||||
}
|
|
|
@ -0,0 +1,14 @@
|
||||||
|
package us.codecraft.webmagic.proxy;
|
||||||
|
|
||||||
|
import us.codecraft.webmagic.Task;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Created by edwardsbean on 15-2-28.
|
||||||
|
*/
|
||||||
|
public interface ProxyProvider {
|
||||||
|
|
||||||
|
void returnProxy(Proxy proxy, boolean banned, Task task);
|
||||||
|
|
||||||
|
Proxy getProxy(Task task);
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,13 @@
|
||||||
|
package us.codecraft.webmagic.proxy;
|
||||||
|
|
||||||
|
import org.apache.http.HttpResponse;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author code4crafter@gmail.com
|
||||||
|
* Date: 17/3/20
|
||||||
|
* Time: 下午10:52
|
||||||
|
*/
|
||||||
|
public interface ResponseChecker {
|
||||||
|
|
||||||
|
boolean isBanned(HttpResponse httpResponse);
|
||||||
|
}
|
|
@ -1,310 +0,0 @@
|
||||||
package us.codecraft.webmagic.proxy;
|
|
||||||
|
|
||||||
import org.apache.http.HttpHost;
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
import us.codecraft.webmagic.utils.FilePersistentBase;
|
|
||||||
import us.codecraft.webmagic.utils.ProxyUtils;
|
|
||||||
|
|
||||||
import java.io.*;
|
|
||||||
import java.net.InetAddress;
|
|
||||||
import java.net.UnknownHostException;
|
|
||||||
import java.util.*;
|
|
||||||
import java.util.Map.Entry;
|
|
||||||
import java.util.concurrent.BlockingQueue;
|
|
||||||
import java.util.concurrent.ConcurrentHashMap;
|
|
||||||
import java.util.concurrent.DelayQueue;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Pooled Proxy Object
|
|
||||||
*
|
|
||||||
* @author yxssfxwzy@sina.com <br>
|
|
||||||
* @see Proxy
|
|
||||||
* @since 0.5.1
|
|
||||||
*/
|
|
||||||
public class SimpleProxyPool implements ProxyPool {
|
|
||||||
|
|
||||||
private Logger logger = LoggerFactory.getLogger(getClass());
|
|
||||||
|
|
||||||
private BlockingQueue<Proxy> proxyQueue = new DelayQueue<Proxy>();
|
|
||||||
private Map<String, Proxy> allProxy = new ConcurrentHashMap<String, Proxy>();
|
|
||||||
|
|
||||||
private int reuseInterval = 1500;// ms
|
|
||||||
private int reviveTime = 2 * 60 * 60 * 1000;// ms
|
|
||||||
private int saveProxyInterval = 10 * 60 * 1000;// ms
|
|
||||||
|
|
||||||
private boolean isEnable = false;
|
|
||||||
private boolean validateWhenInit = false;
|
|
||||||
// private boolean isUseLastProxy = true;
|
|
||||||
private String proxyFilePath = "/data/webmagic/lastUse.proxy";
|
|
||||||
|
|
||||||
private FilePersistentBase fBase = new FilePersistentBase();
|
|
||||||
|
|
||||||
private Timer timer = new Timer(true);
|
|
||||||
private TimerTask saveProxyTask = new TimerTask() {
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void run() {
|
|
||||||
saveProxyList();
|
|
||||||
logger.info(allProxyStatus());
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
public SimpleProxyPool() {
|
|
||||||
this(null, true);
|
|
||||||
}
|
|
||||||
|
|
||||||
public SimpleProxyPool(List<String[]> httpProxyList) {
|
|
||||||
this(httpProxyList, true);
|
|
||||||
}
|
|
||||||
|
|
||||||
public SimpleProxyPool(List<String[]> httpProxyList, boolean isUseLastProxy) {
|
|
||||||
if (httpProxyList != null) {
|
|
||||||
addProxy(httpProxyList.toArray(new String[httpProxyList.size()][]));
|
|
||||||
}
|
|
||||||
if (isUseLastProxy) {
|
|
||||||
if (!new File(proxyFilePath).exists()) {
|
|
||||||
setFilePath();
|
|
||||||
}
|
|
||||||
readProxyList();
|
|
||||||
timer.schedule(saveProxyTask, 0, saveProxyInterval);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private void setFilePath() {
|
|
||||||
String tmpDir = System.getProperty("java.io.tmpdir");
|
|
||||||
String path = tmpDir + FilePersistentBase.PATH_SEPERATOR + "webmagic" + FilePersistentBase.PATH_SEPERATOR + "lastUse.proxy";
|
|
||||||
if (tmpDir != null && new File(tmpDir).isDirectory()) {
|
|
||||||
fBase.setPath(tmpDir + FilePersistentBase.PATH_SEPERATOR + "webmagic");
|
|
||||||
File f = fBase.getFile(path);
|
|
||||||
if (!f.exists()) {
|
|
||||||
try {
|
|
||||||
f.createNewFile();
|
|
||||||
|
|
||||||
} catch (IOException e) {
|
|
||||||
logger.error("proxy file create error", e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
} else {
|
|
||||||
logger.error("java tmp dir not exists");
|
|
||||||
}
|
|
||||||
this.proxyFilePath = path;
|
|
||||||
}
|
|
||||||
|
|
||||||
private void saveProxyList() {
|
|
||||||
if (allProxy.size() == 0) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
try {
|
|
||||||
ObjectOutputStream os = new ObjectOutputStream(new FileOutputStream(fBase.getFile(proxyFilePath)));
|
|
||||||
os.writeObject(prepareForSaving());
|
|
||||||
os.close();
|
|
||||||
logger.info("save proxy");
|
|
||||||
} catch (FileNotFoundException e) {
|
|
||||||
logger.error("proxy file not found", e);
|
|
||||||
} catch (IOException e) {
|
|
||||||
e.printStackTrace();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private Map<String, Proxy> prepareForSaving() {
|
|
||||||
Map<String, Proxy> tmp = new HashMap<String, Proxy>();
|
|
||||||
for (Entry<String, Proxy> e : allProxy.entrySet()) {
|
|
||||||
Proxy p = e.getValue();
|
|
||||||
p.setFailedNum(0);
|
|
||||||
tmp.put(e.getKey(), p);
|
|
||||||
}
|
|
||||||
return tmp;
|
|
||||||
}
|
|
||||||
|
|
||||||
private void readProxyList() {
|
|
||||||
try {
|
|
||||||
ObjectInputStream is = new ObjectInputStream(new FileInputStream(fBase.getFile(proxyFilePath)));
|
|
||||||
addProxy((Map<String, Proxy>) is.readObject());
|
|
||||||
is.close();
|
|
||||||
} catch (FileNotFoundException e) {
|
|
||||||
logger.info("last use proxy file not found", e);
|
|
||||||
} catch (IOException e) {
|
|
||||||
// e.printStackTrace();
|
|
||||||
} catch (ClassNotFoundException e) {
|
|
||||||
// e.printStackTrace();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private void addProxy(Map<String, Proxy> httpProxyMap) {
|
|
||||||
isEnable = true;
|
|
||||||
for (Entry<String, Proxy> entry : httpProxyMap.entrySet()) {
|
|
||||||
try {
|
|
||||||
if (allProxy.containsKey(entry.getKey())) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if (!validateWhenInit || ProxyUtils.validateProxy(entry.getValue().getHttpHost())) {
|
|
||||||
entry.getValue().setFailedNum(0);
|
|
||||||
entry.getValue().setReuseTimeInterval(reuseInterval);
|
|
||||||
proxyQueue.add(entry.getValue());
|
|
||||||
allProxy.put(entry.getKey(), entry.getValue());
|
|
||||||
}
|
|
||||||
} catch (NumberFormatException e) {
|
|
||||||
logger.error("HttpHost init error:", e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
logger.info("proxy pool size>>>>" + allProxy.size());
|
|
||||||
}
|
|
||||||
|
|
||||||
public void addProxy(String[]... httpProxyList) {
|
|
||||||
isEnable = true;
|
|
||||||
for (String[] s : httpProxyList) {
|
|
||||||
try {
|
|
||||||
if (allProxy.containsKey(s[2])) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
HttpHost item = new HttpHost(InetAddress.getByName(s[2]), Integer.valueOf(s[3]));
|
|
||||||
if (!validateWhenInit || ProxyUtils.validateProxy(item)) {
|
|
||||||
Proxy p = new Proxy(item, reuseInterval, s[0], s[1]);
|
|
||||||
proxyQueue.add(p);
|
|
||||||
allProxy.put(s[2], p);
|
|
||||||
}
|
|
||||||
} catch (NumberFormatException e) {
|
|
||||||
logger.error("HttpHost init error:", e);
|
|
||||||
} catch (UnknownHostException e) {
|
|
||||||
logger.error("HttpHost init error:", e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
logger.info("proxy pool size>>>>" + allProxy.size());
|
|
||||||
}
|
|
||||||
|
|
||||||
public Proxy getProxy() {
|
|
||||||
Proxy proxy = null;
|
|
||||||
try {
|
|
||||||
Long time = System.currentTimeMillis();
|
|
||||||
proxy = proxyQueue.take();
|
|
||||||
double costTime = (System.currentTimeMillis() - time) / 1000.0;
|
|
||||||
if (costTime > reuseInterval) {
|
|
||||||
logger.info("get proxy time >>>> " + costTime);
|
|
||||||
}
|
|
||||||
Proxy p = allProxy.get(proxy.getHttpHost().getAddress().getHostAddress());
|
|
||||||
p.setLastBorrowTime(System.currentTimeMillis());
|
|
||||||
p.borrowNumIncrement(1);
|
|
||||||
} catch (InterruptedException e) {
|
|
||||||
logger.error("get proxy error", e);
|
|
||||||
}
|
|
||||||
if (proxy == null) {
|
|
||||||
throw new NoSuchElementException();
|
|
||||||
}
|
|
||||||
return proxy;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void returnProxy(HttpHost host, int statusCode) {
|
|
||||||
Proxy p = allProxy.get(host.getAddress().getHostAddress());
|
|
||||||
if (p == null) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
switch (statusCode) {
|
|
||||||
case Proxy.SUCCESS:
|
|
||||||
p.setReuseTimeInterval(reuseInterval);
|
|
||||||
p.setFailedNum(0);
|
|
||||||
p.setFailedErrorType(new ArrayList<Integer>());
|
|
||||||
p.recordResponse();
|
|
||||||
p.successNumIncrement(1);
|
|
||||||
break;
|
|
||||||
case Proxy.ERROR_403:
|
|
||||||
// banned,try longer interval
|
|
||||||
p.fail(Proxy.ERROR_403);
|
|
||||||
p.setReuseTimeInterval(reuseInterval * p.getFailedNum());
|
|
||||||
logger.info(host + " >>>> reuseTimeInterval is >>>> " + p.getReuseTimeInterval() / 1000.0);
|
|
||||||
break;
|
|
||||||
case Proxy.ERROR_BANNED:
|
|
||||||
p.fail(Proxy.ERROR_BANNED);
|
|
||||||
p.setReuseTimeInterval(10 * 60 * 1000 * p.getFailedNum());
|
|
||||||
logger.warn("this proxy is banned >>>> " + p.getHttpHost());
|
|
||||||
logger.info(host + " >>>> reuseTimeInterval is >>>> " + p.getReuseTimeInterval() / 1000.0);
|
|
||||||
break;
|
|
||||||
case Proxy.ERROR_404:
|
|
||||||
// p.fail(Proxy.ERROR_404);
|
|
||||||
// p.setReuseTimeInterval(reuseInterval * p.getFailedNum());
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
p.fail(statusCode);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
if (p.getFailedNum() > 20) {
|
|
||||||
p.setReuseTimeInterval(reviveTime);
|
|
||||||
logger.error("remove proxy >>>> " + host + ">>>>" + p.getFailedType() + " >>>> remain proxy >>>> " + proxyQueue.size());
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
if (p.getFailedNum() > 0 && p.getFailedNum() % 5 == 0) {
|
|
||||||
if (!ProxyUtils.validateProxy(host)) {
|
|
||||||
p.setReuseTimeInterval(reviveTime);
|
|
||||||
logger.error("remove proxy >>>> " + host + ">>>>" + p.getFailedType() + " >>>> remain proxy >>>> " + proxyQueue.size());
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
try {
|
|
||||||
proxyQueue.put(p);
|
|
||||||
} catch (InterruptedException e) {
|
|
||||||
logger.warn("proxyQueue return proxy error", e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public String allProxyStatus() {
|
|
||||||
String re = "all proxy info >>>> \n";
|
|
||||||
for (Entry<String, Proxy> entry : allProxy.entrySet()) {
|
|
||||||
re += entry.getValue().toString() + "\n";
|
|
||||||
}
|
|
||||||
return re;
|
|
||||||
}
|
|
||||||
|
|
||||||
public int getIdleNum() {
|
|
||||||
return proxyQueue.size();
|
|
||||||
}
|
|
||||||
|
|
||||||
public int getReuseInterval() {
|
|
||||||
return reuseInterval;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setReuseInterval(int reuseInterval) {
|
|
||||||
this.reuseInterval = reuseInterval;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void enable(boolean isEnable) {
|
|
||||||
this.isEnable = isEnable;
|
|
||||||
}
|
|
||||||
|
|
||||||
public boolean isEnable() {
|
|
||||||
return isEnable;
|
|
||||||
}
|
|
||||||
|
|
||||||
public int getReviveTime() {
|
|
||||||
return reviveTime;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setReviveTime(int reviveTime) {
|
|
||||||
this.reviveTime = reviveTime;
|
|
||||||
}
|
|
||||||
|
|
||||||
public boolean isValidateWhenInit() {
|
|
||||||
return validateWhenInit;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void validateWhenInit(boolean validateWhenInit) {
|
|
||||||
this.validateWhenInit = validateWhenInit;
|
|
||||||
}
|
|
||||||
|
|
||||||
public int getSaveProxyInterval() {
|
|
||||||
return saveProxyInterval;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setSaveProxyInterval(int saveProxyInterval) {
|
|
||||||
this.saveProxyInterval = saveProxyInterval;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getProxyFilePath() {
|
|
||||||
return proxyFilePath;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setProxyFilePath(String proxyFilePath) {
|
|
||||||
this.proxyFilePath = proxyFilePath;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
|
@ -0,0 +1,159 @@
|
||||||
|
package us.codecraft.webmagic.proxy;
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.concurrent.Delayed;
|
||||||
|
import java.util.concurrent.TimeUnit;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* >>>> Proxy lifecycle
|
||||||
|
|
||||||
|
+----------+ +-----+
|
||||||
|
| last use | | new |
|
||||||
|
+-----+----+ +---+-+
|
||||||
|
| +------+ |
|
||||||
|
+->| init |<--+
|
||||||
|
+--+---+
|
||||||
|
|
|
||||||
|
v
|
||||||
|
+--------+
|
||||||
|
+--->| borrow |
|
||||||
|
| +---+----+
|
||||||
|
| |+------------------+
|
||||||
|
| v
|
||||||
|
| +--------+
|
||||||
|
| | in use | Respone Time
|
||||||
|
| +---+----+
|
||||||
|
| |+------------------+
|
||||||
|
| v
|
||||||
|
| +--------+
|
||||||
|
| | return |
|
||||||
|
| +---+----+
|
||||||
|
| |+-------------------+
|
||||||
|
| v
|
||||||
|
| +-------+ reuse interval
|
||||||
|
| | delay | (delay time)
|
||||||
|
| +---+---+
|
||||||
|
| |+-------------------+
|
||||||
|
| v
|
||||||
|
| +------+
|
||||||
|
| | idle | idle time
|
||||||
|
| +---+--+
|
||||||
|
| |+-------------------+
|
||||||
|
+--------+
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Object has these status of lifecycle above.<br>
|
||||||
|
*
|
||||||
|
* @author yxssfxwzy@sina.com <br>
|
||||||
|
* @since 0.5.1
|
||||||
|
* @see TimerReuseProxyPool
|
||||||
|
*/
|
||||||
|
|
||||||
|
public class TimerReuseProxy extends Proxy implements Delayed, Serializable {
|
||||||
|
|
||||||
|
private static final long serialVersionUID = 228939737383625551L;
|
||||||
|
public static final int ERROR_403 = 403;
|
||||||
|
public static final int ERROR_404 = 404;
|
||||||
|
public static final int ERROR_BANNED = 10000;// banned by website
|
||||||
|
public static final int ERROR_Proxy = 10001;// the proxy itself failed
|
||||||
|
public static final int SUCCESS = 200;
|
||||||
|
|
||||||
|
private int reuseTimeInterval = 1500;// ms
|
||||||
|
private Long canReuseTime = 0L;
|
||||||
|
private Long lastBorrowTime = System.currentTimeMillis();
|
||||||
|
private Long responseTime = 0L;
|
||||||
|
|
||||||
|
private int failedNum = 0;
|
||||||
|
private int successNum = 0;
|
||||||
|
private int borrowNum = 0;
|
||||||
|
|
||||||
|
private List<Integer> failedErrorType = new ArrayList<Integer>();
|
||||||
|
|
||||||
|
public TimerReuseProxy(String host, int port, String username, String password) {
|
||||||
|
super(host, port, username, password);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public int getSuccessNum() {
|
||||||
|
return successNum;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void successNumIncrement(int increment) {
|
||||||
|
this.successNum += increment;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Long getLastUseTime() {
|
||||||
|
return lastBorrowTime;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setLastBorrowTime(Long lastBorrowTime) {
|
||||||
|
this.lastBorrowTime = lastBorrowTime;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void recordResponse() {
|
||||||
|
this.responseTime = (System.currentTimeMillis() - lastBorrowTime + responseTime) / 2;
|
||||||
|
this.lastBorrowTime = System.currentTimeMillis();
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<Integer> getFailedErrorType() {
|
||||||
|
return failedErrorType;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setFailedErrorType(List<Integer> failedErrorType) {
|
||||||
|
this.failedErrorType = failedErrorType;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void fail(int failedErrorType) {
|
||||||
|
this.failedNum++;
|
||||||
|
this.failedErrorType.add(failedErrorType);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setFailedNum(int failedNum) {
|
||||||
|
this.failedNum = failedNum;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getFailedNum() {
|
||||||
|
return failedNum;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getFailedType() {
|
||||||
|
String re = "";
|
||||||
|
for (Integer i : this.failedErrorType) {
|
||||||
|
re += i + " . ";
|
||||||
|
}
|
||||||
|
return re;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getReuseTimeInterval() {
|
||||||
|
return reuseTimeInterval;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setReuseTimeInterval(int reuseTimeInterval) {
|
||||||
|
this.reuseTimeInterval = reuseTimeInterval;
|
||||||
|
this.canReuseTime = System.nanoTime() + TimeUnit.NANOSECONDS.convert(reuseTimeInterval, TimeUnit.MILLISECONDS);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long getDelay(TimeUnit unit) {
|
||||||
|
return unit.convert(canReuseTime - System.nanoTime(), TimeUnit.NANOSECONDS);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int compareTo(Delayed o) {
|
||||||
|
TimerReuseProxy that = (TimerReuseProxy) o;
|
||||||
|
return canReuseTime > that.canReuseTime ? 1 : (canReuseTime < that.canReuseTime ? -1 : 0);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public void borrowNumIncrement(int increment) {
|
||||||
|
this.borrowNum += increment;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getBorrowNum() {
|
||||||
|
return borrowNum;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,204 @@
|
||||||
|
package us.codecraft.webmagic.proxy;
|
||||||
|
|
||||||
|
import us.codecraft.webmagic.Task;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Pooled Proxy Object
|
||||||
|
*
|
||||||
|
* @author yxssfxwzy@sina.com <br>
|
||||||
|
* @see Proxy
|
||||||
|
* @since 0.5.1
|
||||||
|
*/
|
||||||
|
public class TimerReuseProxyPool implements ProxyProvider {
|
||||||
|
@Override
|
||||||
|
public void returnProxy(Proxy proxy, boolean banned, Task task) {
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Proxy getProxy(Task task) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
// private Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
//
|
||||||
|
// private BlockingQueue<TimerReuseProxy> proxyQueue = new DelayQueue<TimerReuseProxy>();
|
||||||
|
// private Map<String, TimerReuseProxy> allProxy = new ConcurrentHashMap<String, TimerReuseProxy>();
|
||||||
|
//
|
||||||
|
// private int reuseInterval = 1500;// ms
|
||||||
|
// private int reviveTime = 2 * 60 * 60 * 1000;// ms
|
||||||
|
// private int saveProxyInterval = 10 * 60 * 1000;// ms
|
||||||
|
//
|
||||||
|
// private boolean isEnable = false;
|
||||||
|
// private boolean validateWhenInit = false;
|
||||||
|
// // private boolean isUseLastProxy = true;
|
||||||
|
//
|
||||||
|
// public TimerReuseProxyPool(List<String[]> httpProxyList) {
|
||||||
|
// this(httpProxyList, true);
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
// private void addProxy(Map<String, Proxy> httpProxyMap) {
|
||||||
|
// isEnable = true;
|
||||||
|
// for (Entry<String, Proxy> entry : httpProxyMap.entrySet()) {
|
||||||
|
// try {
|
||||||
|
// if (allProxy.containsKey(entry.getKey())) {
|
||||||
|
// continue;
|
||||||
|
// }
|
||||||
|
// if (!validateWhenInit || ProxyUtils.validateProxy(entry.getValue().getHttpHost())) {
|
||||||
|
// entry.getValue().setFailedNum(0);
|
||||||
|
// entry.getValue().setReuseTimeInterval(reuseInterval);
|
||||||
|
// proxyQueue.add(entry.getValue());
|
||||||
|
// allProxy.put(entry.getKey(), entry.getValue());
|
||||||
|
// }
|
||||||
|
// } catch (NumberFormatException e) {
|
||||||
|
// logger.error("HttpHost init error:", e);
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
// logger.info("proxy pool size>>>>" + allProxy.size());
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
// public void addProxy(Proxy... httpProxyList) {
|
||||||
|
// isEnable = true;
|
||||||
|
// for (Proxy proxy : httpProxyList) {
|
||||||
|
// if (!validateWhenInit || ProxyUtils.validateProxy(proxy.getProxyHost())) {
|
||||||
|
// TimerReuseProxy p = new TimerReuseProxy(proxy.getProxyHost(), proxy.getUsername(), proxy.getPassword(), reuseInterval);
|
||||||
|
// proxyQueue.add(p);
|
||||||
|
// allProxy.put(p.getProxyHost().getHost(), p);
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
// logger.info("proxy pool size>>>>" + allProxy.size());
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
// public TimerReuseProxy getProxy() {
|
||||||
|
// TimerReuseProxy proxy = null;
|
||||||
|
// try {
|
||||||
|
// Long time = System.currentTimeMillis();
|
||||||
|
// proxy = proxyQueue.take();
|
||||||
|
// double costTime = (System.currentTimeMillis() - time) / 1000.0;
|
||||||
|
// if (costTime > reuseInterval) {
|
||||||
|
// logger.info("get proxy time >>>> " + costTime);
|
||||||
|
// }
|
||||||
|
// TimerReuseProxy p = allProxy.get(proxy.getProxyHost().getHost());
|
||||||
|
// p.setLastBorrowTime(System.currentTimeMillis());
|
||||||
|
// p.borrowNumIncrement(1);
|
||||||
|
// } catch (InterruptedException e) {
|
||||||
|
// logger.error("get proxy error", e);
|
||||||
|
// }
|
||||||
|
// if (proxy == null) {
|
||||||
|
// throw new NoSuchElementException();
|
||||||
|
// }
|
||||||
|
// return proxy;
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
// public void returnProxy(Proxy proxy, int statusCode) {
|
||||||
|
// TimerReuseProxy p = allProxy.get(proxy.getProxyHost());
|
||||||
|
// if (p == null) {
|
||||||
|
// return;
|
||||||
|
// }
|
||||||
|
// switch (statusCode) {
|
||||||
|
// case TimerReuseProxy.SUCCESS:
|
||||||
|
// p.setReuseTimeInterval(reuseInterval);
|
||||||
|
// p.setFailedNum(0);
|
||||||
|
// p.setFailedErrorType(new ArrayList<Integer>());
|
||||||
|
// p.recordResponse();
|
||||||
|
// p.successNumIncrement(1);
|
||||||
|
// break;
|
||||||
|
// case TimerReuseProxy.ERROR_403:
|
||||||
|
// // banned,try longer interval
|
||||||
|
// p.fail(TimerReuseProxy.ERROR_403);
|
||||||
|
// p.setReuseTimeInterval(reuseInterval * p.getFailedNum());
|
||||||
|
// logger.info(proxy + " >>>> reuseTimeInterval is >>>> " + p.getReuseTimeInterval() / 1000.0);
|
||||||
|
// break;
|
||||||
|
// case TimerReuseProxy.ERROR_BANNED:
|
||||||
|
// p.fail(TimerReuseProxy.ERROR_BANNED);
|
||||||
|
// p.setReuseTimeInterval(10 * 60 * 1000 * p.getFailedNum());
|
||||||
|
// logger.info(proxy + " >>>> reuseTimeInterval is >>>> " + p.getReuseTimeInterval() / 1000.0);
|
||||||
|
// break;
|
||||||
|
// case TimerReuseProxy.ERROR_404:
|
||||||
|
// // p.fail(Proxy.ERROR_404);
|
||||||
|
// // p.setReuseTimeInterval(reuseInterval * p.getFailedNum());
|
||||||
|
// break;
|
||||||
|
// default:
|
||||||
|
// p.fail(statusCode);
|
||||||
|
// break;
|
||||||
|
// }
|
||||||
|
// if (p.getFailedNum() > 20) {
|
||||||
|
// p.setReuseTimeInterval(reviveTime);
|
||||||
|
// logger.error("remove proxy >>>> " + proxy + ">>>>" + p.getFailedType() + " >>>> remain proxy >>>> " + proxyQueue.size());
|
||||||
|
// return;
|
||||||
|
// }
|
||||||
|
// if (p.getFailedNum() > 0 && p.getFailedNum() % 5 == 0) {
|
||||||
|
// if (!ProxyUtils.validateProxy(proxy)) {
|
||||||
|
// p.setReuseTimeInterval(reviveTime);
|
||||||
|
// logger.error("remove proxy >>>> " + proxy + ">>>>" + p.getFailedType() + " >>>> remain proxy >>>> " + proxyQueue.size());
|
||||||
|
// return;
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
// try {
|
||||||
|
// proxyQueue.put(p);
|
||||||
|
// } catch (InterruptedException e) {
|
||||||
|
// logger.warn("proxyQueue return proxy error", e);
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
// public String allProxyStatus() {
|
||||||
|
// String re = "all proxy info >>>> \n";
|
||||||
|
// for (Entry<String, Proxy> entry : allProxy.entrySet()) {
|
||||||
|
// re += entry.getValue().toString() + "\n";
|
||||||
|
// }
|
||||||
|
// return re;
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
// public int getIdleNum() {
|
||||||
|
// return proxyQueue.size();
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
// public int getReuseInterval() {
|
||||||
|
// return reuseInterval;
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
// public void setReuseInterval(int reuseInterval) {
|
||||||
|
// this.reuseInterval = reuseInterval;
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
// public void enable(boolean isEnable) {
|
||||||
|
// this.isEnable = isEnable;
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
// public boolean isEnable() {
|
||||||
|
// return isEnable;
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
// public int getReviveTime() {
|
||||||
|
// return reviveTime;
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
// public void setReviveTime(int reviveTime) {
|
||||||
|
// this.reviveTime = reviveTime;
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
// public boolean isValidateWhenInit() {
|
||||||
|
// return validateWhenInit;
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
// public void validateWhenInit(boolean validateWhenInit) {
|
||||||
|
// this.validateWhenInit = validateWhenInit;
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
// public int getSaveProxyInterval() {
|
||||||
|
// return saveProxyInterval;
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
// public void setSaveProxyInterval(int saveProxyInterval) {
|
||||||
|
// this.saveProxyInterval = saveProxyInterval;
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
// public String getProxyFilePath() {
|
||||||
|
// return proxyFilePath;
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
// public void setProxyFilePath(String proxyFilePath) {
|
||||||
|
// this.proxyFilePath = proxyFilePath;
|
||||||
|
// }
|
||||||
|
|
||||||
|
}
|
|
@ -44,6 +44,16 @@ public class Html extends HtmlNode {
|
||||||
*/
|
*/
|
||||||
private Document document;
|
private Document document;
|
||||||
|
|
||||||
|
public Html(String text, String url) {
|
||||||
|
try {
|
||||||
|
disableJsoupHtmlEntityEscape();
|
||||||
|
this.document = Jsoup.parse(text, url);
|
||||||
|
} catch (Exception e) {
|
||||||
|
this.document = null;
|
||||||
|
logger.warn("parse document error ", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public Html(String text) {
|
public Html(String text) {
|
||||||
try {
|
try {
|
||||||
disableJsoupHtmlEntityEscape();
|
disableJsoupHtmlEntityEscape();
|
||||||
|
|
|
@ -34,7 +34,7 @@ public class HtmlNode extends AbstractSelectable {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Selectable links() {
|
public Selectable links() {
|
||||||
return xpath("//a/@href");
|
return selectElements(new LinksSelector());
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -0,0 +1,51 @@
|
||||||
|
package us.codecraft.webmagic.selector;
|
||||||
|
|
||||||
|
import org.jsoup.helper.StringUtil;
|
||||||
|
import org.jsoup.nodes.Element;
|
||||||
|
import org.jsoup.select.Elements;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Links selector based on jsoup. Use absolute url. <br>
|
||||||
|
*
|
||||||
|
* @author code4crafter@gmail.com <br>
|
||||||
|
* @since 0.7.0
|
||||||
|
*/
|
||||||
|
public class LinksSelector extends BaseElementSelector {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String select(Element element) {
|
||||||
|
throw new UnsupportedOperationException();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public List<String> selectList(Element element) {
|
||||||
|
Elements elements = element.select("a");
|
||||||
|
List<String> links = new ArrayList<String>(elements.size());
|
||||||
|
for (Element element0 : elements) {
|
||||||
|
if (!StringUtil.isBlank(element0.baseUri())) {
|
||||||
|
links.add(element0.attr("abs:href"));
|
||||||
|
} else {
|
||||||
|
links.add(element0.attr("href"));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return links;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Element selectElement(Element element) {
|
||||||
|
throw new UnsupportedOperationException();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public List<Element> selectElements(Element element) {
|
||||||
|
throw new UnsupportedOperationException();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean hasAttribute() {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,28 @@
|
||||||
|
package us.codecraft.webmagic.utils;
|
||||||
|
|
||||||
|
import org.apache.http.Header;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author code4crafter@gmail.com
|
||||||
|
* Date: 17/3/27
|
||||||
|
*/
|
||||||
|
public abstract class HttpClientUtils {
|
||||||
|
|
||||||
|
public static Map<String,List<String>> convertHeaders(Header[] headers){
|
||||||
|
Map<String,List<String>> results = new HashMap<String, List<String>>();
|
||||||
|
for (Header header : headers) {
|
||||||
|
List<String> list = results.get(header.getName());
|
||||||
|
if (list == null) {
|
||||||
|
list = new ArrayList<String>();
|
||||||
|
results.put(header.getName(), list);
|
||||||
|
}
|
||||||
|
list.add(header.getValue());
|
||||||
|
}
|
||||||
|
return results;
|
||||||
|
}
|
||||||
|
}
|
|
@ -1,19 +1,12 @@
|
||||||
package us.codecraft.webmagic.utils;
|
package us.codecraft.webmagic.utils;
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.net.Inet6Address;
|
|
||||||
import java.net.InetAddress;
|
|
||||||
import java.net.InetSocketAddress;
|
|
||||||
import java.net.NetworkInterface;
|
|
||||||
import java.net.Socket;
|
|
||||||
import java.net.SocketException;
|
|
||||||
import java.net.UnknownHostException;
|
|
||||||
import java.util.Enumeration;
|
|
||||||
import java.util.regex.Pattern;
|
|
||||||
|
|
||||||
import org.apache.http.HttpHost;
|
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
import us.codecraft.webmagic.proxy.Proxy;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.net.InetSocketAddress;
|
||||||
|
import java.net.Socket;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Pooled Proxy Object
|
* Pooled Proxy Object
|
||||||
|
@ -23,68 +16,19 @@ import org.slf4j.LoggerFactory;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
public class ProxyUtils {
|
public class ProxyUtils {
|
||||||
private static InetAddress localAddr;
|
|
||||||
private static String networkInterface = "eth7";
|
|
||||||
|
|
||||||
private static final Logger logger = LoggerFactory.getLogger(ProxyUtils.class);
|
private static final Logger logger = LoggerFactory.getLogger(ProxyUtils.class);
|
||||||
static {
|
|
||||||
init();
|
|
||||||
}
|
|
||||||
|
|
||||||
private static void init() {
|
public static boolean validateProxy(Proxy p) {
|
||||||
// first way to get local IP
|
|
||||||
try {
|
|
||||||
localAddr = InetAddress.getLocalHost();
|
|
||||||
logger.info("local IP:" + localAddr.getHostAddress());
|
|
||||||
} catch (UnknownHostException e) {
|
|
||||||
logger.info("try again\n");
|
|
||||||
}
|
|
||||||
if (localAddr != null) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
// other way to get local IP
|
|
||||||
Enumeration<InetAddress> localAddrs;
|
|
||||||
try {
|
|
||||||
// modify your network interface name
|
|
||||||
NetworkInterface ni = NetworkInterface.getByName(networkInterface);
|
|
||||||
if (ni == null) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
localAddrs = ni.getInetAddresses();
|
|
||||||
if (localAddrs == null || !localAddrs.hasMoreElements()) {
|
|
||||||
logger.error("choose NetworkInterface\n" + getNetworkInterface());
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
while (localAddrs.hasMoreElements()) {
|
|
||||||
InetAddress tmp = localAddrs.nextElement();
|
|
||||||
if (!tmp.isLoopbackAddress() && !tmp.isLinkLocalAddress() && !(tmp instanceof Inet6Address)) {
|
|
||||||
localAddr = tmp;
|
|
||||||
logger.info("local IP:" + localAddr.getHostAddress());
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} catch (Exception e) {
|
|
||||||
logger.error("Failure when init ProxyUtil", e);
|
|
||||||
logger.error("choose NetworkInterface\n" + getNetworkInterface());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public static boolean validateProxy(HttpHost p) {
|
|
||||||
if (localAddr == null) {
|
|
||||||
logger.error("cannot get local IP");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
boolean isReachable = false;
|
|
||||||
Socket socket = null;
|
Socket socket = null;
|
||||||
try {
|
try {
|
||||||
socket = new Socket();
|
socket = new Socket();
|
||||||
socket.bind(new InetSocketAddress(localAddr, 0));
|
InetSocketAddress endpointSocketAddr = new InetSocketAddress(p.getHost(), p.getPort());
|
||||||
InetSocketAddress endpointSocketAddr = new InetSocketAddress(p.getAddress().getHostAddress(), p.getPort());
|
|
||||||
socket.connect(endpointSocketAddr, 3000);
|
socket.connect(endpointSocketAddr, 3000);
|
||||||
logger.debug("SUCCESS - connection established! Local: " + localAddr.getHostAddress() + " remote: " + p);
|
return true;
|
||||||
isReachable = true;
|
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
logger.warn("FAILRE - CAN not connect! Local: " + localAddr.getHostAddress() + " remote: " + p);
|
logger.warn("FAILRE - CAN not connect! remote: " + p);
|
||||||
|
return false;
|
||||||
} finally {
|
} finally {
|
||||||
if (socket != null) {
|
if (socket != null) {
|
||||||
try {
|
try {
|
||||||
|
@ -94,30 +38,7 @@ public class ProxyUtils {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return isReachable;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private static String getNetworkInterface() {
|
|
||||||
|
|
||||||
String networkInterfaceName = ">>>> modify networkInterface in us.codecraft.webmagic.utils.ProxyUtils";
|
|
||||||
Enumeration<NetworkInterface> enumeration = null;
|
|
||||||
try {
|
|
||||||
enumeration = NetworkInterface.getNetworkInterfaces();
|
|
||||||
} catch (SocketException e1) {
|
|
||||||
e1.printStackTrace();
|
|
||||||
}
|
|
||||||
while (enumeration.hasMoreElements()) {
|
|
||||||
NetworkInterface networkInterface = enumeration.nextElement();
|
|
||||||
|
|
||||||
Enumeration<InetAddress> addr = networkInterface.getInetAddresses();
|
|
||||||
while (addr.hasMoreElements()) {
|
|
||||||
String s = addr.nextElement().getHostAddress();
|
|
||||||
Pattern IPV4_PATTERN = Pattern.compile("^(25[0-5]|2[0-4]\\d|[0-1]?\\d?\\d)(\\.(25[0-5]|2[0-4]\\d|[0-1]?\\d?\\d)){3}$");
|
|
||||||
if (s != null && IPV4_PATTERN.matcher(s).matches()) {
|
|
||||||
networkInterfaceName += networkInterface.toString() + "IP:" + s + "\n\n";
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return networkInterfaceName;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -80,7 +80,7 @@ public class UrlUtils {
|
||||||
if (i > 0) {
|
if (i > 0) {
|
||||||
domain = StringUtils.substring(domain, 0, i);
|
domain = StringUtils.substring(domain, 0, i);
|
||||||
}
|
}
|
||||||
return domain;
|
return removePort(domain);
|
||||||
}
|
}
|
||||||
|
|
||||||
public static String removePort(String domain) {
|
public static String removePort(String domain) {
|
||||||
|
|
|
@ -48,4 +48,14 @@ public class HtmlTest {
|
||||||
Selectable selectable = html.xpath("//a[1]").nodes().get(0);
|
Selectable selectable = html.xpath("//a[1]").nodes().get(0);
|
||||||
assertThat(selectable.xpath("/a/@href").get()).isEqualTo("/xx/xx");
|
assertThat(selectable.xpath("/a/@href").get()).isEqualTo("/xx/xx");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testGetHrefsByJsoup(){
|
||||||
|
Html html = new Html("<html><a href='issues'>issues</a><img src='webmagic.jpg'/></html>","https://github.com/code4craft/webmagic/");
|
||||||
|
assertThat(html.xpath("//a[1]/@abs:href").get()).isEqualTo("https://github.com/code4craft/webmagic/issues");
|
||||||
|
assertThat(html.xpath("//img/@abs:src").get()).isEqualTo("https://github.com/code4craft/webmagic/webmagic.jpg");
|
||||||
|
html = new Html("<html><base href='https://github.com/code4craft/webmagic/'><a href='issues'>issues</a><img src='webmagic.jpg'/></base></html>");
|
||||||
|
assertThat(html.xpath("//a[1]/@abs:href").get()).isEqualTo("https://github.com/code4craft/webmagic/issues");
|
||||||
|
assertThat(html.xpath("//img/@abs:src").get()).isEqualTo("https://github.com/code4craft/webmagic/webmagic.jpg");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -19,12 +19,12 @@ public class SpiderTest {
|
||||||
@Ignore("long time")
|
@Ignore("long time")
|
||||||
@Test
|
@Test
|
||||||
public void testStartAndStop() throws InterruptedException {
|
public void testStartAndStop() throws InterruptedException {
|
||||||
Spider spider = Spider.create(new SimplePageProcessor("http://www.oschina.net/", "http://www.oschina.net/*")).addPipeline(new Pipeline() {
|
Spider spider = Spider.create(new SimplePageProcessor( "http://www.oschina.net/*")).addPipeline(new Pipeline() {
|
||||||
@Override
|
@Override
|
||||||
public void process(ResultItems resultItems, Task task) {
|
public void process(ResultItems resultItems, Task task) {
|
||||||
System.out.println(1);
|
System.out.println(1);
|
||||||
}
|
}
|
||||||
}).thread(1);
|
}).thread(1).addUrl("http://www.oschina.net/");
|
||||||
spider.start();
|
spider.start();
|
||||||
Thread.sleep(10000);
|
Thread.sleep(10000);
|
||||||
spider.stop();
|
spider.stop();
|
||||||
|
|
|
@ -3,9 +3,10 @@ package us.codecraft.webmagic.downloader;
|
||||||
import com.github.dreamhead.moco.HttpServer;
|
import com.github.dreamhead.moco.HttpServer;
|
||||||
import com.github.dreamhead.moco.Runnable;
|
import com.github.dreamhead.moco.Runnable;
|
||||||
import com.github.dreamhead.moco.Runner;
|
import com.github.dreamhead.moco.Runner;
|
||||||
|
import org.apache.commons.collections.map.HashedMap;
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.http.client.methods.CloseableHttpResponse;
|
import org.apache.http.client.methods.CloseableHttpResponse;
|
||||||
import org.apache.http.client.methods.RequestBuilder;
|
import org.apache.http.client.methods.HttpUriRequest;
|
||||||
import org.apache.http.impl.client.CloseableHttpClient;
|
import org.apache.http.impl.client.CloseableHttpClient;
|
||||||
import org.apache.http.impl.client.HttpClients;
|
import org.apache.http.impl.client.HttpClients;
|
||||||
import org.apache.http.util.EntityUtils;
|
import org.apache.http.util.EntityUtils;
|
||||||
|
@ -14,11 +15,14 @@ import us.codecraft.webmagic.Page;
|
||||||
import us.codecraft.webmagic.Request;
|
import us.codecraft.webmagic.Request;
|
||||||
import us.codecraft.webmagic.Site;
|
import us.codecraft.webmagic.Site;
|
||||||
import us.codecraft.webmagic.Task;
|
import us.codecraft.webmagic.Task;
|
||||||
|
import us.codecraft.webmagic.model.HttpRequestBody;
|
||||||
import us.codecraft.webmagic.selector.Html;
|
import us.codecraft.webmagic.selector.Html;
|
||||||
|
import us.codecraft.webmagic.utils.CharsetUtils;
|
||||||
import us.codecraft.webmagic.utils.HttpConstant;
|
import us.codecraft.webmagic.utils.HttpConstant;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.UnsupportedEncodingException;
|
import java.io.UnsupportedEncodingException;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
import static com.github.dreamhead.moco.Moco.*;
|
import static com.github.dreamhead.moco.Moco.*;
|
||||||
import static org.assertj.core.api.Assertions.assertThat;
|
import static org.assertj.core.api.Assertions.assertThat;
|
||||||
|
@ -30,7 +34,7 @@ import static org.junit.Assert.assertTrue;
|
||||||
*/
|
*/
|
||||||
public class HttpClientDownloaderTest {
|
public class HttpClientDownloaderTest {
|
||||||
|
|
||||||
public static final String PAGE_ALWAYS_NOT_EXISTS = "http://localhost:13421/404";
|
public static final String PAGE_ALWAYS_NOT_EXISTS = "http://localhost:13423/404";
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testDownloader() {
|
public void testDownloader() {
|
||||||
|
@ -59,7 +63,7 @@ public class HttpClientDownloaderTest {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testGetHtmlCharset() throws Exception {
|
public void testGetHtmlCharset() throws Exception {
|
||||||
HttpServer server = httpserver(12306);
|
HttpServer server = httpserver(13423);
|
||||||
server.get(by(uri("/header"))).response(header("Content-Type", "text/html; charset=gbk"));
|
server.get(by(uri("/header"))).response(header("Content-Type", "text/html; charset=gbk"));
|
||||||
server.get(by(uri("/meta4"))).response(with(text("<html>\n" +
|
server.get(by(uri("/meta4"))).response(with(text("<html>\n" +
|
||||||
" <head>\n" +
|
" <head>\n" +
|
||||||
|
@ -76,30 +80,30 @@ public class HttpClientDownloaderTest {
|
||||||
Runner.running(server, new Runnable() {
|
Runner.running(server, new Runnable() {
|
||||||
@Override
|
@Override
|
||||||
public void run() {
|
public void run() {
|
||||||
String charset = getCharsetByUrl("http://127.0.0.1:12306/header");
|
String charset = getCharsetByUrl("http://127.0.0.1:13423/header");
|
||||||
assertEquals(charset, "gbk");
|
assertEquals(charset, "gbk");
|
||||||
charset = getCharsetByUrl("http://127.0.0.1:12306/meta4");
|
charset = getCharsetByUrl("http://127.0.0.1:13423/meta4");
|
||||||
assertEquals(charset, "gbk");
|
assertEquals(charset, "gbk");
|
||||||
charset = getCharsetByUrl("http://127.0.0.1:12306/meta5");
|
charset = getCharsetByUrl("http://127.0.0.1:13423/meta5");
|
||||||
assertEquals(charset, "gbk");
|
assertEquals(charset, "gbk");
|
||||||
}
|
}
|
||||||
|
|
||||||
private String getCharsetByUrl(String url) {
|
private String getCharsetByUrl(String url) {
|
||||||
HttpClientDownloader downloader = new HttpClientDownloader();
|
HttpClientDownloader downloader = new HttpClientDownloader();
|
||||||
Site site = Site.me();
|
Site site = Site.me();
|
||||||
CloseableHttpClient httpClient = new HttpClientGenerator().getClient(site, null);
|
CloseableHttpClient httpClient = new HttpClientGenerator().getClient(site);
|
||||||
// encoding in http header Content-Type
|
// encoding in http header Content-Type
|
||||||
Request requestGBK = new Request(url);
|
Request requestGBK = new Request(url);
|
||||||
CloseableHttpResponse httpResponse = null;
|
CloseableHttpResponse httpResponse = null;
|
||||||
try {
|
try {
|
||||||
httpResponse = httpClient.execute(downloader.getHttpUriRequest(requestGBK, site, null,null));
|
httpResponse = httpClient.execute(new HttpUriRequestConverter().convert(requestGBK, site, null).getHttpUriRequest());
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
e.printStackTrace();
|
e.printStackTrace();
|
||||||
}
|
}
|
||||||
String charset = null;
|
String charset = null;
|
||||||
try {
|
try {
|
||||||
byte[] contentBytes = IOUtils.toByteArray(httpResponse.getEntity().getContent());
|
byte[] contentBytes = IOUtils.toByteArray(httpResponse.getEntity().getContent());
|
||||||
charset = downloader.getHtmlCharset(httpResponse,contentBytes);
|
charset = CharsetUtils.detectCharset(httpResponse.getEntity().getContentType().getValue(), contentBytes);
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
e.printStackTrace();
|
e.printStackTrace();
|
||||||
}
|
}
|
||||||
|
@ -110,53 +114,108 @@ public class HttpClientDownloaderTest {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void test_selectRequestMethod() throws Exception {
|
public void test_selectRequestMethod() throws Exception {
|
||||||
HttpServer server = httpserver(12306);
|
HttpServer server = httpserver(13423);
|
||||||
server.get(eq(query("q"), "webmagic")).response("get");
|
server.get(eq(query("q"), "webmagic")).response("get");
|
||||||
server.post(eq(form("q"), "webmagic")).response("post");
|
server.post(eq(form("q"), "webmagic")).response("post");
|
||||||
server.put(eq(form("q"), "webmagic")).response("put");
|
server.put(eq(form("q"), "webmagic")).response("put");
|
||||||
server.delete(eq(query("q"), "webmagic")).response("delete");
|
server.delete(eq(query("q"), "webmagic")).response("delete");
|
||||||
server.request(and(by(method("HEAD")),eq(query("q"), "webmagic"))).response(header("method","head"));
|
server.request(and(by(method("HEAD")),eq(query("q"), "webmagic"))).response(header("method","head"));
|
||||||
server.request(and(by(method("TRACE")),eq(query("q"), "webmagic"))).response("trace");
|
server.request(and(by(method("TRACE")),eq(query("q"), "webmagic"))).response("trace");
|
||||||
|
final HttpUriRequestConverter httpUriRequestConverter = new HttpUriRequestConverter();
|
||||||
|
final Site site = Site.me();
|
||||||
|
Runner.running(server, new Runnable() {
|
||||||
|
@Override
|
||||||
|
public void run() throws Exception {
|
||||||
|
Request request = new Request();
|
||||||
|
request.setUrl("http://127.0.0.1:13423/search?q=webmagic");
|
||||||
|
request.setMethod(HttpConstant.Method.GET);
|
||||||
|
Map<String,Object> params = new HashedMap();
|
||||||
|
params.put("q","webmagic");
|
||||||
|
HttpUriRequest httpUriRequest = httpUriRequestConverter.convert(request,site,null).getHttpUriRequest();
|
||||||
|
assertThat(EntityUtils.toString(HttpClients.custom().build().execute(httpUriRequest).getEntity())).isEqualTo("get");
|
||||||
|
request.setMethod(HttpConstant.Method.DELETE);
|
||||||
|
httpUriRequest = httpUriRequestConverter.convert(request, site, null).getHttpUriRequest();
|
||||||
|
assertThat(EntityUtils.toString(HttpClients.custom().build().execute(httpUriRequest).getEntity())).isEqualTo("delete");
|
||||||
|
request.setMethod(HttpConstant.Method.HEAD);
|
||||||
|
httpUriRequest = httpUriRequestConverter.convert(request, site, null).getHttpUriRequest();
|
||||||
|
assertThat(HttpClients.custom().build().execute(httpUriRequest).getFirstHeader("method").getValue()).isEqualTo("head");
|
||||||
|
request.setMethod(HttpConstant.Method.TRACE);
|
||||||
|
httpUriRequest = httpUriRequestConverter.convert(request, site, null).getHttpUriRequest();
|
||||||
|
assertThat(EntityUtils.toString(HttpClients.custom().build().execute(httpUriRequest).getEntity())).isEqualTo("trace");
|
||||||
|
request.setUrl("http://127.0.0.1:13423/search");
|
||||||
|
request.setMethod(HttpConstant.Method.POST);
|
||||||
|
request.setRequestBody(HttpRequestBody.form(params, "utf-8"));
|
||||||
|
httpUriRequest = httpUriRequestConverter.convert(request, site, null).getHttpUriRequest();
|
||||||
|
assertThat(EntityUtils.toString(HttpClients.custom().build().execute(httpUriRequest).getEntity())).isEqualTo("post");
|
||||||
|
request.setMethod(HttpConstant.Method.PUT);
|
||||||
|
httpUriRequest = httpUriRequestConverter.convert(request, site, null).getHttpUriRequest();
|
||||||
|
assertThat(EntityUtils.toString(HttpClients.custom().build().execute(httpUriRequest).getEntity())).isEqualTo("put");
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void test_set_request_cookie() throws Exception {
|
||||||
|
HttpServer server = httpserver(13423);
|
||||||
|
server.get(eq(cookie("cookie"), "cookie-webmagic")).response("ok");
|
||||||
Runner.running(server, new Runnable() {
|
Runner.running(server, new Runnable() {
|
||||||
@Override
|
@Override
|
||||||
public void run() throws Exception {
|
public void run() throws Exception {
|
||||||
HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
|
HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
|
||||||
Request request = new Request();
|
Request request = new Request();
|
||||||
request.setUrl("http://127.0.0.1:12306/search");
|
request.setUrl("http://127.0.0.1:13423");
|
||||||
request.putParams("q", "webmagic");
|
request.addCookie("cookie","cookie-webmagic");
|
||||||
request.setMethod(HttpConstant.Method.GET);
|
Page page = httpClientDownloader.download(request, Site.me().toTask());
|
||||||
RequestBuilder requestBuilder = httpClientDownloader.selectRequestMethod(request).setUri(request.getUrl());
|
assertThat(page.getRawText()).isEqualTo("ok");
|
||||||
assertThat(EntityUtils.toString(HttpClients.custom().build().execute(requestBuilder.build()).getEntity())).isEqualTo("get");
|
}
|
||||||
request.setMethod(HttpConstant.Method.POST);
|
});
|
||||||
requestBuilder = httpClientDownloader.selectRequestMethod(request).setUri(request.getUrl());
|
}
|
||||||
assertThat(EntityUtils.toString(HttpClients.custom().build().execute(requestBuilder.build()).getEntity())).isEqualTo("post");
|
|
||||||
request.setMethod(HttpConstant.Method.PUT);
|
@Test
|
||||||
requestBuilder = httpClientDownloader.selectRequestMethod(request).setUri(request.getUrl());
|
public void test_set_request_header() throws Exception {
|
||||||
assertThat(EntityUtils.toString(HttpClients.custom().build().execute(requestBuilder.build()).getEntity())).isEqualTo("put");
|
HttpServer server = httpserver(13423);
|
||||||
request.setMethod(HttpConstant.Method.DELETE);
|
server.get(eq(header("header"), "header-webmagic")).response("ok");
|
||||||
requestBuilder = httpClientDownloader.selectRequestMethod(request).setUri(request.getUrl());
|
Runner.running(server, new Runnable() {
|
||||||
assertThat(EntityUtils.toString(HttpClients.custom().build().execute(requestBuilder.build()).getEntity())).isEqualTo("delete");
|
@Override
|
||||||
request.setMethod(HttpConstant.Method.HEAD);
|
public void run() throws Exception {
|
||||||
requestBuilder = httpClientDownloader.selectRequestMethod(request).setUri(request.getUrl());
|
HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
|
||||||
assertThat(HttpClients.custom().build().execute(requestBuilder.build()).getFirstHeader("method").getValue()).isEqualTo("head");
|
Request request = new Request();
|
||||||
request.setMethod(HttpConstant.Method.TRACE);
|
request.setUrl("http://127.0.0.1:13423");
|
||||||
requestBuilder = httpClientDownloader.selectRequestMethod(request).setUri(request.getUrl());
|
request.addHeader("header","header-webmagic");
|
||||||
assertThat(EntityUtils.toString(HttpClients.custom().build().execute(requestBuilder.build()).getEntity())).isEqualTo("trace");
|
Page page = httpClientDownloader.download(request, Site.me().toTask());
|
||||||
|
assertThat(page.getRawText()).isEqualTo("ok");
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void test_set_site_cookie() throws Exception {
|
||||||
|
HttpServer server = httpserver(13423);
|
||||||
|
server.get(eq(cookie("cookie"), "cookie-webmagic")).response("ok");
|
||||||
|
Runner.running(server, new Runnable() {
|
||||||
|
@Override
|
||||||
|
public void run() throws Exception {
|
||||||
|
HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
|
||||||
|
Request request = new Request();
|
||||||
|
request.setUrl("http://127.0.0.1:13423");
|
||||||
|
Site site = Site.me().addCookie("cookie", "cookie-webmagic").setDomain("127.0.0.1");
|
||||||
|
Page page = httpClientDownloader.download(request, site.toTask());
|
||||||
|
assertThat(page.getRawText()).isEqualTo("ok");
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void test_download_when_task_is_null() throws Exception {
|
public void test_download_when_task_is_null() throws Exception {
|
||||||
HttpServer server = httpserver(12306);
|
HttpServer server = httpserver(13423);
|
||||||
server.response("foo");
|
server.response("foo");
|
||||||
Runner.running(server, new Runnable() {
|
Runner.running(server, new Runnable() {
|
||||||
@Override
|
@Override
|
||||||
public void run() throws Exception {
|
public void run() throws Exception {
|
||||||
final HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
|
final HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
|
||||||
Request request = new Request();
|
Request request = new Request();
|
||||||
request.setUrl("http://127.0.0.1:12306/");
|
request.setUrl("http://127.0.0.1:13423/");
|
||||||
Page page = httpClientDownloader.download(request, null);
|
Page page = httpClientDownloader.download(request, Site.me().toTask());
|
||||||
assertThat(page.getRawText()).isEqualTo("foo");
|
assertThat(page.getRawText()).isEqualTo("foo");
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
|
@ -2,13 +2,10 @@ package us.codecraft.webmagic.proxy;
|
||||||
|
|
||||||
import org.apache.http.HttpHost;
|
import org.apache.http.HttpHost;
|
||||||
import org.junit.BeforeClass;
|
import org.junit.BeforeClass;
|
||||||
import org.junit.Test;
|
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
import static org.assertj.core.api.Assertions.assertThat;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @author yxssfxwzy@sina.com May 30, 2014
|
* @author yxssfxwzy@sina.com May 30, 2014
|
||||||
*
|
*
|
||||||
|
@ -27,30 +24,6 @@ public class ProxyTest {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testProxy() {
|
|
||||||
SimpleProxyPool proxyPool = new SimpleProxyPool(httpProxyList,false);
|
|
||||||
proxyPool.setReuseInterval(500);
|
|
||||||
assertThat(proxyPool.getIdleNum()).isEqualTo(4);
|
|
||||||
for (int i = 0; i < 2; i++) {
|
|
||||||
List<Fetch> fetchList = new ArrayList<Fetch>();
|
|
||||||
while (proxyPool.getIdleNum() != 0) {
|
|
||||||
Proxy proxy = proxyPool.getProxy();
|
|
||||||
HttpHost httphost = proxy.getHttpHost();
|
|
||||||
// httphostList.add(httphost);
|
|
||||||
System.out.println(httphost.getHostName() + ":" + httphost.getPort());
|
|
||||||
Fetch tmp = new Fetch(httphost);
|
|
||||||
tmp.start();
|
|
||||||
fetchList.add(tmp);
|
|
||||||
}
|
|
||||||
for (Fetch fetch : fetchList) {
|
|
||||||
proxyPool.returnProxy(fetch.hp, Proxy.SUCCESS);
|
|
||||||
}
|
|
||||||
System.out.println(proxyPool.allProxyStatus());
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
class Fetch extends Thread {
|
class Fetch extends Thread {
|
||||||
HttpHost hp;
|
HttpHost hp;
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,21 @@
|
||||||
|
package us.codecraft.webmagic.selector;
|
||||||
|
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author code4crafter@gmail.com
|
||||||
|
* Date: 17/4/8
|
||||||
|
* Time: 下午9:41
|
||||||
|
*/
|
||||||
|
public class LinksSelectorTest {
|
||||||
|
|
||||||
|
private String html = "<div><a href='http://whatever.com/aaa'></a></div><div><a href='http://whatever.com/bbb'></a></div>";
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testLinks() throws Exception {
|
||||||
|
List<String> links = new LinksSelector().selectList(html);
|
||||||
|
System.out.println(links);
|
||||||
|
}
|
||||||
|
}
|
|
@ -3,7 +3,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<artifactId>webmagic-parent</artifactId>
|
<artifactId>webmagic-parent</artifactId>
|
||||||
<version>0.6.2-SNAPSHOT</version>
|
<version>0.7.0-SNAPSHOT</version>
|
||||||
</parent>
|
</parent>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
|
|
@ -1,29 +0,0 @@
|
||||||
<?xml version="1.0" encoding="UTF-8"?>
|
|
||||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
|
||||||
<parent>
|
|
||||||
<groupId>us.codecraft</groupId>
|
|
||||||
<artifactId>webmagic-parent</artifactId>
|
|
||||||
<version>0.5.2</version>
|
|
||||||
</parent>
|
|
||||||
<modelVersion>4.0.0</modelVersion>
|
|
||||||
|
|
||||||
<artifactId>webmagic-extension</artifactId>
|
|
||||||
|
|
||||||
<dependencies>
|
|
||||||
<dependency>
|
|
||||||
<groupId>redis.clients</groupId>
|
|
||||||
<artifactId>jedis</artifactId>
|
|
||||||
<version>2.0.0</version>
|
|
||||||
</dependency>
|
|
||||||
<dependency>
|
|
||||||
<groupId>us.codecraft</groupId>
|
|
||||||
<artifactId>webmagic-core</artifactId>
|
|
||||||
<version>${project.version}</version>
|
|
||||||
</dependency>
|
|
||||||
<dependency>
|
|
||||||
<groupId>junit</groupId>
|
|
||||||
<artifactId>junit</artifactId>
|
|
||||||
</dependency>
|
|
||||||
</dependencies>
|
|
||||||
|
|
||||||
</project>
|
|
|
@ -1,124 +0,0 @@
|
||||||
package us.codecraft.webmagic.downloader;
|
|
||||||
|
|
||||||
import org.apache.commons.codec.digest.DigestUtils;
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
import us.codecraft.webmagic.*;
|
|
||||||
import us.codecraft.webmagic.utils.Experimental;
|
|
||||||
import us.codecraft.webmagic.pipeline.Pipeline;
|
|
||||||
import us.codecraft.webmagic.processor.PageProcessor;
|
|
||||||
import us.codecraft.webmagic.processor.SimplePageProcessor;
|
|
||||||
import us.codecraft.webmagic.selector.Html;
|
|
||||||
import us.codecraft.webmagic.selector.PlainText;
|
|
||||||
import us.codecraft.webmagic.utils.FilePersistentBase;
|
|
||||||
import us.codecraft.webmagic.utils.UrlUtils;
|
|
||||||
|
|
||||||
import java.io.*;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Download file and saved to file for cache.<br>
|
|
||||||
*
|
|
||||||
* @author code4crafter@gmail.com
|
|
||||||
* @since 0.2.1
|
|
||||||
*/
|
|
||||||
@Experimental
|
|
||||||
public class FileCache extends FilePersistentBase implements Downloader, Pipeline, PageProcessor {
|
|
||||||
|
|
||||||
private Downloader downloaderWhenFileMiss;
|
|
||||||
|
|
||||||
private final PageProcessor pageProcessor;
|
|
||||||
|
|
||||||
private Logger logger = LoggerFactory.getLogger(getClass());
|
|
||||||
|
|
||||||
public FileCache(String startUrl, String urlPattern) {
|
|
||||||
this(startUrl, urlPattern, "/data/webmagic/temp/");
|
|
||||||
}
|
|
||||||
|
|
||||||
public FileCache(String startUrl, String urlPattern, String path) {
|
|
||||||
this.pageProcessor = new SimplePageProcessor(startUrl, urlPattern);
|
|
||||||
setPath(path);
|
|
||||||
downloaderWhenFileMiss = new HttpClientDownloader();
|
|
||||||
}
|
|
||||||
|
|
||||||
public FileCache setDownloaderWhenFileMiss(Downloader downloaderWhenFileMiss) {
|
|
||||||
this.downloaderWhenFileMiss = downloaderWhenFileMiss;
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public Page download(Request request, Task task) {
|
|
||||||
String path = this.path + "/" + task.getUUID() + "/";
|
|
||||||
Page page = null;
|
|
||||||
try {
|
|
||||||
final File file = getFile(path + DigestUtils.md5Hex(request.getUrl()));
|
|
||||||
BufferedReader bufferedReader = new BufferedReader(new FileReader(file));
|
|
||||||
String line = bufferedReader.readLine();
|
|
||||||
if (line.equals("url:\t" + request.getUrl())) {
|
|
||||||
final String html = getHtml(bufferedReader);
|
|
||||||
page = new Page();
|
|
||||||
page.setRequest(request);
|
|
||||||
page.setUrl(PlainText.create(request.getUrl()));
|
|
||||||
page.setHtml(Html.create(UrlUtils.fixAllRelativeHrefs(html, request.getUrl())));
|
|
||||||
}
|
|
||||||
} catch (IOException e) {
|
|
||||||
if (e instanceof FileNotFoundException) {
|
|
||||||
logger.info("File not exist for url " + request.getUrl());
|
|
||||||
} else {
|
|
||||||
logger.warn("File read error for url " + request.getUrl(), e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (page == null) {
|
|
||||||
page = downloadWhenMiss(request, task);
|
|
||||||
}
|
|
||||||
return page;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void setThread(int thread) {
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
private String getHtml(BufferedReader bufferedReader) throws IOException {
|
|
||||||
String line;
|
|
||||||
StringBuilder htmlBuilder = new StringBuilder();
|
|
||||||
line = bufferedReader.readLine();
|
|
||||||
line = StringUtils.removeStart(line, "html:\t");
|
|
||||||
htmlBuilder.append(line);
|
|
||||||
while ((line = bufferedReader.readLine()) != null) {
|
|
||||||
htmlBuilder.append(line);
|
|
||||||
}
|
|
||||||
return htmlBuilder.toString();
|
|
||||||
}
|
|
||||||
|
|
||||||
private Page downloadWhenMiss(Request request, Task task) {
|
|
||||||
Page page = null;
|
|
||||||
if (downloaderWhenFileMiss != null) {
|
|
||||||
page = downloaderWhenFileMiss.download(request, task);
|
|
||||||
}
|
|
||||||
return page;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void process(ResultItems resultItems, Task task) {
|
|
||||||
String path = this.path + PATH_SEPERATOR + task.getUUID() + PATH_SEPERATOR;
|
|
||||||
try {
|
|
||||||
PrintWriter printWriter = new PrintWriter(new FileWriter(getFile(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".html")));
|
|
||||||
printWriter.println("url:\t" + resultItems.getRequest().getUrl());
|
|
||||||
printWriter.println("html:\t" + resultItems.get("html"));
|
|
||||||
printWriter.close();
|
|
||||||
} catch (IOException e) {
|
|
||||||
logger.warn("write file error", e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void process(Page page) {
|
|
||||||
pageProcessor.process(page);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public Site getSite() {
|
|
||||||
return pageProcessor.getSite();
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,18 +0,0 @@
|
||||||
package us.codecraft.webmagic.downloader;
|
|
||||||
|
|
||||||
import org.junit.Ignore;
|
|
||||||
import org.junit.Test;
|
|
||||||
import us.codecraft.webmagic.Spider;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @author code4crafter@gmail.com <br>
|
|
||||||
*/
|
|
||||||
public class FileCacheTest {
|
|
||||||
|
|
||||||
@Ignore("takes long")
|
|
||||||
@Test
|
|
||||||
public void test() {
|
|
||||||
FileCache fileCache = new FileCache("http://my.oschina.net/flashsword/blog", "http://my.oschina.net/flashsword/blog/*");
|
|
||||||
Spider.create(fileCache).downloader(fileCache).pipeline(fileCache).run();
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -19,7 +19,7 @@ public class GithubRepoProcessor implements PageProcessor {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Site getSite() {
|
public Site getSite() {
|
||||||
return Site.me().addStartUrl("https://github.com/code4craft/webmagic");
|
return Site.me();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<artifactId>webmagic-parent</artifactId>
|
<artifactId>webmagic-parent</artifactId>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<version>0.6.2-SNAPSHOT</version>
|
<version>0.7.0-SNAPSHOT</version>
|
||||||
</parent>
|
</parent>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
|
|
@ -21,7 +21,7 @@ public class DianpingFtlDataScanner implements AfterExtractor {
|
||||||
private List<String> data;
|
private List<String> data;
|
||||||
|
|
||||||
public static void main(String[] args) {
|
public static void main(String[] args) {
|
||||||
OOSpider.create(Site.me().addStartUrl("http://w.alpha.dp/").setSleepTime(0), DianpingFtlDataScanner.class)
|
OOSpider.create(Site.me().setSleepTime(0), DianpingFtlDataScanner.class)
|
||||||
.thread(5).run();
|
.thread(5).run();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -41,9 +41,10 @@ public class GithubRepo implements HasKey {
|
||||||
private String url;
|
private String url;
|
||||||
|
|
||||||
public static void main(String[] args) {
|
public static void main(String[] args) {
|
||||||
OOSpider.create(Site.me().addStartUrl("https://github.com/explore").setSleepTime(0).setRetryTimes(3),
|
OOSpider.create(Site.me().setSleepTime(0).setRetryTimes(3),
|
||||||
new JsonFilePageModelPipeline(), GithubRepo.class)
|
new JsonFilePageModelPipeline(), GithubRepo.class)
|
||||||
.scheduler(new FileCacheQueueScheduler("/data/webmagic/cache/")).thread(15).run();
|
.addUrl("https://github.com/explore")
|
||||||
|
.setScheduler(new FileCacheQueueScheduler("/data/webmagic/cache/")).thread(15).run();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -28,7 +28,7 @@ public class IteyeBlog implements Blog{
|
||||||
}
|
}
|
||||||
|
|
||||||
public static void main(String[] args) {
|
public static void main(String[] args) {
|
||||||
OOSpider.create(Site.me().addStartUrl("http://flashsword20.iteye.com/blog"), IteyeBlog.class).run();
|
OOSpider.create(Site.me(), IteyeBlog.class).addUrl("http://flashsword20.iteye.com/blog").run();
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getTitle() {
|
public String getTitle() {
|
||||||
|
|
|
@ -32,12 +32,12 @@ public class Kr36NewsModel {
|
||||||
|
|
||||||
public static void main(String[] args) throws IOException, JMException {
|
public static void main(String[] args) throws IOException, JMException {
|
||||||
//Just for benchmark
|
//Just for benchmark
|
||||||
Spider thread = OOSpider.create(Site.me().addStartUrl("http://www.36kr.com/").setSleepTime(0), new PageModelPipeline() {
|
Spider thread = OOSpider.create(Site.me().setSleepTime(0), new PageModelPipeline() {
|
||||||
@Override
|
@Override
|
||||||
public void process(Object o, Task task) {
|
public void process(Object o, Task task) {
|
||||||
|
|
||||||
}
|
}
|
||||||
}, Kr36NewsModel.class).thread(20);
|
}, Kr36NewsModel.class).thread(20).addUrl("http://www.36kr.com/");
|
||||||
thread.start();
|
thread.start();
|
||||||
SpiderMonitor spiderMonitor = SpiderMonitor.instance();
|
SpiderMonitor spiderMonitor = SpiderMonitor.instance();
|
||||||
spiderMonitor.register(thread);
|
spiderMonitor.register(thread);
|
||||||
|
|
|
@ -22,7 +22,7 @@ public class OschinaAnswer implements AfterExtractor{
|
||||||
private String content;
|
private String content;
|
||||||
|
|
||||||
public static void main(String[] args) {
|
public static void main(String[] args) {
|
||||||
OOSpider.create(Site.me().addStartUrl("http://www.oschina.net/question/567527_120597"), OschinaAnswer.class).run();
|
OOSpider.create(Site.me(), OschinaAnswer.class).addUrl("http://www.oschina.net/question/567527_120597").run();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -26,7 +26,7 @@ public class OschinaBlog{
|
||||||
|
|
||||||
public static void main(String[] args) {
|
public static void main(String[] args) {
|
||||||
OOSpider.create(Site.me()
|
OOSpider.create(Site.me()
|
||||||
.setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36").addStartUrl("http://my.oschina.net/flashsword/blog")
|
.setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36")
|
||||||
.setSleepTime(0)
|
.setSleepTime(0)
|
||||||
.setRetryTimes(3)
|
.setRetryTimes(3)
|
||||||
,new PageModelPipeline() {
|
,new PageModelPipeline() {
|
||||||
|
@ -34,7 +34,7 @@ public class OschinaBlog{
|
||||||
public void process(Object o, Task task) {
|
public void process(Object o, Task task) {
|
||||||
|
|
||||||
}
|
}
|
||||||
}, OschinaBlog.class).thread(10).run();
|
}, OschinaBlog.class).thread(10).addUrl("http://my.oschina.net/flashsword/blog").run();
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getTitle() {
|
public String getTitle() {
|
||||||
|
|
|
@ -35,7 +35,7 @@ public class DiandianBlogProcessor implements PageProcessor {
|
||||||
public Site getSite() {
|
public Site getSite() {
|
||||||
//site定义抽取配置,以及开始url等
|
//site定义抽取配置,以及开始url等
|
||||||
if (site == null) {
|
if (site == null) {
|
||||||
site = Site.me().setDomain("progressdaily.diandian.com").addStartUrl("http://progressdaily.diandian.com/").
|
site = Site.me().setDomain("progressdaily.diandian.com").
|
||||||
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
|
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
|
||||||
}
|
}
|
||||||
return site;
|
return site;
|
||||||
|
|
|
@ -34,13 +34,13 @@ public class DiaoyuwengProcessor implements PageProcessor {
|
||||||
@Override
|
@Override
|
||||||
public Site getSite() {
|
public Site getSite() {
|
||||||
if (site==null){
|
if (site==null){
|
||||||
site= Site.me().setDomain("www.diaoyuweng.com").addStartUrl("http://www.diaoyuweng.com/home.php?mod=space&uid=88304&do=thread&view=me&type=thread&from=space").
|
site= Site.me().setDomain("www.diaoyuweng.com").
|
||||||
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31").setCharset("GBK").setSleepTime(500);
|
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31").setCharset("GBK").setSleepTime(500);
|
||||||
}
|
}
|
||||||
return site;
|
return site;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static void main(String[] args) {
|
public static void main(String[] args) {
|
||||||
Spider.create(new DiaoyuwengProcessor()).run();
|
Spider.create(new DiaoyuwengProcessor()).addUrl("http://www.diaoyuweng.com/home.php?mod=space&uid=88304&do=thread&view=me&type=thread&from=space").run();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -25,10 +25,10 @@ public class F58PageProcesser implements PageProcessor {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Site getSite() {
|
public Site getSite() {
|
||||||
return Site.me().setDomain("sh.58.com").addStartUrl("http://sh1.51a8.com/").setCycleRetryTimes(2); //To change body of implemented methods use File | Settings | File Templates.
|
return Site.me().setDomain("sh.58.com").setCycleRetryTimes(2); //To change body of implemented methods use File | Settings | File Templates.
|
||||||
}
|
}
|
||||||
|
|
||||||
public static void main(String[] args) {
|
public static void main(String[] args) {
|
||||||
Spider.create(new F58PageProcesser()).setScheduler(new RedisScheduler("localhost")).run();
|
Spider.create(new F58PageProcesser()).setScheduler(new RedisScheduler("localhost")).addUrl("http://sh1.51a8.com/").run();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -21,11 +21,11 @@ public class HuxiuProcessor implements PageProcessor {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Site getSite() {
|
public Site getSite() {
|
||||||
return Site.me().setDomain("www.huxiu.com").addStartUrl("http://www.huxiu.com/");
|
return Site.me().setDomain("www.huxiu.com");
|
||||||
}
|
}
|
||||||
|
|
||||||
public static void main(String[] args) {
|
public static void main(String[] args) {
|
||||||
Spider.create(new HuxiuProcessor()).run();
|
Spider.create(new HuxiuProcessor()).addUrl("http://www.huxiu.com/").run();
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -29,7 +29,7 @@ public class InfoQMiniBookProcessor implements PageProcessor {
|
||||||
@Override
|
@Override
|
||||||
public Site getSite() {
|
public Site getSite() {
|
||||||
if (site == null) {
|
if (site == null) {
|
||||||
site = Site.me().setDomain("www.infoq.com").addStartUrl("http://www.infoq.com/cn/minibooks").addCookie("RegisteredUserCookie", "sDDDc8dIAgZSq67uJSXhtpQaHEi1XDOH").
|
site = Site.me().setDomain("www.infoq.com").addCookie("RegisteredUserCookie", "sDDDc8dIAgZSq67uJSXhtpQaHEi1XDOH").
|
||||||
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
|
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
|
||||||
}
|
}
|
||||||
return site;
|
return site;
|
||||||
|
@ -38,6 +38,7 @@ public class InfoQMiniBookProcessor implements PageProcessor {
|
||||||
public static void main(String[] args) {
|
public static void main(String[] args) {
|
||||||
Spider.create(new InfoQMiniBookProcessor())
|
Spider.create(new InfoQMiniBookProcessor())
|
||||||
.thread(5)
|
.thread(5)
|
||||||
|
.addUrl("http://www.infoq.com/cn/minibooks")
|
||||||
.run();
|
.run();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -22,12 +22,12 @@ public class IteyeBlogProcessor implements PageProcessor {
|
||||||
@Override
|
@Override
|
||||||
public Site getSite() {
|
public Site getSite() {
|
||||||
if (site == null) {
|
if (site == null) {
|
||||||
site = Site.me().setDomain("yanghaoli.iteye.com").addStartUrl("http://yanghaoli.iteye.com/");
|
site = Site.me().setDomain("yanghaoli.iteye.com");
|
||||||
}
|
}
|
||||||
return site;
|
return site;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static void main(String[] args) {
|
public static void main(String[] args) {
|
||||||
Spider.create(new IteyeBlogProcessor()).thread(5).run();
|
Spider.create(new IteyeBlogProcessor()).thread(5).addUrl("http://yanghaoli.iteye.com/").run();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -22,11 +22,11 @@ public class KaichibaProcessor implements PageProcessor {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Site getSite() {
|
public Site getSite() {
|
||||||
return Site.me().setDomain("kaichiba.com").addStartUrl("http://kaichiba.com/shop/41725781").setCharset("utf-8").
|
return Site.me().setDomain("kaichiba.com").setCharset("utf-8").
|
||||||
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
|
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
|
||||||
}
|
}
|
||||||
|
|
||||||
public static void main(String[] args) {
|
public static void main(String[] args) {
|
||||||
Spider.create(new KaichibaProcessor()).run();
|
Spider.create(new KaichibaProcessor()).addUrl("http://kaichiba.com/shop/41725781").run();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -28,11 +28,11 @@ public class MeicanProcessor implements PageProcessor {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Site getSite() {
|
public Site getSite() {
|
||||||
return Site.me().setDomain("meican.com").addStartUrl("http://www.meican.com/shanghai/districts").setCharset("utf-8").
|
return Site.me().setDomain("meican.com").setCharset("utf-8").
|
||||||
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
|
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
|
||||||
}
|
}
|
||||||
|
|
||||||
public static void main(String[] args) {
|
public static void main(String[] args) {
|
||||||
Spider.create(new MeicanProcessor()).run();
|
Spider.create(new MeicanProcessor()).addUrl("http://www.meican.com/shanghai/districts").run();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,7 +1,8 @@
|
||||||
package us.codecraft.webmagic.samples;
|
package us.codecraft.webmagic.samples;
|
||||||
|
|
||||||
import us.codecraft.webmagic.Site;
|
|
||||||
import us.codecraft.webmagic.Page;
|
import us.codecraft.webmagic.Page;
|
||||||
|
import us.codecraft.webmagic.Site;
|
||||||
|
import us.codecraft.webmagic.Spider;
|
||||||
import us.codecraft.webmagic.processor.PageProcessor;
|
import us.codecraft.webmagic.processor.PageProcessor;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
@ -22,6 +23,10 @@ public class NjuBBSProcessor implements PageProcessor {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Site getSite() {
|
public Site getSite() {
|
||||||
return Site.me().setDomain("bbs.nju.edu.cn").addStartUrl("http://bbs.nju.edu.cn/board?board=Pictures");
|
return Site.me().setDomain("bbs.nju.edu.cn");
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void main(String[] args) {
|
||||||
|
Spider.create(new NjuBBSProcessor()).addUrl("http://bbs.nju.edu.cn/board?board=Pictures").run();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,41 +0,0 @@
|
||||||
package us.codecraft.webmagic.samples;
|
|
||||||
|
|
||||||
import us.codecraft.webmagic.Page;
|
|
||||||
import us.codecraft.webmagic.Site;
|
|
||||||
import us.codecraft.webmagic.Spider;
|
|
||||||
import us.codecraft.webmagic.monitor.SpiderMonitor;
|
|
||||||
import us.codecraft.webmagic.processor.PageProcessor;
|
|
||||||
import us.codecraft.webmagic.scheduler.BloomFilterDuplicateRemover;
|
|
||||||
import us.codecraft.webmagic.scheduler.QueueScheduler;
|
|
||||||
|
|
||||||
import javax.management.JMException;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @author code4crafter@gmail.com <br>
|
|
||||||
*/
|
|
||||||
public class OschinaBlogPageProcesser implements PageProcessor {
|
|
||||||
|
|
||||||
private Site site = Site.me().setDomain("my.oschina.net").addStartUrl("http://my.oschina.net/flashsword/blog");
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void process(Page page) {
|
|
||||||
List<String> links = page.getHtml().links().regex("http://my\\.oschina\\.net/flashsword/blog/\\d+").all();
|
|
||||||
page.addTargetRequests(links);
|
|
||||||
page.putField("title", page.getHtml().xpath("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1/text()").toString());
|
|
||||||
page.putField("content", page.getHtml().xpath("//div[@class='BlogContent']/tidyText()").toString());
|
|
||||||
page.putField("tags",page.getHtml().xpath("//div[@class='BlogTags']/a/text()").all());
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public Site getSite() {
|
|
||||||
return site;
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
public static void main(String[] args) throws JMException {
|
|
||||||
Spider spider = Spider.create(new OschinaBlogPageProcesser()).setScheduler(new QueueScheduler().setDuplicateRemover(new BloomFilterDuplicateRemover(2000)));
|
|
||||||
SpiderMonitor.instance().register(spider);
|
|
||||||
spider.run();
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,27 +0,0 @@
|
||||||
package us.codecraft.webmagic.samples;
|
|
||||||
|
|
||||||
import us.codecraft.webmagic.Site;
|
|
||||||
import us.codecraft.webmagic.Page;
|
|
||||||
import us.codecraft.webmagic.processor.PageProcessor;
|
|
||||||
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @author code4crafter@gmail.com <br>
|
|
||||||
*/
|
|
||||||
public class OschinaPageProcesser implements PageProcessor {
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void process(Page page) {
|
|
||||||
List<String> strings = page.getHtml().regex("<a[^<>]*href=[\"']{1}(http://www\\.oschina\\.net/question/[\\w]+)[\"']{1}").all();
|
|
||||||
page.addTargetRequests(strings);
|
|
||||||
page.putField("title", page.getHtml().xpath("//div[@class='QTitle']/h1/a"));
|
|
||||||
page.putField("content", page.getHtml().xpath("//div[@class='Question']//div[@class='Content']/div[@class='detail']"));
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public Site getSite() {
|
|
||||||
return Site.me().setDomain("www.oschina.net").addStartUrl("http://www.oschina.net/").
|
|
||||||
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -24,7 +24,7 @@ public class QzoneBlogProcessor implements PageProcessor {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Site getSite() {
|
public Site getSite() {
|
||||||
return Site.me().setDomain("www.diandian.com").addStartUrl("http://17dujingdian.com/").
|
return Site.me().setDomain("www.diandian.com").
|
||||||
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
|
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -21,6 +21,6 @@ public class TianyaPageProcesser implements PageProcessor {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Site getSite() {
|
public Site getSite() {
|
||||||
return Site.me().setDomain("http://bbs.tianya.cn/").addStartUrl("http://bbs.tianya.cn/"); //To change body of implemented methods use File | Settings | File Templates.
|
return Site.me().setDomain("http://bbs.tianya.cn/"); //To change body of implemented methods use File | Settings | File Templates.
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -28,10 +28,10 @@ public class SpiderTest {
|
||||||
// PageProcessor pageProcessor = new MeicanProcessor();
|
// PageProcessor pageProcessor = new MeicanProcessor();
|
||||||
// Spider.me().pipeline(new FilePipeline()).scheduler(new FileCacheQueueScheduler(pageProcessor.getSite(),"/data/temp/webmagic/cache/")).
|
// Spider.me().pipeline(new FilePipeline()).scheduler(new FileCacheQueueScheduler(pageProcessor.getSite(),"/data/temp/webmagic/cache/")).
|
||||||
// processor(pageProcessor).run();
|
// processor(pageProcessor).run();
|
||||||
SimplePageProcessor pageProcessor2 = new SimplePageProcessor("http://www.diaoyuweng.com/home.php?mod=space&uid=88304&do=thread&view=me&type=thread&from=space", "http://www.diaoyuweng.com/thread-*-1-1.html");
|
SimplePageProcessor pageProcessor2 = new SimplePageProcessor( "http://www.diaoyuweng.com/thread-*-1-1.html");
|
||||||
System.out.println(pageProcessor2.getSite().getCharset());
|
System.out.println(pageProcessor2.getSite().getCharset());
|
||||||
pageProcessor2.getSite().setSleepTime(500);
|
pageProcessor2.getSite().setSleepTime(500);
|
||||||
Spider.create(pageProcessor2).addPipeline(new FilePipeline()).scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")).
|
Spider.create(pageProcessor2).addUrl("http://www.diaoyuweng.com/home.php?mod=space&uid=88304&do=thread&view=me&type=thread&from=space").addPipeline(new FilePipeline()).scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")).
|
||||||
run();
|
run();
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -17,7 +17,7 @@ public class ProcessorBenchmark {
|
||||||
@Ignore
|
@Ignore
|
||||||
@Test
|
@Test
|
||||||
public void test() {
|
public void test() {
|
||||||
ModelPageProcessor modelPageProcessor = ModelPageProcessor.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog"), OschinaBlog.class);
|
ModelPageProcessor modelPageProcessor = ModelPageProcessor.create(Site.me(), OschinaBlog.class);
|
||||||
Page page = new Page();
|
Page page = new Page();
|
||||||
page.setRequest(new Request("http://my.oschina.net/flashsword/blog"));
|
page.setRequest(new Request("http://my.oschina.net/flashsword/blog"));
|
||||||
page.setUrl(new PlainText("http://my.oschina.net/flashsword/blog"));
|
page.setUrl(new PlainText("http://my.oschina.net/flashsword/blog"));
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<artifactId>webmagic-parent</artifactId>
|
<artifactId>webmagic-parent</artifactId>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<version>0.6.2-SNAPSHOT</version>
|
<version>0.7.0-SNAPSHOT</version>
|
||||||
</parent>
|
</parent>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<artifactId>webmagic-parent</artifactId>
|
<artifactId>webmagic-parent</artifactId>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<version>0.6.2-SNAPSHOT</version>
|
<version>0.7.0-SNAPSHOT</version>
|
||||||
</parent>
|
</parent>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<artifactId>webmagic-parent</artifactId>
|
<artifactId>webmagic-parent</artifactId>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<version>0.6.2-SNAPSHOT</version>
|
<version>0.7.0-SNAPSHOT</version>
|
||||||
</parent>
|
</parent>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue