From a7f9e7cad5a17b14a1e9ab22009d893fd670fc4b Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 18 Mar 2017 12:16:21 +0800 Subject: [PATCH 01/25] =?UTF-8?q?=E9=87=8D=E6=9E=84=E4=B8=80=E9=83=A8?= =?UTF-8?q?=E5=88=86httpclient?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../main/java/us/codecraft/webmagic/Site.java | 6 +- .../downloader/HttpClientDownloader.java | 135 ++++--------- .../downloader/HttpClientGenerator.java | 29 +-- .../downloader/HttpUriRequestConverter.java | 98 +++++++++ .../us/codecraft/webmagic/proxy/Proxy.java | 190 ++---------------- .../codecraft/webmagic/proxy/ProxyHost.java | 34 ++++ .../codecraft/webmagic/proxy/ProxyPool.java | 9 +- .../webmagic/proxy/TimerReuseProxy.java | 163 +++++++++++++++ ...roxyPool.java => TimerReuseProxyPool.java} | 46 ++--- .../codecraft/webmagic/utils/ProxyUtils.java | 25 ++- .../codecraft/webmagic/proxy/ProxyTest.java | 2 +- 11 files changed, 393 insertions(+), 344 deletions(-) create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyHost.java create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/proxy/TimerReuseProxy.java rename webmagic-core/src/main/java/us/codecraft/webmagic/proxy/{SimpleProxyPool.java => TimerReuseProxyPool.java} (85%) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java index ac9f9ce..146bb0d 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java @@ -4,7 +4,7 @@ import org.apache.http.HttpHost; import org.apache.http.auth.UsernamePasswordCredentials; import us.codecraft.webmagic.proxy.Proxy; import us.codecraft.webmagic.proxy.ProxyPool; -import us.codecraft.webmagic.proxy.SimpleProxyPool; +import us.codecraft.webmagic.proxy.TimerReuseProxyPool; import us.codecraft.webmagic.utils.UrlUtils; import java.util.*; @@ -487,12 +487,12 @@ public class Site { * @return this */ public Site setHttpProxyPool(List httpProxyList, boolean isUseLastProxy) { - this.httpProxyPool=new SimpleProxyPool(httpProxyList, isUseLastProxy); + this.httpProxyPool=new TimerReuseProxyPool(httpProxyList, isUseLastProxy); return this; } public Site enableHttpProxyPool() { - this.httpProxyPool=new SimpleProxyPool(); + this.httpProxyPool=new TimerReuseProxyPool(); return this; } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index fa907a1..816e6c5 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -3,16 +3,16 @@ package us.codecraft.webmagic.downloader; import org.apache.commons.io.IOUtils; import org.apache.http.HttpHost; import org.apache.http.HttpResponse; -import org.apache.http.NameValuePair; import org.apache.http.annotation.ThreadSafe; -import org.apache.http.client.config.CookieSpecs; -import org.apache.http.client.config.RequestConfig; -import org.apache.http.client.entity.UrlEncodedFormEntity; +import org.apache.http.auth.AuthState; +import org.apache.http.auth.UsernamePasswordCredentials; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpUriRequest; -import org.apache.http.client.methods.RequestBuilder; +import org.apache.http.client.protocol.HttpClientContext; +import org.apache.http.impl.auth.BasicScheme; import org.apache.http.impl.client.CloseableHttpClient; -import org.apache.http.message.BasicNameValuePair; +import org.apache.http.protocol.BasicHttpContext; +import org.apache.http.protocol.HttpContext; import org.apache.http.util.EntityUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -23,12 +23,13 @@ import us.codecraft.webmagic.Task; import us.codecraft.webmagic.proxy.Proxy; import us.codecraft.webmagic.selector.PlainText; import us.codecraft.webmagic.utils.CharsetUtils; -import us.codecraft.webmagic.utils.HttpConstant; import us.codecraft.webmagic.utils.WMCollections; import java.io.IOException; import java.nio.charset.Charset; -import java.util.*; +import java.util.HashMap; +import java.util.Map; +import java.util.Set; /** @@ -46,9 +47,15 @@ public class HttpClientDownloader extends AbstractDownloader { private HttpClientGenerator httpClientGenerator = new HttpClientGenerator(); - private CloseableHttpClient getHttpClient(Site site, Proxy proxy) { + private HttpUriRequestConverter httpUriRequestConverter = new HttpUriRequestConverter(); + + public void setHttpUriRequestConverter(HttpUriRequestConverter httpUriRequestConverter) { + this.httpUriRequestConverter = httpUriRequestConverter; + } + + private CloseableHttpClient getHttpClient(Site site) { if (site == null) { - return httpClientGenerator.getClient(null, proxy); + return httpClientGenerator.getClient(null); } String domain = site.getDomain(); CloseableHttpClient httpClient = httpClients.get(domain); @@ -56,7 +63,7 @@ public class HttpClientDownloader extends AbstractDownloader { synchronized (this) { httpClient = httpClients.get(domain); if (httpClient == null) { - httpClient = httpClientGenerator.getClient(site, proxy); + httpClient = httpClientGenerator.getClient(site); httpClients.put(domain, httpClient); } } @@ -66,35 +73,31 @@ public class HttpClientDownloader extends AbstractDownloader { @Override public Page download(Request request, Task task) { - Site site = null; - if (task != null) { - site = task.getSite(); + if (task == null || task.getSite() == null) { + throw new NullPointerException("task or site can not be null"); } - Set acceptStatCode; - String charset = null; - Map headers = null; - if (site != null) { - acceptStatCode = site.getAcceptStatCode(); - charset = site.getCharset(); - headers = site.getHeaders(); - } else { - acceptStatCode = WMCollections.newHashSet(200); - } - logger.info("downloading page {}", request.getUrl()); + logger.debug("downloading page {}", request.getUrl()); CloseableHttpResponse httpResponse = null; int statusCode = 0; + Site site = task.getSite(); try { - HttpHost proxyHost = null; - Proxy proxy = null; //TODO - if (site != null && site.getHttpProxyPool() != null && site.getHttpProxyPool().isEnable()) { + Proxy proxy = null; + if (site.getHttpProxyPool() != null && site.getHttpProxyPool().isEnable()) { proxy = site.getHttpProxyFromPool(); - proxyHost = proxy.getHttpHost(); } else if (site != null && site.getHttpProxy() != null){ - proxyHost = site.getHttpProxy(); + proxy = site.getHttpProxy(); + request.putExtra(Request.PROXY, site.getHttpProxy()); } - - HttpUriRequest httpUriRequest = getHttpUriRequest(request, site, headers, proxyHost); - httpResponse = getHttpClient(site, proxy).execute(httpUriRequest); + request.putExtra(Request.PROXY, proxy); + + HttpContext httpContext = new BasicHttpContext(); + + HttpUriRequest httpUriRequest = httpUriRequestConverter.convert(request,site); + AuthState authState = new AuthState(); + authState.update(new BasicScheme(), new UsernamePasswordCredentials("userName", "password")); + httpContext.setAttribute(HttpClientContext.PROXY_AUTH_STATE, authState); + CloseableHttpClient httpClient = getHttpClient(site, proxy); + httpResponse = httpClient.execute(httpUriRequest, httpContext); statusCode = httpResponse.getStatusLine().getStatusCode(); request.putExtra(Request.STATUS_CODE, statusCode); if (statusAccept(acceptStatCode, statusCode)) { @@ -134,72 +137,6 @@ public class HttpClientDownloader extends AbstractDownloader { return acceptStatCode.contains(statusCode); } - protected HttpUriRequest getHttpUriRequest(Request request, Site site, Map headers, HttpHost proxy) { - RequestBuilder requestBuilder = selectRequestMethod(request).setUri(request.getUrl()); - if (headers != null) { - for (Map.Entry headerEntry : headers.entrySet()) { - requestBuilder.addHeader(headerEntry.getKey(), headerEntry.getValue()); - } - } - - RequestConfig.Builder requestConfigBuilder = RequestConfig.custom(); - if (site != null) { - requestConfigBuilder.setConnectionRequestTimeout(site.getTimeOut()) - .setSocketTimeout(site.getTimeOut()) - .setConnectTimeout(site.getTimeOut()) - .setCookieSpec(CookieSpecs.BEST_MATCH); - } - - if (proxy != null) { - requestConfigBuilder.setProxy(proxy); - request.putExtra(Request.PROXY, proxy); - } - requestBuilder.setConfig(requestConfigBuilder.build()); - return requestBuilder.build(); - } - - protected RequestBuilder selectRequestMethod(Request request) { - String method = request.getMethod(); - if (method == null || method.equalsIgnoreCase(HttpConstant.Method.GET)) { - //default get - return addQueryParams(RequestBuilder.get(),request.getParams()); - } else if (method.equalsIgnoreCase(HttpConstant.Method.POST)) { - return addFormParams(RequestBuilder.post(), (NameValuePair[]) request.getExtra("nameValuePair"), request.getParams()); - } else if (method.equalsIgnoreCase(HttpConstant.Method.HEAD)) { - return addQueryParams(RequestBuilder.head(),request.getParams()); - } else if (method.equalsIgnoreCase(HttpConstant.Method.PUT)) { - return addFormParams(RequestBuilder.put(), (NameValuePair[]) request.getExtra("nameValuePair"), request.getParams()); - } else if (method.equalsIgnoreCase(HttpConstant.Method.DELETE)) { - return addQueryParams(RequestBuilder.delete(),request.getParams()); - } else if (method.equalsIgnoreCase(HttpConstant.Method.TRACE)) { - return addQueryParams(RequestBuilder.trace(),request.getParams()); - } - throw new IllegalArgumentException("Illegal HTTP Method " + method); - } - - private RequestBuilder addFormParams(RequestBuilder requestBuilder, NameValuePair[] nameValuePair, Map params) { - List allNameValuePair=new ArrayList(); - if (nameValuePair != null && nameValuePair.length > 0) { - allNameValuePair= Arrays.asList(nameValuePair); - } - if (params != null) { - for (String key : params.keySet()) { - allNameValuePair.add(new BasicNameValuePair(key, params.get(key))); - } - } - requestBuilder.setEntity(new UrlEncodedFormEntity(allNameValuePair, Charset.forName("utf8"))); - return requestBuilder; - } - - private RequestBuilder addQueryParams(RequestBuilder requestBuilder, Map params) { - if (params != null) { - for (Map.Entry entry : params.entrySet()) { - requestBuilder.addParameter(entry.getKey(), entry.getValue()); - } - } - return requestBuilder; - } - protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException { String content = getContent(charset, httpResponse); Page page = new Page(); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java index aec5309..1da64e7 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java @@ -1,13 +1,9 @@ package us.codecraft.webmagic.downloader; -import org.apache.commons.lang3.StringUtils; import org.apache.http.HttpException; import org.apache.http.HttpRequest; import org.apache.http.HttpRequestInterceptor; -import org.apache.http.auth.AuthScope; -import org.apache.http.auth.UsernamePasswordCredentials; import org.apache.http.client.CookieStore; -import org.apache.http.client.CredentialsProvider; import org.apache.http.config.Registry; import org.apache.http.config.RegistryBuilder; import org.apache.http.config.SocketConfig; @@ -21,7 +17,6 @@ import org.apache.http.protocol.HttpContext; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import us.codecraft.webmagic.Site; -import us.codecraft.webmagic.proxy.Proxy; import javax.net.ssl.SSLContext; import javax.net.ssl.TrustManager; @@ -92,31 +87,13 @@ public class HttpClientGenerator { return this; } - public CloseableHttpClient getClient(Site site, Proxy proxy) { - return generateClient(site, proxy); + public CloseableHttpClient getClient(Site site) { + return generateClient(site); } - private CloseableHttpClient generateClient(Site site, Proxy proxy) { - CredentialsProvider credsProvider = null; + private CloseableHttpClient generateClient(Site site) { HttpClientBuilder httpClientBuilder = HttpClients.custom(); - if (proxy != null && StringUtils.isNotBlank(proxy.getUser()) && StringUtils.isNotBlank(proxy.getPassword())) - { - credsProvider= new BasicCredentialsProvider(); - credsProvider.setCredentials( - new AuthScope(proxy.getHttpHost().getAddress().getHostAddress(), proxy.getHttpHost().getPort()), - new UsernamePasswordCredentials(proxy.getUser(), proxy.getPassword())); - httpClientBuilder.setDefaultCredentialsProvider(credsProvider); - } - - if (site != null && site.getHttpProxy()!= null && site.getUsernamePasswordCredentials() != null){ - credsProvider = new BasicCredentialsProvider(); - credsProvider.setCredentials( - new AuthScope(site.getHttpProxy()),//可以访问的范围 - site.getUsernamePasswordCredentials());//用户名和密码 - httpClientBuilder.setDefaultCredentialsProvider(credsProvider); - } - httpClientBuilder.setConnectionManager(connectionManager); if (site != null && site.getUserAgent() != null) { httpClientBuilder.setUserAgent(site.getUserAgent()); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java new file mode 100644 index 0000000..7e77676 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java @@ -0,0 +1,98 @@ +package us.codecraft.webmagic.downloader; + +import org.apache.http.HttpHost; +import org.apache.http.NameValuePair; +import org.apache.http.client.config.CookieSpecs; +import org.apache.http.client.config.RequestConfig; +import org.apache.http.client.entity.UrlEncodedFormEntity; +import org.apache.http.client.methods.HttpUriRequest; +import org.apache.http.client.methods.RequestBuilder; +import org.apache.http.message.BasicNameValuePair; +import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.proxy.Proxy; +import us.codecraft.webmagic.utils.HttpConstant; + +import java.nio.charset.Charset; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Map; + +/** + * @author code4crafter@gmail.com + * Date: 17/3/18 + * Time: 上午11:28 + */ +public class HttpUriRequestConverter { + + public HttpUriRequest convert(Request request, Site site, Proxy proxy) { + return null; + } + + private HttpUriRequest getHttpUriRequest(Request request, Site site, Map headers, HttpHost proxy) { + RequestBuilder requestBuilder = selectRequestMethod(request).setUri(request.getUrl()); + if (headers != null) { + for (Map.Entry headerEntry : headers.entrySet()) { + requestBuilder.addHeader(headerEntry.getKey(), headerEntry.getValue()); + } + } + + RequestConfig.Builder requestConfigBuilder = RequestConfig.custom(); + if (site != null) { + requestConfigBuilder.setConnectionRequestTimeout(site.getTimeOut()) + .setSocketTimeout(site.getTimeOut()) + .setConnectTimeout(site.getTimeOut()) + .setCookieSpec(CookieSpecs.BEST_MATCH); + } + + if (proxy != null) { + requestConfigBuilder.setProxy(proxy); + } + requestBuilder.setConfig(requestConfigBuilder.build()); + return requestBuilder.build(); + } + + private RequestBuilder selectRequestMethod(Request request) { + String method = request.getMethod(); + if (method == null || method.equalsIgnoreCase(HttpConstant.Method.GET)) { + //default get + return addQueryParams(RequestBuilder.get(),request.getParams()); + } else if (method.equalsIgnoreCase(HttpConstant.Method.POST)) { + return addFormParams(RequestBuilder.post(), (NameValuePair[]) request.getExtra("nameValuePair"), request.getParams()); + } else if (method.equalsIgnoreCase(HttpConstant.Method.HEAD)) { + return addQueryParams(RequestBuilder.head(),request.getParams()); + } else if (method.equalsIgnoreCase(HttpConstant.Method.PUT)) { + return addFormParams(RequestBuilder.put(), (NameValuePair[]) request.getExtra("nameValuePair"), request.getParams()); + } else if (method.equalsIgnoreCase(HttpConstant.Method.DELETE)) { + return addQueryParams(RequestBuilder.delete(),request.getParams()); + } else if (method.equalsIgnoreCase(HttpConstant.Method.TRACE)) { + return addQueryParams(RequestBuilder.trace(),request.getParams()); + } + throw new IllegalArgumentException("Illegal HTTP Method " + method); + } + + private RequestBuilder addFormParams(RequestBuilder requestBuilder, NameValuePair[] nameValuePair, Map params) { + List allNameValuePair=new ArrayList(); + if (nameValuePair != null && nameValuePair.length > 0) { + allNameValuePair= Arrays.asList(nameValuePair); + } + if (params != null) { + for (String key : params.keySet()) { + allNameValuePair.add(new BasicNameValuePair(key, params.get(key))); + } + } + requestBuilder.setEntity(new UrlEncodedFormEntity(allNameValuePair, Charset.forName("utf8"))); + return requestBuilder; + } + + private RequestBuilder addQueryParams(RequestBuilder requestBuilder, Map params) { + if (params != null) { + for (Map.Entry entry : params.entrySet()) { + requestBuilder.addParameter(entry.getKey(), entry.getValue()); + } + } + return requestBuilder; + } + +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java index dbe3a18..b078fd9 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java @@ -1,199 +1,47 @@ package us.codecraft.webmagic.proxy; -import org.apache.http.HttpHost; - -import java.io.Serializable; -import java.util.ArrayList; -import java.util.List; -import java.util.concurrent.Delayed; -import java.util.concurrent.TimeUnit; - -/** - * >>>> Proxy lifecycle - - +----------+ +-----+ - | last use | | new | - +-----+----+ +---+-+ - | +------+ | - +->| init |<--+ - +--+---+ - | - v - +--------+ - +--->| borrow | - | +---+----+ - | |+------------------+ - | v - | +--------+ - | | in use | Respone Time - | +---+----+ - | |+------------------+ - | v - | +--------+ - | | return | - | +---+----+ - | |+-------------------+ - | v - | +-------+ reuse interval - | | delay | (delay time) - | +---+---+ - | |+-------------------+ - | v - | +------+ - | | idle | idle time - | +---+--+ - | |+-------------------+ - +--------+ - */ - /** - * Object has these status of lifecycle above.
* - * @author yxssfxwzy@sina.com
- * @since 0.5.1 - * @see SimpleProxyPool */ -public class Proxy implements Delayed, Serializable { +public class Proxy { - private static final long serialVersionUID = 228939737383625551L; - public static final int ERROR_403 = 403; - public static final int ERROR_404 = 404; - public static final int ERROR_BANNED = 10000;// banned by website - public static final int ERROR_Proxy = 10001;// the proxy itself failed - public static final int SUCCESS = 200; - - private final HttpHost httpHost; + private ProxyHost proxyHost; private String user; private String password; - - private int reuseTimeInterval = 1500;// ms - private Long canReuseTime = 0L; - private Long lastBorrowTime = System.currentTimeMillis(); - private Long responseTime = 0L; - - private int failedNum = 0; - private int successNum = 0; - private int borrowNum = 0; - - private List failedErrorType = new ArrayList(); - - public Proxy(HttpHost httpHost, String user, String password) { - this.httpHost = httpHost; + public Proxy(ProxyHost proxyHost, String user, String password) { + this.proxyHost = proxyHost; this.user = user; this.password = password; - this.canReuseTime = System.nanoTime() + TimeUnit.NANOSECONDS.convert(reuseTimeInterval, TimeUnit.MILLISECONDS); } - public Proxy(HttpHost httpHost, int reuseInterval, String user, String password) { - this.httpHost = httpHost; - this.user = user; - this.password = password; - this.canReuseTime = System.nanoTime() + TimeUnit.NANOSECONDS.convert(reuseInterval, TimeUnit.MILLISECONDS); + public Proxy(ProxyHost proxyHost) { + this.proxyHost = proxyHost; } - public int getSuccessNum() { - return successNum; + public ProxyHost getProxyHost() { + return proxyHost; } - public void successNumIncrement(int increment) { - this.successNum += increment; + public void setProxyHost(ProxyHost proxyHost) { + this.proxyHost = proxyHost; } - public Long getLastUseTime() { - return lastBorrowTime; - } - - public void setLastBorrowTime(Long lastBorrowTime) { - this.lastBorrowTime = lastBorrowTime; - } - - public void recordResponse() { - this.responseTime = (System.currentTimeMillis() - lastBorrowTime + responseTime) / 2; - this.lastBorrowTime = System.currentTimeMillis(); - } - - public List getFailedErrorType() { - return failedErrorType; - } - - public void setFailedErrorType(List failedErrorType) { - this.failedErrorType = failedErrorType; - } - - public void fail(int failedErrorType) { - this.failedNum++; - this.failedErrorType.add(failedErrorType); - } - - public void setFailedNum(int failedNum) { - this.failedNum = failedNum; - } - - public int getFailedNum() { - return failedNum; - } - - public String getFailedType() { - String re = ""; - for (Integer i : this.failedErrorType) { - re += i + " . "; - } - return re; - } - - public HttpHost getHttpHost() { - return httpHost; - } - - public int getReuseTimeInterval() { - return reuseTimeInterval; - } - - public void setReuseTimeInterval(int reuseTimeInterval) { - this.reuseTimeInterval = reuseTimeInterval; - this.canReuseTime = System.nanoTime() + TimeUnit.NANOSECONDS.convert(reuseTimeInterval, TimeUnit.MILLISECONDS); - - } - - @Override - public long getDelay(TimeUnit unit) { - return unit.convert(canReuseTime - System.nanoTime(), TimeUnit.NANOSECONDS); - } - - @Override - public int compareTo(Delayed o) { - Proxy that = (Proxy) o; - return canReuseTime > that.canReuseTime ? 1 : (canReuseTime < that.canReuseTime ? -1 : 0); - - } - - @Override - public String toString() { - - String re = String.format("host: %15s >> %5dms >> success: %-3.2f%% >> borrow: %d", httpHost.getAddress().getHostAddress(), responseTime, - successNum * 100.0 / borrowNum, borrowNum); - return re; - - } - - public String getUser() - { + public String getUser() { return user; - } - public String getPassword() - { + + public void setUser(String user) { + this.user = user; + } + + public String getPassword() { return password; - } - public void borrowNumIncrement(int increment) { - this.borrowNum += increment; + public void setPassword(String password) { + this.password = password; } - public int getBorrowNum() { - return borrowNum; - } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyHost.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyHost.java new file mode 100644 index 0000000..11e8c87 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyHost.java @@ -0,0 +1,34 @@ +package us.codecraft.webmagic.proxy; + +/** + * @author code4crafter@gmail.com + * Date: 17/3/18 + * Time: 下午12:04 + */ +public class ProxyHost { + + private String host; + + private int port; + + public String getHost() { + return host; + } + + public ProxyHost(String host, int port) { + this.host = host; + this.port = port; + } + + public void setHost(String host) { + this.host = host; + } + + public int getPort() { + return port; + } + + public void setPort(int port) { + this.port = port; + } +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyPool.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyPool.java index 40b1913..418b445 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyPool.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyPool.java @@ -6,7 +6,10 @@ import org.apache.http.HttpHost; * Created by edwardsbean on 15-2-28. */ public interface ProxyPool { - public void returnProxy(HttpHost host, int statusCode); - public Proxy getProxy(); - public boolean isEnable(); + + void returnProxy(HttpHost host, int statusCode); + + Proxy getProxy(); + + boolean isEnable(); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/TimerReuseProxy.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/TimerReuseProxy.java new file mode 100644 index 0000000..8f59252 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/TimerReuseProxy.java @@ -0,0 +1,163 @@ +package us.codecraft.webmagic.proxy; + +import java.io.Serializable; +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.Delayed; +import java.util.concurrent.TimeUnit; + +/** + * >>>> Proxy lifecycle + + +----------+ +-----+ + | last use | | new | + +-----+----+ +---+-+ + | +------+ | + +->| init |<--+ + +--+---+ + | + v + +--------+ + +--->| borrow | + | +---+----+ + | |+------------------+ + | v + | +--------+ + | | in use | Respone Time + | +---+----+ + | |+------------------+ + | v + | +--------+ + | | return | + | +---+----+ + | |+-------------------+ + | v + | +-------+ reuse interval + | | delay | (delay time) + | +---+---+ + | |+-------------------+ + | v + | +------+ + | | idle | idle time + | +---+--+ + | |+-------------------+ + +--------+ + */ + +/** + * Object has these status of lifecycle above.
+ * + * @author yxssfxwzy@sina.com
+ * @since 0.5.1 + * @see TimerReuseProxyPool + */ + +public class TimerReuseProxy extends Proxy implements Delayed, Serializable { + + private static final long serialVersionUID = 228939737383625551L; + public static final int ERROR_403 = 403; + public static final int ERROR_404 = 404; + public static final int ERROR_BANNED = 10000;// banned by website + public static final int ERROR_Proxy = 10001;// the proxy itself failed + public static final int SUCCESS = 200; + + private int reuseTimeInterval = 1500;// ms + private Long canReuseTime = 0L; + private Long lastBorrowTime = System.currentTimeMillis(); + private Long responseTime = 0L; + + private int failedNum = 0; + private int successNum = 0; + private int borrowNum = 0; + + private List failedErrorType = new ArrayList(); + + public TimerReuseProxy(ProxyHost proxyHost, String user, String password) { + super(proxyHost, user, password); + } + + public TimerReuseProxy(ProxyHost proxyHost, String user, String password, int reuseTimeInterval) { + super(proxyHost, user, password); + this.reuseTimeInterval = reuseTimeInterval; + } + + public int getSuccessNum() { + return successNum; + } + + public void successNumIncrement(int increment) { + this.successNum += increment; + } + + public Long getLastUseTime() { + return lastBorrowTime; + } + + public void setLastBorrowTime(Long lastBorrowTime) { + this.lastBorrowTime = lastBorrowTime; + } + + public void recordResponse() { + this.responseTime = (System.currentTimeMillis() - lastBorrowTime + responseTime) / 2; + this.lastBorrowTime = System.currentTimeMillis(); + } + + public List getFailedErrorType() { + return failedErrorType; + } + + public void setFailedErrorType(List failedErrorType) { + this.failedErrorType = failedErrorType; + } + + public void fail(int failedErrorType) { + this.failedNum++; + this.failedErrorType.add(failedErrorType); + } + + public void setFailedNum(int failedNum) { + this.failedNum = failedNum; + } + + public int getFailedNum() { + return failedNum; + } + + public String getFailedType() { + String re = ""; + for (Integer i : this.failedErrorType) { + re += i + " . "; + } + return re; + } + + public int getReuseTimeInterval() { + return reuseTimeInterval; + } + + public void setReuseTimeInterval(int reuseTimeInterval) { + this.reuseTimeInterval = reuseTimeInterval; + this.canReuseTime = System.nanoTime() + TimeUnit.NANOSECONDS.convert(reuseTimeInterval, TimeUnit.MILLISECONDS); + + } + + @Override + public long getDelay(TimeUnit unit) { + return unit.convert(canReuseTime - System.nanoTime(), TimeUnit.NANOSECONDS); + } + + @Override + public int compareTo(Delayed o) { + TimerReuseProxy that = (TimerReuseProxy) o; + return canReuseTime > that.canReuseTime ? 1 : (canReuseTime < that.canReuseTime ? -1 : 0); + + } + + public void borrowNumIncrement(int increment) { + this.borrowNum += increment; + } + + public int getBorrowNum() { + return borrowNum; + } +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyPool.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/TimerReuseProxyPool.java similarity index 85% rename from webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyPool.java rename to webmagic-core/src/main/java/us/codecraft/webmagic/proxy/TimerReuseProxyPool.java index f7cd049..4752fee 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyPool.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/TimerReuseProxyPool.java @@ -22,12 +22,12 @@ import java.util.concurrent.DelayQueue; * @see Proxy * @since 0.5.1 */ -public class SimpleProxyPool implements ProxyPool { +public class TimerReuseProxyPool implements ProxyPool { private Logger logger = LoggerFactory.getLogger(getClass()); - private BlockingQueue proxyQueue = new DelayQueue(); - private Map allProxy = new ConcurrentHashMap(); + private BlockingQueue proxyQueue = new DelayQueue(); + private Map allProxy = new ConcurrentHashMap(); private int reuseInterval = 1500;// ms private int reviveTime = 2 * 60 * 60 * 1000;// ms @@ -50,15 +50,15 @@ public class SimpleProxyPool implements ProxyPool { } }; - public SimpleProxyPool() { + public TimerReuseProxyPool() { this(null, true); } - public SimpleProxyPool(List httpProxyList) { + public TimerReuseProxyPool(List httpProxyList) { this(httpProxyList, true); } - public SimpleProxyPool(List httpProxyList, boolean isUseLastProxy) { + public TimerReuseProxyPool(List httpProxyList, boolean isUseLastProxy) { if (httpProxyList != null) { addProxy(httpProxyList.toArray(new String[httpProxyList.size()][])); } @@ -109,9 +109,9 @@ public class SimpleProxyPool implements ProxyPool { } private Map prepareForSaving() { - Map tmp = new HashMap(); - for (Entry e : allProxy.entrySet()) { - Proxy p = e.getValue(); + Map tmp = new HashMap(); + for (Entry e : allProxy.entrySet()) { + TimerReuseProxy p = e.getValue(); p.setFailedNum(0); tmp.put(e.getKey(), p); } @@ -152,30 +152,20 @@ public class SimpleProxyPool implements ProxyPool { logger.info("proxy pool size>>>>" + allProxy.size()); } - public void addProxy(String[]... httpProxyList) { + public void addProxy(Proxy... httpProxyList) { isEnable = true; - for (String[] s : httpProxyList) { - try { - if (allProxy.containsKey(s[2])) { - continue; - } - HttpHost item = new HttpHost(InetAddress.getByName(s[2]), Integer.valueOf(s[3])); - if (!validateWhenInit || ProxyUtils.validateProxy(item)) { - Proxy p = new Proxy(item, reuseInterval, s[0], s[1]); - proxyQueue.add(p); - allProxy.put(s[2], p); - } - } catch (NumberFormatException e) { - logger.error("HttpHost init error:", e); - } catch (UnknownHostException e) { - logger.error("HttpHost init error:", e); + for (Proxy proxy : httpProxyList) { + if (!validateWhenInit || ProxyUtils.validateProxy(proxy.getProxyHost())) { + TimerReuseProxy p = new TimerReuseProxy(proxy.getProxyHost(), proxy.getUser(), proxy.getPassword(), reuseInterval); + proxyQueue.add(p); + allProxy.put(p.getProxyHost().getHost(), p); } } logger.info("proxy pool size>>>>" + allProxy.size()); } - public Proxy getProxy() { - Proxy proxy = null; + public TimerReuseProxy getProxy() { + TimerReuseProxy proxy = null; try { Long time = System.currentTimeMillis(); proxy = proxyQueue.take(); @@ -183,7 +173,7 @@ public class SimpleProxyPool implements ProxyPool { if (costTime > reuseInterval) { logger.info("get proxy time >>>> " + costTime); } - Proxy p = allProxy.get(proxy.getHttpHost().getAddress().getHostAddress()); + TimerReuseProxy p = allProxy.get(proxy.getProxyHost().getHost()); p.setLastBorrowTime(System.currentTimeMillis()); p.borrowNumIncrement(1); } catch (InterruptedException e) { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/ProxyUtils.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/ProxyUtils.java index f44c2ac..f9f9a8c 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/ProxyUtils.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/ProxyUtils.java @@ -1,19 +1,14 @@ package us.codecraft.webmagic.utils; -import java.io.IOException; -import java.net.Inet6Address; -import java.net.InetAddress; -import java.net.InetSocketAddress; -import java.net.NetworkInterface; -import java.net.Socket; -import java.net.SocketException; -import java.net.UnknownHostException; -import java.util.Enumeration; -import java.util.regex.Pattern; - import org.apache.http.HttpHost; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import us.codecraft.webmagic.proxy.ProxyHost; + +import java.io.IOException; +import java.net.*; +import java.util.Enumeration; +import java.util.regex.Pattern; /** * Pooled Proxy Object @@ -69,7 +64,11 @@ public class ProxyUtils { } } - public static boolean validateProxy(HttpHost p) { + public static HttpHost convert(ProxyHost p){ + return new HttpHost(p.getHost(),p.getPort()); + } + + public static boolean validateProxy(ProxyHost p) { if (localAddr == null) { logger.error("cannot get local IP"); return false; @@ -79,7 +78,7 @@ public class ProxyUtils { try { socket = new Socket(); socket.bind(new InetSocketAddress(localAddr, 0)); - InetSocketAddress endpointSocketAddr = new InetSocketAddress(p.getAddress().getHostAddress(), p.getPort()); + InetSocketAddress endpointSocketAddr = new InetSocketAddress(p.getHost(), p.getPort()); socket.connect(endpointSocketAddr, 3000); logger.debug("SUCCESS - connection established! Local: " + localAddr.getHostAddress() + " remote: " + p); isReachable = true; diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java index f218356..6477323 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java @@ -29,7 +29,7 @@ public class ProxyTest { @Test public void testProxy() { - SimpleProxyPool proxyPool = new SimpleProxyPool(httpProxyList,false); + TimerReuseProxyPool proxyPool = new TimerReuseProxyPool(httpProxyList,false); proxyPool.setReuseInterval(500); assertThat(proxyPool.getIdleNum()).isEqualTo(4); for (int i = 0; i < 2; i++) { From b71f379512d74598a3a4877c60e653c072f652c7 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 18 Mar 2017 12:18:00 +0800 Subject: [PATCH 02/25] fix --- .../src/main/java/us/codecraft/webmagic/Site.java | 4 ---- .../webmagic/proxy/TimerReuseProxyPool.java | 14 +++++++------- 2 files changed, 7 insertions(+), 11 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java index 146bb0d..501f758 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java @@ -513,8 +513,4 @@ public class Site { return httpProxyPool.getProxy(); } - public void returnHttpProxyToPool(HttpHost proxy,int statusCode) { - httpProxyPool.returnProxy(proxy,statusCode); - } - } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/TimerReuseProxyPool.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/TimerReuseProxyPool.java index 4752fee..b375fae 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/TimerReuseProxyPool.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/TimerReuseProxyPool.java @@ -186,31 +186,31 @@ public class TimerReuseProxyPool implements ProxyPool { } public void returnProxy(HttpHost host, int statusCode) { - Proxy p = allProxy.get(host.getAddress().getHostAddress()); + TimerReuseProxy p = allProxy.get(host.getAddress().getHostAddress()); if (p == null) { return; } switch (statusCode) { - case Proxy.SUCCESS: + case TimerReuseProxy.SUCCESS: p.setReuseTimeInterval(reuseInterval); p.setFailedNum(0); p.setFailedErrorType(new ArrayList()); p.recordResponse(); p.successNumIncrement(1); break; - case Proxy.ERROR_403: + case TimerReuseProxy.ERROR_403: // banned,try longer interval - p.fail(Proxy.ERROR_403); + p.fail(TimerReuseProxy.ERROR_403); p.setReuseTimeInterval(reuseInterval * p.getFailedNum()); logger.info(host + " >>>> reuseTimeInterval is >>>> " + p.getReuseTimeInterval() / 1000.0); break; - case Proxy.ERROR_BANNED: - p.fail(Proxy.ERROR_BANNED); + case TimerReuseProxy.ERROR_BANNED: + p.fail(TimerReuseProxy.ERROR_BANNED); p.setReuseTimeInterval(10 * 60 * 1000 * p.getFailedNum()); logger.warn("this proxy is banned >>>> " + p.getHttpHost()); logger.info(host + " >>>> reuseTimeInterval is >>>> " + p.getReuseTimeInterval() / 1000.0); break; - case Proxy.ERROR_404: + case TimerReuseProxy.ERROR_404: // p.fail(Proxy.ERROR_404); // p.setReuseTimeInterval(reuseInterval * p.getFailedNum()); break; From 1d86f7c048b27cf15f84d1690740f0b338026137 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Mon, 20 Mar 2017 22:40:14 +0800 Subject: [PATCH 03/25] compile passed in httpclientDownloader --- .../java/us/codecraft/webmagic/Request.java | 1 - .../java/us/codecraft/webmagic/Spider.java | 2 - .../downloader/HttpClientDownloader.java | 41 ++++++------------- .../downloader/HttpUriRequestConverter.java | 3 +- .../us/codecraft/webmagic/proxy/Proxy.java | 14 +++---- .../codecraft/webmagic/proxy/ProxyPool.java | 4 +- .../webmagic/proxy/TimerReuseProxyPool.java | 18 ++++---- 7 files changed, 30 insertions(+), 53 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java index c8c5978..0a38fcc 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java @@ -18,7 +18,6 @@ public class Request implements Serializable { private static final long serialVersionUID = 2062192774891352043L; public static final String CYCLE_TRIED_TIMES = "_cycle_tried_times"; - public static final String STATUS_CODE = "statusCode"; public static final String PROXY = "proxy"; private String url; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 49734b7..213cf3f 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -419,8 +419,6 @@ public class Spider implements Runnable, Task { pipeline.process(page.getResultItems(), this); } } - //for proxy status management - request.putExtra(Request.STATUS_CODE, page.getStatusCode()); sleep(site.getSleepTime()); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 816e6c5..052c6fa 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -1,7 +1,6 @@ package us.codecraft.webmagic.downloader; import org.apache.commons.io.IOUtils; -import org.apache.http.HttpHost; import org.apache.http.HttpResponse; import org.apache.http.annotation.ThreadSafe; import org.apache.http.auth.AuthState; @@ -23,13 +22,11 @@ import us.codecraft.webmagic.Task; import us.codecraft.webmagic.proxy.Proxy; import us.codecraft.webmagic.selector.PlainText; import us.codecraft.webmagic.utils.CharsetUtils; -import us.codecraft.webmagic.utils.WMCollections; import java.io.IOException; import java.nio.charset.Charset; import java.util.HashMap; import java.util.Map; -import java.util.Set; /** @@ -80,28 +77,22 @@ public class HttpClientDownloader extends AbstractDownloader { CloseableHttpResponse httpResponse = null; int statusCode = 0; Site site = task.getSite(); - try { - Proxy proxy = null; - if (site.getHttpProxyPool() != null && site.getHttpProxyPool().isEnable()) { - proxy = site.getHttpProxyFromPool(); - } else if (site != null && site.getHttpProxy() != null){ - proxy = site.getHttpProxy(); - request.putExtra(Request.PROXY, site.getHttpProxy()); - } + Proxy proxy = null; + HttpContext httpContext = new BasicHttpContext(); + if (site.getHttpProxyPool() != null && site.getHttpProxyPool().isEnable()) { + proxy = site.getHttpProxyFromPool(); request.putExtra(Request.PROXY, proxy); - - HttpContext httpContext = new BasicHttpContext(); - - HttpUriRequest httpUriRequest = httpUriRequestConverter.convert(request,site); AuthState authState = new AuthState(); - authState.update(new BasicScheme(), new UsernamePasswordCredentials("userName", "password")); + authState.update(new BasicScheme(), new UsernamePasswordCredentials(proxy.getUsername(), proxy.getPassword())); httpContext.setAttribute(HttpClientContext.PROXY_AUTH_STATE, authState); - CloseableHttpClient httpClient = getHttpClient(site, proxy); + } + HttpUriRequest httpUriRequest = httpUriRequestConverter.convert(request, site); + CloseableHttpClient httpClient = getHttpClient(site); + try { httpResponse = httpClient.execute(httpUriRequest, httpContext); statusCode = httpResponse.getStatusLine().getStatusCode(); - request.putExtra(Request.STATUS_CODE, statusCode); - if (statusAccept(acceptStatCode, statusCode)) { - Page page = handleResponse(request, charset, httpResponse, task); + if (site.getAcceptStatCode().contains(statusCode)) { + Page page = handleResponse(request, site.getCharset(), httpResponse, task); onSuccess(request); return page; } else { @@ -120,10 +111,8 @@ public class HttpClientDownloader extends AbstractDownloader { //ensure the connection is released back to pool EntityUtils.consumeQuietly(httpResponse.getEntity()); } - request.putExtra(Request.STATUS_CODE, statusCode); - if (site != null && site.getHttpProxyPool() != null && site.getHttpProxyPool().isEnable()) { - site.returnHttpProxyToPool((HttpHost) request.getExtra(Request.PROXY), (Integer) request - .getExtra(Request.STATUS_CODE)); + if (proxy != null) { + site.getHttpProxyPool().returnProxy(proxy, statusCode); } } } @@ -133,10 +122,6 @@ public class HttpClientDownloader extends AbstractDownloader { httpClientGenerator.setPoolSize(thread); } - protected boolean statusAccept(Set acceptStatCode, int statusCode) { - return acceptStatCode.contains(statusCode); - } - protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException { String content = getContent(charset, httpResponse); Page page = new Page(); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java index 7e77676..beda2e6 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java @@ -10,7 +10,6 @@ import org.apache.http.client.methods.RequestBuilder; import org.apache.http.message.BasicNameValuePair; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Site; -import us.codecraft.webmagic.proxy.Proxy; import us.codecraft.webmagic.utils.HttpConstant; import java.nio.charset.Charset; @@ -26,7 +25,7 @@ import java.util.Map; */ public class HttpUriRequestConverter { - public HttpUriRequest convert(Request request, Site site, Proxy proxy) { + public HttpUriRequest convert(Request request, Site site) { return null; } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java index b078fd9..1d872d4 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java @@ -7,12 +7,12 @@ package us.codecraft.webmagic.proxy; public class Proxy { private ProxyHost proxyHost; - private String user; + private String username; private String password; - public Proxy(ProxyHost proxyHost, String user, String password) { + public Proxy(ProxyHost proxyHost, String username, String password) { this.proxyHost = proxyHost; - this.user = user; + this.username = username; this.password = password; } @@ -28,12 +28,12 @@ public class Proxy { this.proxyHost = proxyHost; } - public String getUser() { - return user; + public String getUsername() { + return username; } - public void setUser(String user) { - this.user = user; + public void setUsername(String username) { + this.username = username; } public String getPassword() { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyPool.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyPool.java index 418b445..50e4029 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyPool.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyPool.java @@ -1,13 +1,11 @@ package us.codecraft.webmagic.proxy; -import org.apache.http.HttpHost; - /** * Created by edwardsbean on 15-2-28. */ public interface ProxyPool { - void returnProxy(HttpHost host, int statusCode); + void returnProxy(Proxy proxy, int statusCode); Proxy getProxy(); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/TimerReuseProxyPool.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/TimerReuseProxyPool.java index b375fae..a336c71 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/TimerReuseProxyPool.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/TimerReuseProxyPool.java @@ -7,8 +7,6 @@ import us.codecraft.webmagic.utils.FilePersistentBase; import us.codecraft.webmagic.utils.ProxyUtils; import java.io.*; -import java.net.InetAddress; -import java.net.UnknownHostException; import java.util.*; import java.util.Map.Entry; import java.util.concurrent.BlockingQueue; @@ -156,7 +154,7 @@ public class TimerReuseProxyPool implements ProxyPool { isEnable = true; for (Proxy proxy : httpProxyList) { if (!validateWhenInit || ProxyUtils.validateProxy(proxy.getProxyHost())) { - TimerReuseProxy p = new TimerReuseProxy(proxy.getProxyHost(), proxy.getUser(), proxy.getPassword(), reuseInterval); + TimerReuseProxy p = new TimerReuseProxy(proxy.getProxyHost(), proxy.getUsername(), proxy.getPassword(), reuseInterval); proxyQueue.add(p); allProxy.put(p.getProxyHost().getHost(), p); } @@ -185,8 +183,8 @@ public class TimerReuseProxyPool implements ProxyPool { return proxy; } - public void returnProxy(HttpHost host, int statusCode) { - TimerReuseProxy p = allProxy.get(host.getAddress().getHostAddress()); + public void returnProxy(Proxy proxy, int statusCode) { + TimerReuseProxy p = allProxy.get(proxy.getProxyHost()); if (p == null) { return; } @@ -202,13 +200,13 @@ public class TimerReuseProxyPool implements ProxyPool { // banned,try longer interval p.fail(TimerReuseProxy.ERROR_403); p.setReuseTimeInterval(reuseInterval * p.getFailedNum()); - logger.info(host + " >>>> reuseTimeInterval is >>>> " + p.getReuseTimeInterval() / 1000.0); + logger.info(proxy + " >>>> reuseTimeInterval is >>>> " + p.getReuseTimeInterval() / 1000.0); break; case TimerReuseProxy.ERROR_BANNED: p.fail(TimerReuseProxy.ERROR_BANNED); p.setReuseTimeInterval(10 * 60 * 1000 * p.getFailedNum()); logger.warn("this proxy is banned >>>> " + p.getHttpHost()); - logger.info(host + " >>>> reuseTimeInterval is >>>> " + p.getReuseTimeInterval() / 1000.0); + logger.info(proxy + " >>>> reuseTimeInterval is >>>> " + p.getReuseTimeInterval() / 1000.0); break; case TimerReuseProxy.ERROR_404: // p.fail(Proxy.ERROR_404); @@ -220,13 +218,13 @@ public class TimerReuseProxyPool implements ProxyPool { } if (p.getFailedNum() > 20) { p.setReuseTimeInterval(reviveTime); - logger.error("remove proxy >>>> " + host + ">>>>" + p.getFailedType() + " >>>> remain proxy >>>> " + proxyQueue.size()); + logger.error("remove proxy >>>> " + proxy + ">>>>" + p.getFailedType() + " >>>> remain proxy >>>> " + proxyQueue.size()); return; } if (p.getFailedNum() > 0 && p.getFailedNum() % 5 == 0) { - if (!ProxyUtils.validateProxy(host)) { + if (!ProxyUtils.validateProxy(proxy)) { p.setReuseTimeInterval(reviveTime); - logger.error("remove proxy >>>> " + host + ">>>>" + p.getFailedType() + " >>>> remain proxy >>>> " + proxyQueue.size()); + logger.error("remove proxy >>>> " + proxy + ">>>>" + p.getFailedType() + " >>>> remain proxy >>>> " + proxyQueue.size()); return; } } From 46297deaa1f15d2167f72004957e7e2f6beec2d0 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Mon, 20 Mar 2017 22:43:32 +0800 Subject: [PATCH 04/25] HttpUriRequestConverter --- .../webmagic/downloader/HttpClientDownloader.java | 2 +- .../downloader/HttpUriRequestConverter.java | 13 +++++-------- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 052c6fa..7ca483a 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -86,7 +86,7 @@ public class HttpClientDownloader extends AbstractDownloader { authState.update(new BasicScheme(), new UsernamePasswordCredentials(proxy.getUsername(), proxy.getPassword())); httpContext.setAttribute(HttpClientContext.PROXY_AUTH_STATE, authState); } - HttpUriRequest httpUriRequest = httpUriRequestConverter.convert(request, site); + HttpUriRequest httpUriRequest = httpUriRequestConverter.convert(request, site, proxy); CloseableHttpClient httpClient = getHttpClient(site); try { httpResponse = httpClient.execute(httpUriRequest, httpContext); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java index beda2e6..0ec4b0e 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java @@ -10,6 +10,7 @@ import org.apache.http.client.methods.RequestBuilder; import org.apache.http.message.BasicNameValuePair; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.proxy.Proxy; import us.codecraft.webmagic.utils.HttpConstant; import java.nio.charset.Charset; @@ -25,14 +26,10 @@ import java.util.Map; */ public class HttpUriRequestConverter { - public HttpUriRequest convert(Request request, Site site) { - return null; - } - - private HttpUriRequest getHttpUriRequest(Request request, Site site, Map headers, HttpHost proxy) { + public HttpUriRequest convert(Request request, Site site, Proxy proxy) { RequestBuilder requestBuilder = selectRequestMethod(request).setUri(request.getUrl()); - if (headers != null) { - for (Map.Entry headerEntry : headers.entrySet()) { + if (site.getHeaders() != null) { + for (Map.Entry headerEntry : site.getHeaders().entrySet()) { requestBuilder.addHeader(headerEntry.getKey(), headerEntry.getValue()); } } @@ -46,7 +43,7 @@ public class HttpUriRequestConverter { } if (proxy != null) { - requestConfigBuilder.setProxy(proxy); + requestConfigBuilder.setProxy(new HttpHost(proxy.getProxyHost().getHost(), proxy.getProxyHost().getPort())); } requestBuilder.setConfig(requestConfigBuilder.build()); return requestBuilder.build(); From 25c81013ca6573f1646613f1c5c7e3572dff734a Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Mon, 20 Mar 2017 22:47:24 +0800 Subject: [PATCH 05/25] new proxy pool api --- .../src/main/java/us/codecraft/webmagic/Site.java | 5 ----- .../webmagic/downloader/HttpClientDownloader.java | 6 +++--- .../main/java/us/codecraft/webmagic/proxy/ProxyPool.java | 7 ++++--- 3 files changed, 7 insertions(+), 11 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java index 501f758..d342069 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java @@ -2,7 +2,6 @@ package us.codecraft.webmagic; import org.apache.http.HttpHost; import org.apache.http.auth.UsernamePasswordCredentials; -import us.codecraft.webmagic.proxy.Proxy; import us.codecraft.webmagic.proxy.ProxyPool; import us.codecraft.webmagic.proxy.TimerReuseProxyPool; import us.codecraft.webmagic.utils.UrlUtils; @@ -509,8 +508,4 @@ public class Site { return httpProxyPool; } - public Proxy getHttpProxyFromPool() { - return httpProxyPool.getProxy(); - } - } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 7ca483a..93a8a7c 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -79,8 +79,8 @@ public class HttpClientDownloader extends AbstractDownloader { Site site = task.getSite(); Proxy proxy = null; HttpContext httpContext = new BasicHttpContext(); - if (site.getHttpProxyPool() != null && site.getHttpProxyPool().isEnable()) { - proxy = site.getHttpProxyFromPool(); + if (site.getHttpProxyPool() != null) { + proxy = site.getHttpProxyPool().getProxy(task); request.putExtra(Request.PROXY, proxy); AuthState authState = new AuthState(); authState.update(new BasicScheme(), new UsernamePasswordCredentials(proxy.getUsername(), proxy.getPassword())); @@ -112,7 +112,7 @@ public class HttpClientDownloader extends AbstractDownloader { EntityUtils.consumeQuietly(httpResponse.getEntity()); } if (proxy != null) { - site.getHttpProxyPool().returnProxy(proxy, statusCode); + site.getHttpProxyPool().returnProxy(proxy, statusCode, task); } } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyPool.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyPool.java index 50e4029..ad307a6 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyPool.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyPool.java @@ -1,13 +1,14 @@ package us.codecraft.webmagic.proxy; +import us.codecraft.webmagic.Task; + /** * Created by edwardsbean on 15-2-28. */ public interface ProxyPool { - void returnProxy(Proxy proxy, int statusCode); + void returnProxy(Proxy proxy, int statusCode, Task task); - Proxy getProxy(); + Proxy getProxy(Task task); - boolean isEnable(); } From 474b7c9d578102791c0091f234ab777acbede9f5 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Mon, 20 Mar 2017 23:13:49 +0800 Subject: [PATCH 06/25] refactor --- .../webmagic/proxy/BannedChecker.java | 13 +++ .../codecraft/webmagic/proxy/ProxyPool.java | 2 +- .../webmagic/proxy/TimerReuseProxyPool.java | 94 +------------------ 3 files changed, 15 insertions(+), 94 deletions(-) create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/proxy/BannedChecker.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/BannedChecker.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/BannedChecker.java new file mode 100644 index 0000000..db17de2 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/BannedChecker.java @@ -0,0 +1,13 @@ +package us.codecraft.webmagic.proxy; + +import org.apache.http.HttpResponse; + +/** + * @author code4crafter@gmail.com + * Date: 17/3/20 + * Time: 下午10:52 + */ +public interface BannedChecker { + + boolean isBanned(HttpResponse httpResponse); +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyPool.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyPool.java index ad307a6..fcc1f8d 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyPool.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyPool.java @@ -7,7 +7,7 @@ import us.codecraft.webmagic.Task; */ public interface ProxyPool { - void returnProxy(Proxy proxy, int statusCode, Task task); + void returnProxy(Proxy proxy, boolean banned, Task task); Proxy getProxy(Task task); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/TimerReuseProxyPool.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/TimerReuseProxyPool.java index a336c71..6fde604 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/TimerReuseProxyPool.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/TimerReuseProxyPool.java @@ -34,102 +34,11 @@ public class TimerReuseProxyPool implements ProxyPool { private boolean isEnable = false; private boolean validateWhenInit = false; // private boolean isUseLastProxy = true; - private String proxyFilePath = "/data/webmagic/lastUse.proxy"; - - private FilePersistentBase fBase = new FilePersistentBase(); - - private Timer timer = new Timer(true); - private TimerTask saveProxyTask = new TimerTask() { - - @Override - public void run() { - saveProxyList(); - logger.info(allProxyStatus()); - } - }; - - public TimerReuseProxyPool() { - this(null, true); - } - + public TimerReuseProxyPool(List httpProxyList) { this(httpProxyList, true); } - public TimerReuseProxyPool(List httpProxyList, boolean isUseLastProxy) { - if (httpProxyList != null) { - addProxy(httpProxyList.toArray(new String[httpProxyList.size()][])); - } - if (isUseLastProxy) { - if (!new File(proxyFilePath).exists()) { - setFilePath(); - } - readProxyList(); - timer.schedule(saveProxyTask, 0, saveProxyInterval); - } - } - - private void setFilePath() { - String tmpDir = System.getProperty("java.io.tmpdir"); - String path = tmpDir + FilePersistentBase.PATH_SEPERATOR + "webmagic" + FilePersistentBase.PATH_SEPERATOR + "lastUse.proxy"; - if (tmpDir != null && new File(tmpDir).isDirectory()) { - fBase.setPath(tmpDir + FilePersistentBase.PATH_SEPERATOR + "webmagic"); - File f = fBase.getFile(path); - if (!f.exists()) { - try { - f.createNewFile(); - - } catch (IOException e) { - logger.error("proxy file create error", e); - } - } - - } else { - logger.error("java tmp dir not exists"); - } - this.proxyFilePath = path; - } - - private void saveProxyList() { - if (allProxy.size() == 0) { - return; - } - try { - ObjectOutputStream os = new ObjectOutputStream(new FileOutputStream(fBase.getFile(proxyFilePath))); - os.writeObject(prepareForSaving()); - os.close(); - logger.info("save proxy"); - } catch (FileNotFoundException e) { - logger.error("proxy file not found", e); - } catch (IOException e) { - e.printStackTrace(); - } - } - - private Map prepareForSaving() { - Map tmp = new HashMap(); - for (Entry e : allProxy.entrySet()) { - TimerReuseProxy p = e.getValue(); - p.setFailedNum(0); - tmp.put(e.getKey(), p); - } - return tmp; - } - - private void readProxyList() { - try { - ObjectInputStream is = new ObjectInputStream(new FileInputStream(fBase.getFile(proxyFilePath))); - addProxy((Map) is.readObject()); - is.close(); - } catch (FileNotFoundException e) { - logger.info("last use proxy file not found", e); - } catch (IOException e) { - // e.printStackTrace(); - } catch (ClassNotFoundException e) { - // e.printStackTrace(); - } - } - private void addProxy(Map httpProxyMap) { isEnable = true; for (Entry entry : httpProxyMap.entrySet()) { @@ -205,7 +114,6 @@ public class TimerReuseProxyPool implements ProxyPool { case TimerReuseProxy.ERROR_BANNED: p.fail(TimerReuseProxy.ERROR_BANNED); p.setReuseTimeInterval(10 * 60 * 1000 * p.getFailedNum()); - logger.warn("this proxy is banned >>>> " + p.getHttpHost()); logger.info(proxy + " >>>> reuseTimeInterval is >>>> " + p.getReuseTimeInterval() / 1000.0); break; case TimerReuseProxy.ERROR_404: From 68050fc88ee91e3fb18804d3efc3f61f919ca7e2 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Tue, 21 Mar 2017 07:19:12 +0800 Subject: [PATCH 07/25] test pass --- .../main/java/us/codecraft/webmagic/Site.java | 52 +-- .../downloader/HttpClientDownloader.java | 14 +- .../downloader/HttpUriRequestConverter.java | 2 +- .../us/codecraft/webmagic/proxy/Proxy.java | 34 +- .../codecraft/webmagic/proxy/ProxyHost.java | 34 -- .../{ProxyPool.java => ProxyProvider.java} | 2 +- .../webmagic/proxy/TimerReuseProxy.java | 8 +- .../webmagic/proxy/TimerReuseProxyPool.java | 378 +++++++++--------- .../codecraft/webmagic/utils/ProxyUtils.java | 94 +---- .../downloader/HttpClientDownloaderTest.java | 35 +- .../codecraft/webmagic/proxy/ProxyTest.java | 27 -- 11 files changed, 248 insertions(+), 432 deletions(-) delete mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyHost.java rename webmagic-core/src/main/java/us/codecraft/webmagic/proxy/{ProxyPool.java => ProxyProvider.java} (87%) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java index d342069..87eab14 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java @@ -2,8 +2,7 @@ package us.codecraft.webmagic; import org.apache.http.HttpHost; import org.apache.http.auth.UsernamePasswordCredentials; -import us.codecraft.webmagic.proxy.ProxyPool; -import us.codecraft.webmagic.proxy.TimerReuseProxyPool; +import us.codecraft.webmagic.proxy.ProxyProvider; import us.codecraft.webmagic.utils.UrlUtils; import java.util.*; @@ -52,7 +51,7 @@ public class Site { private UsernamePasswordCredentials usernamePasswordCredentials; //代理用户名密码设置 - private ProxyPool httpProxyPool; + private ProxyProvider httpProxyPool; private boolean useGzip = true; @@ -399,7 +398,11 @@ public class Site { return new Task() { @Override public String getUUID() { - return Site.this.getDomain(); + String uuid = Site.this.getDomain(); + if (uuid == null) { + uuid = UUID.randomUUID().toString(); + } + return uuid; } @Override @@ -467,45 +470,4 @@ public class Site { '}'; } - /** - * Set httpProxyPool, String[0]:ip, String[1]:port
- * - * @param proxyPool proxyPool - * @return this - */ - public Site setHttpProxyPool(ProxyPool proxyPool) { - this.httpProxyPool = proxyPool; - return this; - } - - /** - * Set httpProxyPool, String[0]:ip, String[1]:port
- * - * @param httpProxyList httpProxyList - * @param isUseLastProxy isUseLastProxy - * @return this - */ - public Site setHttpProxyPool(List httpProxyList, boolean isUseLastProxy) { - this.httpProxyPool=new TimerReuseProxyPool(httpProxyList, isUseLastProxy); - return this; - } - - public Site enableHttpProxyPool() { - this.httpProxyPool=new TimerReuseProxyPool(); - return this; - } - - public UsernamePasswordCredentials getUsernamePasswordCredentials() { - return usernamePasswordCredentials; - } - - public Site setUsernamePasswordCredentials(UsernamePasswordCredentials usernamePasswordCredentials) { - this.usernamePasswordCredentials = usernamePasswordCredentials; - return this; - } - - public ProxyPool getHttpProxyPool() { - return httpProxyPool; - } - } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 93a8a7c..3a44af6 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -20,6 +20,7 @@ import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.proxy.Proxy; +import us.codecraft.webmagic.proxy.ProxyProvider; import us.codecraft.webmagic.selector.PlainText; import us.codecraft.webmagic.utils.CharsetUtils; @@ -45,11 +46,17 @@ public class HttpClientDownloader extends AbstractDownloader { private HttpClientGenerator httpClientGenerator = new HttpClientGenerator(); private HttpUriRequestConverter httpUriRequestConverter = new HttpUriRequestConverter(); + + private ProxyProvider proxyProvider; public void setHttpUriRequestConverter(HttpUriRequestConverter httpUriRequestConverter) { this.httpUriRequestConverter = httpUriRequestConverter; } + public void setProxyProvider(ProxyProvider proxyProvider) { + this.proxyProvider = proxyProvider; + } + private CloseableHttpClient getHttpClient(Site site) { if (site == null) { return httpClientGenerator.getClient(null); @@ -79,8 +86,8 @@ public class HttpClientDownloader extends AbstractDownloader { Site site = task.getSite(); Proxy proxy = null; HttpContext httpContext = new BasicHttpContext(); - if (site.getHttpProxyPool() != null) { - proxy = site.getHttpProxyPool().getProxy(task); + if (proxyProvider != null) { + proxy = proxyProvider.getProxy(task); request.putExtra(Request.PROXY, proxy); AuthState authState = new AuthState(); authState.update(new BasicScheme(), new UsernamePasswordCredentials(proxy.getUsername(), proxy.getPassword())); @@ -111,9 +118,6 @@ public class HttpClientDownloader extends AbstractDownloader { //ensure the connection is released back to pool EntityUtils.consumeQuietly(httpResponse.getEntity()); } - if (proxy != null) { - site.getHttpProxyPool().returnProxy(proxy, statusCode, task); - } } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java index 0ec4b0e..951d332 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java @@ -43,7 +43,7 @@ public class HttpUriRequestConverter { } if (proxy != null) { - requestConfigBuilder.setProxy(new HttpHost(proxy.getProxyHost().getHost(), proxy.getProxyHost().getPort())); + requestConfigBuilder.setProxy(new HttpHost(proxy.getHost(), proxy.getPort())); } requestBuilder.setConfig(requestConfigBuilder.build()); return requestBuilder.build(); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java index 1d872d4..a38ccaa 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java @@ -6,42 +6,36 @@ package us.codecraft.webmagic.proxy; public class Proxy { - private ProxyHost proxyHost; + private String host; + private int port; private String username; private String password; - public Proxy(ProxyHost proxyHost, String username, String password) { - this.proxyHost = proxyHost; + public Proxy(String host, int port) { + this.host = host; + this.port = port; + } + + public Proxy(String host, int port, String username, String password) { + this.host = host; + this.port = port; this.username = username; this.password = password; } - public Proxy(ProxyHost proxyHost) { - this.proxyHost = proxyHost; + public String getHost() { + return host; } - public ProxyHost getProxyHost() { - return proxyHost; - } - - public void setProxyHost(ProxyHost proxyHost) { - this.proxyHost = proxyHost; + public int getPort() { + return port; } public String getUsername() { return username; } - public void setUsername(String username) { - this.username = username; - } - public String getPassword() { return password; } - - public void setPassword(String password) { - this.password = password; - } - } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyHost.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyHost.java deleted file mode 100644 index 11e8c87..0000000 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyHost.java +++ /dev/null @@ -1,34 +0,0 @@ -package us.codecraft.webmagic.proxy; - -/** - * @author code4crafter@gmail.com - * Date: 17/3/18 - * Time: 下午12:04 - */ -public class ProxyHost { - - private String host; - - private int port; - - public String getHost() { - return host; - } - - public ProxyHost(String host, int port) { - this.host = host; - this.port = port; - } - - public void setHost(String host) { - this.host = host; - } - - public int getPort() { - return port; - } - - public void setPort(int port) { - this.port = port; - } -} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyPool.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java similarity index 87% rename from webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyPool.java rename to webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java index fcc1f8d..4266d78 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyPool.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java @@ -5,7 +5,7 @@ import us.codecraft.webmagic.Task; /** * Created by edwardsbean on 15-2-28. */ -public interface ProxyPool { +public interface ProxyProvider { void returnProxy(Proxy proxy, boolean banned, Task task); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/TimerReuseProxy.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/TimerReuseProxy.java index 8f59252..7002df4 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/TimerReuseProxy.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/TimerReuseProxy.java @@ -72,14 +72,10 @@ public class TimerReuseProxy extends Proxy implements Delayed, Serializable { private List failedErrorType = new ArrayList(); - public TimerReuseProxy(ProxyHost proxyHost, String user, String password) { - super(proxyHost, user, password); + public TimerReuseProxy(String host, int port, String username, String password) { + super(host, port, username, password); } - public TimerReuseProxy(ProxyHost proxyHost, String user, String password, int reuseTimeInterval) { - super(proxyHost, user, password); - this.reuseTimeInterval = reuseTimeInterval; - } public int getSuccessNum() { return successNum; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/TimerReuseProxyPool.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/TimerReuseProxyPool.java index 6fde604..6dbac5d 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/TimerReuseProxyPool.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/TimerReuseProxyPool.java @@ -1,17 +1,6 @@ package us.codecraft.webmagic.proxy; -import org.apache.http.HttpHost; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import us.codecraft.webmagic.utils.FilePersistentBase; -import us.codecraft.webmagic.utils.ProxyUtils; - -import java.io.*; -import java.util.*; -import java.util.Map.Entry; -import java.util.concurrent.BlockingQueue; -import java.util.concurrent.ConcurrentHashMap; -import java.util.concurrent.DelayQueue; +import us.codecraft.webmagic.Task; /** * Pooled Proxy Object @@ -20,187 +9,196 @@ import java.util.concurrent.DelayQueue; * @see Proxy * @since 0.5.1 */ -public class TimerReuseProxyPool implements ProxyPool { - - private Logger logger = LoggerFactory.getLogger(getClass()); - - private BlockingQueue proxyQueue = new DelayQueue(); - private Map allProxy = new ConcurrentHashMap(); - - private int reuseInterval = 1500;// ms - private int reviveTime = 2 * 60 * 60 * 1000;// ms - private int saveProxyInterval = 10 * 60 * 1000;// ms - - private boolean isEnable = false; - private boolean validateWhenInit = false; - // private boolean isUseLastProxy = true; - - public TimerReuseProxyPool(List httpProxyList) { - this(httpProxyList, true); +public class TimerReuseProxyPool implements ProxyProvider { + @Override + public void returnProxy(Proxy proxy, boolean banned, Task task) { + } - private void addProxy(Map httpProxyMap) { - isEnable = true; - for (Entry entry : httpProxyMap.entrySet()) { - try { - if (allProxy.containsKey(entry.getKey())) { - continue; - } - if (!validateWhenInit || ProxyUtils.validateProxy(entry.getValue().getHttpHost())) { - entry.getValue().setFailedNum(0); - entry.getValue().setReuseTimeInterval(reuseInterval); - proxyQueue.add(entry.getValue()); - allProxy.put(entry.getKey(), entry.getValue()); - } - } catch (NumberFormatException e) { - logger.error("HttpHost init error:", e); - } - } - logger.info("proxy pool size>>>>" + allProxy.size()); + @Override + public Proxy getProxy(Task task) { + return null; } - public void addProxy(Proxy... httpProxyList) { - isEnable = true; - for (Proxy proxy : httpProxyList) { - if (!validateWhenInit || ProxyUtils.validateProxy(proxy.getProxyHost())) { - TimerReuseProxy p = new TimerReuseProxy(proxy.getProxyHost(), proxy.getUsername(), proxy.getPassword(), reuseInterval); - proxyQueue.add(p); - allProxy.put(p.getProxyHost().getHost(), p); - } - } - logger.info("proxy pool size>>>>" + allProxy.size()); - } - - public TimerReuseProxy getProxy() { - TimerReuseProxy proxy = null; - try { - Long time = System.currentTimeMillis(); - proxy = proxyQueue.take(); - double costTime = (System.currentTimeMillis() - time) / 1000.0; - if (costTime > reuseInterval) { - logger.info("get proxy time >>>> " + costTime); - } - TimerReuseProxy p = allProxy.get(proxy.getProxyHost().getHost()); - p.setLastBorrowTime(System.currentTimeMillis()); - p.borrowNumIncrement(1); - } catch (InterruptedException e) { - logger.error("get proxy error", e); - } - if (proxy == null) { - throw new NoSuchElementException(); - } - return proxy; - } - - public void returnProxy(Proxy proxy, int statusCode) { - TimerReuseProxy p = allProxy.get(proxy.getProxyHost()); - if (p == null) { - return; - } - switch (statusCode) { - case TimerReuseProxy.SUCCESS: - p.setReuseTimeInterval(reuseInterval); - p.setFailedNum(0); - p.setFailedErrorType(new ArrayList()); - p.recordResponse(); - p.successNumIncrement(1); - break; - case TimerReuseProxy.ERROR_403: - // banned,try longer interval - p.fail(TimerReuseProxy.ERROR_403); - p.setReuseTimeInterval(reuseInterval * p.getFailedNum()); - logger.info(proxy + " >>>> reuseTimeInterval is >>>> " + p.getReuseTimeInterval() / 1000.0); - break; - case TimerReuseProxy.ERROR_BANNED: - p.fail(TimerReuseProxy.ERROR_BANNED); - p.setReuseTimeInterval(10 * 60 * 1000 * p.getFailedNum()); - logger.info(proxy + " >>>> reuseTimeInterval is >>>> " + p.getReuseTimeInterval() / 1000.0); - break; - case TimerReuseProxy.ERROR_404: - // p.fail(Proxy.ERROR_404); - // p.setReuseTimeInterval(reuseInterval * p.getFailedNum()); - break; - default: - p.fail(statusCode); - break; - } - if (p.getFailedNum() > 20) { - p.setReuseTimeInterval(reviveTime); - logger.error("remove proxy >>>> " + proxy + ">>>>" + p.getFailedType() + " >>>> remain proxy >>>> " + proxyQueue.size()); - return; - } - if (p.getFailedNum() > 0 && p.getFailedNum() % 5 == 0) { - if (!ProxyUtils.validateProxy(proxy)) { - p.setReuseTimeInterval(reviveTime); - logger.error("remove proxy >>>> " + proxy + ">>>>" + p.getFailedType() + " >>>> remain proxy >>>> " + proxyQueue.size()); - return; - } - } - try { - proxyQueue.put(p); - } catch (InterruptedException e) { - logger.warn("proxyQueue return proxy error", e); - } - } - - public String allProxyStatus() { - String re = "all proxy info >>>> \n"; - for (Entry entry : allProxy.entrySet()) { - re += entry.getValue().toString() + "\n"; - } - return re; - } - - public int getIdleNum() { - return proxyQueue.size(); - } - - public int getReuseInterval() { - return reuseInterval; - } - - public void setReuseInterval(int reuseInterval) { - this.reuseInterval = reuseInterval; - } - - public void enable(boolean isEnable) { - this.isEnable = isEnable; - } - - public boolean isEnable() { - return isEnable; - } - - public int getReviveTime() { - return reviveTime; - } - - public void setReviveTime(int reviveTime) { - this.reviveTime = reviveTime; - } - - public boolean isValidateWhenInit() { - return validateWhenInit; - } - - public void validateWhenInit(boolean validateWhenInit) { - this.validateWhenInit = validateWhenInit; - } - - public int getSaveProxyInterval() { - return saveProxyInterval; - } - - public void setSaveProxyInterval(int saveProxyInterval) { - this.saveProxyInterval = saveProxyInterval; - } - - public String getProxyFilePath() { - return proxyFilePath; - } - - public void setProxyFilePath(String proxyFilePath) { - this.proxyFilePath = proxyFilePath; - } +// private Logger logger = LoggerFactory.getLogger(getClass()); +// +// private BlockingQueue proxyQueue = new DelayQueue(); +// private Map allProxy = new ConcurrentHashMap(); +// +// private int reuseInterval = 1500;// ms +// private int reviveTime = 2 * 60 * 60 * 1000;// ms +// private int saveProxyInterval = 10 * 60 * 1000;// ms +// +// private boolean isEnable = false; +// private boolean validateWhenInit = false; +// // private boolean isUseLastProxy = true; +// +// public TimerReuseProxyPool(List httpProxyList) { +// this(httpProxyList, true); +// } +// +// private void addProxy(Map httpProxyMap) { +// isEnable = true; +// for (Entry entry : httpProxyMap.entrySet()) { +// try { +// if (allProxy.containsKey(entry.getKey())) { +// continue; +// } +// if (!validateWhenInit || ProxyUtils.validateProxy(entry.getValue().getHttpHost())) { +// entry.getValue().setFailedNum(0); +// entry.getValue().setReuseTimeInterval(reuseInterval); +// proxyQueue.add(entry.getValue()); +// allProxy.put(entry.getKey(), entry.getValue()); +// } +// } catch (NumberFormatException e) { +// logger.error("HttpHost init error:", e); +// } +// } +// logger.info("proxy pool size>>>>" + allProxy.size()); +// } +// +// public void addProxy(Proxy... httpProxyList) { +// isEnable = true; +// for (Proxy proxy : httpProxyList) { +// if (!validateWhenInit || ProxyUtils.validateProxy(proxy.getProxyHost())) { +// TimerReuseProxy p = new TimerReuseProxy(proxy.getProxyHost(), proxy.getUsername(), proxy.getPassword(), reuseInterval); +// proxyQueue.add(p); +// allProxy.put(p.getProxyHost().getHost(), p); +// } +// } +// logger.info("proxy pool size>>>>" + allProxy.size()); +// } +// +// public TimerReuseProxy getProxy() { +// TimerReuseProxy proxy = null; +// try { +// Long time = System.currentTimeMillis(); +// proxy = proxyQueue.take(); +// double costTime = (System.currentTimeMillis() - time) / 1000.0; +// if (costTime > reuseInterval) { +// logger.info("get proxy time >>>> " + costTime); +// } +// TimerReuseProxy p = allProxy.get(proxy.getProxyHost().getHost()); +// p.setLastBorrowTime(System.currentTimeMillis()); +// p.borrowNumIncrement(1); +// } catch (InterruptedException e) { +// logger.error("get proxy error", e); +// } +// if (proxy == null) { +// throw new NoSuchElementException(); +// } +// return proxy; +// } +// +// public void returnProxy(Proxy proxy, int statusCode) { +// TimerReuseProxy p = allProxy.get(proxy.getProxyHost()); +// if (p == null) { +// return; +// } +// switch (statusCode) { +// case TimerReuseProxy.SUCCESS: +// p.setReuseTimeInterval(reuseInterval); +// p.setFailedNum(0); +// p.setFailedErrorType(new ArrayList()); +// p.recordResponse(); +// p.successNumIncrement(1); +// break; +// case TimerReuseProxy.ERROR_403: +// // banned,try longer interval +// p.fail(TimerReuseProxy.ERROR_403); +// p.setReuseTimeInterval(reuseInterval * p.getFailedNum()); +// logger.info(proxy + " >>>> reuseTimeInterval is >>>> " + p.getReuseTimeInterval() / 1000.0); +// break; +// case TimerReuseProxy.ERROR_BANNED: +// p.fail(TimerReuseProxy.ERROR_BANNED); +// p.setReuseTimeInterval(10 * 60 * 1000 * p.getFailedNum()); +// logger.info(proxy + " >>>> reuseTimeInterval is >>>> " + p.getReuseTimeInterval() / 1000.0); +// break; +// case TimerReuseProxy.ERROR_404: +// // p.fail(Proxy.ERROR_404); +// // p.setReuseTimeInterval(reuseInterval * p.getFailedNum()); +// break; +// default: +// p.fail(statusCode); +// break; +// } +// if (p.getFailedNum() > 20) { +// p.setReuseTimeInterval(reviveTime); +// logger.error("remove proxy >>>> " + proxy + ">>>>" + p.getFailedType() + " >>>> remain proxy >>>> " + proxyQueue.size()); +// return; +// } +// if (p.getFailedNum() > 0 && p.getFailedNum() % 5 == 0) { +// if (!ProxyUtils.validateProxy(proxy)) { +// p.setReuseTimeInterval(reviveTime); +// logger.error("remove proxy >>>> " + proxy + ">>>>" + p.getFailedType() + " >>>> remain proxy >>>> " + proxyQueue.size()); +// return; +// } +// } +// try { +// proxyQueue.put(p); +// } catch (InterruptedException e) { +// logger.warn("proxyQueue return proxy error", e); +// } +// } +// +// public String allProxyStatus() { +// String re = "all proxy info >>>> \n"; +// for (Entry entry : allProxy.entrySet()) { +// re += entry.getValue().toString() + "\n"; +// } +// return re; +// } +// +// public int getIdleNum() { +// return proxyQueue.size(); +// } +// +// public int getReuseInterval() { +// return reuseInterval; +// } +// +// public void setReuseInterval(int reuseInterval) { +// this.reuseInterval = reuseInterval; +// } +// +// public void enable(boolean isEnable) { +// this.isEnable = isEnable; +// } +// +// public boolean isEnable() { +// return isEnable; +// } +// +// public int getReviveTime() { +// return reviveTime; +// } +// +// public void setReviveTime(int reviveTime) { +// this.reviveTime = reviveTime; +// } +// +// public boolean isValidateWhenInit() { +// return validateWhenInit; +// } +// +// public void validateWhenInit(boolean validateWhenInit) { +// this.validateWhenInit = validateWhenInit; +// } +// +// public int getSaveProxyInterval() { +// return saveProxyInterval; +// } +// +// public void setSaveProxyInterval(int saveProxyInterval) { +// this.saveProxyInterval = saveProxyInterval; +// } +// +// public String getProxyFilePath() { +// return proxyFilePath; +// } +// +// public void setProxyFilePath(String proxyFilePath) { +// this.proxyFilePath = proxyFilePath; +// } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/ProxyUtils.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/ProxyUtils.java index f9f9a8c..9b734c7 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/ProxyUtils.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/ProxyUtils.java @@ -1,14 +1,12 @@ package us.codecraft.webmagic.utils; -import org.apache.http.HttpHost; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import us.codecraft.webmagic.proxy.ProxyHost; +import us.codecraft.webmagic.proxy.Proxy; import java.io.IOException; -import java.net.*; -import java.util.Enumeration; -import java.util.regex.Pattern; +import java.net.InetSocketAddress; +import java.net.Socket; /** * Pooled Proxy Object @@ -18,72 +16,19 @@ import java.util.regex.Pattern; */ public class ProxyUtils { - private static InetAddress localAddr; - private static String networkInterface = "eth7"; private static final Logger logger = LoggerFactory.getLogger(ProxyUtils.class); - static { - init(); - } - private static void init() { - // first way to get local IP - try { - localAddr = InetAddress.getLocalHost(); - logger.info("local IP:" + localAddr.getHostAddress()); - } catch (UnknownHostException e) { - logger.info("try again\n"); - } - if (localAddr != null) { - return; - } - // other way to get local IP - Enumeration localAddrs; - try { - // modify your network interface name - NetworkInterface ni = NetworkInterface.getByName(networkInterface); - if (ni == null) { - return; - } - localAddrs = ni.getInetAddresses(); - if (localAddrs == null || !localAddrs.hasMoreElements()) { - logger.error("choose NetworkInterface\n" + getNetworkInterface()); - return; - } - while (localAddrs.hasMoreElements()) { - InetAddress tmp = localAddrs.nextElement(); - if (!tmp.isLoopbackAddress() && !tmp.isLinkLocalAddress() && !(tmp instanceof Inet6Address)) { - localAddr = tmp; - logger.info("local IP:" + localAddr.getHostAddress()); - break; - } - } - } catch (Exception e) { - logger.error("Failure when init ProxyUtil", e); - logger.error("choose NetworkInterface\n" + getNetworkInterface()); - } - } - - public static HttpHost convert(ProxyHost p){ - return new HttpHost(p.getHost(),p.getPort()); - } - - public static boolean validateProxy(ProxyHost p) { - if (localAddr == null) { - logger.error("cannot get local IP"); - return false; - } - boolean isReachable = false; + public static boolean validateProxy(Proxy p) { Socket socket = null; try { socket = new Socket(); - socket.bind(new InetSocketAddress(localAddr, 0)); InetSocketAddress endpointSocketAddr = new InetSocketAddress(p.getHost(), p.getPort()); socket.connect(endpointSocketAddr, 3000); - logger.debug("SUCCESS - connection established! Local: " + localAddr.getHostAddress() + " remote: " + p); - isReachable = true; + return true; } catch (IOException e) { - logger.warn("FAILRE - CAN not connect! Local: " + localAddr.getHostAddress() + " remote: " + p); + logger.warn("FAILRE - CAN not connect! remote: " + p); + return false; } finally { if (socket != null) { try { @@ -93,30 +38,7 @@ public class ProxyUtils { } } } - return isReachable; + } - private static String getNetworkInterface() { - - String networkInterfaceName = ">>>> modify networkInterface in us.codecraft.webmagic.utils.ProxyUtils"; - Enumeration enumeration = null; - try { - enumeration = NetworkInterface.getNetworkInterfaces(); - } catch (SocketException e1) { - e1.printStackTrace(); - } - while (enumeration.hasMoreElements()) { - NetworkInterface networkInterface = enumeration.nextElement(); - - Enumeration addr = networkInterface.getInetAddresses(); - while (addr.hasMoreElements()) { - String s = addr.nextElement().getHostAddress(); - Pattern IPV4_PATTERN = Pattern.compile("^(25[0-5]|2[0-4]\\d|[0-1]?\\d?\\d)(\\.(25[0-5]|2[0-4]\\d|[0-1]?\\d?\\d)){3}$"); - if (s != null && IPV4_PATTERN.matcher(s).matches()) { - networkInterfaceName += networkInterface.toString() + "IP:" + s + "\n\n"; - } - } - } - return networkInterfaceName; - } } diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java index 5440b33..fd1f4c2 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java @@ -5,7 +5,7 @@ import com.github.dreamhead.moco.Runnable; import com.github.dreamhead.moco.Runner; import org.apache.commons.io.IOUtils; import org.apache.http.client.methods.CloseableHttpResponse; -import org.apache.http.client.methods.RequestBuilder; +import org.apache.http.client.methods.HttpUriRequest; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.http.util.EntityUtils; @@ -87,12 +87,12 @@ public class HttpClientDownloaderTest { private String getCharsetByUrl(String url) { HttpClientDownloader downloader = new HttpClientDownloader(); Site site = Site.me(); - CloseableHttpClient httpClient = new HttpClientGenerator().getClient(site, null); + CloseableHttpClient httpClient = new HttpClientGenerator().getClient(site); // encoding in http header Content-Type Request requestGBK = new Request(url); CloseableHttpResponse httpResponse = null; try { - httpResponse = httpClient.execute(downloader.getHttpUriRequest(requestGBK, site, null,null)); + httpResponse = httpClient.execute(new HttpUriRequestConverter().convert(requestGBK, site, null)); } catch (IOException e) { e.printStackTrace(); } @@ -117,31 +117,32 @@ public class HttpClientDownloaderTest { server.delete(eq(query("q"), "webmagic")).response("delete"); server.request(and(by(method("HEAD")),eq(query("q"), "webmagic"))).response(header("method","head")); server.request(and(by(method("TRACE")),eq(query("q"), "webmagic"))).response("trace"); + final HttpUriRequestConverter httpUriRequestConverter = new HttpUriRequestConverter(); + final Site site = Site.me(); Runner.running(server, new Runnable() { @Override public void run() throws Exception { - HttpClientDownloader httpClientDownloader = new HttpClientDownloader(); Request request = new Request(); request.setUrl("http://127.0.0.1:12306/search"); request.putParams("q", "webmagic"); request.setMethod(HttpConstant.Method.GET); - RequestBuilder requestBuilder = httpClientDownloader.selectRequestMethod(request).setUri(request.getUrl()); - assertThat(EntityUtils.toString(HttpClients.custom().build().execute(requestBuilder.build()).getEntity())).isEqualTo("get"); + HttpUriRequest httpUriRequest = httpUriRequestConverter.convert(request,site,null); + assertThat(EntityUtils.toString(HttpClients.custom().build().execute(httpUriRequest).getEntity())).isEqualTo("get"); request.setMethod(HttpConstant.Method.POST); - requestBuilder = httpClientDownloader.selectRequestMethod(request).setUri(request.getUrl()); - assertThat(EntityUtils.toString(HttpClients.custom().build().execute(requestBuilder.build()).getEntity())).isEqualTo("post"); + httpUriRequest = httpUriRequestConverter.convert(request, site, null); + assertThat(EntityUtils.toString(HttpClients.custom().build().execute(httpUriRequest).getEntity())).isEqualTo("post"); request.setMethod(HttpConstant.Method.PUT); - requestBuilder = httpClientDownloader.selectRequestMethod(request).setUri(request.getUrl()); - assertThat(EntityUtils.toString(HttpClients.custom().build().execute(requestBuilder.build()).getEntity())).isEqualTo("put"); + httpUriRequest = httpUriRequestConverter.convert(request, site, null); + assertThat(EntityUtils.toString(HttpClients.custom().build().execute(httpUriRequest).getEntity())).isEqualTo("put"); request.setMethod(HttpConstant.Method.DELETE); - requestBuilder = httpClientDownloader.selectRequestMethod(request).setUri(request.getUrl()); - assertThat(EntityUtils.toString(HttpClients.custom().build().execute(requestBuilder.build()).getEntity())).isEqualTo("delete"); + httpUriRequest = httpUriRequestConverter.convert(request, site, null); + assertThat(EntityUtils.toString(HttpClients.custom().build().execute(httpUriRequest).getEntity())).isEqualTo("delete"); request.setMethod(HttpConstant.Method.HEAD); - requestBuilder = httpClientDownloader.selectRequestMethod(request).setUri(request.getUrl()); - assertThat(HttpClients.custom().build().execute(requestBuilder.build()).getFirstHeader("method").getValue()).isEqualTo("head"); + httpUriRequest = httpUriRequestConverter.convert(request, site, null); + assertThat(HttpClients.custom().build().execute(httpUriRequest).getFirstHeader("method").getValue()).isEqualTo("head"); request.setMethod(HttpConstant.Method.TRACE); - requestBuilder = httpClientDownloader.selectRequestMethod(request).setUri(request.getUrl()); - assertThat(EntityUtils.toString(HttpClients.custom().build().execute(requestBuilder.build()).getEntity())).isEqualTo("trace"); + httpUriRequest = httpUriRequestConverter.convert(request, site, null); + assertThat(EntityUtils.toString(HttpClients.custom().build().execute(httpUriRequest).getEntity())).isEqualTo("trace"); } }); } @@ -156,7 +157,7 @@ public class HttpClientDownloaderTest { final HttpClientDownloader httpClientDownloader = new HttpClientDownloader(); Request request = new Request(); request.setUrl("http://127.0.0.1:12306/"); - Page page = httpClientDownloader.download(request, null); + Page page = httpClientDownloader.download(request, Site.me().toTask()); assertThat(page.getRawText()).isEqualTo("foo"); } }); diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java index 6477323..86af367 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java @@ -2,13 +2,10 @@ package us.codecraft.webmagic.proxy; import org.apache.http.HttpHost; import org.junit.BeforeClass; -import org.junit.Test; import java.util.ArrayList; import java.util.List; -import static org.assertj.core.api.Assertions.assertThat; - /** * @author yxssfxwzy@sina.com May 30, 2014 * @@ -27,30 +24,6 @@ public class ProxyTest { } } - @Test - public void testProxy() { - TimerReuseProxyPool proxyPool = new TimerReuseProxyPool(httpProxyList,false); - proxyPool.setReuseInterval(500); - assertThat(proxyPool.getIdleNum()).isEqualTo(4); - for (int i = 0; i < 2; i++) { - List fetchList = new ArrayList(); - while (proxyPool.getIdleNum() != 0) { - Proxy proxy = proxyPool.getProxy(); - HttpHost httphost = proxy.getHttpHost(); - // httphostList.add(httphost); - System.out.println(httphost.getHostName() + ":" + httphost.getPort()); - Fetch tmp = new Fetch(httphost); - tmp.start(); - fetchList.add(tmp); - } - for (Fetch fetch : fetchList) { - proxyPool.returnProxy(fetch.hp, Proxy.SUCCESS); - } - System.out.println(proxyPool.allProxyStatus()); - - } - } - class Fetch extends Thread { HttpHost hp; From c51ac6017c1371f2fd64b281a221edf125f5fb2f Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Tue, 21 Mar 2017 07:31:58 +0800 Subject: [PATCH 08/25] remove Site.addStartRequest() etc. #494 --- .../main/java/us/codecraft/webmagic/Site.java | 71 ------------------- .../java/us/codecraft/webmagic/Spider.java | 1 - 2 files changed, 72 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java index 87eab14..13733c2 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java @@ -3,7 +3,6 @@ package us.codecraft.webmagic; import org.apache.http.HttpHost; import org.apache.http.auth.UsernamePasswordCredentials; import us.codecraft.webmagic.proxy.ProxyProvider; -import us.codecraft.webmagic.utils.UrlUtils; import java.util.*; @@ -26,11 +25,6 @@ public class Site { private String charset; - /** - * startUrls is the urls the crawler to start with. - */ - private List startRequests = new ArrayList(); - private int sleepTime = 5000; private int retryTimes = 0; @@ -223,52 +217,6 @@ public class Site { return acceptStatCode; } - /** - * get start urls - * - * @return start urls - * @see #getStartRequests - * @deprecated - */ - @Deprecated - public List getStartUrls() { - return UrlUtils.convertToUrls(startRequests); - } - - public List getStartRequests() { - return startRequests; - } - - /** - * Add a url to start url.
- * Because urls are more a Spider's property than Site, move it to {@link Spider#addUrl(String...)}} - * - * @param startUrl startUrl - * @return this - * @see Spider#addUrl(String...) - * @deprecated - */ - public Site addStartUrl(String startUrl) { - return addStartRequest(new Request(startUrl)); - } - - /** - * Add a url to start url.
- * Because urls are more a Spider's property than Site, move it to {@link Spider#addRequest(Request...)}} - * - * @param startRequest startRequest - * @return this - * @see Spider#addRequest(Request...) - * @deprecated - */ - public Site addStartRequest(Request startRequest) { - this.startRequests.add(startRequest); - if (domain == null && startRequest.getUrl() != null) { - domain = UrlUtils.getDomain(startRequest.getUrl()); - } - return this; - } - /** * Set the interval between the processing of two pages.
* Time unit is micro seconds.
@@ -348,21 +296,6 @@ public class Site { return this; } - public HttpHost getHttpProxy() { - return httpProxy; - } - - /** - * set up httpProxy for this site - * - * @param httpProxy httpProxy - * @return this - */ - public Site setHttpProxy(HttpHost httpProxy) { - this.httpProxy = httpProxy; - return this; - } - public boolean isUseGzip() { return useGzip; } @@ -430,8 +363,6 @@ public class Site { return false; if (domain != null ? !domain.equals(site.domain) : site.domain != null) return false; if (headers != null ? !headers.equals(site.headers) : site.headers != null) return false; - if (startRequests != null ? !startRequests.equals(site.startRequests) : site.startRequests != null) - return false; if (userAgent != null ? !userAgent.equals(site.userAgent) : site.userAgent != null) return false; return true; @@ -443,7 +374,6 @@ public class Site { result = 31 * result + (userAgent != null ? userAgent.hashCode() : 0); result = 31 * result + (defaultCookies != null ? defaultCookies.hashCode() : 0); result = 31 * result + (charset != null ? charset.hashCode() : 0); - result = 31 * result + (startRequests != null ? startRequests.hashCode() : 0); result = 31 * result + sleepTime; result = 31 * result + retryTimes; result = 31 * result + cycleRetryTimes; @@ -460,7 +390,6 @@ public class Site { ", userAgent='" + userAgent + '\'' + ", cookies=" + defaultCookies + ", charset='" + charset + '\'' + - ", startRequests=" + startRequests + ", sleepTime=" + sleepTime + ", retryTimes=" + retryTimes + ", cycleRetryTimes=" + cycleRetryTimes + diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 213cf3f..c8d974f 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -126,7 +126,6 @@ public class Spider implements Runnable, Task { public Spider(PageProcessor pageProcessor) { this.pageProcessor = pageProcessor; this.site = pageProcessor.getSite(); - this.startRequests = pageProcessor.getSite().getStartRequests(); } /** From c13110c4cbae35443e19d7e57fc8f47d0a3358fd Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Tue, 21 Mar 2017 07:53:43 +0800 Subject: [PATCH 09/25] fix samples --- .../main/java/us/codecraft/webmagic/Site.java | 10 -- .../java/us/codecraft/webmagic/Spider.java | 4 +- .../downloader/HttpClientGenerator.java | 14 +- .../processor/SimplePageProcessor.java | 6 +- .../us/codecraft/webmagic/SpiderTest.java | 4 +- .../webmagic/downloader/FileCache.java | 124 ------------------ .../webmagic/downloader/FileCacheTest.java | 18 --- .../processor/GithubRepoProcessor.java | 2 +- .../samples/DiandianBlogProcessor.java | 2 +- .../webmagic/samples/DiaoyuwengProcessor.java | 4 +- .../webmagic/samples/F58PageProcesser.java | 4 +- .../webmagic/samples/HuxiuProcessor.java | 4 +- .../samples/InfoQMiniBookProcessor.java | 3 +- .../webmagic/samples/IteyeBlogProcessor.java | 4 +- .../webmagic/samples/KaichibaProcessor.java | 4 +- .../webmagic/samples/MeicanProcessor.java | 4 +- .../webmagic/samples/NjuBBSProcessor.java | 9 +- .../samples/OschinaBlogPageProcesser.java | 41 ------ .../samples/OschinaPageProcesser.java | 27 ---- .../webmagic/samples/QzoneBlogProcessor.java | 2 +- .../webmagic/samples/TianyaPageProcesser.java | 2 +- 21 files changed, 37 insertions(+), 255 deletions(-) delete mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/FileCache.java delete mode 100644 webmagic-extension/src/test/java/us/codecraft/webmagic/downloader/FileCacheTest.java delete mode 100644 webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java delete mode 100644 webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaPageProcesser.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java index 13733c2..5606d12 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java @@ -1,9 +1,5 @@ package us.codecraft.webmagic; -import org.apache.http.HttpHost; -import org.apache.http.auth.UsernamePasswordCredentials; -import us.codecraft.webmagic.proxy.ProxyProvider; - import java.util.*; /** @@ -41,12 +37,6 @@ public class Site { private Map headers = new HashMap(); - private HttpHost httpProxy; - - private UsernamePasswordCredentials usernamePasswordCredentials; //代理用户名密码设置 - - private ProxyProvider httpProxyPool; - private boolean useGzip = true; /** diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index c8d974f..5e785af 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -479,7 +479,9 @@ public class Spider implements Runnable, Task { public List getAll(Collection urls) { destroyWhenExit = false; spawnUrl = false; - startRequests.clear(); + if (startRequests!=null){ + startRequests.clear(); + } for (Request request : UrlUtils.convertToRequests(urls)) { addRequest(request); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java index 1da64e7..9e17f60 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java @@ -95,12 +95,12 @@ public class HttpClientGenerator { HttpClientBuilder httpClientBuilder = HttpClients.custom(); httpClientBuilder.setConnectionManager(connectionManager); - if (site != null && site.getUserAgent() != null) { + if (site.getUserAgent() != null) { httpClientBuilder.setUserAgent(site.getUserAgent()); } else { httpClientBuilder.setUserAgent(""); } - if (site == null || site.isUseGzip()) { + if (site.isUseGzip()) { httpClientBuilder.addInterceptorFirst(new HttpRequestInterceptor() { public void process( @@ -117,16 +117,12 @@ public class HttpClientGenerator { SocketConfig.Builder socketConfigBuilder = SocketConfig.custom(); socketConfigBuilder.setSoKeepAlive(true).setTcpNoDelay(true); - if (site != null) { - socketConfigBuilder.setSoTimeout(site.getTimeOut()); - } + socketConfigBuilder.setSoTimeout(site.getTimeOut()); SocketConfig socketConfig = socketConfigBuilder.build(); httpClientBuilder.setDefaultSocketConfig(socketConfig); connectionManager.setDefaultSocketConfig(socketConfig); - if (site != null) { - httpClientBuilder.setRetryHandler(new DefaultHttpRequestRetryHandler(site.getRetryTimes(), true)); - generateCookie(httpClientBuilder, site); - } + httpClientBuilder.setRetryHandler(new DefaultHttpRequestRetryHandler(site.getRetryTimes(), true)); + generateCookie(httpClientBuilder, site); return httpClientBuilder.build(); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/SimplePageProcessor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/SimplePageProcessor.java index a0572a9..842429b 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/SimplePageProcessor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/SimplePageProcessor.java @@ -2,7 +2,6 @@ package us.codecraft.webmagic.processor; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; -import us.codecraft.webmagic.utils.UrlUtils; import java.util.List; @@ -18,9 +17,8 @@ public class SimplePageProcessor implements PageProcessor { private Site site; - public SimplePageProcessor(String startUrl, String urlPattern) { - this.site = Site.me().addStartUrl(startUrl). - setDomain(UrlUtils.getDomain(startUrl)); + public SimplePageProcessor(String urlPattern) { + this.site = Site.me(); //compile "*" expression to regex this.urlPattern = "(" + urlPattern.replace(".", "\\.").replace("*", "[^\"'#]*") + ")"; diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/SpiderTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/SpiderTest.java index ba29387..4f4a280 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/SpiderTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/SpiderTest.java @@ -19,12 +19,12 @@ public class SpiderTest { @Ignore("long time") @Test public void testStartAndStop() throws InterruptedException { - Spider spider = Spider.create(new SimplePageProcessor("http://www.oschina.net/", "http://www.oschina.net/*")).addPipeline(new Pipeline() { + Spider spider = Spider.create(new SimplePageProcessor( "http://www.oschina.net/*")).addPipeline(new Pipeline() { @Override public void process(ResultItems resultItems, Task task) { System.out.println(1); } - }).thread(1); + }).thread(1).addUrl("http://www.oschina.net/"); spider.start(); Thread.sleep(10000); spider.stop(); diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/FileCache.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/FileCache.java deleted file mode 100644 index 3c7e6ff..0000000 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/FileCache.java +++ /dev/null @@ -1,124 +0,0 @@ -package us.codecraft.webmagic.downloader; - -import org.apache.commons.codec.digest.DigestUtils; -import org.apache.commons.lang3.StringUtils; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import us.codecraft.webmagic.*; -import us.codecraft.webmagic.utils.Experimental; -import us.codecraft.webmagic.pipeline.Pipeline; -import us.codecraft.webmagic.processor.PageProcessor; -import us.codecraft.webmagic.processor.SimplePageProcessor; -import us.codecraft.webmagic.selector.Html; -import us.codecraft.webmagic.selector.PlainText; -import us.codecraft.webmagic.utils.FilePersistentBase; -import us.codecraft.webmagic.utils.UrlUtils; - -import java.io.*; - -/** - * Download file and saved to file for cache.
- * - * @author code4crafter@gmail.com - * @since 0.2.1 - */ -@Experimental -public class FileCache extends FilePersistentBase implements Downloader, Pipeline, PageProcessor { - - private Downloader downloaderWhenFileMiss; - - private final PageProcessor pageProcessor; - - private Logger logger = LoggerFactory.getLogger(getClass()); - - public FileCache(String startUrl, String urlPattern) { - this(startUrl, urlPattern, "/data/webmagic/temp/"); - } - - public FileCache(String startUrl, String urlPattern, String path) { - this.pageProcessor = new SimplePageProcessor(startUrl, urlPattern); - setPath(path); - downloaderWhenFileMiss = new HttpClientDownloader(); - } - - public FileCache setDownloaderWhenFileMiss(Downloader downloaderWhenFileMiss) { - this.downloaderWhenFileMiss = downloaderWhenFileMiss; - return this; - } - - @Override - public Page download(Request request, Task task) { - String path = this.path + "/" + task.getUUID() + "/"; - Page page = null; - try { - final File file = getFile(path + DigestUtils.md5Hex(request.getUrl())); - BufferedReader bufferedReader = new BufferedReader(new FileReader(file)); - String line = bufferedReader.readLine(); - if (line.equals("url:\t" + request.getUrl())) { - final String html = getHtml(bufferedReader); - page = new Page(); - page.setRequest(request); - page.setUrl(PlainText.create(request.getUrl())); - page.setHtml(Html.create(UrlUtils.fixAllRelativeHrefs(html, request.getUrl()))); - } - } catch (IOException e) { - if (e instanceof FileNotFoundException) { - logger.info("File not exist for url " + request.getUrl()); - } else { - logger.warn("File read error for url " + request.getUrl(), e); - } - } - if (page == null) { - page = downloadWhenMiss(request, task); - } - return page; - } - - @Override - public void setThread(int thread) { - - } - - private String getHtml(BufferedReader bufferedReader) throws IOException { - String line; - StringBuilder htmlBuilder = new StringBuilder(); - line = bufferedReader.readLine(); - line = StringUtils.removeStart(line, "html:\t"); - htmlBuilder.append(line); - while ((line = bufferedReader.readLine()) != null) { - htmlBuilder.append(line); - } - return htmlBuilder.toString(); - } - - private Page downloadWhenMiss(Request request, Task task) { - Page page = null; - if (downloaderWhenFileMiss != null) { - page = downloaderWhenFileMiss.download(request, task); - } - return page; - } - - @Override - public void process(ResultItems resultItems, Task task) { - String path = this.path + PATH_SEPERATOR + task.getUUID() + PATH_SEPERATOR; - try { - PrintWriter printWriter = new PrintWriter(new FileWriter(getFile(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".html"))); - printWriter.println("url:\t" + resultItems.getRequest().getUrl()); - printWriter.println("html:\t" + resultItems.get("html")); - printWriter.close(); - } catch (IOException e) { - logger.warn("write file error", e); - } - } - - @Override - public void process(Page page) { - pageProcessor.process(page); - } - - @Override - public Site getSite() { - return pageProcessor.getSite(); - } -} diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/downloader/FileCacheTest.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/downloader/FileCacheTest.java deleted file mode 100644 index f73b344..0000000 --- a/webmagic-extension/src/test/java/us/codecraft/webmagic/downloader/FileCacheTest.java +++ /dev/null @@ -1,18 +0,0 @@ -package us.codecraft.webmagic.downloader; - -import org.junit.Ignore; -import org.junit.Test; -import us.codecraft.webmagic.Spider; - -/** - * @author code4crafter@gmail.com
- */ -public class FileCacheTest { - - @Ignore("takes long") - @Test - public void test() { - FileCache fileCache = new FileCache("http://my.oschina.net/flashsword/blog", "http://my.oschina.net/flashsword/blog/*"); - Spider.create(fileCache).downloader(fileCache).pipeline(fileCache).run(); - } -} diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/processor/GithubRepoProcessor.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/processor/GithubRepoProcessor.java index bf9e381..1c8742c 100644 --- a/webmagic-extension/src/test/java/us/codecraft/webmagic/processor/GithubRepoProcessor.java +++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/processor/GithubRepoProcessor.java @@ -19,7 +19,7 @@ public class GithubRepoProcessor implements PageProcessor { @Override public Site getSite() { - return Site.me().addStartUrl("https://github.com/code4craft/webmagic"); + return Site.me(); } @Test diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiandianBlogProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiandianBlogProcessor.java index 25baa1f..8bd7d58 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiandianBlogProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiandianBlogProcessor.java @@ -35,7 +35,7 @@ public class DiandianBlogProcessor implements PageProcessor { public Site getSite() { //site定义抽取配置,以及开始url等 if (site == null) { - site = Site.me().setDomain("progressdaily.diandian.com").addStartUrl("http://progressdaily.diandian.com/"). + site = Site.me().setDomain("progressdaily.diandian.com"). setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); } return site; diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java index 3ceba0a..61458d0 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java @@ -34,13 +34,13 @@ public class DiaoyuwengProcessor implements PageProcessor { @Override public Site getSite() { if (site==null){ - site= Site.me().setDomain("www.diaoyuweng.com").addStartUrl("http://www.diaoyuweng.com/home.php?mod=space&uid=88304&do=thread&view=me&type=thread&from=space"). + site= Site.me().setDomain("www.diaoyuweng.com"). setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31").setCharset("GBK").setSleepTime(500); } return site; } public static void main(String[] args) { - Spider.create(new DiaoyuwengProcessor()).run(); + Spider.create(new DiaoyuwengProcessor()).addUrl("http://www.diaoyuweng.com/home.php?mod=space&uid=88304&do=thread&view=me&type=thread&from=space").run(); } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/F58PageProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/F58PageProcesser.java index 3d27be8..8091b65 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/F58PageProcesser.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/F58PageProcesser.java @@ -25,10 +25,10 @@ public class F58PageProcesser implements PageProcessor { @Override public Site getSite() { - return Site.me().setDomain("sh.58.com").addStartUrl("http://sh1.51a8.com/").setCycleRetryTimes(2); //To change body of implemented methods use File | Settings | File Templates. + return Site.me().setDomain("sh.58.com").setCycleRetryTimes(2); //To change body of implemented methods use File | Settings | File Templates. } public static void main(String[] args) { - Spider.create(new F58PageProcesser()).setScheduler(new RedisScheduler("localhost")).run(); + Spider.create(new F58PageProcesser()).setScheduler(new RedisScheduler("localhost")).addUrl("http://sh1.51a8.com/").run(); } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java index 000cb99..1cc90b0 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java @@ -21,11 +21,11 @@ public class HuxiuProcessor implements PageProcessor { @Override public Site getSite() { - return Site.me().setDomain("www.huxiu.com").addStartUrl("http://www.huxiu.com/"); + return Site.me().setDomain("www.huxiu.com"); } public static void main(String[] args) { - Spider.create(new HuxiuProcessor()).run(); + Spider.create(new HuxiuProcessor()).addUrl("http://www.huxiu.com/").run(); } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/InfoQMiniBookProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/InfoQMiniBookProcessor.java index 3ef3957..280f8f1 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/InfoQMiniBookProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/InfoQMiniBookProcessor.java @@ -29,7 +29,7 @@ public class InfoQMiniBookProcessor implements PageProcessor { @Override public Site getSite() { if (site == null) { - site = Site.me().setDomain("www.infoq.com").addStartUrl("http://www.infoq.com/cn/minibooks").addCookie("RegisteredUserCookie", "sDDDc8dIAgZSq67uJSXhtpQaHEi1XDOH"). + site = Site.me().setDomain("www.infoq.com").addCookie("RegisteredUserCookie", "sDDDc8dIAgZSq67uJSXhtpQaHEi1XDOH"). setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); } return site; @@ -38,6 +38,7 @@ public class InfoQMiniBookProcessor implements PageProcessor { public static void main(String[] args) { Spider.create(new InfoQMiniBookProcessor()) .thread(5) + .addUrl("http://www.infoq.com/cn/minibooks") .run(); } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/IteyeBlogProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/IteyeBlogProcessor.java index 26b85e8..6dce807 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/IteyeBlogProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/IteyeBlogProcessor.java @@ -22,12 +22,12 @@ public class IteyeBlogProcessor implements PageProcessor { @Override public Site getSite() { if (site == null) { - site = Site.me().setDomain("yanghaoli.iteye.com").addStartUrl("http://yanghaoli.iteye.com/"); + site = Site.me().setDomain("yanghaoli.iteye.com"); } return site; } public static void main(String[] args) { - Spider.create(new IteyeBlogProcessor()).thread(5).run(); + Spider.create(new IteyeBlogProcessor()).thread(5).addUrl("http://yanghaoli.iteye.com/").run(); } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/KaichibaProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/KaichibaProcessor.java index 0ab6c64..b373f52 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/KaichibaProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/KaichibaProcessor.java @@ -22,11 +22,11 @@ public class KaichibaProcessor implements PageProcessor { @Override public Site getSite() { - return Site.me().setDomain("kaichiba.com").addStartUrl("http://kaichiba.com/shop/41725781").setCharset("utf-8"). + return Site.me().setDomain("kaichiba.com").setCharset("utf-8"). setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); } public static void main(String[] args) { - Spider.create(new KaichibaProcessor()).run(); + Spider.create(new KaichibaProcessor()).addUrl("http://kaichiba.com/shop/41725781").run(); } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MeicanProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MeicanProcessor.java index bfa347d..cb4c498 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MeicanProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MeicanProcessor.java @@ -28,11 +28,11 @@ public class MeicanProcessor implements PageProcessor { @Override public Site getSite() { - return Site.me().setDomain("meican.com").addStartUrl("http://www.meican.com/shanghai/districts").setCharset("utf-8"). + return Site.me().setDomain("meican.com").setCharset("utf-8"). setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); } public static void main(String[] args) { - Spider.create(new MeicanProcessor()).run(); + Spider.create(new MeicanProcessor()).addUrl("http://www.meican.com/shanghai/districts").run(); } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/NjuBBSProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/NjuBBSProcessor.java index 16dcb0c..ce0f817 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/NjuBBSProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/NjuBBSProcessor.java @@ -1,7 +1,8 @@ package us.codecraft.webmagic.samples; -import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; import java.util.List; @@ -22,6 +23,10 @@ public class NjuBBSProcessor implements PageProcessor { @Override public Site getSite() { - return Site.me().setDomain("bbs.nju.edu.cn").addStartUrl("http://bbs.nju.edu.cn/board?board=Pictures"); + return Site.me().setDomain("bbs.nju.edu.cn"); + } + + public static void main(String[] args) { + Spider.create(new NjuBBSProcessor()).addUrl("http://bbs.nju.edu.cn/board?board=Pictures").run(); } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java deleted file mode 100644 index e6db04e..0000000 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java +++ /dev/null @@ -1,41 +0,0 @@ -package us.codecraft.webmagic.samples; - -import us.codecraft.webmagic.Page; -import us.codecraft.webmagic.Site; -import us.codecraft.webmagic.Spider; -import us.codecraft.webmagic.monitor.SpiderMonitor; -import us.codecraft.webmagic.processor.PageProcessor; -import us.codecraft.webmagic.scheduler.BloomFilterDuplicateRemover; -import us.codecraft.webmagic.scheduler.QueueScheduler; - -import javax.management.JMException; -import java.util.List; - -/** - * @author code4crafter@gmail.com
- */ -public class OschinaBlogPageProcesser implements PageProcessor { - - private Site site = Site.me().setDomain("my.oschina.net").addStartUrl("http://my.oschina.net/flashsword/blog"); - - @Override - public void process(Page page) { - List links = page.getHtml().links().regex("http://my\\.oschina\\.net/flashsword/blog/\\d+").all(); - page.addTargetRequests(links); - page.putField("title", page.getHtml().xpath("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1/text()").toString()); - page.putField("content", page.getHtml().xpath("//div[@class='BlogContent']/tidyText()").toString()); - page.putField("tags",page.getHtml().xpath("//div[@class='BlogTags']/a/text()").all()); - } - - @Override - public Site getSite() { - return site; - - } - - public static void main(String[] args) throws JMException { - Spider spider = Spider.create(new OschinaBlogPageProcesser()).setScheduler(new QueueScheduler().setDuplicateRemover(new BloomFilterDuplicateRemover(2000))); - SpiderMonitor.instance().register(spider); - spider.run(); - } -} diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaPageProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaPageProcesser.java deleted file mode 100644 index b75cc83..0000000 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaPageProcesser.java +++ /dev/null @@ -1,27 +0,0 @@ -package us.codecraft.webmagic.samples; - -import us.codecraft.webmagic.Site; -import us.codecraft.webmagic.Page; -import us.codecraft.webmagic.processor.PageProcessor; - -import java.util.List; - -/** - * @author code4crafter@gmail.com
- */ -public class OschinaPageProcesser implements PageProcessor { - - @Override - public void process(Page page) { - List strings = page.getHtml().regex("]*href=[\"']{1}(http://www\\.oschina\\.net/question/[\\w]+)[\"']{1}").all(); - page.addTargetRequests(strings); - page.putField("title", page.getHtml().xpath("//div[@class='QTitle']/h1/a")); - page.putField("content", page.getHtml().xpath("//div[@class='Question']//div[@class='Content']/div[@class='detail']")); - } - - @Override - public Site getSite() { - return Site.me().setDomain("www.oschina.net").addStartUrl("http://www.oschina.net/"). - setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); - } -} diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/QzoneBlogProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/QzoneBlogProcessor.java index d9cee2b..037b333 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/QzoneBlogProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/QzoneBlogProcessor.java @@ -24,7 +24,7 @@ public class QzoneBlogProcessor implements PageProcessor { @Override public Site getSite() { - return Site.me().setDomain("www.diandian.com").addStartUrl("http://17dujingdian.com/"). + return Site.me().setDomain("www.diandian.com"). setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/TianyaPageProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/TianyaPageProcesser.java index d14b442..6cc8f99 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/TianyaPageProcesser.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/TianyaPageProcesser.java @@ -21,6 +21,6 @@ public class TianyaPageProcesser implements PageProcessor { @Override public Site getSite() { - return Site.me().setDomain("http://bbs.tianya.cn/").addStartUrl("http://bbs.tianya.cn/"); //To change body of implemented methods use File | Settings | File Templates. + return Site.me().setDomain("http://bbs.tianya.cn/"); //To change body of implemented methods use File | Settings | File Templates. } } From 8564d51e569bfa4ac81eb551f920b11ff87021d1 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Tue, 21 Mar 2017 07:57:56 +0800 Subject: [PATCH 10/25] remove site.addStartUrl in samples --- .../webmagic/model/samples/DianpingFtlDataScanner.java | 2 +- .../java/us/codecraft/webmagic/model/samples/GithubRepo.java | 5 +++-- .../java/us/codecraft/webmagic/model/samples/IteyeBlog.java | 2 +- .../us/codecraft/webmagic/model/samples/Kr36NewsModel.java | 4 ++-- .../us/codecraft/webmagic/model/samples/OschinaAnswer.java | 2 +- .../us/codecraft/webmagic/model/samples/OschinaBlog.java | 4 ++-- 6 files changed, 10 insertions(+), 9 deletions(-) diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/DianpingFtlDataScanner.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/DianpingFtlDataScanner.java index 7239e36..77def20 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/DianpingFtlDataScanner.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/DianpingFtlDataScanner.java @@ -21,7 +21,7 @@ public class DianpingFtlDataScanner implements AfterExtractor { private List data; public static void main(String[] args) { - OOSpider.create(Site.me().addStartUrl("http://w.alpha.dp/").setSleepTime(0), DianpingFtlDataScanner.class) + OOSpider.create(Site.me().setSleepTime(0), DianpingFtlDataScanner.class) .thread(5).run(); } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/GithubRepo.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/GithubRepo.java index e8998ec..941bdbd 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/GithubRepo.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/GithubRepo.java @@ -41,9 +41,10 @@ public class GithubRepo implements HasKey { private String url; public static void main(String[] args) { - OOSpider.create(Site.me().addStartUrl("https://github.com/explore").setSleepTime(0).setRetryTimes(3), + OOSpider.create(Site.me().setSleepTime(0).setRetryTimes(3), new JsonFilePageModelPipeline(), GithubRepo.class) - .scheduler(new FileCacheQueueScheduler("/data/webmagic/cache/")).thread(15).run(); + .addUrl("https://github.com/explore") + .setScheduler(new FileCacheQueueScheduler("/data/webmagic/cache/")).thread(15).run(); } @Override diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/IteyeBlog.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/IteyeBlog.java index 7e3dc51..6a10f47 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/IteyeBlog.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/IteyeBlog.java @@ -28,7 +28,7 @@ public class IteyeBlog implements Blog{ } public static void main(String[] args) { - OOSpider.create(Site.me().addStartUrl("http://flashsword20.iteye.com/blog"), IteyeBlog.class).run(); + OOSpider.create(Site.me(), IteyeBlog.class).addUrl("http://flashsword20.iteye.com/blog").run(); } public String getTitle() { diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/Kr36NewsModel.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/Kr36NewsModel.java index a1ef3fd..a1cc545 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/Kr36NewsModel.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/Kr36NewsModel.java @@ -32,12 +32,12 @@ public class Kr36NewsModel { public static void main(String[] args) throws IOException, JMException { //Just for benchmark - Spider thread = OOSpider.create(Site.me().addStartUrl("http://www.36kr.com/").setSleepTime(0), new PageModelPipeline() { + Spider thread = OOSpider.create(Site.me().setSleepTime(0), new PageModelPipeline() { @Override public void process(Object o, Task task) { } - }, Kr36NewsModel.class).thread(20); + }, Kr36NewsModel.class).thread(20).addUrl("http://www.36kr.com/"); thread.start(); SpiderMonitor spiderMonitor = SpiderMonitor.instance(); spiderMonitor.register(thread); diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaAnswer.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaAnswer.java index 112f86a..cd93093 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaAnswer.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaAnswer.java @@ -22,7 +22,7 @@ public class OschinaAnswer implements AfterExtractor{ private String content; public static void main(String[] args) { - OOSpider.create(Site.me().addStartUrl("http://www.oschina.net/question/567527_120597"), OschinaAnswer.class).run(); + OOSpider.create(Site.me(), OschinaAnswer.class).addUrl("http://www.oschina.net/question/567527_120597").run(); } @Override diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java index 468b855..286e6f5 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java @@ -26,7 +26,7 @@ public class OschinaBlog{ public static void main(String[] args) { OOSpider.create(Site.me() - .setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36").addStartUrl("http://my.oschina.net/flashsword/blog") + .setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36") .setSleepTime(0) .setRetryTimes(3) ,new PageModelPipeline() { @@ -34,7 +34,7 @@ public class OschinaBlog{ public void process(Object o, Task task) { } - }, OschinaBlog.class).thread(10).run(); + }, OschinaBlog.class).thread(10).addUrl("http://my.oschina.net/flashsword/blog").run(); } public String getTitle() { From 3824232171f17a100ce3096d37552e34c03c8df3 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Tue, 21 Mar 2017 07:59:33 +0800 Subject: [PATCH 11/25] remove site.addStartUrl in samples --- .../src/test/java/us/codecraft/webmagic/SpiderTest.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/webmagic-samples/src/test/java/us/codecraft/webmagic/SpiderTest.java b/webmagic-samples/src/test/java/us/codecraft/webmagic/SpiderTest.java index 2fd690d..f8dfb97 100644 --- a/webmagic-samples/src/test/java/us/codecraft/webmagic/SpiderTest.java +++ b/webmagic-samples/src/test/java/us/codecraft/webmagic/SpiderTest.java @@ -28,10 +28,10 @@ public class SpiderTest { // PageProcessor pageProcessor = new MeicanProcessor(); // Spider.me().pipeline(new FilePipeline()).scheduler(new FileCacheQueueScheduler(pageProcessor.getSite(),"/data/temp/webmagic/cache/")). // processor(pageProcessor).run(); - SimplePageProcessor pageProcessor2 = new SimplePageProcessor("http://www.diaoyuweng.com/home.php?mod=space&uid=88304&do=thread&view=me&type=thread&from=space", "http://www.diaoyuweng.com/thread-*-1-1.html"); + SimplePageProcessor pageProcessor2 = new SimplePageProcessor( "http://www.diaoyuweng.com/thread-*-1-1.html"); System.out.println(pageProcessor2.getSite().getCharset()); pageProcessor2.getSite().setSleepTime(500); - Spider.create(pageProcessor2).addPipeline(new FilePipeline()).scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")). + Spider.create(pageProcessor2).addUrl("http://www.diaoyuweng.com/home.php?mod=space&uid=88304&do=thread&view=me&type=thread&from=space").addPipeline(new FilePipeline()).scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")). run(); From ba000b364cc32a124822e60ae7a2d77c16e177ce Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Tue, 21 Mar 2017 08:00:15 +0800 Subject: [PATCH 12/25] remove site.addStartUrl in samples --- .../java/us/codecraft/webmagic/model/ProcessorBenchmark.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webmagic-samples/src/test/java/us/codecraft/webmagic/model/ProcessorBenchmark.java b/webmagic-samples/src/test/java/us/codecraft/webmagic/model/ProcessorBenchmark.java index 193908d..7c61926 100644 --- a/webmagic-samples/src/test/java/us/codecraft/webmagic/model/ProcessorBenchmark.java +++ b/webmagic-samples/src/test/java/us/codecraft/webmagic/model/ProcessorBenchmark.java @@ -17,7 +17,7 @@ public class ProcessorBenchmark { @Ignore @Test public void test() { - ModelPageProcessor modelPageProcessor = ModelPageProcessor.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog"), OschinaBlog.class); + ModelPageProcessor modelPageProcessor = ModelPageProcessor.create(Site.me(), OschinaBlog.class); Page page = new Page(); page.setRequest(new Request("http://my.oschina.net/flashsword/blog")); page.setUrl(new PlainText("http://my.oschina.net/flashsword/blog")); From f23e138c728f95622170990e732be9f77c6b1f17 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Mon, 27 Mar 2017 09:52:25 +0800 Subject: [PATCH 13/25] add response headers to Page #508 --- .../main/java/us/codecraft/webmagic/Page.java | 13 +++++++++ .../java/us/codecraft/webmagic/Request.java | 1 - .../main/java/us/codecraft/webmagic/Site.java | 10 ------- .../downloader/HttpClientDownloader.java | 13 ++++++--- .../downloader/HttpUriRequestConverter.java | 2 +- ...annedChecker.java => ResponseChecker.java} | 2 +- .../webmagic/utils/HttpClientUtils.java | 28 +++++++++++++++++++ 7 files changed, 52 insertions(+), 17 deletions(-) rename webmagic-core/src/main/java/us/codecraft/webmagic/proxy/{BannedChecker.java => ResponseChecker.java} (86%) create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/utils/HttpClientUtils.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java index 7c0064d..f9495a4 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java @@ -8,6 +8,7 @@ import us.codecraft.webmagic.utils.UrlUtils; import java.util.ArrayList; import java.util.List; +import java.util.Map; /** * Object storing extracted result and urls to fetch.
@@ -38,6 +39,8 @@ public class Page { private Selectable url; + private Map> headers; + private int statusCode; private boolean needCycleRetry; @@ -210,6 +213,14 @@ public class Page { return this; } + public Map> getHeaders() { + return headers; + } + + public void setHeaders(Map> headers) { + this.headers = headers; + } + @Override public String toString() { return "Page{" + @@ -217,7 +228,9 @@ public class Page { ", resultItems=" + resultItems + ", rawText='" + rawText + '\'' + ", url=" + url + + ", headers=" + headers + ", statusCode=" + statusCode + + ", needCycleRetry=" + needCycleRetry + ", targetRequests=" + targetRequests + '}'; } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java index 0a38fcc..21cd72e 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java @@ -18,7 +18,6 @@ public class Request implements Serializable { private static final long serialVersionUID = 2062192774891352043L; public static final String CYCLE_TRIED_TIMES = "_cycle_tried_times"; - public static final String PROXY = "proxy"; private String url; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java index 5606d12..520902d 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java @@ -39,16 +39,6 @@ public class Site { private boolean useGzip = true; - /** - * @see us.codecraft.webmagic.utils.HttpConstant.Header - * @deprecated - */ - public static interface HeaderConst { - - public static final String REFERER = "Referer"; - } - - static { DEFAULT_STATUS_CODE_SET.add(200); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 3a44af6..e6523ec 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -23,6 +23,7 @@ import us.codecraft.webmagic.proxy.Proxy; import us.codecraft.webmagic.proxy.ProxyProvider; import us.codecraft.webmagic.selector.PlainText; import us.codecraft.webmagic.utils.CharsetUtils; +import us.codecraft.webmagic.utils.HttpClientUtils; import java.io.IOException; import java.nio.charset.Charset; @@ -49,6 +50,8 @@ public class HttpClientDownloader extends AbstractDownloader { private ProxyProvider proxyProvider; + private boolean responseHeader = true; + public void setHttpUriRequestConverter(HttpUriRequestConverter httpUriRequestConverter) { this.httpUriRequestConverter = httpUriRequestConverter; } @@ -88,13 +91,12 @@ public class HttpClientDownloader extends AbstractDownloader { HttpContext httpContext = new BasicHttpContext(); if (proxyProvider != null) { proxy = proxyProvider.getProxy(task); - request.putExtra(Request.PROXY, proxy); AuthState authState = new AuthState(); authState.update(new BasicScheme(), new UsernamePasswordCredentials(proxy.getUsername(), proxy.getPassword())); httpContext.setAttribute(HttpClientContext.PROXY_AUTH_STATE, authState); } - HttpUriRequest httpUriRequest = httpUriRequestConverter.convert(request, site, proxy); CloseableHttpClient httpClient = getHttpClient(site); + HttpUriRequest httpUriRequest = httpUriRequestConverter.convert(request, site, proxy); try { httpResponse = httpClient.execute(httpUriRequest, httpContext); statusCode = httpResponse.getStatusLine().getStatusCode(); @@ -133,10 +135,13 @@ public class HttpClientDownloader extends AbstractDownloader { page.setUrl(new PlainText(request.getUrl())); page.setRequest(request); page.setStatusCode(httpResponse.getStatusLine().getStatusCode()); + if (responseHeader) { + page.setHeaders(HttpClientUtils.convertHeaders(httpResponse.getAllHeaders())); + } return page; } - protected String getContent(String charset, HttpResponse httpResponse) throws IOException { + private String getContent(String charset, HttpResponse httpResponse) throws IOException { if (charset == null) { byte[] contentBytes = IOUtils.toByteArray(httpResponse.getEntity().getContent()); String htmlCharset = getHtmlCharset(httpResponse, contentBytes); @@ -151,7 +156,7 @@ public class HttpClientDownloader extends AbstractDownloader { } } - protected String getHtmlCharset(HttpResponse httpResponse, byte[] contentBytes) throws IOException { + private String getHtmlCharset(HttpResponse httpResponse, byte[] contentBytes) throws IOException { return CharsetUtils.detectCharset(httpResponse.getEntity().getContentType().getValue(), contentBytes); } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java index 951d332..db131d0 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java @@ -39,7 +39,7 @@ public class HttpUriRequestConverter { requestConfigBuilder.setConnectionRequestTimeout(site.getTimeOut()) .setSocketTimeout(site.getTimeOut()) .setConnectTimeout(site.getTimeOut()) - .setCookieSpec(CookieSpecs.BEST_MATCH); + .setCookieSpec(CookieSpecs.STANDARD); } if (proxy != null) { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/BannedChecker.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ResponseChecker.java similarity index 86% rename from webmagic-core/src/main/java/us/codecraft/webmagic/proxy/BannedChecker.java rename to webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ResponseChecker.java index db17de2..3e68c11 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/BannedChecker.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ResponseChecker.java @@ -7,7 +7,7 @@ import org.apache.http.HttpResponse; * Date: 17/3/20 * Time: 下午10:52 */ -public interface BannedChecker { +public interface ResponseChecker { boolean isBanned(HttpResponse httpResponse); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/HttpClientUtils.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/HttpClientUtils.java new file mode 100644 index 0000000..93f8fe9 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/HttpClientUtils.java @@ -0,0 +1,28 @@ +package us.codecraft.webmagic.utils; + +import org.apache.http.Header; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +/** + * @author code4crafter@gmail.com + * Date: 17/3/27 + */ +public abstract class HttpClientUtils { + + public static Map> convertHeaders(Header[] headers){ + Map> results = new HashMap>(); + for (Header header : headers) { + List list = results.get(header.getName()); + if (list == null) { + list = new ArrayList(); + results.put(header.getName(), list); + } + list.add(header.getValue()); + } + return results; + } +} From 6bd197859b2d7c0a342997008f0ee6c61d56092c Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Mon, 27 Mar 2017 09:55:50 +0800 Subject: [PATCH 14/25] fix test --- .../webmagic/downloader/HttpClientDownloaderTest.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java index fd1f4c2..e600bf9 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java @@ -15,6 +15,7 @@ import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.selector.Html; +import us.codecraft.webmagic.utils.CharsetUtils; import us.codecraft.webmagic.utils.HttpConstant; import java.io.IOException; @@ -99,7 +100,7 @@ public class HttpClientDownloaderTest { String charset = null; try { byte[] contentBytes = IOUtils.toByteArray(httpResponse.getEntity().getContent()); - charset = downloader.getHtmlCharset(httpResponse,contentBytes); + charset = CharsetUtils.detectCharset(httpResponse.getEntity().getContentType().getValue(), contentBytes); } catch (IOException e) { e.printStackTrace(); } From b100dfe273cd681d0f897d0f298ea6cb8cf64ce9 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Thu, 30 Mar 2017 09:18:12 +0800 Subject: [PATCH 15/25] update version --- pom.xml | 2 +- webmagic-core/pom.xml | 2 +- webmagic-extension/pom.xml | 2 +- webmagic-samples/pom.xml | 2 +- webmagic-saxon/pom.xml | 2 +- webmagic-scripts/pom.xml | 2 +- webmagic-selenium/pom.xml | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/pom.xml b/pom.xml index 04b6dec..4279ec7 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ 7 us.codecraft - 0.6.2-SNAPSHOT + 0.7.0-SNAPSHOT 4.0.0 pom diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index ad96961..7ca5c7b 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.6.2-SNAPSHOT + 0.7.0-SNAPSHOT 4.0.0 diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index a48bdd0..0848817 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.6.2-SNAPSHOT + 0.7.0-SNAPSHOT 4.0.0 diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index eed2b77..a447e39 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.6.2-SNAPSHOT + 0.7.0-SNAPSHOT 4.0.0 diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml index 9b8b732..1e33539 100644 --- a/webmagic-saxon/pom.xml +++ b/webmagic-saxon/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.6.2-SNAPSHOT + 0.7.0-SNAPSHOT 4.0.0 diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index 3c6f673..cd1ec64 100755 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.6.2-SNAPSHOT + 0.7.0-SNAPSHOT 4.0.0 diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml index b66ca0c..bdc9d8a 100644 --- a/webmagic-selenium/pom.xml +++ b/webmagic-selenium/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.6.2-SNAPSHOT + 0.7.0-SNAPSHOT 4.0.0 From 74110e6ec5cf5c89115dbafa814563ea5b8ce71a Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Thu, 30 Mar 2017 09:19:00 +0800 Subject: [PATCH 16/25] remove useless file --- webmagic-core/pom.xml.versionsBackup | 86 ----------------------- webmagic-extension/pom.xml.versionsBackup | 29 -------- 2 files changed, 115 deletions(-) delete mode 100644 webmagic-core/pom.xml.versionsBackup delete mode 100644 webmagic-extension/pom.xml.versionsBackup diff --git a/webmagic-core/pom.xml.versionsBackup b/webmagic-core/pom.xml.versionsBackup deleted file mode 100644 index b530bab..0000000 --- a/webmagic-core/pom.xml.versionsBackup +++ /dev/null @@ -1,86 +0,0 @@ - - - - us.codecraft - webmagic-parent - 0.5.2 - - 4.0.0 - - webmagic-core - - - - org.apache.httpcomponents - httpclient - - - - junit - junit - - - - com.google.guava - guava - - - - org.apache.commons - commons-lang3 - - - - us.codecraft - xsoup - - - - com.github.dreamhead - moco-core - - - - org.slf4j - slf4j-api - - - - org.slf4j - slf4j-log4j12 - - - - commons-collections - commons-collections - - - - org.assertj - assertj-core - - - - org.jsoup - jsoup - - - - org.apache.commons - commons-io - - - - com.jayway.jsonpath - json-path - 0.8.1 - - - - com.alibaba - fastjson - - - - - \ No newline at end of file diff --git a/webmagic-extension/pom.xml.versionsBackup b/webmagic-extension/pom.xml.versionsBackup deleted file mode 100644 index 47496ec..0000000 --- a/webmagic-extension/pom.xml.versionsBackup +++ /dev/null @@ -1,29 +0,0 @@ - - - - us.codecraft - webmagic-parent - 0.5.2 - - 4.0.0 - - webmagic-extension - - - - redis.clients - jedis - 2.0.0 - - - us.codecraft - webmagic-core - ${project.version} - - - junit - junit - - - - \ No newline at end of file From 395396c68ec257d1e982e696dab53653cb4bccfe Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 8 Apr 2017 11:59:52 +0800 Subject: [PATCH 17/25] =?UTF-8?q?=E5=A2=9E=E5=8A=A0HttpRequestBody?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../webmagic/model/HttpRequestBody.java | 69 +++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/model/HttpRequestBody.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/model/HttpRequestBody.java b/webmagic-core/src/main/java/us/codecraft/webmagic/model/HttpRequestBody.java new file mode 100644 index 0000000..39a92f7 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/model/HttpRequestBody.java @@ -0,0 +1,69 @@ +package us.codecraft.webmagic.model; + +import org.apache.http.NameValuePair; +import org.apache.http.client.utils.URLEncodedUtils; +import org.apache.http.message.BasicNameValuePair; + +import java.io.UnsupportedEncodingException; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; + +/** + * @author code4crafter@gmail.com + * Date: 17/4/8 + */ +public class HttpRequestBody { + + public static abstract class ContentType { + + public static final String JSON = "application/json"; + + public static final String XML = "text/xml"; + + public static final String FORM = "application/x-www-form-urlencoded"; + + public static final String MULTIPART = "multipart/form-data"; + } + + private final byte[] body; + + private final String contentType; + + private final String encoding; + + public HttpRequestBody(byte[] body, String contentType, String encoding) { + this.body = body; + this.contentType = contentType; + this.encoding = encoding; + } + + public String getContentType() { + return contentType; + } + + public String getEncoding() { + return encoding; + } + + public static HttpRequestBody json(String json, String encoding) throws UnsupportedEncodingException { + return new HttpRequestBody(json.getBytes(encoding), ContentType.JSON, encoding); + } + + public static HttpRequestBody xml(String xml, String encoding) throws UnsupportedEncodingException { + return new HttpRequestBody(xml.getBytes(encoding), ContentType.XML, encoding); + } + + public static HttpRequestBody custom(byte[] body, String contentType, String encoding) throws UnsupportedEncodingException { + return new HttpRequestBody(body, contentType, encoding); + } + + public static HttpRequestBody form(Map params, String encoding) throws UnsupportedEncodingException { + List nameValuePairs = new ArrayList(params.size()); + for (Map.Entry entry : params.entrySet()) { + nameValuePairs.add(new BasicNameValuePair(entry.getKey(), String.valueOf(entry.getValue()))); + } + return new HttpRequestBody(URLEncodedUtils.format(nameValuePairs, encoding).getBytes(encoding), ContentType.FORM, encoding); + } + +} From 83ada9749ee51383672f55002b4950ec2717f2e3 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 8 Apr 2017 12:16:34 +0800 Subject: [PATCH 18/25] fix test --- .../java/us/codecraft/webmagic/Request.java | 4 ++++ .../downloader/HttpClientDownloaderTest.java | 22 ++++++++++++------- 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java index 9a63c8c..c4b9426 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java @@ -141,6 +141,10 @@ public class Request implements Serializable { return requestBody; } + public void setRequestBody(HttpRequestBody requestBody) { + this.requestBody = requestBody; + } + @Override public String toString() { return "Request{" + diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java index e600bf9..cac84f3 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java @@ -3,6 +3,7 @@ package us.codecraft.webmagic.downloader; import com.github.dreamhead.moco.HttpServer; import com.github.dreamhead.moco.Runnable; import com.github.dreamhead.moco.Runner; +import org.apache.commons.collections.map.HashedMap; import org.apache.commons.io.IOUtils; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpUriRequest; @@ -14,12 +15,14 @@ import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Task; +import us.codecraft.webmagic.model.HttpRequestBody; import us.codecraft.webmagic.selector.Html; import us.codecraft.webmagic.utils.CharsetUtils; import us.codecraft.webmagic.utils.HttpConstant; import java.io.IOException; import java.io.UnsupportedEncodingException; +import java.util.Map; import static com.github.dreamhead.moco.Moco.*; import static org.assertj.core.api.Assertions.assertThat; @@ -124,17 +127,12 @@ public class HttpClientDownloaderTest { @Override public void run() throws Exception { Request request = new Request(); - request.setUrl("http://127.0.0.1:12306/search"); - request.putParams("q", "webmagic"); + request.setUrl("http://127.0.0.1:12306/search?q=webmagic"); request.setMethod(HttpConstant.Method.GET); + Map params = new HashedMap(); + params.put("q","webmagic"); HttpUriRequest httpUriRequest = httpUriRequestConverter.convert(request,site,null); assertThat(EntityUtils.toString(HttpClients.custom().build().execute(httpUriRequest).getEntity())).isEqualTo("get"); - request.setMethod(HttpConstant.Method.POST); - httpUriRequest = httpUriRequestConverter.convert(request, site, null); - assertThat(EntityUtils.toString(HttpClients.custom().build().execute(httpUriRequest).getEntity())).isEqualTo("post"); - request.setMethod(HttpConstant.Method.PUT); - httpUriRequest = httpUriRequestConverter.convert(request, site, null); - assertThat(EntityUtils.toString(HttpClients.custom().build().execute(httpUriRequest).getEntity())).isEqualTo("put"); request.setMethod(HttpConstant.Method.DELETE); httpUriRequest = httpUriRequestConverter.convert(request, site, null); assertThat(EntityUtils.toString(HttpClients.custom().build().execute(httpUriRequest).getEntity())).isEqualTo("delete"); @@ -144,6 +142,14 @@ public class HttpClientDownloaderTest { request.setMethod(HttpConstant.Method.TRACE); httpUriRequest = httpUriRequestConverter.convert(request, site, null); assertThat(EntityUtils.toString(HttpClients.custom().build().execute(httpUriRequest).getEntity())).isEqualTo("trace"); + request.setUrl("http://127.0.0.1:12306/search"); + request.setMethod(HttpConstant.Method.POST); + request.setRequestBody(HttpRequestBody.form(params, "utf-8")); + httpUriRequest = httpUriRequestConverter.convert(request, site, null); + assertThat(EntityUtils.toString(HttpClients.custom().build().execute(httpUriRequest).getEntity())).isEqualTo("post"); + request.setMethod(HttpConstant.Method.PUT); + httpUriRequest = httpUriRequestConverter.convert(request, site, null); + assertThat(EntityUtils.toString(HttpClients.custom().build().execute(httpUriRequest).getEntity())).isEqualTo("put"); } }); } From 1cfbd13aaedfb1bd20dcd576570f6a0a65662200 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 8 Apr 2017 20:04:56 +0800 Subject: [PATCH 19/25] refacor in httpclientdownloader --- .../downloader/HttpClientDownloader.java | 50 ++----------------- .../downloader/HttpClientRequestContext.java | 33 ++++++++++++ .../downloader/HttpUriRequestConverter.java | 41 ++++++++++++++- 3 files changed, 77 insertions(+), 47 deletions(-) create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientRequestContext.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index b26bcf9..e14eff5 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -1,28 +1,10 @@ package us.codecraft.webmagic.downloader; -import org.apache.commons.collections.CollectionUtils; import org.apache.commons.io.IOUtils; -import org.apache.http.Header; -import org.apache.http.HttpHost; import org.apache.http.HttpResponse; import org.apache.http.annotation.ThreadSafe; -import org.apache.http.auth.AuthState; -import org.apache.http.auth.UsernamePasswordCredentials; -import org.apache.http.client.CookieStore; -import org.apache.http.client.config.CookieSpecs; -import org.apache.http.client.config.RequestConfig; -import org.apache.http.client.entity.UrlEncodedFormEntity; import org.apache.http.client.methods.CloseableHttpResponse; -import org.apache.http.client.methods.HttpUriRequest; -import org.apache.http.client.methods.RequestBuilder; -import org.apache.http.client.protocol.HttpClientContext; -import org.apache.http.cookie.Cookie; -import org.apache.http.impl.auth.BasicScheme; -import org.apache.http.impl.client.BasicCookieStore; import org.apache.http.impl.client.CloseableHttpClient; -import org.apache.http.message.BasicNameValuePair; -import org.apache.http.protocol.BasicHttpContext; -import org.apache.http.protocol.HttpContext; import org.apache.http.util.EntityUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -30,16 +12,15 @@ import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Task; -import us.codecraft.webmagic.proxy.Proxy; import us.codecraft.webmagic.proxy.ProxyProvider; import us.codecraft.webmagic.selector.PlainText; import us.codecraft.webmagic.utils.CharsetUtils; import us.codecraft.webmagic.utils.HttpClientUtils; -import us.codecraft.webmagic.utils.HttpConstant; import java.io.IOException; import java.nio.charset.Charset; -import java.util.*; +import java.util.HashMap; +import java.util.Map; /** @@ -96,33 +77,12 @@ public class HttpClientDownloader extends AbstractDownloader { } logger.debug("downloading page {}", request.getUrl()); CloseableHttpResponse httpResponse = null; - int statusCode = 0; Site site = task.getSite(); - Proxy proxy = null; - HttpClientContext httpContext = new HttpClientContext(); - if (proxyProvider != null) { - proxy = proxyProvider.getProxy(task); - AuthState authState = new AuthState(); - authState.update(new BasicScheme(), new UsernamePasswordCredentials(proxy.getUsername(), proxy.getPassword())); - httpContext.setAttribute(HttpClientContext.PROXY_AUTH_STATE, authState); - } CloseableHttpClient httpClient = getHttpClient(site); - HttpUriRequest httpUriRequest = httpUriRequestConverter.convert(request, site, proxy); - if (request.getCookies() != null && CollectionUtils.isNotEmpty(request.getCookies())) { - CookieStore cookieStore = new BasicCookieStore(); - for (Cookie c : request.getCookies()) { - cookieStore.addCookie(c); - } - httpContext.setCookieStore(cookieStore); - } - if (request.getHeaders() != null && CollectionUtils.isNotEmpty(request.getHeaders())) { - for (Header h : request.getHeaders()) { - httpUriRequest.setHeader(h); - } - } + HttpClientRequestContext requestContext = httpUriRequestConverter.convert(request, site, proxyProvider != null ? proxyProvider.getProxy(task) : null); try { - httpResponse = httpClient.execute(httpUriRequest, httpContext); - statusCode = httpResponse.getStatusLine().getStatusCode(); + httpResponse = httpClient.execute(requestContext.getHttpUriRequest(), requestContext.getHttpClientContext()); + int statusCode = httpResponse.getStatusLine().getStatusCode(); if (site.getAcceptStatCode().contains(statusCode)) { Page page = handleResponse(request, site.getCharset(), httpResponse, task); onSuccess(request); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientRequestContext.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientRequestContext.java new file mode 100644 index 0000000..b0afc65 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientRequestContext.java @@ -0,0 +1,33 @@ +package us.codecraft.webmagic.downloader; + +import org.apache.http.client.methods.HttpUriRequest; +import org.apache.http.client.protocol.HttpClientContext; + +/** + * @author code4crafter@gmail.com + * Date: 17/4/8 + * Time: 19:43 + */ +public class HttpClientRequestContext { + + private HttpUriRequest httpUriRequest; + + private HttpClientContext httpClientContext; + + public HttpUriRequest getHttpUriRequest() { + return httpUriRequest; + } + + public void setHttpUriRequest(HttpUriRequest httpUriRequest) { + this.httpUriRequest = httpUriRequest; + } + + public HttpClientContext getHttpClientContext() { + return httpClientContext; + } + + public void setHttpClientContext(HttpClientContext httpClientContext) { + this.httpClientContext = httpClientContext; + } + +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java index 22aa31d..acf1a7c 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java @@ -1,11 +1,20 @@ package us.codecraft.webmagic.downloader; +import org.apache.commons.collections.CollectionUtils; +import org.apache.http.Header; import org.apache.http.HttpHost; +import org.apache.http.auth.AuthState; +import org.apache.http.auth.UsernamePasswordCredentials; +import org.apache.http.client.CookieStore; import org.apache.http.client.config.CookieSpecs; import org.apache.http.client.config.RequestConfig; import org.apache.http.client.methods.HttpUriRequest; import org.apache.http.client.methods.RequestBuilder; +import org.apache.http.client.protocol.HttpClientContext; +import org.apache.http.cookie.Cookie; import org.apache.http.entity.ByteArrayEntity; +import org.apache.http.impl.auth.BasicScheme; +import org.apache.http.impl.client.BasicCookieStore; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.proxy.Proxy; @@ -20,7 +29,29 @@ import java.util.Map; */ public class HttpUriRequestConverter { - public HttpUriRequest convert(Request request, Site site, Proxy proxy) { + public HttpClientRequestContext convert(Request request, Site site, Proxy proxy) { + HttpClientRequestContext httpClientRequestContext = new HttpClientRequestContext(); + httpClientRequestContext.setHttpUriRequest(convertHttpUriRequest(request, site, proxy)); + httpClientRequestContext.setHttpClientContext(convertHttpClientContext(request, site, proxy)); + return httpClientRequestContext; + } + + private HttpClientContext convertHttpClientContext(Request request, Site site, Proxy proxy) { + HttpClientContext httpContext = new HttpClientContext(); + AuthState authState = new AuthState(); + authState.update(new BasicScheme(), new UsernamePasswordCredentials(proxy.getUsername(), proxy.getPassword())); + httpContext.setAttribute(HttpClientContext.PROXY_AUTH_STATE, authState); + if (request.getCookies() != null && CollectionUtils.isNotEmpty(request.getCookies())) { + CookieStore cookieStore = new BasicCookieStore(); + for (Cookie c : request.getCookies()) { + cookieStore.addCookie(c); + } + httpContext.setCookieStore(cookieStore); + } + return httpContext; + } + + private HttpUriRequest convertHttpUriRequest(Request request, Site site, Proxy proxy) { RequestBuilder requestBuilder = selectRequestMethod(request).setUri(request.getUrl()); if (site.getHeaders() != null) { for (Map.Entry headerEntry : site.getHeaders().entrySet()) { @@ -40,7 +71,13 @@ public class HttpUriRequestConverter { requestConfigBuilder.setProxy(new HttpHost(proxy.getHost(), proxy.getPort())); } requestBuilder.setConfig(requestConfigBuilder.build()); - return requestBuilder.build(); + HttpUriRequest httpUriRequest = requestBuilder.build(); + if (request.getHeaders() != null && CollectionUtils.isNotEmpty(request.getHeaders())) { + for (Header h : request.getHeaders()) { + httpUriRequest.setHeader(h); + } + } + return httpUriRequest; } private RequestBuilder selectRequestMethod(Request request) { From b06a248c00a8e23db5e87db470d0c82363c89523 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 8 Apr 2017 20:06:04 +0800 Subject: [PATCH 20/25] fix test --- .../downloader/HttpClientDownloaderTest.java | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java index cac84f3..79bbb7b 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java @@ -96,7 +96,7 @@ public class HttpClientDownloaderTest { Request requestGBK = new Request(url); CloseableHttpResponse httpResponse = null; try { - httpResponse = httpClient.execute(new HttpUriRequestConverter().convert(requestGBK, site, null)); + httpResponse = httpClient.execute(new HttpUriRequestConverter().convert(requestGBK, site, null).getHttpUriRequest()); } catch (IOException e) { e.printStackTrace(); } @@ -131,24 +131,24 @@ public class HttpClientDownloaderTest { request.setMethod(HttpConstant.Method.GET); Map params = new HashedMap(); params.put("q","webmagic"); - HttpUriRequest httpUriRequest = httpUriRequestConverter.convert(request,site,null); + HttpUriRequest httpUriRequest = httpUriRequestConverter.convert(request,site,null).getHttpUriRequest(); assertThat(EntityUtils.toString(HttpClients.custom().build().execute(httpUriRequest).getEntity())).isEqualTo("get"); request.setMethod(HttpConstant.Method.DELETE); - httpUriRequest = httpUriRequestConverter.convert(request, site, null); + httpUriRequest = httpUriRequestConverter.convert(request, site, null).getHttpUriRequest(); assertThat(EntityUtils.toString(HttpClients.custom().build().execute(httpUriRequest).getEntity())).isEqualTo("delete"); request.setMethod(HttpConstant.Method.HEAD); - httpUriRequest = httpUriRequestConverter.convert(request, site, null); + httpUriRequest = httpUriRequestConverter.convert(request, site, null).getHttpUriRequest(); assertThat(HttpClients.custom().build().execute(httpUriRequest).getFirstHeader("method").getValue()).isEqualTo("head"); request.setMethod(HttpConstant.Method.TRACE); - httpUriRequest = httpUriRequestConverter.convert(request, site, null); + httpUriRequest = httpUriRequestConverter.convert(request, site, null).getHttpUriRequest(); assertThat(EntityUtils.toString(HttpClients.custom().build().execute(httpUriRequest).getEntity())).isEqualTo("trace"); request.setUrl("http://127.0.0.1:12306/search"); request.setMethod(HttpConstant.Method.POST); request.setRequestBody(HttpRequestBody.form(params, "utf-8")); - httpUriRequest = httpUriRequestConverter.convert(request, site, null); + httpUriRequest = httpUriRequestConverter.convert(request, site, null).getHttpUriRequest(); assertThat(EntityUtils.toString(HttpClients.custom().build().execute(httpUriRequest).getEntity())).isEqualTo("post"); request.setMethod(HttpConstant.Method.PUT); - httpUriRequest = httpUriRequestConverter.convert(request, site, null); + httpUriRequest = httpUriRequestConverter.convert(request, site, null).getHttpUriRequest(); assertThat(EntityUtils.toString(HttpClients.custom().build().execute(httpUriRequest).getEntity())).isEqualTo("put"); } }); From 2622b448b8ff2eca6823f118e31c2534d27a5f4a Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 8 Apr 2017 20:09:43 +0800 Subject: [PATCH 21/25] fix test --- .../java/us/codecraft/webmagic/Request.java | 14 +++++++++++--- .../downloader/HttpUriRequestConverter.java | 17 +++++------------ 2 files changed, 16 insertions(+), 15 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java index c4b9426..67be957 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java @@ -123,12 +123,20 @@ public class Request implements Serializable { public int hashCode() { int result = url != null ? url.hashCode() : 0; result = 31 * result + (method != null ? method.hashCode() : 0); - result = 31 * result + (headers != null ? headers.hashCode() : 0); - result = 31 * result + (cookies != null ? cookies.hashCode() : 0); - return result; } + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + Request request = (Request) o; + + if (url != null ? !url.equals(request.url) : request.url != null) return false; + return method != null ? method.equals(request.method) : request.method == null; + } + public List getCookies() { return cookies; } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java index acf1a7c..42119a3 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java @@ -38,9 +38,11 @@ public class HttpUriRequestConverter { private HttpClientContext convertHttpClientContext(Request request, Site site, Proxy proxy) { HttpClientContext httpContext = new HttpClientContext(); - AuthState authState = new AuthState(); - authState.update(new BasicScheme(), new UsernamePasswordCredentials(proxy.getUsername(), proxy.getPassword())); - httpContext.setAttribute(HttpClientContext.PROXY_AUTH_STATE, authState); + if (proxy != null) { + AuthState authState = new AuthState(); + authState.update(new BasicScheme(), new UsernamePasswordCredentials(proxy.getUsername(), proxy.getPassword())); + httpContext.setAttribute(HttpClientContext.PROXY_AUTH_STATE, authState); + } if (request.getCookies() != null && CollectionUtils.isNotEmpty(request.getCookies())) { CookieStore cookieStore = new BasicCookieStore(); for (Cookie c : request.getCookies()) { @@ -108,13 +110,4 @@ public class HttpUriRequestConverter { return requestBuilder; } - private RequestBuilder addQueryParams(RequestBuilder requestBuilder, Map params) { - if (params != null) { - for (Map.Entry entry : params.entrySet()) { - requestBuilder.addParameter(entry.getKey(), entry.getValue()); - } - } - return requestBuilder; - } - } From abd020b45be4bd04d5efe82537a7a40ff3bc66e8 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 8 Apr 2017 20:16:17 +0800 Subject: [PATCH 22/25] some comments --- .../codecraft/webmagic/downloader/HttpClientDownloader.java | 4 ++-- .../webmagic/downloader/HttpClientRequestContext.java | 1 + .../webmagic/downloader/HttpUriRequestConverter.java | 4 +++- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index e14eff5..284702d 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -112,7 +112,7 @@ public class HttpClientDownloader extends AbstractDownloader { } protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException { - String content = getContent(charset, httpResponse); + String content = getResponseContent(charset, httpResponse); Page page = new Page(); page.setRawText(content); page.setUrl(new PlainText(request.getUrl())); @@ -124,7 +124,7 @@ public class HttpClientDownloader extends AbstractDownloader { return page; } - private String getContent(String charset, HttpResponse httpResponse) throws IOException { + private String getResponseContent(String charset, HttpResponse httpResponse) throws IOException { if (charset == null) { byte[] contentBytes = IOUtils.toByteArray(httpResponse.getEntity().getContent()); String htmlCharset = getHtmlCharset(httpResponse, contentBytes); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientRequestContext.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientRequestContext.java index b0afc65..74e6d25 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientRequestContext.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientRequestContext.java @@ -7,6 +7,7 @@ import org.apache.http.client.protocol.HttpClientContext; * @author code4crafter@gmail.com * Date: 17/4/8 * Time: 19:43 + * @since 0.7.0 */ public class HttpClientRequestContext { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java index 42119a3..70554ea 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java @@ -25,7 +25,9 @@ import java.util.Map; /** * @author code4crafter@gmail.com * Date: 17/3/18 - * Time: 上午11:28 + * Time: 11:28 + * + * @since 0.7.0 */ public class HttpUriRequestConverter { From db67db8103b05506d131c74ce0036a5c875fb92a Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 8 Apr 2017 22:06:18 +0800 Subject: [PATCH 23/25] #523 remove fixAllRelativeHrefs by default, get absolute urls for links() --- .../main/java/us/codecraft/webmagic/Page.java | 9 ++-- .../us/codecraft/webmagic/selector/Html.java | 10 ++++ .../codecraft/webmagic/selector/HtmlNode.java | 2 +- .../webmagic/selector/LinksSelector.java | 51 +++++++++++++++++++ .../java/us/codecraft/webmagic/HtmlTest.java | 10 ++++ .../webmagic/selector/LinksSelectorTest.java | 21 ++++++++ 6 files changed, 96 insertions(+), 7 deletions(-) create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/selector/LinksSelector.java create mode 100644 webmagic-core/src/test/java/us/codecraft/webmagic/selector/LinksSelectorTest.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java index d24ceba..7dd48f8 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java @@ -1,16 +1,13 @@ package us.codecraft.webmagic; -import java.util.ArrayList; -import java.util.List; - import org.apache.commons.lang3.StringUtils; -import org.apache.http.Header; - import us.codecraft.webmagic.selector.Html; import us.codecraft.webmagic.selector.Json; import us.codecraft.webmagic.selector.Selectable; import us.codecraft.webmagic.utils.UrlUtils; +import java.util.ArrayList; +import java.util.List; import java.util.Map; /** @@ -76,7 +73,7 @@ public class Page { */ public Html getHtml() { if (html == null) { - html = new Html(UrlUtils.fixAllRelativeHrefs(rawText, request.getUrl())); + html = new Html(rawText, request.getUrl()); } return html; } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java index d80e8b4..7b22639 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java @@ -44,6 +44,16 @@ public class Html extends HtmlNode { */ private Document document; + public Html(String text, String url) { + try { + disableJsoupHtmlEntityEscape(); + this.document = Jsoup.parse(text, url); + } catch (Exception e) { + this.document = null; + logger.warn("parse document error ", e); + } + } + public Html(String text) { try { disableJsoupHtmlEntityEscape(); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java index 030522f..89de5a6 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java @@ -34,7 +34,7 @@ public class HtmlNode extends AbstractSelectable { @Override public Selectable links() { - return xpath("//a/@href"); + return selectElements(new LinksSelector()); } @Override diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/LinksSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/LinksSelector.java new file mode 100644 index 0000000..5296a74 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/LinksSelector.java @@ -0,0 +1,51 @@ +package us.codecraft.webmagic.selector; + +import org.jsoup.helper.StringUtil; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + +import java.util.ArrayList; +import java.util.List; + +/** + * Links selector based on jsoup. Use absolute url.
+ * + * @author code4crafter@gmail.com
+ * @since 0.7.0 + */ +public class LinksSelector extends BaseElementSelector { + + @Override + public String select(Element element) { + throw new UnsupportedOperationException(); + } + + @Override + public List selectList(Element element) { + Elements elements = element.select("a"); + List links = new ArrayList(elements.size()); + for (Element element0 : elements) { + if (!StringUtil.isBlank(element0.baseUri())) { + links.add(element0.attr("abs:href")); + } else { + links.add(element0.attr("href")); + } + } + return links; + } + + @Override + public Element selectElement(Element element) { + throw new UnsupportedOperationException(); + } + + @Override + public List selectElements(Element element) { + throw new UnsupportedOperationException(); + } + + @Override + public boolean hasAttribute() { + return true; + } +} diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/HtmlTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/HtmlTest.java index 6cf5382..faf249f 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/HtmlTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/HtmlTest.java @@ -48,4 +48,14 @@ public class HtmlTest { Selectable selectable = html.xpath("//a[1]").nodes().get(0); assertThat(selectable.xpath("/a/@href").get()).isEqualTo("/xx/xx"); } + + @Test + public void testGetHrefsByJsoup(){ + Html html = new Html("issues","https://github.com/code4craft/webmagic/"); + assertThat(html.xpath("//a[1]/@abs:href").get()).isEqualTo("https://github.com/code4craft/webmagic/issues"); + assertThat(html.xpath("//img/@abs:src").get()).isEqualTo("https://github.com/code4craft/webmagic/webmagic.jpg"); + html = new Html("issues"); + assertThat(html.xpath("//a[1]/@abs:href").get()).isEqualTo("https://github.com/code4craft/webmagic/issues"); + assertThat(html.xpath("//img/@abs:src").get()).isEqualTo("https://github.com/code4craft/webmagic/webmagic.jpg"); + } } diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/LinksSelectorTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/LinksSelectorTest.java new file mode 100644 index 0000000..3fcb71b --- /dev/null +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/LinksSelectorTest.java @@ -0,0 +1,21 @@ +package us.codecraft.webmagic.selector; + +import org.junit.Test; + +import java.util.List; + +/** + * @author code4crafter@gmail.com + * Date: 17/4/8 + * Time: 下午9:41 + */ +public class LinksSelectorTest { + + private String html = "
"; + + @Test + public void testLinks() throws Exception { + List links = new LinksSelector().selectList(html); + System.out.println(links); + } +} From a1ae632b62ffb63b36858b42116c35ed8246a284 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 8 Apr 2017 23:13:16 +0800 Subject: [PATCH 24/25] test for request cookies and headers --- .../java/us/codecraft/webmagic/Request.java | 24 ++++--- .../downloader/HttpUriRequestConverter.java | 19 ++--- .../downloader/HttpClientDownloaderTest.java | 71 ++++++++++++++++--- 3 files changed, 86 insertions(+), 28 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java index 67be957..d2ea247 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java @@ -1,14 +1,10 @@ package us.codecraft.webmagic; -import org.apache.http.Header; -import org.apache.http.cookie.Cookie; import us.codecraft.webmagic.model.HttpRequestBody; import us.codecraft.webmagic.utils.Experimental; import java.io.Serializable; -import java.util.ArrayList; import java.util.HashMap; -import java.util.List; import java.util.Map; /** @@ -38,9 +34,9 @@ public class Request implements Serializable { /** * cookies for current url, if not set use Site's cookies */ - private List cookies=new ArrayList(); - - private List
headers=new ArrayList
(); + private Map cookies = new HashMap(); + + private Map headers = new HashMap(); /** * Priority of the request.
@@ -137,11 +133,21 @@ public class Request implements Serializable { return method != null ? method.equals(request.method) : request.method == null; } - public List getCookies() { + public Request addCookie(String name, String value) { + cookies.put(name, value); + return this; + } + + public Request addHeader(String name, String value) { + headers.put(name, value); + return this; + } + + public Map getCookies() { return cookies; } - public List
getHeaders() { + public Map getHeaders() { return headers; } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java index 70554ea..8ca0bf9 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java @@ -1,7 +1,5 @@ package us.codecraft.webmagic.downloader; -import org.apache.commons.collections.CollectionUtils; -import org.apache.http.Header; import org.apache.http.HttpHost; import org.apache.http.auth.AuthState; import org.apache.http.auth.UsernamePasswordCredentials; @@ -11,14 +9,15 @@ import org.apache.http.client.config.RequestConfig; import org.apache.http.client.methods.HttpUriRequest; import org.apache.http.client.methods.RequestBuilder; import org.apache.http.client.protocol.HttpClientContext; -import org.apache.http.cookie.Cookie; import org.apache.http.entity.ByteArrayEntity; import org.apache.http.impl.auth.BasicScheme; import org.apache.http.impl.client.BasicCookieStore; +import org.apache.http.impl.cookie.BasicClientCookie; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.proxy.Proxy; import us.codecraft.webmagic.utils.HttpConstant; +import us.codecraft.webmagic.utils.UrlUtils; import java.util.Map; @@ -45,10 +44,12 @@ public class HttpUriRequestConverter { authState.update(new BasicScheme(), new UsernamePasswordCredentials(proxy.getUsername(), proxy.getPassword())); httpContext.setAttribute(HttpClientContext.PROXY_AUTH_STATE, authState); } - if (request.getCookies() != null && CollectionUtils.isNotEmpty(request.getCookies())) { + if (request.getCookies() != null && !request.getCookies().isEmpty()) { CookieStore cookieStore = new BasicCookieStore(); - for (Cookie c : request.getCookies()) { - cookieStore.addCookie(c); + for (Map.Entry cookieEntry : request.getCookies().entrySet()) { + BasicClientCookie cookie1 = new BasicClientCookie(cookieEntry.getKey(), cookieEntry.getValue()); + cookie1.setDomain(UrlUtils.removePort(UrlUtils.getDomain(request.getUrl()))); + cookieStore.addCookie(cookie1); } httpContext.setCookieStore(cookieStore); } @@ -76,9 +77,9 @@ public class HttpUriRequestConverter { } requestBuilder.setConfig(requestConfigBuilder.build()); HttpUriRequest httpUriRequest = requestBuilder.build(); - if (request.getHeaders() != null && CollectionUtils.isNotEmpty(request.getHeaders())) { - for (Header h : request.getHeaders()) { - httpUriRequest.setHeader(h); + if (request.getHeaders() != null && !request.getHeaders().isEmpty()) { + for (Map.Entry header : request.getHeaders().entrySet()) { + httpUriRequest.addHeader(header.getKey(), header.getValue()); } } return httpUriRequest; diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java index 79bbb7b..9c93915 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java @@ -34,7 +34,7 @@ import static org.junit.Assert.assertTrue; */ public class HttpClientDownloaderTest { - public static final String PAGE_ALWAYS_NOT_EXISTS = "http://localhost:13421/404"; + public static final String PAGE_ALWAYS_NOT_EXISTS = "http://localhost:13423/404"; @Test public void testDownloader() { @@ -63,7 +63,7 @@ public class HttpClientDownloaderTest { @Test public void testGetHtmlCharset() throws Exception { - HttpServer server = httpserver(12306); + HttpServer server = httpserver(13423); server.get(by(uri("/header"))).response(header("Content-Type", "text/html; charset=gbk")); server.get(by(uri("/meta4"))).response(with(text("\n" + " \n" + @@ -80,11 +80,11 @@ public class HttpClientDownloaderTest { Runner.running(server, new Runnable() { @Override public void run() { - String charset = getCharsetByUrl("http://127.0.0.1:12306/header"); + String charset = getCharsetByUrl("http://127.0.0.1:13423/header"); assertEquals(charset, "gbk"); - charset = getCharsetByUrl("http://127.0.0.1:12306/meta4"); + charset = getCharsetByUrl("http://127.0.0.1:13423/meta4"); assertEquals(charset, "gbk"); - charset = getCharsetByUrl("http://127.0.0.1:12306/meta5"); + charset = getCharsetByUrl("http://127.0.0.1:13423/meta5"); assertEquals(charset, "gbk"); } @@ -114,7 +114,7 @@ public class HttpClientDownloaderTest { @Test public void test_selectRequestMethod() throws Exception { - HttpServer server = httpserver(12306); + HttpServer server = httpserver(13423); server.get(eq(query("q"), "webmagic")).response("get"); server.post(eq(form("q"), "webmagic")).response("post"); server.put(eq(form("q"), "webmagic")).response("put"); @@ -127,7 +127,7 @@ public class HttpClientDownloaderTest { @Override public void run() throws Exception { Request request = new Request(); - request.setUrl("http://127.0.0.1:12306/search?q=webmagic"); + request.setUrl("http://127.0.0.1:13423/search?q=webmagic"); request.setMethod(HttpConstant.Method.GET); Map params = new HashedMap(); params.put("q","webmagic"); @@ -142,7 +142,7 @@ public class HttpClientDownloaderTest { request.setMethod(HttpConstant.Method.TRACE); httpUriRequest = httpUriRequestConverter.convert(request, site, null).getHttpUriRequest(); assertThat(EntityUtils.toString(HttpClients.custom().build().execute(httpUriRequest).getEntity())).isEqualTo("trace"); - request.setUrl("http://127.0.0.1:12306/search"); + request.setUrl("http://127.0.0.1:13423/search"); request.setMethod(HttpConstant.Method.POST); request.setRequestBody(HttpRequestBody.form(params, "utf-8")); httpUriRequest = httpUriRequestConverter.convert(request, site, null).getHttpUriRequest(); @@ -154,16 +154,67 @@ public class HttpClientDownloaderTest { }); } + @Test + public void test_set_request_cookie() throws Exception { + HttpServer server = httpserver(13423); + server.get(eq(cookie("cookie"), "cookie-webmagic")).response("ok"); + Runner.running(server, new Runnable() { + @Override + public void run() throws Exception { + HttpClientDownloader httpClientDownloader = new HttpClientDownloader(); + Request request = new Request(); + request.setUrl("http://127.0.0.1:13423"); + request.addCookie("cookie","cookie-webmagic"); + Page page = httpClientDownloader.download(request, Site.me().toTask()); + assertThat(page.getRawText()).isEqualTo("ok"); + } + }); + } + + @Test + public void test_set_request_header() throws Exception { + HttpServer server = httpserver(13423); + server.get(eq(header("header"), "header-webmagic")).response("ok"); + Runner.running(server, new Runnable() { + @Override + public void run() throws Exception { + HttpClientDownloader httpClientDownloader = new HttpClientDownloader(); + Request request = new Request(); + request.setUrl("http://127.0.0.1:13423"); + request.addHeader("header","header-webmagic"); + Page page = httpClientDownloader.download(request, Site.me().toTask()); + assertThat(page.getRawText()).isEqualTo("ok"); + } + }); + } + + @Test + public void test_set_site_cookie() throws Exception { + HttpServer server = httpserver(13423); + server.get(eq(cookie("cookie"), "cookie-webmagic")).response("ok"); + Runner.running(server, new Runnable() { + @Override + public void run() throws Exception { + HttpClientDownloader httpClientDownloader = new HttpClientDownloader(); + Request request = new Request(); + request.setUrl("http://127.0.0.1:13423"); + Site site = Site.me().addCookie("cookie", "cookie-webmagic").setDomain("127.0.0.1"); + Page page = httpClientDownloader.download(request, site.toTask()); + assertThat(page.getRawText()).isEqualTo("ok"); + } + }); + } + @Test public void test_download_when_task_is_null() throws Exception { - HttpServer server = httpserver(12306); + HttpServer server = httpserver(13423); server.response("foo"); Runner.running(server, new Runnable() { @Override public void run() throws Exception { final HttpClientDownloader httpClientDownloader = new HttpClientDownloader(); Request request = new Request(); - request.setUrl("http://127.0.0.1:12306/"); + request.setUrl("http://127.0.0.1:13423/"); Page page = httpClientDownloader.download(request, Site.me().toTask()); assertThat(page.getRawText()).isEqualTo("foo"); } From 0f4d6e8b1201b047204bf5c2a0ddacae94d800dc Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 8 Apr 2017 23:17:00 +0800 Subject: [PATCH 25/25] #525 remove port in UrlUtils.getDomain() --- .../src/main/java/us/codecraft/webmagic/utils/UrlUtils.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java index ed7ae8c..72a9d3f 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java @@ -80,7 +80,7 @@ public class UrlUtils { if (i > 0) { domain = StringUtils.substring(domain, 0, i); } - return domain; + return removePort(domain); } public static String removePort(String domain) {