Downloader 提供刷新组件的api,方便在spider中操作
parent
19465089c3
commit
2e2a0fdf3e
|
@ -1,5 +1,6 @@
|
||||||
package us.codecraft.webmagic;
|
package us.codecraft.webmagic;
|
||||||
|
|
||||||
|
import com.sun.org.apache.regexp.internal.RE;
|
||||||
import us.codecraft.webmagic.utils.HttpConstant;
|
import us.codecraft.webmagic.utils.HttpConstant;
|
||||||
|
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
@ -35,8 +36,12 @@ public class Site {
|
||||||
|
|
||||||
private static final Set<Integer> DEFAULT_STATUS_CODE_SET = new HashSet<Integer>();
|
private static final Set<Integer> DEFAULT_STATUS_CODE_SET = new HashSet<Integer>();
|
||||||
|
|
||||||
|
private static final Set<Integer> DEFAULT_REFRESH_CODE_SET = new HashSet<>();
|
||||||
|
|
||||||
|
private Set<Integer> refreshCode = DEFAULT_REFRESH_CODE_SET;
|
||||||
private Set<Integer> acceptStatCode = DEFAULT_STATUS_CODE_SET;
|
private Set<Integer> acceptStatCode = DEFAULT_STATUS_CODE_SET;
|
||||||
|
|
||||||
|
|
||||||
private Map<String, String> headers = new HashMap<String, String>();
|
private Map<String, String> headers = new HashMap<String, String>();
|
||||||
|
|
||||||
private boolean useGzip = true;
|
private boolean useGzip = true;
|
||||||
|
@ -44,6 +49,7 @@ public class Site {
|
||||||
private boolean disableCookieManagement = false;
|
private boolean disableCookieManagement = false;
|
||||||
|
|
||||||
static {
|
static {
|
||||||
|
DEFAULT_REFRESH_CODE_SET.add(HttpConstant.StatusCode.FORBIDDEN);
|
||||||
DEFAULT_STATUS_CODE_SET.add(HttpConstant.StatusCode.CODE_200);
|
DEFAULT_STATUS_CODE_SET.add(HttpConstant.StatusCode.CODE_200);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -192,6 +198,15 @@ public class Site {
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public Site setRefreshCode(Set<Integer> refreshCode){
|
||||||
|
this.refreshCode = refreshCode;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
public Set<Integer> getRefreshCode(){
|
||||||
|
return refreshCode;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* get acceptStatCode
|
* get acceptStatCode
|
||||||
*
|
*
|
||||||
|
|
|
@ -419,7 +419,10 @@ public class Spider implements Runnable, Task {
|
||||||
pipeline.process(page.getResultItems(), this);
|
pipeline.process(page.getResultItems(), this);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else if(site.getRefreshCode().contains(page.getStatusCode())) {
|
||||||
|
logger.info("page status code error, page {} , code: {}, start refresh downloader", request.getUrl(), page.getStatusCode());
|
||||||
|
downloader.refreshComponent(this);
|
||||||
|
}else {
|
||||||
logger.info("page status code error, page {} , code: {}", request.getUrl(), page.getStatusCode());
|
logger.info("page status code error, page {} , code: {}", request.getUrl(), page.getStatusCode());
|
||||||
}
|
}
|
||||||
sleep(site.getSleepTime());
|
sleep(site.getSleepTime());
|
||||||
|
|
|
@ -18,14 +18,18 @@ public interface Downloader {
|
||||||
* Downloads web pages and store in Page object.
|
* Downloads web pages and store in Page object.
|
||||||
*
|
*
|
||||||
* @param request request
|
* @param request request
|
||||||
* @param task task
|
* @param task task
|
||||||
* @return page
|
* @return page
|
||||||
*/
|
*/
|
||||||
public Page download(Request request, Task task);
|
Page download(Request request, Task task);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Tell the downloader how many threads the spider used.
|
* Tell the downloader how many threads the spider used.
|
||||||
|
*
|
||||||
* @param threadNum number of threads
|
* @param threadNum number of threads
|
||||||
*/
|
*/
|
||||||
public void setThread(int threadNum);
|
void setThread(int threadNum);
|
||||||
|
|
||||||
|
|
||||||
|
void refreshComponent(Task task);
|
||||||
}
|
}
|
||||||
|
|
|
@ -111,6 +111,17 @@ public class HttpClientDownloader extends AbstractDownloader {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void refreshComponent(Task task) {
|
||||||
|
if (proxyProvider != null ) {
|
||||||
|
proxyProvider.refreshProxy(task);
|
||||||
|
}
|
||||||
|
|
||||||
|
httpClients.remove(task.getSite().getDomain());
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void setThread(int thread) {
|
public void setThread(int thread) {
|
||||||
httpClientGenerator.setPoolSize(thread);
|
httpClientGenerator.setPoolSize(thread);
|
||||||
|
|
|
@ -1,13 +1,17 @@
|
||||||
package us.codecraft.webmagic.downloader;
|
package us.codecraft.webmagic.downloader;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.security.KeyManagementException;
|
import java.security.KeyManagementException;
|
||||||
|
import java.security.KeyStore;
|
||||||
|
import java.security.KeyStoreException;
|
||||||
import java.security.NoSuchAlgorithmException;
|
import java.security.NoSuchAlgorithmException;
|
||||||
import java.security.cert.CertificateException;
|
import java.security.cert.CertificateException;
|
||||||
import java.security.cert.X509Certificate;
|
import java.security.cert.X509Certificate;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
import javax.net.ssl.SSLContext;
|
import javax.net.ssl.SSLContext;
|
||||||
|
import javax.net.ssl.SSLContextSpi;
|
||||||
import javax.net.ssl.TrustManager;
|
import javax.net.ssl.TrustManager;
|
||||||
import javax.net.ssl.X509TrustManager;
|
import javax.net.ssl.X509TrustManager;
|
||||||
|
|
||||||
|
@ -24,6 +28,7 @@ import org.apache.http.conn.socket.ConnectionSocketFactory;
|
||||||
import org.apache.http.conn.socket.PlainConnectionSocketFactory;
|
import org.apache.http.conn.socket.PlainConnectionSocketFactory;
|
||||||
import org.apache.http.conn.ssl.DefaultHostnameVerifier;
|
import org.apache.http.conn.ssl.DefaultHostnameVerifier;
|
||||||
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
|
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
|
||||||
|
import org.apache.http.conn.ssl.TrustSelfSignedStrategy;
|
||||||
import org.apache.http.impl.client.BasicCookieStore;
|
import org.apache.http.impl.client.BasicCookieStore;
|
||||||
import org.apache.http.impl.client.CloseableHttpClient;
|
import org.apache.http.impl.client.CloseableHttpClient;
|
||||||
import org.apache.http.impl.client.DefaultHttpRequestRetryHandler;
|
import org.apache.http.impl.client.DefaultHttpRequestRetryHandler;
|
||||||
|
@ -32,6 +37,7 @@ import org.apache.http.impl.client.HttpClients;
|
||||||
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
|
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
|
||||||
import org.apache.http.impl.cookie.BasicClientCookie;
|
import org.apache.http.impl.cookie.BasicClientCookie;
|
||||||
import org.apache.http.protocol.HttpContext;
|
import org.apache.http.protocol.HttpContext;
|
||||||
|
import org.apache.http.ssl.SSLContexts;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
@ -69,7 +75,7 @@ public class HttpClientGenerator {
|
||||||
return new SSLConnectionSocketFactory(sslContext, supportedProtocols,
|
return new SSLConnectionSocketFactory(sslContext, supportedProtocols,
|
||||||
null,
|
null,
|
||||||
new DefaultHostnameVerifier()); // 优先绕过安全证书
|
new DefaultHostnameVerifier()); // 优先绕过安全证书
|
||||||
} catch (KeyManagementException e) {
|
} catch (KeyManagementException | CertificateException | KeyStoreException | IOException e) {
|
||||||
logger.error("ssl connection fail", e);
|
logger.error("ssl connection fail", e);
|
||||||
} catch (NoSuchAlgorithmException e) {
|
} catch (NoSuchAlgorithmException e) {
|
||||||
logger.error("ssl connection fail", e);
|
logger.error("ssl connection fail", e);
|
||||||
|
@ -77,8 +83,8 @@ public class HttpClientGenerator {
|
||||||
return SSLConnectionSocketFactory.getSocketFactory();
|
return SSLConnectionSocketFactory.getSocketFactory();
|
||||||
}
|
}
|
||||||
|
|
||||||
private SSLContext createIgnoreVerifySSL() throws NoSuchAlgorithmException, KeyManagementException {
|
private SSLContext createIgnoreVerifySSL() throws NoSuchAlgorithmException, KeyManagementException, CertificateException, KeyStoreException, IOException {
|
||||||
// 实现一个X509TrustManager接口,用于绕过验证,不用修改里面的方法
|
// 实现一个X509TrustManager接口,用于绕过验证,不用修改里面的方法
|
||||||
X509TrustManager trustManager = new X509TrustManager() {
|
X509TrustManager trustManager = new X509TrustManager() {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -96,10 +102,10 @@ public class HttpClientGenerator {
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
SSLContext sc = SSLContext.getInstance("TLS");
|
SSLContext sc = SSLContext.getInstance("SSLv3");
|
||||||
sc.init(null, new TrustManager[] { trustManager }, null);
|
sc.init(null, new TrustManager[] { trustManager }, null);
|
||||||
return sc;
|
return sc;
|
||||||
}
|
}
|
||||||
|
|
||||||
public HttpClientGenerator setPoolSize(int poolSize) {
|
public HttpClientGenerator setPoolSize(int poolSize) {
|
||||||
connectionManager.setMaxTotal(poolSize);
|
connectionManager.setMaxTotal(poolSize);
|
||||||
|
|
|
@ -28,6 +28,7 @@ public abstract class HttpConstant {
|
||||||
public static abstract class StatusCode {
|
public static abstract class StatusCode {
|
||||||
|
|
||||||
public static final int CODE_200 = 200;
|
public static final int CODE_200 = 200;
|
||||||
|
public static final int FORBIDDEN = 403;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -57,6 +57,11 @@ public class SpiderTest {
|
||||||
return Site.me().setSleepTime(0);
|
return Site.me().setSleepTime(0);
|
||||||
}
|
}
|
||||||
}).setDownloader(new Downloader() {
|
}).setDownloader(new Downloader() {
|
||||||
|
@Override
|
||||||
|
public void refreshComponent(Task task) {
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Page download(Request request, Task task) {
|
public Page download(Request request, Task task) {
|
||||||
return new Page().setRawText("");
|
return new Page().setRawText("");
|
||||||
|
|
|
@ -28,6 +28,11 @@ public class MockGithubDownloader implements Downloader {
|
||||||
return page;
|
return page;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void refreshComponent(Task task) {
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void setThread(int threadNum) {
|
public void setThread(int threadNum) {
|
||||||
}
|
}
|
||||||
|
|
|
@ -42,7 +42,12 @@ public class PhantomJSDownloader extends AbstractDownloader {
|
||||||
this.initPhantomjsCrawlPath();
|
this.initPhantomjsCrawlPath();
|
||||||
PhantomJSDownloader.phantomJsCommand = phantomJsCommand;
|
PhantomJSDownloader.phantomJsCommand = phantomJsCommand;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void refreshComponent(Task task) {
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 新增构造函数,支持crawl.js路径自定义,因为当其他项目依赖此jar包时,runtime.exec()执行phantomjs命令时无使用法jar包中的crawl.js
|
* 新增构造函数,支持crawl.js路径自定义,因为当其他项目依赖此jar包时,runtime.exec()执行phantomjs命令时无使用法jar包中的crawl.js
|
||||||
* <pre>
|
* <pre>
|
||||||
|
|
|
@ -9,6 +9,10 @@ import us.codecraft.webmagic.selector.PlainText;
|
||||||
* @author code4crafter@gmail.com
|
* @author code4crafter@gmail.com
|
||||||
*/
|
*/
|
||||||
public class MockGithubDownloader implements Downloader{
|
public class MockGithubDownloader implements Downloader{
|
||||||
|
@Override
|
||||||
|
public void refreshComponent(Task task) {
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
private String html = "\n" +
|
private String html = "\n" +
|
||||||
"\n" +
|
"\n" +
|
||||||
|
|
|
@ -59,6 +59,11 @@ public class SeleniumDownloader implements Downloader, Closeable {
|
||||||
// "/Users/Bingo/Downloads/phantomjs-1.9.7-macosx/bin/phantomjs");
|
// "/Users/Bingo/Downloads/phantomjs-1.9.7-macosx/bin/phantomjs");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void refreshComponent(Task task) {
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* set sleep time to wait until load success
|
* set sleep time to wait until load success
|
||||||
*
|
*
|
||||||
|
|
Loading…
Reference in New Issue