Downloader 提供刷新组件的api,方便在spider中操作
parent
19465089c3
commit
2e2a0fdf3e
|
@ -1,5 +1,6 @@
|
|||
package us.codecraft.webmagic;
|
||||
|
||||
import com.sun.org.apache.regexp.internal.RE;
|
||||
import us.codecraft.webmagic.utils.HttpConstant;
|
||||
|
||||
import java.util.*;
|
||||
|
@ -35,8 +36,12 @@ public class Site {
|
|||
|
||||
private static final Set<Integer> DEFAULT_STATUS_CODE_SET = new HashSet<Integer>();
|
||||
|
||||
private static final Set<Integer> DEFAULT_REFRESH_CODE_SET = new HashSet<>();
|
||||
|
||||
private Set<Integer> refreshCode = DEFAULT_REFRESH_CODE_SET;
|
||||
private Set<Integer> acceptStatCode = DEFAULT_STATUS_CODE_SET;
|
||||
|
||||
|
||||
private Map<String, String> headers = new HashMap<String, String>();
|
||||
|
||||
private boolean useGzip = true;
|
||||
|
@ -44,6 +49,7 @@ public class Site {
|
|||
private boolean disableCookieManagement = false;
|
||||
|
||||
static {
|
||||
DEFAULT_REFRESH_CODE_SET.add(HttpConstant.StatusCode.FORBIDDEN);
|
||||
DEFAULT_STATUS_CODE_SET.add(HttpConstant.StatusCode.CODE_200);
|
||||
}
|
||||
|
||||
|
@ -192,6 +198,15 @@ public class Site {
|
|||
return this;
|
||||
}
|
||||
|
||||
public Site setRefreshCode(Set<Integer> refreshCode){
|
||||
this.refreshCode = refreshCode;
|
||||
return this;
|
||||
}
|
||||
public Set<Integer> getRefreshCode(){
|
||||
return refreshCode;
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* get acceptStatCode
|
||||
*
|
||||
|
|
|
@ -419,7 +419,10 @@ public class Spider implements Runnable, Task {
|
|||
pipeline.process(page.getResultItems(), this);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
} else if(site.getRefreshCode().contains(page.getStatusCode())) {
|
||||
logger.info("page status code error, page {} , code: {}, start refresh downloader", request.getUrl(), page.getStatusCode());
|
||||
downloader.refreshComponent(this);
|
||||
}else {
|
||||
logger.info("page status code error, page {} , code: {}", request.getUrl(), page.getStatusCode());
|
||||
}
|
||||
sleep(site.getSleepTime());
|
||||
|
|
|
@ -21,11 +21,15 @@ public interface Downloader {
|
|||
* @param task task
|
||||
* @return page
|
||||
*/
|
||||
public Page download(Request request, Task task);
|
||||
Page download(Request request, Task task);
|
||||
|
||||
/**
|
||||
* Tell the downloader how many threads the spider used.
|
||||
*
|
||||
* @param threadNum number of threads
|
||||
*/
|
||||
public void setThread(int threadNum);
|
||||
void setThread(int threadNum);
|
||||
|
||||
|
||||
void refreshComponent(Task task);
|
||||
}
|
||||
|
|
|
@ -111,6 +111,17 @@ public class HttpClientDownloader extends AbstractDownloader {
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void refreshComponent(Task task) {
|
||||
if (proxyProvider != null ) {
|
||||
proxyProvider.refreshProxy(task);
|
||||
}
|
||||
|
||||
httpClients.remove(task.getSite().getDomain());
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setThread(int thread) {
|
||||
httpClientGenerator.setPoolSize(thread);
|
||||
|
|
|
@ -1,13 +1,17 @@
|
|||
package us.codecraft.webmagic.downloader;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.security.KeyManagementException;
|
||||
import java.security.KeyStore;
|
||||
import java.security.KeyStoreException;
|
||||
import java.security.NoSuchAlgorithmException;
|
||||
import java.security.cert.CertificateException;
|
||||
import java.security.cert.X509Certificate;
|
||||
import java.util.Map;
|
||||
|
||||
import javax.net.ssl.SSLContext;
|
||||
import javax.net.ssl.SSLContextSpi;
|
||||
import javax.net.ssl.TrustManager;
|
||||
import javax.net.ssl.X509TrustManager;
|
||||
|
||||
|
@ -24,6 +28,7 @@ import org.apache.http.conn.socket.ConnectionSocketFactory;
|
|||
import org.apache.http.conn.socket.PlainConnectionSocketFactory;
|
||||
import org.apache.http.conn.ssl.DefaultHostnameVerifier;
|
||||
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
|
||||
import org.apache.http.conn.ssl.TrustSelfSignedStrategy;
|
||||
import org.apache.http.impl.client.BasicCookieStore;
|
||||
import org.apache.http.impl.client.CloseableHttpClient;
|
||||
import org.apache.http.impl.client.DefaultHttpRequestRetryHandler;
|
||||
|
@ -32,6 +37,7 @@ import org.apache.http.impl.client.HttpClients;
|
|||
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
|
||||
import org.apache.http.impl.cookie.BasicClientCookie;
|
||||
import org.apache.http.protocol.HttpContext;
|
||||
import org.apache.http.ssl.SSLContexts;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
|
@ -69,7 +75,7 @@ public class HttpClientGenerator {
|
|||
return new SSLConnectionSocketFactory(sslContext, supportedProtocols,
|
||||
null,
|
||||
new DefaultHostnameVerifier()); // 优先绕过安全证书
|
||||
} catch (KeyManagementException e) {
|
||||
} catch (KeyManagementException | CertificateException | KeyStoreException | IOException e) {
|
||||
logger.error("ssl connection fail", e);
|
||||
} catch (NoSuchAlgorithmException e) {
|
||||
logger.error("ssl connection fail", e);
|
||||
|
@ -77,8 +83,8 @@ public class HttpClientGenerator {
|
|||
return SSLConnectionSocketFactory.getSocketFactory();
|
||||
}
|
||||
|
||||
private SSLContext createIgnoreVerifySSL() throws NoSuchAlgorithmException, KeyManagementException {
|
||||
// 实现一个X509TrustManager接口,用于绕过验证,不用修改里面的方法
|
||||
private SSLContext createIgnoreVerifySSL() throws NoSuchAlgorithmException, KeyManagementException, CertificateException, KeyStoreException, IOException {
|
||||
// 实现一个X509TrustManager接口,用于绕过验证,不用修改里面的方法
|
||||
X509TrustManager trustManager = new X509TrustManager() {
|
||||
|
||||
@Override
|
||||
|
@ -96,7 +102,7 @@ public class HttpClientGenerator {
|
|||
|
||||
};
|
||||
|
||||
SSLContext sc = SSLContext.getInstance("TLS");
|
||||
SSLContext sc = SSLContext.getInstance("SSLv3");
|
||||
sc.init(null, new TrustManager[] { trustManager }, null);
|
||||
return sc;
|
||||
}
|
||||
|
|
|
@ -28,6 +28,7 @@ public abstract class HttpConstant {
|
|||
public static abstract class StatusCode {
|
||||
|
||||
public static final int CODE_200 = 200;
|
||||
public static final int FORBIDDEN = 403;
|
||||
|
||||
}
|
||||
|
||||
|
|
|
@ -57,6 +57,11 @@ public class SpiderTest {
|
|||
return Site.me().setSleepTime(0);
|
||||
}
|
||||
}).setDownloader(new Downloader() {
|
||||
@Override
|
||||
public void refreshComponent(Task task) {
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public Page download(Request request, Task task) {
|
||||
return new Page().setRawText("");
|
||||
|
|
|
@ -28,6 +28,11 @@ public class MockGithubDownloader implements Downloader {
|
|||
return page;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void refreshComponent(Task task) {
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setThread(int threadNum) {
|
||||
}
|
||||
|
|
|
@ -43,6 +43,11 @@ public class PhantomJSDownloader extends AbstractDownloader {
|
|||
PhantomJSDownloader.phantomJsCommand = phantomJsCommand;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void refreshComponent(Task task) {
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* 新增构造函数,支持crawl.js路径自定义,因为当其他项目依赖此jar包时,runtime.exec()执行phantomjs命令时无使用法jar包中的crawl.js
|
||||
* <pre>
|
||||
|
|
|
@ -9,6 +9,10 @@ import us.codecraft.webmagic.selector.PlainText;
|
|||
* @author code4crafter@gmail.com
|
||||
*/
|
||||
public class MockGithubDownloader implements Downloader{
|
||||
@Override
|
||||
public void refreshComponent(Task task) {
|
||||
|
||||
}
|
||||
|
||||
private String html = "\n" +
|
||||
"\n" +
|
||||
|
|
|
@ -59,6 +59,11 @@ public class SeleniumDownloader implements Downloader, Closeable {
|
|||
// "/Users/Bingo/Downloads/phantomjs-1.9.7-macosx/bin/phantomjs");
|
||||
}
|
||||
|
||||
@Override
|
||||
public void refreshComponent(Task task) {
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* set sleep time to wait until load success
|
||||
*
|
||||
|
|
Loading…
Reference in New Issue