Downloader 提供刷新组件的api,方便在spider中操作

master
yao 2020-12-21 18:08:55 +08:00
parent 19465089c3
commit 2e2a0fdf3e
11 changed files with 74 additions and 10 deletions

View File

@ -1,5 +1,6 @@
package us.codecraft.webmagic;
import com.sun.org.apache.regexp.internal.RE;
import us.codecraft.webmagic.utils.HttpConstant;
import java.util.*;
@ -35,8 +36,12 @@ public class Site {
private static final Set<Integer> DEFAULT_STATUS_CODE_SET = new HashSet<Integer>();
private static final Set<Integer> DEFAULT_REFRESH_CODE_SET = new HashSet<>();
private Set<Integer> refreshCode = DEFAULT_REFRESH_CODE_SET;
private Set<Integer> acceptStatCode = DEFAULT_STATUS_CODE_SET;
private Map<String, String> headers = new HashMap<String, String>();
private boolean useGzip = true;
@ -44,6 +49,7 @@ public class Site {
private boolean disableCookieManagement = false;
static {
DEFAULT_REFRESH_CODE_SET.add(HttpConstant.StatusCode.FORBIDDEN);
DEFAULT_STATUS_CODE_SET.add(HttpConstant.StatusCode.CODE_200);
}
@ -192,6 +198,15 @@ public class Site {
return this;
}
public Site setRefreshCode(Set<Integer> refreshCode){
this.refreshCode = refreshCode;
return this;
}
public Set<Integer> getRefreshCode(){
return refreshCode;
}
/**
* get acceptStatCode
*

View File

@ -419,7 +419,10 @@ public class Spider implements Runnable, Task {
pipeline.process(page.getResultItems(), this);
}
}
} else {
} else if(site.getRefreshCode().contains(page.getStatusCode())) {
logger.info("page status code error, page {} , code: {}, start refresh downloader", request.getUrl(), page.getStatusCode());
downloader.refreshComponent(this);
}else {
logger.info("page status code error, page {} , code: {}", request.getUrl(), page.getStatusCode());
}
sleep(site.getSleepTime());

View File

@ -18,14 +18,18 @@ public interface Downloader {
* Downloads web pages and store in Page object.
*
* @param request request
* @param task task
* @param task task
* @return page
*/
public Page download(Request request, Task task);
Page download(Request request, Task task);
/**
* Tell the downloader how many threads the spider used.
*
* @param threadNum number of threads
*/
public void setThread(int threadNum);
void setThread(int threadNum);
void refreshComponent(Task task);
}

View File

@ -111,6 +111,17 @@ public class HttpClientDownloader extends AbstractDownloader {
}
}
@Override
public void refreshComponent(Task task) {
if (proxyProvider != null ) {
proxyProvider.refreshProxy(task);
}
httpClients.remove(task.getSite().getDomain());
}
@Override
public void setThread(int thread) {
httpClientGenerator.setPoolSize(thread);

View File

@ -1,13 +1,17 @@
package us.codecraft.webmagic.downloader;
import java.io.File;
import java.io.IOException;
import java.security.KeyManagementException;
import java.security.KeyStore;
import java.security.KeyStoreException;
import java.security.NoSuchAlgorithmException;
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;
import java.util.Map;
import javax.net.ssl.SSLContext;
import javax.net.ssl.SSLContextSpi;
import javax.net.ssl.TrustManager;
import javax.net.ssl.X509TrustManager;
@ -24,6 +28,7 @@ import org.apache.http.conn.socket.ConnectionSocketFactory;
import org.apache.http.conn.socket.PlainConnectionSocketFactory;
import org.apache.http.conn.ssl.DefaultHostnameVerifier;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.conn.ssl.TrustSelfSignedStrategy;
import org.apache.http.impl.client.BasicCookieStore;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.DefaultHttpRequestRetryHandler;
@ -32,6 +37,7 @@ import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.impl.cookie.BasicClientCookie;
import org.apache.http.protocol.HttpContext;
import org.apache.http.ssl.SSLContexts;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -69,7 +75,7 @@ public class HttpClientGenerator {
return new SSLConnectionSocketFactory(sslContext, supportedProtocols,
null,
new DefaultHostnameVerifier()); // 优先绕过安全证书
} catch (KeyManagementException e) {
} catch (KeyManagementException | CertificateException | KeyStoreException | IOException e) {
logger.error("ssl connection fail", e);
} catch (NoSuchAlgorithmException e) {
logger.error("ssl connection fail", e);
@ -77,8 +83,8 @@ public class HttpClientGenerator {
return SSLConnectionSocketFactory.getSocketFactory();
}
private SSLContext createIgnoreVerifySSL() throws NoSuchAlgorithmException, KeyManagementException {
// 实现一个X509TrustManager接口用于绕过验证不用修改里面的方法
private SSLContext createIgnoreVerifySSL() throws NoSuchAlgorithmException, KeyManagementException, CertificateException, KeyStoreException, IOException {
// 实现一个X509TrustManager接口用于绕过验证不用修改里面的方法
X509TrustManager trustManager = new X509TrustManager() {
@Override
@ -96,10 +102,10 @@ public class HttpClientGenerator {
};
SSLContext sc = SSLContext.getInstance("TLS");
SSLContext sc = SSLContext.getInstance("SSLv3");
sc.init(null, new TrustManager[] { trustManager }, null);
return sc;
}
}
public HttpClientGenerator setPoolSize(int poolSize) {
connectionManager.setMaxTotal(poolSize);

View File

@ -28,6 +28,7 @@ public abstract class HttpConstant {
public static abstract class StatusCode {
public static final int CODE_200 = 200;
public static final int FORBIDDEN = 403;
}

View File

@ -57,6 +57,11 @@ public class SpiderTest {
return Site.me().setSleepTime(0);
}
}).setDownloader(new Downloader() {
@Override
public void refreshComponent(Task task) {
}
@Override
public Page download(Request request, Task task) {
return new Page().setRawText("");

View File

@ -28,6 +28,11 @@ public class MockGithubDownloader implements Downloader {
return page;
}
@Override
public void refreshComponent(Task task) {
}
@Override
public void setThread(int threadNum) {
}

View File

@ -42,7 +42,12 @@ public class PhantomJSDownloader extends AbstractDownloader {
this.initPhantomjsCrawlPath();
PhantomJSDownloader.phantomJsCommand = phantomJsCommand;
}
@Override
public void refreshComponent(Task task) {
}
/**
* crawl.jsjarruntime.exec()phantomjs使jarcrawl.js
* <pre>

View File

@ -9,6 +9,10 @@ import us.codecraft.webmagic.selector.PlainText;
* @author code4crafter@gmail.com
*/
public class MockGithubDownloader implements Downloader{
@Override
public void refreshComponent(Task task) {
}
private String html = "\n" +
"\n" +

View File

@ -59,6 +59,11 @@ public class SeleniumDownloader implements Downloader, Closeable {
// "/Users/Bingo/Downloads/phantomjs-1.9.7-macosx/bin/phantomjs");
}
@Override
public void refreshComponent(Task task) {
}
/**
* set sleep time to wait until load success
*