Revert " Downloader 提供刷新组件的api,方便在spider中操作"

This reverts commit 2e2a0fdf3e.
master
Sutra Zhou 2021-01-02 20:15:10 +08:00
parent 4bedd97267
commit c489647c4b
11 changed files with 10 additions and 73 deletions

View File

@ -40,12 +40,8 @@ public class Site {
private static final Set<Integer> DEFAULT_STATUS_CODE_SET = new HashSet<Integer>(); private static final Set<Integer> DEFAULT_STATUS_CODE_SET = new HashSet<Integer>();
private static final Set<Integer> DEFAULT_REFRESH_CODE_SET = new HashSet<>();
private Set<Integer> refreshCode = DEFAULT_REFRESH_CODE_SET;
private Set<Integer> acceptStatCode = DEFAULT_STATUS_CODE_SET; private Set<Integer> acceptStatCode = DEFAULT_STATUS_CODE_SET;
private Map<String, String> headers = new HashMap<String, String>(); private Map<String, String> headers = new HashMap<String, String>();
private boolean useGzip = true; private boolean useGzip = true;
@ -53,7 +49,6 @@ public class Site {
private boolean disableCookieManagement = false; private boolean disableCookieManagement = false;
static { static {
DEFAULT_REFRESH_CODE_SET.add(HttpConstant.StatusCode.FORBIDDEN);
DEFAULT_STATUS_CODE_SET.add(HttpConstant.StatusCode.CODE_200); DEFAULT_STATUS_CODE_SET.add(HttpConstant.StatusCode.CODE_200);
} }
@ -202,15 +197,6 @@ public class Site {
return this; return this;
} }
public Site setRefreshCode(Set<Integer> refreshCode){
this.refreshCode = refreshCode;
return this;
}
public Set<Integer> getRefreshCode(){
return refreshCode;
}
/** /**
* get acceptStatCode * get acceptStatCode
* *

View File

@ -424,10 +424,7 @@ public class Spider implements Runnable, Task {
pipeline.process(page.getResultItems(), this); pipeline.process(page.getResultItems(), this);
} }
} }
} else if(site.getRefreshCode().contains(page.getStatusCode())) { } else {
logger.info("page status code error, page {} , code: {}, start refresh downloader", request.getUrl(), page.getStatusCode());
downloader.refreshComponent(this);
}else {
logger.info("page status code error, page {} , code: {}", request.getUrl(), page.getStatusCode()); logger.info("page status code error, page {} , code: {}", request.getUrl(), page.getStatusCode());
} }
sleep(site.getSleepTime()); sleep(site.getSleepTime());

View File

@ -21,15 +21,11 @@ public interface Downloader {
* @param task task * @param task task
* @return page * @return page
*/ */
Page download(Request request, Task task); public Page download(Request request, Task task);
/** /**
* Tell the downloader how many threads the spider used. * Tell the downloader how many threads the spider used.
*
* @param threadNum number of threads * @param threadNum number of threads
*/ */
void setThread(int threadNum); public void setThread(int threadNum);
void refreshComponent(Task task);
} }

View File

@ -111,17 +111,6 @@ public class HttpClientDownloader extends AbstractDownloader {
} }
} }
@Override
public void refreshComponent(Task task) {
if (proxyProvider != null ) {
proxyProvider.refreshProxy(task);
}
httpClients.remove(task.getSite().getDomain());
}
@Override @Override
public void setThread(int thread) { public void setThread(int thread) {
httpClientGenerator.setPoolSize(thread); httpClientGenerator.setPoolSize(thread);

View File

@ -1,17 +1,13 @@
package us.codecraft.webmagic.downloader; package us.codecraft.webmagic.downloader;
import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.security.KeyManagementException; import java.security.KeyManagementException;
import java.security.KeyStore;
import java.security.KeyStoreException;
import java.security.NoSuchAlgorithmException; import java.security.NoSuchAlgorithmException;
import java.security.cert.CertificateException; import java.security.cert.CertificateException;
import java.security.cert.X509Certificate; import java.security.cert.X509Certificate;
import java.util.Map; import java.util.Map;
import javax.net.ssl.SSLContext; import javax.net.ssl.SSLContext;
import javax.net.ssl.SSLContextSpi;
import javax.net.ssl.TrustManager; import javax.net.ssl.TrustManager;
import javax.net.ssl.X509TrustManager; import javax.net.ssl.X509TrustManager;
@ -28,7 +24,6 @@ import org.apache.http.conn.socket.ConnectionSocketFactory;
import org.apache.http.conn.socket.PlainConnectionSocketFactory; import org.apache.http.conn.socket.PlainConnectionSocketFactory;
import org.apache.http.conn.ssl.DefaultHostnameVerifier; import org.apache.http.conn.ssl.DefaultHostnameVerifier;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory; import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.conn.ssl.TrustSelfSignedStrategy;
import org.apache.http.impl.client.BasicCookieStore; import org.apache.http.impl.client.BasicCookieStore;
import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.DefaultHttpRequestRetryHandler; import org.apache.http.impl.client.DefaultHttpRequestRetryHandler;
@ -37,7 +32,6 @@ import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager; import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.impl.cookie.BasicClientCookie; import org.apache.http.impl.cookie.BasicClientCookie;
import org.apache.http.protocol.HttpContext; import org.apache.http.protocol.HttpContext;
import org.apache.http.ssl.SSLContexts;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -75,7 +69,7 @@ public class HttpClientGenerator {
return new SSLConnectionSocketFactory(sslContext, supportedProtocols, return new SSLConnectionSocketFactory(sslContext, supportedProtocols,
null, null,
new DefaultHostnameVerifier()); // 优先绕过安全证书 new DefaultHostnameVerifier()); // 优先绕过安全证书
} catch (KeyManagementException | CertificateException | KeyStoreException | IOException e) { } catch (KeyManagementException e) {
logger.error("ssl connection fail", e); logger.error("ssl connection fail", e);
} catch (NoSuchAlgorithmException e) { } catch (NoSuchAlgorithmException e) {
logger.error("ssl connection fail", e); logger.error("ssl connection fail", e);
@ -83,8 +77,8 @@ public class HttpClientGenerator {
return SSLConnectionSocketFactory.getSocketFactory(); return SSLConnectionSocketFactory.getSocketFactory();
} }
private SSLContext createIgnoreVerifySSL() throws NoSuchAlgorithmException, KeyManagementException, CertificateException, KeyStoreException, IOException { private SSLContext createIgnoreVerifySSL() throws NoSuchAlgorithmException, KeyManagementException {
// 实现一个X509TrustManager接口用于绕过验证不用修改里面的方法 // 实现一个X509TrustManager接口用于绕过验证不用修改里面的方法
X509TrustManager trustManager = new X509TrustManager() { X509TrustManager trustManager = new X509TrustManager() {
@Override @Override
@ -102,7 +96,7 @@ public class HttpClientGenerator {
}; };
SSLContext sc = SSLContext.getInstance("SSLv3"); SSLContext sc = SSLContext.getInstance("TLS");
sc.init(null, new TrustManager[] { trustManager }, null); sc.init(null, new TrustManager[] { trustManager }, null);
return sc; return sc;
} }

View File

@ -28,7 +28,6 @@ public abstract class HttpConstant {
public static abstract class StatusCode { public static abstract class StatusCode {
public static final int CODE_200 = 200; public static final int CODE_200 = 200;
public static final int FORBIDDEN = 403;
} }

View File

@ -57,11 +57,6 @@ public class SpiderTest {
return Site.me().setSleepTime(0); return Site.me().setSleepTime(0);
} }
}).setDownloader(new Downloader() { }).setDownloader(new Downloader() {
@Override
public void refreshComponent(Task task) {
}
@Override @Override
public Page download(Request request, Task task) { public Page download(Request request, Task task) {
return new Page().setRawText(""); return new Page().setRawText("");

View File

@ -28,11 +28,6 @@ public class MockGithubDownloader implements Downloader {
return page; return page;
} }
@Override
public void refreshComponent(Task task) {
}
@Override @Override
public void setThread(int threadNum) { public void setThread(int threadNum) {
} }

View File

@ -43,11 +43,6 @@ public class PhantomJSDownloader extends AbstractDownloader {
PhantomJSDownloader.phantomJsCommand = phantomJsCommand; PhantomJSDownloader.phantomJsCommand = phantomJsCommand;
} }
@Override
public void refreshComponent(Task task) {
}
/** /**
* crawl.jsjarruntime.exec()phantomjs使jarcrawl.js * crawl.jsjarruntime.exec()phantomjs使jarcrawl.js
* <pre> * <pre>

View File

@ -9,10 +9,6 @@ import us.codecraft.webmagic.selector.PlainText;
* @author code4crafter@gmail.com * @author code4crafter@gmail.com
*/ */
public class MockGithubDownloader implements Downloader{ public class MockGithubDownloader implements Downloader{
@Override
public void refreshComponent(Task task) {
}
private String html = "\n" + private String html = "\n" +
"\n" + "\n" +

View File

@ -59,11 +59,6 @@ public class SeleniumDownloader implements Downloader, Closeable {
// "/Users/Bingo/Downloads/phantomjs-1.9.7-macosx/bin/phantomjs"); // "/Users/Bingo/Downloads/phantomjs-1.9.7-macosx/bin/phantomjs");
} }
@Override
public void refreshComponent(Task task) {
}
/** /**
* set sleep time to wait until load success * set sleep time to wait until load success
* *