parent
4bedd97267
commit
c489647c4b
|
@ -40,12 +40,8 @@ public class Site {
|
|||
|
||||
private static final Set<Integer> DEFAULT_STATUS_CODE_SET = new HashSet<Integer>();
|
||||
|
||||
private static final Set<Integer> DEFAULT_REFRESH_CODE_SET = new HashSet<>();
|
||||
|
||||
private Set<Integer> refreshCode = DEFAULT_REFRESH_CODE_SET;
|
||||
private Set<Integer> acceptStatCode = DEFAULT_STATUS_CODE_SET;
|
||||
|
||||
|
||||
private Map<String, String> headers = new HashMap<String, String>();
|
||||
|
||||
private boolean useGzip = true;
|
||||
|
@ -53,7 +49,6 @@ public class Site {
|
|||
private boolean disableCookieManagement = false;
|
||||
|
||||
static {
|
||||
DEFAULT_REFRESH_CODE_SET.add(HttpConstant.StatusCode.FORBIDDEN);
|
||||
DEFAULT_STATUS_CODE_SET.add(HttpConstant.StatusCode.CODE_200);
|
||||
}
|
||||
|
||||
|
@ -202,15 +197,6 @@ public class Site {
|
|||
return this;
|
||||
}
|
||||
|
||||
public Site setRefreshCode(Set<Integer> refreshCode){
|
||||
this.refreshCode = refreshCode;
|
||||
return this;
|
||||
}
|
||||
public Set<Integer> getRefreshCode(){
|
||||
return refreshCode;
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* get acceptStatCode
|
||||
*
|
||||
|
|
|
@ -424,10 +424,7 @@ public class Spider implements Runnable, Task {
|
|||
pipeline.process(page.getResultItems(), this);
|
||||
}
|
||||
}
|
||||
} else if(site.getRefreshCode().contains(page.getStatusCode())) {
|
||||
logger.info("page status code error, page {} , code: {}, start refresh downloader", request.getUrl(), page.getStatusCode());
|
||||
downloader.refreshComponent(this);
|
||||
}else {
|
||||
} else {
|
||||
logger.info("page status code error, page {} , code: {}", request.getUrl(), page.getStatusCode());
|
||||
}
|
||||
sleep(site.getSleepTime());
|
||||
|
|
|
@ -18,18 +18,14 @@ public interface Downloader {
|
|||
* Downloads web pages and store in Page object.
|
||||
*
|
||||
* @param request request
|
||||
* @param task task
|
||||
* @param task task
|
||||
* @return page
|
||||
*/
|
||||
Page download(Request request, Task task);
|
||||
public Page download(Request request, Task task);
|
||||
|
||||
/**
|
||||
* Tell the downloader how many threads the spider used.
|
||||
*
|
||||
* @param threadNum number of threads
|
||||
*/
|
||||
void setThread(int threadNum);
|
||||
|
||||
|
||||
void refreshComponent(Task task);
|
||||
public void setThread(int threadNum);
|
||||
}
|
||||
|
|
|
@ -111,17 +111,6 @@ public class HttpClientDownloader extends AbstractDownloader {
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void refreshComponent(Task task) {
|
||||
if (proxyProvider != null ) {
|
||||
proxyProvider.refreshProxy(task);
|
||||
}
|
||||
|
||||
httpClients.remove(task.getSite().getDomain());
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setThread(int thread) {
|
||||
httpClientGenerator.setPoolSize(thread);
|
||||
|
|
|
@ -1,17 +1,13 @@
|
|||
package us.codecraft.webmagic.downloader;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.security.KeyManagementException;
|
||||
import java.security.KeyStore;
|
||||
import java.security.KeyStoreException;
|
||||
import java.security.NoSuchAlgorithmException;
|
||||
import java.security.cert.CertificateException;
|
||||
import java.security.cert.X509Certificate;
|
||||
import java.util.Map;
|
||||
|
||||
import javax.net.ssl.SSLContext;
|
||||
import javax.net.ssl.SSLContextSpi;
|
||||
import javax.net.ssl.TrustManager;
|
||||
import javax.net.ssl.X509TrustManager;
|
||||
|
||||
|
@ -28,7 +24,6 @@ import org.apache.http.conn.socket.ConnectionSocketFactory;
|
|||
import org.apache.http.conn.socket.PlainConnectionSocketFactory;
|
||||
import org.apache.http.conn.ssl.DefaultHostnameVerifier;
|
||||
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
|
||||
import org.apache.http.conn.ssl.TrustSelfSignedStrategy;
|
||||
import org.apache.http.impl.client.BasicCookieStore;
|
||||
import org.apache.http.impl.client.CloseableHttpClient;
|
||||
import org.apache.http.impl.client.DefaultHttpRequestRetryHandler;
|
||||
|
@ -37,7 +32,6 @@ import org.apache.http.impl.client.HttpClients;
|
|||
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
|
||||
import org.apache.http.impl.cookie.BasicClientCookie;
|
||||
import org.apache.http.protocol.HttpContext;
|
||||
import org.apache.http.ssl.SSLContexts;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
|
@ -75,7 +69,7 @@ public class HttpClientGenerator {
|
|||
return new SSLConnectionSocketFactory(sslContext, supportedProtocols,
|
||||
null,
|
||||
new DefaultHostnameVerifier()); // 优先绕过安全证书
|
||||
} catch (KeyManagementException | CertificateException | KeyStoreException | IOException e) {
|
||||
} catch (KeyManagementException e) {
|
||||
logger.error("ssl connection fail", e);
|
||||
} catch (NoSuchAlgorithmException e) {
|
||||
logger.error("ssl connection fail", e);
|
||||
|
@ -83,8 +77,8 @@ public class HttpClientGenerator {
|
|||
return SSLConnectionSocketFactory.getSocketFactory();
|
||||
}
|
||||
|
||||
private SSLContext createIgnoreVerifySSL() throws NoSuchAlgorithmException, KeyManagementException, CertificateException, KeyStoreException, IOException {
|
||||
// 实现一个X509TrustManager接口,用于绕过验证,不用修改里面的方法
|
||||
private SSLContext createIgnoreVerifySSL() throws NoSuchAlgorithmException, KeyManagementException {
|
||||
// 实现一个X509TrustManager接口,用于绕过验证,不用修改里面的方法
|
||||
X509TrustManager trustManager = new X509TrustManager() {
|
||||
|
||||
@Override
|
||||
|
@ -102,10 +96,10 @@ public class HttpClientGenerator {
|
|||
|
||||
};
|
||||
|
||||
SSLContext sc = SSLContext.getInstance("SSLv3");
|
||||
SSLContext sc = SSLContext.getInstance("TLS");
|
||||
sc.init(null, new TrustManager[] { trustManager }, null);
|
||||
return sc;
|
||||
}
|
||||
}
|
||||
|
||||
public HttpClientGenerator setPoolSize(int poolSize) {
|
||||
connectionManager.setMaxTotal(poolSize);
|
||||
|
|
|
@ -28,7 +28,6 @@ public abstract class HttpConstant {
|
|||
public static abstract class StatusCode {
|
||||
|
||||
public static final int CODE_200 = 200;
|
||||
public static final int FORBIDDEN = 403;
|
||||
|
||||
}
|
||||
|
||||
|
|
|
@ -57,11 +57,6 @@ public class SpiderTest {
|
|||
return Site.me().setSleepTime(0);
|
||||
}
|
||||
}).setDownloader(new Downloader() {
|
||||
@Override
|
||||
public void refreshComponent(Task task) {
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public Page download(Request request, Task task) {
|
||||
return new Page().setRawText("");
|
||||
|
|
|
@ -28,11 +28,6 @@ public class MockGithubDownloader implements Downloader {
|
|||
return page;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void refreshComponent(Task task) {
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setThread(int threadNum) {
|
||||
}
|
||||
|
|
|
@ -43,11 +43,6 @@ public class PhantomJSDownloader extends AbstractDownloader {
|
|||
PhantomJSDownloader.phantomJsCommand = phantomJsCommand;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void refreshComponent(Task task) {
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* 新增构造函数,支持crawl.js路径自定义,因为当其他项目依赖此jar包时,runtime.exec()执行phantomjs命令时无使用法jar包中的crawl.js
|
||||
* <pre>
|
||||
|
|
|
@ -9,10 +9,6 @@ import us.codecraft.webmagic.selector.PlainText;
|
|||
* @author code4crafter@gmail.com
|
||||
*/
|
||||
public class MockGithubDownloader implements Downloader{
|
||||
@Override
|
||||
public void refreshComponent(Task task) {
|
||||
|
||||
}
|
||||
|
||||
private String html = "\n" +
|
||||
"\n" +
|
||||
|
|
|
@ -59,11 +59,6 @@ public class SeleniumDownloader implements Downloader, Closeable {
|
|||
// "/Users/Bingo/Downloads/phantomjs-1.9.7-macosx/bin/phantomjs");
|
||||
}
|
||||
|
||||
@Override
|
||||
public void refreshComponent(Task task) {
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* set sleep time to wait until load success
|
||||
*
|
||||
|
|
Loading…
Reference in New Issue