parent
4bedd97267
commit
c489647c4b
|
@ -40,12 +40,8 @@ public class Site {
|
||||||
|
|
||||||
private static final Set<Integer> DEFAULT_STATUS_CODE_SET = new HashSet<Integer>();
|
private static final Set<Integer> DEFAULT_STATUS_CODE_SET = new HashSet<Integer>();
|
||||||
|
|
||||||
private static final Set<Integer> DEFAULT_REFRESH_CODE_SET = new HashSet<>();
|
|
||||||
|
|
||||||
private Set<Integer> refreshCode = DEFAULT_REFRESH_CODE_SET;
|
|
||||||
private Set<Integer> acceptStatCode = DEFAULT_STATUS_CODE_SET;
|
private Set<Integer> acceptStatCode = DEFAULT_STATUS_CODE_SET;
|
||||||
|
|
||||||
|
|
||||||
private Map<String, String> headers = new HashMap<String, String>();
|
private Map<String, String> headers = new HashMap<String, String>();
|
||||||
|
|
||||||
private boolean useGzip = true;
|
private boolean useGzip = true;
|
||||||
|
@ -53,7 +49,6 @@ public class Site {
|
||||||
private boolean disableCookieManagement = false;
|
private boolean disableCookieManagement = false;
|
||||||
|
|
||||||
static {
|
static {
|
||||||
DEFAULT_REFRESH_CODE_SET.add(HttpConstant.StatusCode.FORBIDDEN);
|
|
||||||
DEFAULT_STATUS_CODE_SET.add(HttpConstant.StatusCode.CODE_200);
|
DEFAULT_STATUS_CODE_SET.add(HttpConstant.StatusCode.CODE_200);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -202,15 +197,6 @@ public class Site {
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Site setRefreshCode(Set<Integer> refreshCode){
|
|
||||||
this.refreshCode = refreshCode;
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
public Set<Integer> getRefreshCode(){
|
|
||||||
return refreshCode;
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* get acceptStatCode
|
* get acceptStatCode
|
||||||
*
|
*
|
||||||
|
|
|
@ -424,10 +424,7 @@ public class Spider implements Runnable, Task {
|
||||||
pipeline.process(page.getResultItems(), this);
|
pipeline.process(page.getResultItems(), this);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else if(site.getRefreshCode().contains(page.getStatusCode())) {
|
} else {
|
||||||
logger.info("page status code error, page {} , code: {}, start refresh downloader", request.getUrl(), page.getStatusCode());
|
|
||||||
downloader.refreshComponent(this);
|
|
||||||
}else {
|
|
||||||
logger.info("page status code error, page {} , code: {}", request.getUrl(), page.getStatusCode());
|
logger.info("page status code error, page {} , code: {}", request.getUrl(), page.getStatusCode());
|
||||||
}
|
}
|
||||||
sleep(site.getSleepTime());
|
sleep(site.getSleepTime());
|
||||||
|
|
|
@ -18,18 +18,14 @@ public interface Downloader {
|
||||||
* Downloads web pages and store in Page object.
|
* Downloads web pages and store in Page object.
|
||||||
*
|
*
|
||||||
* @param request request
|
* @param request request
|
||||||
* @param task task
|
* @param task task
|
||||||
* @return page
|
* @return page
|
||||||
*/
|
*/
|
||||||
Page download(Request request, Task task);
|
public Page download(Request request, Task task);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Tell the downloader how many threads the spider used.
|
* Tell the downloader how many threads the spider used.
|
||||||
*
|
|
||||||
* @param threadNum number of threads
|
* @param threadNum number of threads
|
||||||
*/
|
*/
|
||||||
void setThread(int threadNum);
|
public void setThread(int threadNum);
|
||||||
|
|
||||||
|
|
||||||
void refreshComponent(Task task);
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -111,17 +111,6 @@ public class HttpClientDownloader extends AbstractDownloader {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void refreshComponent(Task task) {
|
|
||||||
if (proxyProvider != null ) {
|
|
||||||
proxyProvider.refreshProxy(task);
|
|
||||||
}
|
|
||||||
|
|
||||||
httpClients.remove(task.getSite().getDomain());
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void setThread(int thread) {
|
public void setThread(int thread) {
|
||||||
httpClientGenerator.setPoolSize(thread);
|
httpClientGenerator.setPoolSize(thread);
|
||||||
|
|
|
@ -1,17 +1,13 @@
|
||||||
package us.codecraft.webmagic.downloader;
|
package us.codecraft.webmagic.downloader;
|
||||||
|
|
||||||
import java.io.File;
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.security.KeyManagementException;
|
import java.security.KeyManagementException;
|
||||||
import java.security.KeyStore;
|
|
||||||
import java.security.KeyStoreException;
|
|
||||||
import java.security.NoSuchAlgorithmException;
|
import java.security.NoSuchAlgorithmException;
|
||||||
import java.security.cert.CertificateException;
|
import java.security.cert.CertificateException;
|
||||||
import java.security.cert.X509Certificate;
|
import java.security.cert.X509Certificate;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
import javax.net.ssl.SSLContext;
|
import javax.net.ssl.SSLContext;
|
||||||
import javax.net.ssl.SSLContextSpi;
|
|
||||||
import javax.net.ssl.TrustManager;
|
import javax.net.ssl.TrustManager;
|
||||||
import javax.net.ssl.X509TrustManager;
|
import javax.net.ssl.X509TrustManager;
|
||||||
|
|
||||||
|
@ -28,7 +24,6 @@ import org.apache.http.conn.socket.ConnectionSocketFactory;
|
||||||
import org.apache.http.conn.socket.PlainConnectionSocketFactory;
|
import org.apache.http.conn.socket.PlainConnectionSocketFactory;
|
||||||
import org.apache.http.conn.ssl.DefaultHostnameVerifier;
|
import org.apache.http.conn.ssl.DefaultHostnameVerifier;
|
||||||
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
|
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
|
||||||
import org.apache.http.conn.ssl.TrustSelfSignedStrategy;
|
|
||||||
import org.apache.http.impl.client.BasicCookieStore;
|
import org.apache.http.impl.client.BasicCookieStore;
|
||||||
import org.apache.http.impl.client.CloseableHttpClient;
|
import org.apache.http.impl.client.CloseableHttpClient;
|
||||||
import org.apache.http.impl.client.DefaultHttpRequestRetryHandler;
|
import org.apache.http.impl.client.DefaultHttpRequestRetryHandler;
|
||||||
|
@ -37,7 +32,6 @@ import org.apache.http.impl.client.HttpClients;
|
||||||
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
|
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
|
||||||
import org.apache.http.impl.cookie.BasicClientCookie;
|
import org.apache.http.impl.cookie.BasicClientCookie;
|
||||||
import org.apache.http.protocol.HttpContext;
|
import org.apache.http.protocol.HttpContext;
|
||||||
import org.apache.http.ssl.SSLContexts;
|
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
@ -75,7 +69,7 @@ public class HttpClientGenerator {
|
||||||
return new SSLConnectionSocketFactory(sslContext, supportedProtocols,
|
return new SSLConnectionSocketFactory(sslContext, supportedProtocols,
|
||||||
null,
|
null,
|
||||||
new DefaultHostnameVerifier()); // 优先绕过安全证书
|
new DefaultHostnameVerifier()); // 优先绕过安全证书
|
||||||
} catch (KeyManagementException | CertificateException | KeyStoreException | IOException e) {
|
} catch (KeyManagementException e) {
|
||||||
logger.error("ssl connection fail", e);
|
logger.error("ssl connection fail", e);
|
||||||
} catch (NoSuchAlgorithmException e) {
|
} catch (NoSuchAlgorithmException e) {
|
||||||
logger.error("ssl connection fail", e);
|
logger.error("ssl connection fail", e);
|
||||||
|
@ -83,8 +77,8 @@ public class HttpClientGenerator {
|
||||||
return SSLConnectionSocketFactory.getSocketFactory();
|
return SSLConnectionSocketFactory.getSocketFactory();
|
||||||
}
|
}
|
||||||
|
|
||||||
private SSLContext createIgnoreVerifySSL() throws NoSuchAlgorithmException, KeyManagementException, CertificateException, KeyStoreException, IOException {
|
private SSLContext createIgnoreVerifySSL() throws NoSuchAlgorithmException, KeyManagementException {
|
||||||
// 实现一个X509TrustManager接口,用于绕过验证,不用修改里面的方法
|
// 实现一个X509TrustManager接口,用于绕过验证,不用修改里面的方法
|
||||||
X509TrustManager trustManager = new X509TrustManager() {
|
X509TrustManager trustManager = new X509TrustManager() {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -102,10 +96,10 @@ public class HttpClientGenerator {
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
SSLContext sc = SSLContext.getInstance("SSLv3");
|
SSLContext sc = SSLContext.getInstance("TLS");
|
||||||
sc.init(null, new TrustManager[] { trustManager }, null);
|
sc.init(null, new TrustManager[] { trustManager }, null);
|
||||||
return sc;
|
return sc;
|
||||||
}
|
}
|
||||||
|
|
||||||
public HttpClientGenerator setPoolSize(int poolSize) {
|
public HttpClientGenerator setPoolSize(int poolSize) {
|
||||||
connectionManager.setMaxTotal(poolSize);
|
connectionManager.setMaxTotal(poolSize);
|
||||||
|
|
|
@ -28,7 +28,6 @@ public abstract class HttpConstant {
|
||||||
public static abstract class StatusCode {
|
public static abstract class StatusCode {
|
||||||
|
|
||||||
public static final int CODE_200 = 200;
|
public static final int CODE_200 = 200;
|
||||||
public static final int FORBIDDEN = 403;
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -57,11 +57,6 @@ public class SpiderTest {
|
||||||
return Site.me().setSleepTime(0);
|
return Site.me().setSleepTime(0);
|
||||||
}
|
}
|
||||||
}).setDownloader(new Downloader() {
|
}).setDownloader(new Downloader() {
|
||||||
@Override
|
|
||||||
public void refreshComponent(Task task) {
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Page download(Request request, Task task) {
|
public Page download(Request request, Task task) {
|
||||||
return new Page().setRawText("");
|
return new Page().setRawText("");
|
||||||
|
|
|
@ -28,11 +28,6 @@ public class MockGithubDownloader implements Downloader {
|
||||||
return page;
|
return page;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
public void refreshComponent(Task task) {
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void setThread(int threadNum) {
|
public void setThread(int threadNum) {
|
||||||
}
|
}
|
||||||
|
|
|
@ -42,12 +42,7 @@ public class PhantomJSDownloader extends AbstractDownloader {
|
||||||
this.initPhantomjsCrawlPath();
|
this.initPhantomjsCrawlPath();
|
||||||
PhantomJSDownloader.phantomJsCommand = phantomJsCommand;
|
PhantomJSDownloader.phantomJsCommand = phantomJsCommand;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
public void refreshComponent(Task task) {
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 新增构造函数,支持crawl.js路径自定义,因为当其他项目依赖此jar包时,runtime.exec()执行phantomjs命令时无使用法jar包中的crawl.js
|
* 新增构造函数,支持crawl.js路径自定义,因为当其他项目依赖此jar包时,runtime.exec()执行phantomjs命令时无使用法jar包中的crawl.js
|
||||||
* <pre>
|
* <pre>
|
||||||
|
|
|
@ -9,10 +9,6 @@ import us.codecraft.webmagic.selector.PlainText;
|
||||||
* @author code4crafter@gmail.com
|
* @author code4crafter@gmail.com
|
||||||
*/
|
*/
|
||||||
public class MockGithubDownloader implements Downloader{
|
public class MockGithubDownloader implements Downloader{
|
||||||
@Override
|
|
||||||
public void refreshComponent(Task task) {
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
private String html = "\n" +
|
private String html = "\n" +
|
||||||
"\n" +
|
"\n" +
|
||||||
|
|
|
@ -59,11 +59,6 @@ public class SeleniumDownloader implements Downloader, Closeable {
|
||||||
// "/Users/Bingo/Downloads/phantomjs-1.9.7-macosx/bin/phantomjs");
|
// "/Users/Bingo/Downloads/phantomjs-1.9.7-macosx/bin/phantomjs");
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
public void refreshComponent(Task task) {
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* set sleep time to wait until load success
|
* set sleep time to wait until load success
|
||||||
*
|
*
|
||||||
|
|
Loading…
Reference in New Issue