commit
1f85674ae1
|
@ -38,12 +38,12 @@ webmagic使用maven管理依赖,在项目中添加对应的依赖即可使用w
|
|||
<dependency>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<artifactId>webmagic-core</artifactId>
|
||||
<version>0.6.0</version>
|
||||
<version>0.6.1</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<artifactId>webmagic-extension</artifactId>
|
||||
<version>0.6.0</version>
|
||||
<version>0.6.1</version>
|
||||
</dependency>
|
||||
```
|
||||
|
||||
|
|
|
@ -23,12 +23,12 @@ Add dependencies to your pom.xml:
|
|||
<dependency>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<artifactId>webmagic-core</artifactId>
|
||||
<version>0.6.0</version>
|
||||
<version>0.6.1</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<artifactId>webmagic-extension</artifactId>
|
||||
<version>0.6.0</version>
|
||||
<version>0.6.1</version>
|
||||
</dependency>
|
||||
```
|
||||
|
||||
|
|
4
pom.xml
4
pom.xml
|
@ -6,7 +6,7 @@
|
|||
<version>7</version>
|
||||
</parent>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<version>0.6.1-SNAPSHOT</version>
|
||||
<version>0.6.2-SNAPSHOT</version>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
<packaging>pom</packaging>
|
||||
<properties>
|
||||
|
@ -38,7 +38,7 @@
|
|||
<connection>scm:git:git@github.com:code4craft/webmagic.git</connection>
|
||||
<developerConnection>scm:git:git@github.com:code4craft/webmagic.git</developerConnection>
|
||||
<url>git@github.com:code4craft/webmagic.git</url>
|
||||
<tag>webmagic-parent-0.6.0</tag>
|
||||
<tag>webmagic-parent-0.6.1</tag>
|
||||
</scm>
|
||||
<licenses>
|
||||
<license>
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
<parent>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<artifactId>webmagic-parent</artifactId>
|
||||
<version>0.6.1-SNAPSHOT</version>
|
||||
<version>0.6.2-SNAPSHOT</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
|
|
|
@ -292,7 +292,7 @@ public class Spider implements Runnable, Task {
|
|||
}
|
||||
if (startRequests != null) {
|
||||
for (Request request : startRequests) {
|
||||
scheduler.push(request, this);
|
||||
addRequest(request);
|
||||
}
|
||||
startRequests.clear();
|
||||
}
|
||||
|
|
|
@ -18,10 +18,19 @@ import org.apache.http.impl.client.*;
|
|||
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
|
||||
import org.apache.http.impl.cookie.BasicClientCookie;
|
||||
import org.apache.http.protocol.HttpContext;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import us.codecraft.webmagic.Site;
|
||||
import us.codecraft.webmagic.proxy.Proxy;
|
||||
|
||||
import javax.net.ssl.SSLContext;
|
||||
import javax.net.ssl.TrustManager;
|
||||
import javax.net.ssl.X509TrustManager;
|
||||
import java.io.IOException;
|
||||
import java.security.KeyManagementException;
|
||||
import java.security.NoSuchAlgorithmException;
|
||||
import java.security.cert.CertificateException;
|
||||
import java.security.cert.X509Certificate;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
|
@ -29,18 +38,55 @@ import java.util.Map;
|
|||
* @since 0.4.0
|
||||
*/
|
||||
public class HttpClientGenerator {
|
||||
|
||||
|
||||
private transient Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
private PoolingHttpClientConnectionManager connectionManager;
|
||||
|
||||
public HttpClientGenerator() {
|
||||
Registry<ConnectionSocketFactory> reg = RegistryBuilder.<ConnectionSocketFactory>create()
|
||||
.register("http", PlainConnectionSocketFactory.INSTANCE)
|
||||
.register("https", SSLConnectionSocketFactory.getSocketFactory())
|
||||
.register("https", buildSSLConnectionSocketFactory())
|
||||
.build();
|
||||
connectionManager = new PoolingHttpClientConnectionManager(reg);
|
||||
connectionManager.setDefaultMaxPerRoute(100);
|
||||
}
|
||||
|
||||
private SSLConnectionSocketFactory buildSSLConnectionSocketFactory() {
|
||||
try {
|
||||
return new SSLConnectionSocketFactory(createIgnoreVerifySSL()); // 优先绕过安全证书
|
||||
} catch (KeyManagementException e) {
|
||||
logger.error("ssl connection fail", e);
|
||||
} catch (NoSuchAlgorithmException e) {
|
||||
logger.error("ssl connection fail", e);
|
||||
}
|
||||
return SSLConnectionSocketFactory.getSocketFactory();
|
||||
}
|
||||
|
||||
private SSLContext createIgnoreVerifySSL() throws NoSuchAlgorithmException, KeyManagementException {
|
||||
// 实现一个X509TrustManager接口,用于绕过验证,不用修改里面的方法
|
||||
X509TrustManager trustManager = new X509TrustManager() {
|
||||
|
||||
@Override
|
||||
public void checkClientTrusted(X509Certificate[] chain, String authType) throws CertificateException {
|
||||
}
|
||||
|
||||
@Override
|
||||
public void checkServerTrusted(X509Certificate[] chain, String authType) throws CertificateException {
|
||||
}
|
||||
|
||||
@Override
|
||||
public X509Certificate[] getAcceptedIssuers() {
|
||||
return null;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
SSLContext sc = SSLContext.getInstance("SSLv3");
|
||||
sc.init(null, new TrustManager[] { trustManager }, null);
|
||||
return sc;
|
||||
}
|
||||
|
||||
public HttpClientGenerator setPoolSize(int poolSize) {
|
||||
connectionManager.setMaxTotal(poolSize);
|
||||
return this;
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
<parent>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<artifactId>webmagic-parent</artifactId>
|
||||
<version>0.6.1-SNAPSHOT</version>
|
||||
<version>0.6.2-SNAPSHOT</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
|
|
|
@ -29,21 +29,60 @@ public class PhantomJSDownloader extends AbstractDownloader {
|
|||
public PhantomJSDownloader() {
|
||||
this.initPhantomjsCrawlPath();
|
||||
}
|
||||
|
||||
/**
|
||||
* 添加新的构造函数,支持phantomjs自定义命令
|
||||
*
|
||||
* example:
|
||||
* phantomjs.exe 支持windows环境
|
||||
* phantomjs --ignore-ssl-errors=yes 忽略抓取地址是https时的一些错误
|
||||
* /usr/local/bin/phantomjs 命令的绝对路径,避免因系统环境变量引起的IOException
|
||||
*
|
||||
* @param phantomJsCommand phantomJsCommand
|
||||
*/
|
||||
* 添加新的构造函数,支持phantomjs自定义命令
|
||||
*
|
||||
* example:
|
||||
* phantomjs.exe 支持windows环境
|
||||
* phantomjs --ignore-ssl-errors=yes 忽略抓取地址是https时的一些错误
|
||||
* /usr/local/bin/phantomjs 命令的绝对路径,避免因系统环境变量引起的IOException
|
||||
*
|
||||
* @param phantomJsCommand
|
||||
*/
|
||||
public PhantomJSDownloader(String phantomJsCommand) {
|
||||
this.initPhantomjsCrawlPath();
|
||||
PhantomJSDownloader.phantomJsCommand = phantomJsCommand;
|
||||
}
|
||||
|
||||
/**
|
||||
* 新增构造函数,支持crawl.js路径自定义,因为当其他项目依赖此jar包时,runtime.exec()执行phantomjs命令时无使用法jar包中的crawl.js
|
||||
* <pre>
|
||||
* crawl.js start --
|
||||
*
|
||||
* var system = require('system');
|
||||
* var url = system.args[1];
|
||||
*
|
||||
* var page = require('webpage').create();
|
||||
* page.settings.loadImages = false;
|
||||
* page.settings.resourceTimeout = 5000;
|
||||
*
|
||||
* page.open(url, function (status) {
|
||||
* if (status != 'success') {
|
||||
* console.log("HTTP request failed!");
|
||||
* } else {
|
||||
* console.log(page.content);
|
||||
* }
|
||||
*
|
||||
* page.close();
|
||||
* phantom.exit();
|
||||
* });
|
||||
*
|
||||
* -- crawl.js end
|
||||
* </pre>
|
||||
* 具体项目时可以将以上js代码复制下来使用
|
||||
*
|
||||
* example:
|
||||
* new PhantomJSDownloader("/your/path/phantomjs", "/your/path/crawl.js");
|
||||
*
|
||||
* @param phantomJsCommand phantomJsCommand
|
||||
* @param crawlJsPath crawlJsPath
|
||||
*/
|
||||
public PhantomJSDownloader(String phantomJsCommand, String crawlJsPath) {
|
||||
PhantomJSDownloader.phantomJsCommand = phantomJsCommand;
|
||||
PhantomJSDownloader.crawlJsPath = crawlJsPath;
|
||||
}
|
||||
|
||||
private void initPhantomjsCrawlPath() {
|
||||
PhantomJSDownloader.crawlJsPath = new File(this.getClass().getResource("/").getPath()).getPath() + System.getProperty("file.separator") + "crawl.js ";
|
||||
}
|
||||
|
@ -86,7 +125,7 @@ public class PhantomJSDownloader extends AbstractDownloader {
|
|||
try {
|
||||
String url = request.getUrl();
|
||||
Runtime runtime = Runtime.getRuntime();
|
||||
Process process = runtime.exec(phantomJsCommand + " " + crawlJsPath + url);
|
||||
Process process = runtime.exec(phantomJsCommand + " " + crawlJsPath + " " + url);
|
||||
InputStream is = process.getInputStream();
|
||||
BufferedReader br = new BufferedReader(new InputStreamReader(is));
|
||||
StringBuffer stringBuffer = new StringBuffer();
|
||||
|
|
|
@ -45,7 +45,7 @@ public class SpiderMonitor {
|
|||
*
|
||||
* @param spiders spiders
|
||||
* @return this
|
||||
* @throws JMException
|
||||
* @throws JMException JMException
|
||||
*/
|
||||
public synchronized SpiderMonitor register(Spider... spiders) throws JMException {
|
||||
for (Spider spider : spiders) {
|
||||
|
|
|
@ -102,7 +102,7 @@ public class DoubleKeyMap<K1, K2, V> extends MultiKeyMapBase {
|
|||
|
||||
/**
|
||||
* @param key1 key1
|
||||
* @return
|
||||
* @return map
|
||||
*/
|
||||
public Map<K2, V> remove(K1 key1) {
|
||||
Map<K2, V> remove = map.remove(key1);
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
<parent>
|
||||
<artifactId>webmagic-parent</artifactId>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<version>0.6.1-SNAPSHOT</version>
|
||||
<version>0.6.2-SNAPSHOT</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
<parent>
|
||||
<artifactId>webmagic-parent</artifactId>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<version>0.6.1-SNAPSHOT</version>
|
||||
<version>0.6.2-SNAPSHOT</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
<parent>
|
||||
<artifactId>webmagic-parent</artifactId>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<version>0.6.1-SNAPSHOT</version>
|
||||
<version>0.6.2-SNAPSHOT</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
<parent>
|
||||
<artifactId>webmagic-parent</artifactId>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<version>0.6.1-SNAPSHOT</version>
|
||||
<version>0.6.2-SNAPSHOT</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
|
|
Loading…
Reference in New Issue