diff --git a/README-zh.md b/README-zh.md index 36059a4..d69dd63 100644 --- a/README-zh.md +++ b/README-zh.md @@ -38,12 +38,12 @@ webmagic使用maven管理依赖,在项目中添加对应的依赖即可使用w us.codecraft webmagic-core - 0.6.0 + 0.6.1 us.codecraft webmagic-extension - 0.6.0 + 0.6.1 ``` diff --git a/README.md b/README.md index 0dc200a..285eb60 100644 --- a/README.md +++ b/README.md @@ -23,12 +23,12 @@ Add dependencies to your pom.xml: us.codecraft webmagic-core - 0.6.0 + 0.6.1 us.codecraft webmagic-extension - 0.6.0 + 0.6.1 ``` diff --git a/pom.xml b/pom.xml index 46b60e8..0743c02 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ 7 us.codecraft - 0.6.1-SNAPSHOT + 0.6.2-SNAPSHOT 4.0.0 pom @@ -38,7 +38,7 @@ scm:git:git@github.com:code4craft/webmagic.git scm:git:git@github.com:code4craft/webmagic.git git@github.com:code4craft/webmagic.git - webmagic-parent-0.6.0 + webmagic-parent-0.6.1 diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index d789539..fbd5034 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.6.1-SNAPSHOT + 0.6.2-SNAPSHOT 4.0.0 diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index bb7cb3f..b1afb66 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -292,7 +292,7 @@ public class Spider implements Runnable, Task { } if (startRequests != null) { for (Request request : startRequests) { - scheduler.push(request, this); + addRequest(request); } startRequests.clear(); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java index ef98a47..1a0b2bd 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java @@ -18,10 +18,19 @@ import org.apache.http.impl.client.*; import org.apache.http.impl.conn.PoolingHttpClientConnectionManager; import org.apache.http.impl.cookie.BasicClientCookie; import org.apache.http.protocol.HttpContext; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.proxy.Proxy; +import javax.net.ssl.SSLContext; +import javax.net.ssl.TrustManager; +import javax.net.ssl.X509TrustManager; import java.io.IOException; +import java.security.KeyManagementException; +import java.security.NoSuchAlgorithmException; +import java.security.cert.CertificateException; +import java.security.cert.X509Certificate; import java.util.Map; /** @@ -29,18 +38,55 @@ import java.util.Map; * @since 0.4.0 */ public class HttpClientGenerator { - + + private transient Logger logger = LoggerFactory.getLogger(getClass()); + private PoolingHttpClientConnectionManager connectionManager; public HttpClientGenerator() { Registry reg = RegistryBuilder.create() .register("http", PlainConnectionSocketFactory.INSTANCE) - .register("https", SSLConnectionSocketFactory.getSocketFactory()) + .register("https", buildSSLConnectionSocketFactory()) .build(); connectionManager = new PoolingHttpClientConnectionManager(reg); connectionManager.setDefaultMaxPerRoute(100); } + private SSLConnectionSocketFactory buildSSLConnectionSocketFactory() { + try { + return new SSLConnectionSocketFactory(createIgnoreVerifySSL()); // 优先绕过安全证书 + } catch (KeyManagementException e) { + logger.error("ssl connection fail", e); + } catch (NoSuchAlgorithmException e) { + logger.error("ssl connection fail", e); + } + return SSLConnectionSocketFactory.getSocketFactory(); + } + + private SSLContext createIgnoreVerifySSL() throws NoSuchAlgorithmException, KeyManagementException { + // 实现一个X509TrustManager接口,用于绕过验证,不用修改里面的方法 + X509TrustManager trustManager = new X509TrustManager() { + + @Override + public void checkClientTrusted(X509Certificate[] chain, String authType) throws CertificateException { + } + + @Override + public void checkServerTrusted(X509Certificate[] chain, String authType) throws CertificateException { + } + + @Override + public X509Certificate[] getAcceptedIssuers() { + return null; + } + + }; + + SSLContext sc = SSLContext.getInstance("SSLv3"); + sc.init(null, new TrustManager[] { trustManager }, null); + return sc; + } + public HttpClientGenerator setPoolSize(int poolSize) { connectionManager.setMaxTotal(poolSize); return this; diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index 76ee2d9..a48bdd0 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.6.1-SNAPSHOT + 0.6.2-SNAPSHOT 4.0.0 diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java index 88fa7c0..fd0cc47 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java @@ -29,21 +29,60 @@ public class PhantomJSDownloader extends AbstractDownloader { public PhantomJSDownloader() { this.initPhantomjsCrawlPath(); } + /** - * 添加新的构造函数,支持phantomjs自定义命令 - * - * example: - * phantomjs.exe 支持windows环境 - * phantomjs --ignore-ssl-errors=yes 忽略抓取地址是https时的一些错误 - * /usr/local/bin/phantomjs 命令的绝对路径,避免因系统环境变量引起的IOException - * - * @param phantomJsCommand phantomJsCommand - */ + * 添加新的构造函数,支持phantomjs自定义命令 + * + * example: + * phantomjs.exe 支持windows环境 + * phantomjs --ignore-ssl-errors=yes 忽略抓取地址是https时的一些错误 + * /usr/local/bin/phantomjs 命令的绝对路径,避免因系统环境变量引起的IOException + * + * @param phantomJsCommand + */ public PhantomJSDownloader(String phantomJsCommand) { this.initPhantomjsCrawlPath(); PhantomJSDownloader.phantomJsCommand = phantomJsCommand; } + /** + * 新增构造函数,支持crawl.js路径自定义,因为当其他项目依赖此jar包时,runtime.exec()执行phantomjs命令时无使用法jar包中的crawl.js + *
+     * crawl.js start --
+     * 
+     *   var system = require('system');
+     *   var url = system.args[1];
+     *   
+     *   var page = require('webpage').create();
+     *   page.settings.loadImages = false;
+     *   page.settings.resourceTimeout = 5000;
+     *   
+     *   page.open(url, function (status) {
+     *       if (status != 'success') {
+     *           console.log("HTTP request failed!");
+     *       } else {
+     *           console.log(page.content);
+     *       }
+     *   
+     *       page.close();
+     *       phantom.exit();
+     *   });
+     *   
+     * -- crawl.js end
+     * 
+ * 具体项目时可以将以上js代码复制下来使用 + * + * example: + * new PhantomJSDownloader("/your/path/phantomjs", "/your/path/crawl.js"); + * + * @param phantomJsCommand phantomJsCommand + * @param crawlJsPath crawlJsPath + */ + public PhantomJSDownloader(String phantomJsCommand, String crawlJsPath) { + PhantomJSDownloader.phantomJsCommand = phantomJsCommand; + PhantomJSDownloader.crawlJsPath = crawlJsPath; + } + private void initPhantomjsCrawlPath() { PhantomJSDownloader.crawlJsPath = new File(this.getClass().getResource("/").getPath()).getPath() + System.getProperty("file.separator") + "crawl.js "; } @@ -86,7 +125,7 @@ public class PhantomJSDownloader extends AbstractDownloader { try { String url = request.getUrl(); Runtime runtime = Runtime.getRuntime(); - Process process = runtime.exec(phantomJsCommand + " " + crawlJsPath + url); + Process process = runtime.exec(phantomJsCommand + " " + crawlJsPath + " " + url); InputStream is = process.getInputStream(); BufferedReader br = new BufferedReader(new InputStreamReader(is)); StringBuffer stringBuffer = new StringBuffer(); diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java index a8aaecf..cfb4a82 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java @@ -45,7 +45,7 @@ public class SpiderMonitor { * * @param spiders spiders * @return this - * @throws JMException + * @throws JMException JMException */ public synchronized SpiderMonitor register(Spider... spiders) throws JMException { for (Spider spider : spiders) { diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/DoubleKeyMap.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/DoubleKeyMap.java index 9e83b6d..7695c66 100755 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/DoubleKeyMap.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/DoubleKeyMap.java @@ -102,7 +102,7 @@ public class DoubleKeyMap extends MultiKeyMapBase { /** * @param key1 key1 - * @return + * @return map */ public Map remove(K1 key1) { Map remove = map.remove(key1); diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index 8ac4d5b..eed2b77 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.6.1-SNAPSHOT + 0.6.2-SNAPSHOT 4.0.0 diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml index bfbe4e3..9b8b732 100644 --- a/webmagic-saxon/pom.xml +++ b/webmagic-saxon/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.6.1-SNAPSHOT + 0.6.2-SNAPSHOT 4.0.0 diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index 6095a57..3c6f673 100755 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.6.1-SNAPSHOT + 0.6.2-SNAPSHOT 4.0.0 diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml index fdd5dfb..6ddc61c 100644 --- a/webmagic-selenium/pom.xml +++ b/webmagic-selenium/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.6.1-SNAPSHOT + 0.6.2-SNAPSHOT 4.0.0