From 42f1018010d858dc3ad77bb960c6de1d044b3df8 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Tue, 21 Feb 2017 14:08:05 +0800 Subject: [PATCH 01/27] remove messy code --- .../codecraft/webmagic/downloader/HttpClientDownloader.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 2cb49b6..e57d5cd 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -98,8 +98,8 @@ public class HttpClientDownloader extends AbstractDownloader { proxyHost = site.getHttpProxy(); } - HttpUriRequest httpUriRequest = getHttpUriRequest(request, site, headers, proxyHost);//���������˴��� - httpResponse = getHttpClient(site, proxy).execute(httpUriRequest);//getHttpClient�������˴�����֤ + HttpUriRequest httpUriRequest = getHttpUriRequest(request, site, headers, proxyHost); + httpResponse = getHttpClient(site, proxy).execute(httpUriRequest); statusCode = httpResponse.getStatusLine().getStatusCode(); request.putExtra(Request.STATUS_CODE, statusCode); if (statusAccept(acceptStatCode, statusCode)) { From 3a796b9413b1166bd9fa3181e5f4f6f259d6393c Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 25 Feb 2017 12:01:12 +0800 Subject: [PATCH 02/27] remove duplicate code #421 --- .../src/main/java/us/codecraft/webmagic/Spider.java | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index b1afb66..9045ad8 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -305,7 +305,7 @@ public class Spider implements Runnable, Task { initComponent(); logger.info("Spider " + getUUID() + " started!"); while (!Thread.currentThread().isInterrupted() && stat.get() == STAT_RUNNING) { - Request request = scheduler.poll(this); + final Request request = scheduler.poll(this); if (request == null) { if (threadPool.getThreadAlive() == 0 && exitWhenComplete) { break; @@ -313,16 +313,15 @@ public class Spider implements Runnable, Task { // wait until new url added waitNewUrl(); } else { - final Request requestFinal = request; threadPool.execute(new Runnable() { @Override public void run() { try { - processRequest(requestFinal); - onSuccess(requestFinal); + processRequest(request); + onSuccess(request); } catch (Exception e) { - onError(requestFinal); - logger.error("process request " + requestFinal + " error", e); + onError(request); + logger.error("process request " + request + " error", e); } finally { pageCount.incrementAndGet(); signalNewUrl(); From ad6996300531f4f81f4a615469f00e1e9fcf936b Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 25 Feb 2017 19:42:12 +0800 Subject: [PATCH 03/27] remove synchronize in Page #411 --- .../main/java/us/codecraft/webmagic/Page.java | 34 +++++++------------ 1 file changed, 13 insertions(+), 21 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java index 62f21f8..7c0064d 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java @@ -107,14 +107,12 @@ public class Page { * @param requests requests */ public void addTargetRequests(List requests) { - synchronized (targetRequests) { - for (String s : requests) { - if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) { - continue; - } - s = UrlUtils.canonicalizeUrl(s, url.toString()); - targetRequests.add(new Request(s)); + for (String s : requests) { + if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) { + continue; } + s = UrlUtils.canonicalizeUrl(s, url.toString()); + targetRequests.add(new Request(s)); } } @@ -125,14 +123,12 @@ public class Page { * @param priority priority */ public void addTargetRequests(List requests, long priority) { - synchronized (targetRequests) { - for (String s : requests) { - if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) { - continue; - } - s = UrlUtils.canonicalizeUrl(s, url.toString()); - targetRequests.add(new Request(s).setPriority(priority)); + for (String s : requests) { + if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) { + continue; } + s = UrlUtils.canonicalizeUrl(s, url.toString()); + targetRequests.add(new Request(s).setPriority(priority)); } } @@ -145,10 +141,8 @@ public class Page { if (StringUtils.isBlank(requestString) || requestString.equals("#")) { return; } - synchronized (targetRequests) { - requestString = UrlUtils.canonicalizeUrl(requestString, url.toString()); - targetRequests.add(new Request(requestString)); - } + requestString = UrlUtils.canonicalizeUrl(requestString, url.toString()); + targetRequests.add(new Request(requestString)); } /** @@ -157,9 +151,7 @@ public class Page { * @param request request */ public void addTargetRequest(Request request) { - synchronized (targetRequests) { - targetRequests.add(request); - } + targetRequests.add(request); } /** From 00e81bd650c0fc16adaf13b290ddc158ceae4a3c Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 25 Feb 2017 19:51:03 +0800 Subject: [PATCH 04/27] update common-collections to 3.2.2 #456 --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 0743c02..0914e26 100644 --- a/pom.xml +++ b/pom.xml @@ -130,7 +130,7 @@ commons-collections commons-collections - 3.2.1 + 3.2.2 org.apache.commons From bbe0b52dddb7b32b82cd913a99f18672ef1df6a8 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 25 Feb 2017 19:55:45 +0800 Subject: [PATCH 05/27] remove synchronized in QueueScheduler #410 --- .../java/us/codecraft/webmagic/scheduler/QueueScheduler.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java index c38311f..078506c 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java @@ -26,7 +26,7 @@ public class QueueScheduler extends DuplicateRemovedScheduler implements Monitor } @Override - public synchronized Request poll(Task task) { + public Request poll(Task task) { return queue.poll(); } From 1d2171805fbc7447c1f10b15f8033e406b7add0f Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 25 Feb 2017 22:30:48 +0800 Subject: [PATCH 06/27] add test for #228 --- .../test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java | 3 +++ 1 file changed, 3 insertions(+) diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java index 86b9db3..a90304d 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java @@ -20,6 +20,9 @@ public class UrlUtilsTest { absoluteUrl = UrlUtils.canonicalizeUrl("../aa", "http://www.dianping.com/sh/ss/com"); assertThat(absoluteUrl).isEqualTo("http://www.dianping.com/sh/aa"); + absoluteUrl = UrlUtils.canonicalizeUrl("../mshz", "http://www.court.gov.cn/zgcpwsw/zgrmfy/"); + assertThat(absoluteUrl).isEqualTo("http://www.court.gov.cn/zgcpwsw/mshz"); + absoluteUrl = UrlUtils.canonicalizeUrl("..aa", "http://www.dianping.com/sh/ss/com"); assertThat(absoluteUrl).isEqualTo("http://www.dianping.com/sh/ss/..aa"); From a872a6480e3b1a7ce904b5e23e38652451b97111 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 25 Feb 2017 22:46:29 +0800 Subject: [PATCH 07/27] fix code sample for github #348 --- README.md | 4 ++-- .../webmagic/processor/example/GithubRepoPageProcessor.java | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 285eb60..f1ddd27 100644 --- a/README.md +++ b/README.md @@ -59,7 +59,7 @@ public class GithubRepoPageProcessor implements PageProcessor { public void process(Page page) { page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all()); page.putField("author", page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString()); - page.putField("name", page.getHtml().xpath("//h1[@class='entry-title public']/strong/a/text()").toString()); + page.putField("name", page.getHtml().xpath("//h1[@class='public']/strong/a/text()").toString()); if (page.getResultItems().get("name")==null){ //skip this page page.setSkip(true); @@ -89,7 +89,7 @@ You can also use annotation way: @HelpUrl("https://github.com/\\w+") public class GithubRepo { - @ExtractBy(value = "//h1[@class='entry-title public']/strong/a/text()", notNull = true) + @ExtractBy(value = "//h1[@class='public']/strong/a/text()", notNull = true) private String name; @ExtractByUrl("https://github\\.com/(\\w+)/.*") diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/GithubRepoPageProcessor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/GithubRepoPageProcessor.java index 955bd5a..e93ab4c 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/GithubRepoPageProcessor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/GithubRepoPageProcessor.java @@ -18,7 +18,7 @@ public class GithubRepoPageProcessor implements PageProcessor { page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/[\\w\\-]+/[\\w\\-]+)").all()); page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/[\\w\\-])").all()); page.putField("author", page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString()); - page.putField("name", page.getHtml().xpath("//h1[@class='entry-title public']/strong/a/text()").toString()); + page.putField("name", page.getHtml().xpath("//h1[@class='public']/strong/a/text()").toString()); if (page.getResultItems().get("name")==null){ //skip this page page.setSkip(true); From d6cd92b1a889e0d53b71d7a7b3d81eeff28c3a56 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Mon, 27 Feb 2017 10:26:26 +0800 Subject: [PATCH 08/27] LICENSE file --- LICENSE | 191 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 191 insertions(+) create mode 100644 LICENSE diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..0cecd85 --- /dev/null +++ b/LICENSE @@ -0,0 +1,191 @@ +Apache License +Version 2.0, January 2004 +http://www.apache.org/licenses/ + +TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + +1. Definitions. + +"License" shall mean the terms and conditions for use, reproduction, and +distribution as defined by Sections 1 through 9 of this document. + +"Licensor" shall mean the copyright owner or entity authorized by the copyright +owner that is granting the License. + +"Legal Entity" shall mean the union of the acting entity and all other entities +that control, are controlled by, or are under common control with that entity. +For the purposes of this definition, "control" means (i) the power, direct or +indirect, to cause the direction or management of such entity, whether by +contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the +outstanding shares, or (iii) beneficial ownership of such entity. + +"You" (or "Your") shall mean an individual or Legal Entity exercising +permissions granted by this License. + +"Source" form shall mean the preferred form for making modifications, including +but not limited to software source code, documentation source, and configuration +files. + +"Object" form shall mean any form resulting from mechanical transformation or +translation of a Source form, including but not limited to compiled object code, +generated documentation, and conversions to other media types. + +"Work" shall mean the work of authorship, whether in Source or Object form, made +available under the License, as indicated by a copyright notice that is included +in or attached to the work (an example is provided in the Appendix below). + +"Derivative Works" shall mean any work, whether in Source or Object form, that +is based on (or derived from) the Work and for which the editorial revisions, +annotations, elaborations, or other modifications represent, as a whole, an +original work of authorship. For the purposes of this License, Derivative Works +shall not include works that remain separable from, or merely link (or bind by +name) to the interfaces of, the Work and Derivative Works thereof. + +"Contribution" shall mean any work of authorship, including the original version +of the Work and any modifications or additions to that Work or Derivative Works +thereof, that is intentionally submitted to Licensor for inclusion in the Work +by the copyright owner or by an individual or Legal Entity authorized to submit +on behalf of the copyright owner. For the purposes of this definition, +"submitted" means any form of electronic, verbal, or written communication sent +to the Licensor or its representatives, including but not limited to +communication on electronic mailing lists, source code control systems, and +issue tracking systems that are managed by, or on behalf of, the Licensor for +the purpose of discussing and improving the Work, but excluding communication +that is conspicuously marked or otherwise designated in writing by the copyright +owner as "Not a Contribution." + +"Contributor" shall mean Licensor and any individual or Legal Entity on behalf +of whom a Contribution has been received by Licensor and subsequently +incorporated within the Work. + +2. Grant of Copyright License. + +Subject to the terms and conditions of this License, each Contributor hereby +grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, +irrevocable copyright license to reproduce, prepare Derivative Works of, +publicly display, publicly perform, sublicense, and distribute the Work and such +Derivative Works in Source or Object form. + +3. Grant of Patent License. + +Subject to the terms and conditions of this License, each Contributor hereby +grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, +irrevocable (except as stated in this section) patent license to make, have +made, use, offer to sell, sell, import, and otherwise transfer the Work, where +such license applies only to those patent claims licensable by such Contributor +that are necessarily infringed by their Contribution(s) alone or by combination +of their Contribution(s) with the Work to which such Contribution(s) was +submitted. If You institute patent litigation against any entity (including a +cross-claim or counterclaim in a lawsuit) alleging that the Work or a +Contribution incorporated within the Work constitutes direct or contributory +patent infringement, then any patent licenses granted to You under this License +for that Work shall terminate as of the date such litigation is filed. + +4. Redistribution. + +You may reproduce and distribute copies of the Work or Derivative Works thereof +in any medium, with or without modifications, and in Source or Object form, +provided that You meet the following conditions: + +You must give any other recipients of the Work or Derivative Works a copy of +this License; and +You must cause any modified files to carry prominent notices stating that You +changed the files; and +You must retain, in the Source form of any Derivative Works that You distribute, +all copyright, patent, trademark, and attribution notices from the Source form +of the Work, excluding those notices that do not pertain to any part of the +Derivative Works; and +If the Work includes a "NOTICE" text file as part of its distribution, then any +Derivative Works that You distribute must include a readable copy of the +attribution notices contained within such NOTICE file, excluding those notices +that do not pertain to any part of the Derivative Works, in at least one of the +following places: within a NOTICE text file distributed as part of the +Derivative Works; within the Source form or documentation, if provided along +with the Derivative Works; or, within a display generated by the Derivative +Works, if and wherever such third-party notices normally appear. The contents of +the NOTICE file are for informational purposes only and do not modify the +License. You may add Your own attribution notices within Derivative Works that +You distribute, alongside or as an addendum to the NOTICE text from the Work, +provided that such additional attribution notices cannot be construed as +modifying the License. +You may add Your own copyright statement to Your modifications and may provide +additional or different license terms and conditions for use, reproduction, or +distribution of Your modifications, or for any such Derivative Works as a whole, +provided Your use, reproduction, and distribution of the Work otherwise complies +with the conditions stated in this License. + +5. Submission of Contributions. + +Unless You explicitly state otherwise, any Contribution intentionally submitted +for inclusion in the Work by You to the Licensor shall be under the terms and +conditions of this License, without any additional terms or conditions. +Notwithstanding the above, nothing herein shall supersede or modify the terms of +any separate license agreement you may have executed with Licensor regarding +such Contributions. + +6. Trademarks. + +This License does not grant permission to use the trade names, trademarks, +service marks, or product names of the Licensor, except as required for +reasonable and customary use in describing the origin of the Work and +reproducing the content of the NOTICE file. + +7. Disclaimer of Warranty. + +Unless required by applicable law or agreed to in writing, Licensor provides the +Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, +including, without limitation, any warranties or conditions of TITLE, +NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are +solely responsible for determining the appropriateness of using or +redistributing the Work and assume any risks associated with Your exercise of +permissions under this License. + +8. Limitation of Liability. + +In no event and under no legal theory, whether in tort (including negligence), +contract, or otherwise, unless required by applicable law (such as deliberate +and grossly negligent acts) or agreed to in writing, shall any Contributor be +liable to You for damages, including any direct, indirect, special, incidental, +or consequential damages of any character arising as a result of this License or +out of the use or inability to use the Work (including but not limited to +damages for loss of goodwill, work stoppage, computer failure or malfunction, or +any and all other commercial damages or losses), even if such Contributor has +been advised of the possibility of such damages. + +9. Accepting Warranty or Additional Liability. + +While redistributing the Work or Derivative Works thereof, You may choose to +offer, and charge a fee for, acceptance of support, warranty, indemnity, or +other liability obligations and/or rights consistent with this License. However, +in accepting such obligations, You may act only on Your own behalf and on Your +sole responsibility, not on behalf of any other Contributor, and only if You +agree to indemnify, defend, and hold each Contributor harmless for any liability +incurred by, or claims asserted against, such Contributor by reason of your +accepting any such warranty or additional liability. + +END OF TERMS AND CONDITIONS + +APPENDIX: How to apply the Apache License to your work + +To apply the Apache License to your work, attach the following boilerplate +notice, with the fields enclosed by brackets "{}" replaced with your own +identifying information. (Don't include the brackets!) The text should be +enclosed in the appropriate comment syntax for the file format. We also +recommend that a file or class name and description of purpose be included on +the same "printed page" as the copyright notice for easier identification within +third-party archives. + + Copyright 2013 code4craft + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. \ No newline at end of file From d87c73b472061f17a92920fe5a0d1f76e9426d67 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Wed, 1 Mar 2017 22:24:34 +0800 Subject: [PATCH 09/27] change check-and-set to atomic sadd for redis DuplicateRemover #368 --- .../us/codecraft/webmagic/scheduler/RedisScheduler.java | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java index 61551b1..59f4b3f 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java @@ -48,11 +48,7 @@ public class RedisScheduler extends DuplicateRemovedScheduler implements Monitor public boolean isDuplicate(Request request, Task task) { Jedis jedis = pool.getResource(); try { - boolean isDuplicate = jedis.sismember(getSetKey(task), request.getUrl()); - if (!isDuplicate) { - jedis.sadd(getSetKey(task), request.getUrl()); - } - return isDuplicate; + return jedis.sadd(getSetKey(task), request.getUrl()) > 0; } finally { pool.returnResource(jedis); } From 895fca9fd711990dd8a04c6ed94738a7f82dbcfb Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 4 Mar 2017 11:34:06 +0800 Subject: [PATCH 10/27] =?UTF-8?q?=E4=BF=AE=E5=A4=8DseleniumDownloader?= =?UTF-8?q?=E9=85=8D=E7=BD=AE=E6=96=87=E4=BB=B6=E5=86=99=E6=AD=BB=E7=9A=84?= =?UTF-8?q?=E9=97=AE=E9=A2=98=20#475?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- webmagic-selenium/pom.xml | 2 +- .../webmagic/downloader/selenium/WebDriverPool.java | 8 ++++++-- webmagic-selenium/src/test/resources/config.ini | 11 +++++++++++ 3 files changed, 18 insertions(+), 3 deletions(-) create mode 100644 webmagic-selenium/src/test/resources/config.ini diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml index 6ddc61c..b66ca0c 100644 --- a/webmagic-selenium/pom.xml +++ b/webmagic-selenium/pom.xml @@ -13,7 +13,7 @@ org.seleniumhq.selenium selenium-java - 2.46.0 + 2.41.0 us.codecraft diff --git a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java index 59f83ea..1472cb3 100644 --- a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java +++ b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java @@ -45,7 +45,7 @@ class WebDriverPool { private WebDriver mDriver = null; private boolean mAutoQuitDriver = true; - private static final String CONFIG_FILE = "/Users/Bingo/Documents/workspace/webmagic/webmagic-selenium/config.ini"; + private static final String DEFAULT_CONFIG_FILE = "/data/webmagic/webmagic-selenium/config.ini"; private static final String DRIVER_FIREFOX = "firefox"; private static final String DRIVER_CHROME = "chrome"; private static final String DRIVER_PHANTOMJS = "phantomjs"; @@ -64,7 +64,11 @@ class WebDriverPool { public void configure() throws IOException { // Read config file sConfig = new Properties(); - sConfig.load(new FileReader(CONFIG_FILE)); + String configFile = DEFAULT_CONFIG_FILE; + if (System.getProperty("selenuim_config")!=null){ + configFile = System.getProperty("selenuim_config"); + } + sConfig.load(new FileReader(configFile)); // Prepare capabilities sCaps = new DesiredCapabilities(); diff --git a/webmagic-selenium/src/test/resources/config.ini b/webmagic-selenium/src/test/resources/config.ini new file mode 100644 index 0000000..40c8b46 --- /dev/null +++ b/webmagic-selenium/src/test/resources/config.ini @@ -0,0 +1,11 @@ +#driver=phantomjs +driver=firefox +driver=chrome +#driver=http://localhost:8910 +driver=http://localhost:4444/wd/hub + +# PhantomJS specific config (change according to your installation) +#phantomjs_exec_path=/Users/detro/bin/phantomjs-qt5 +phantomjs_exec_path=/Users/detro/bin/phantomjs-upstream +phantomjs_driver_path=../../src/main.js +phantomjs_driver_loglevel=DEBUG \ No newline at end of file From 11904a4d41d15deab7ad0f8811f9a13ffd366bd0 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 4 Mar 2017 11:42:25 +0800 Subject: [PATCH 11/27] fix huaban demo #475 --- .../java/us/codecraft/webmagic/samples/HuabanProcessor.java | 2 +- webmagic-selenium/src/test/resources/config.ini | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/webmagic-selenium/src/test/java/us/codecraft/webmagic/samples/HuabanProcessor.java b/webmagic-selenium/src/test/java/us/codecraft/webmagic/samples/HuabanProcessor.java index 2854a76..ad3a3e5 100644 --- a/webmagic-selenium/src/test/java/us/codecraft/webmagic/samples/HuabanProcessor.java +++ b/webmagic-selenium/src/test/java/us/codecraft/webmagic/samples/HuabanProcessor.java @@ -22,7 +22,7 @@ public class HuabanProcessor implements PageProcessor { public void process(Page page) { page.addTargetRequests(page.getHtml().links().regex("http://huaban\\.com/.*").all()); if (page.getUrl().toString().contains("pins")) { - page.putField("img", page.getHtml().xpath("//div[@id='pin_img']/a/img/@src").toString()); + page.putField("img", page.getHtml().xpath("//div[@class='image-holder']/a/img/@src").toString()); } else { page.getResultItems().setSkip(true); } diff --git a/webmagic-selenium/src/test/resources/config.ini b/webmagic-selenium/src/test/resources/config.ini index 40c8b46..6bd19af 100644 --- a/webmagic-selenium/src/test/resources/config.ini +++ b/webmagic-selenium/src/test/resources/config.ini @@ -1,5 +1,5 @@ #driver=phantomjs -driver=firefox +#driver=firefox driver=chrome #driver=http://localhost:8910 driver=http://localhost:4444/wd/hub From e645524ad2166ac9155032ae808c2afa4f7b7905 Mon Sep 17 00:00:00 2001 From: "Ckex.zha" Date: Sat, 4 Mar 2017 20:57:29 +0800 Subject: [PATCH 12/27] fix bug,set ExecutorService --- webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java | 1 + 1 file changed, 1 insertion(+) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 9045ad8..49734b7 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -586,6 +586,7 @@ public class Spider implements Runnable, Task { if (threadNum <= 0) { throw new IllegalArgumentException("threadNum should be more than one!"); } + this.executorService = executorService; return this; } From 8b8f535c309658c3e33c1a2e53b61d1fce13651e Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 11 Mar 2017 10:43:10 +0800 Subject: [PATCH 13/27] refactor:extract charset detect to utils --- .../downloader/HttpClientDownloader.java | 43 +------------ .../webmagic/utils/CharsetUtils.java | 61 +++++++++++++++++++ 2 files changed, 63 insertions(+), 41 deletions(-) create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/utils/CharsetUtils.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index e57d5cd..ca35867 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -1,7 +1,6 @@ package us.codecraft.webmagic.downloader; import org.apache.commons.io.IOUtils; -import org.apache.commons.lang3.StringUtils; import org.apache.http.HttpHost; import org.apache.http.HttpResponse; import org.apache.http.NameValuePair; @@ -13,10 +12,6 @@ import org.apache.http.client.methods.HttpUriRequest; import org.apache.http.client.methods.RequestBuilder; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.util.EntityUtils; -import org.jsoup.Jsoup; -import org.jsoup.nodes.Document; -import org.jsoup.nodes.Element; -import org.jsoup.select.Elements; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import us.codecraft.webmagic.Page; @@ -25,8 +20,8 @@ import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.proxy.Proxy; import us.codecraft.webmagic.selector.PlainText; +import us.codecraft.webmagic.utils.CharsetUtils; import us.codecraft.webmagic.utils.HttpConstant; -import us.codecraft.webmagic.utils.UrlUtils; import us.codecraft.webmagic.utils.WMCollections; import java.io.IOException; @@ -213,40 +208,6 @@ public class HttpClientDownloader extends AbstractDownloader { } protected String getHtmlCharset(HttpResponse httpResponse, byte[] contentBytes) throws IOException { - String charset; - // charset - // 1、encoding in http header Content-Type - String value = httpResponse.getEntity().getContentType().getValue(); - charset = UrlUtils.getCharset(value); - if (StringUtils.isNotBlank(charset)) { - logger.debug("Auto get charset: {}", charset); - return charset; - } - // use default charset to decode first time - Charset defaultCharset = Charset.defaultCharset(); - String content = new String(contentBytes, defaultCharset.name()); - // 2、charset in meta - if (StringUtils.isNotEmpty(content)) { - Document document = Jsoup.parse(content); - Elements links = document.select("meta"); - for (Element link : links) { - // 2.1、html4.01 - String metaContent = link.attr("content"); - String metaCharset = link.attr("charset"); - if (metaContent.indexOf("charset") != -1) { - metaContent = metaContent.substring(metaContent.indexOf("charset"), metaContent.length()); - charset = metaContent.split("=")[1]; - break; - } - // 2.2、html5 - else if (StringUtils.isNotEmpty(metaCharset)) { - charset = metaCharset; - break; - } - } - } - logger.debug("Auto get charset: {}", charset); - // 3、todo use tools as cpdetector for content decode - return charset; + return CharsetUtils.detectCharset(httpResponse.getEntity().getContentType().getValue(), contentBytes); } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/CharsetUtils.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/CharsetUtils.java new file mode 100644 index 0000000..50b4f1b --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/CharsetUtils.java @@ -0,0 +1,61 @@ +package us.codecraft.webmagic.utils; + +import org.apache.commons.lang3.StringUtils; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.nio.charset.Charset; + +/** + * @author code4crafter@gmail.com + * Date: 17/3/11 + * Time: 10:36 + * @since 0.6.2 + */ +public abstract class CharsetUtils { + + private static Logger logger = LoggerFactory.getLogger(CharsetUtils.class); + + public static String detectCharset(String contentType, byte[] contentBytes) throws IOException { + String charset; + // charset + // 1、encoding in http header Content-Type + charset = UrlUtils.getCharset(contentType); + if (StringUtils.isNotBlank(contentType)) { + logger.debug("Auto get charset: {}", charset); + return charset; + } + // use default charset to decode first time + Charset defaultCharset = Charset.defaultCharset(); + String content = new String(contentBytes, defaultCharset); + // 2、charset in meta + if (StringUtils.isNotEmpty(content)) { + Document document = Jsoup.parse(content); + Elements links = document.select("meta"); + for (Element link : links) { + // 2.1、html4.01 + String metaContent = link.attr("content"); + String metaCharset = link.attr("charset"); + if (metaContent.indexOf("charset") != -1) { + metaContent = metaContent.substring(metaContent.indexOf("charset"), metaContent.length()); + charset = metaContent.split("=")[1]; + break; + } + // 2.2、html5 + else if (StringUtils.isNotEmpty(metaCharset)) { + charset = metaCharset; + break; + } + } + } + logger.debug("Auto get charset: {}", charset); + // 3、todo use tools as cpdetector for content decode + return charset; + } + +} From ef325718219ecb3cef56911dd72ea04c40dd8673 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 11 Mar 2017 10:52:39 +0800 Subject: [PATCH 14/27] rewrite Request.equals and hashCode, add Method to check #483 --- .../java/us/codecraft/webmagic/Request.java | 35 ++++++++++--------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java index 9ecb172..3649d32 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java @@ -81,27 +81,10 @@ public class Request implements Serializable { return url; } - @Override - public boolean equals(Object o) { - if (this == o) return true; - if (o == null || getClass() != o.getClass()) return false; - - Request request = (Request) o; - - if (!url.equals(request.url)) return false; - - return true; - } - public Map getExtras() { return extras; } - @Override - public int hashCode() { - return url.hashCode(); - } - public void setExtras(Map extras) { this.extras = extras; } @@ -133,4 +116,22 @@ public class Request implements Serializable { ", priority=" + priority + '}'; } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + Request request = (Request) o; + + if (url != null ? !url.equals(request.url) : request.url != null) return false; + return method != null ? method.equals(request.method) : request.method == null; + } + + @Override + public int hashCode() { + int result = url != null ? url.hashCode() : 0; + result = 31 * result + (method != null ? method.hashCode() : 0); + return result; + } } From 0a1fb190526283e70963d4433780b626808eee0c Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 11 Mar 2017 10:56:31 +0800 Subject: [PATCH 15/27] add tests #483 --- .../us/codecraft/webmagic/RequestTest.java | 25 +++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 webmagic-core/src/test/java/us/codecraft/webmagic/RequestTest.java diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/RequestTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/RequestTest.java new file mode 100644 index 0000000..c7e4943 --- /dev/null +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/RequestTest.java @@ -0,0 +1,25 @@ +package us.codecraft.webmagic; + +import org.junit.Test; +import us.codecraft.webmagic.utils.HttpConstant; + +import static org.assertj.core.api.Assertions.assertThat; + +/** + * @author code4crafter@gmail.com + * Date: 17/3/11 + */ +public class RequestTest { + + @Test + public void testEqualsAndHashCode() throws Exception { + Request requestA = new Request("http://www.google.com/"); + Request requestB = new Request("http://www.google.com/"); + assertThat(requestA.hashCode()).isEqualTo(requestB.hashCode()); + assertThat(requestA).isEqualTo(requestB); + requestA.setMethod(HttpConstant.Method.GET); + requestA.setMethod(HttpConstant.Method.POST); + assertThat(requestA).isNotEqualTo(requestB); + assertThat(requestA.hashCode()).isNotEqualTo(requestB.hashCode()); + } +} From 2a35bb468813bfdf46d00c4a1087b32d08dce7d9 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 11 Mar 2017 10:59:55 +0800 Subject: [PATCH 16/27] remove contributors because it's hard to maintain the list: see https://github.com/code4craft/webmagic/graphs/contributors instead --- README-zh.md | 24 ------------------------ README.md | 25 ------------------------- 2 files changed, 49 deletions(-) diff --git a/README-zh.md b/README-zh.md index d69dd63..e8f0735 100644 --- a/README-zh.md +++ b/README-zh.md @@ -168,30 +168,6 @@ webmagic的使用可以参考:[oschina openapi 应用:博客搬家](http://m webmagic遵循[Apache 2.0协议](http://opensource.org/licenses/Apache-2.0) -### 贡献者: - -以下是为WebMagic提交过代码或者issue的朋友: - -* [ccliangbo](https://github.com/ccliangbo) -* [yuany](https://github.com/yuany) -* [yxssfxwzy](https://github.com/yxssfxwzy) -* [linkerlin](https://github.com/linkerlin) -* [d0ngw](https://github.com/d0ngw) -* [xuchaoo](https://github.com/xuchaoo) -* [supermicah](https://github.com/supermicah) -* [SimpleExpress](https://github.com/SimpleExpress) -* [aruanruan](https://github.com/aruanruan) -* [l1z2g9](https://github.com/l1z2g9) -* [zhegexiaohuozi](https://github.com/zhegexiaohuozi) -* [ywooer](https://github.com/ywooer) -* [yyw258520](https://github.com/yyw258520) -* [perfecking](https://github.com/perfecking) -* [lidongyang](http://my.oschina.net/lidongyang) -* [seveniu](https://github.com/seveniu) -* [sebastian1118](https://github.com/sebastian1118) -* [codev777](https://github.com/codev777) -* [fengwuze](https://github.com/fengwuze) - ### 邮件组: Gmail: diff --git a/README.md b/README.md index f1ddd27..5572b0c 100644 --- a/README.md +++ b/README.md @@ -122,31 +122,6 @@ There are some samples in `webmagic-samples` package. Lisenced under [Apache 2.0 lisence](http://opensource.org/licenses/Apache-2.0) -### Contributors: - -Thanks these people for commiting source code, reporting bugs or suggesting for new feature: - -* [ccliangbo](https://github.com/ccliangbo) -* [yuany](https://github.com/yuany) -* [yxssfxwzy](https://github.com/yxssfxwzy) -* [linkerlin](https://github.com/linkerlin) -* [d0ngw](https://github.com/d0ngw) -* [xuchaoo](https://github.com/xuchaoo) -* [supermicah](https://github.com/supermicah) -* [SimpleExpress](https://github.com/SimpleExpress) -* [aruanruan](https://github.com/aruanruan) -* [l1z2g9](https://github.com/l1z2g9) -* [zhegexiaohuozi](https://github.com/zhegexiaohuozi) -* [ywooer](https://github.com/ywooer) -* [yyw258520](https://github.com/yyw258520) -* [perfecking](https://github.com/perfecking) -* [lidongyang](http://my.oschina.net/lidongyang) -* [seveniu](https://github.com/seveniu) -* [sebastian1118](https://github.com/sebastian1118) -* [codev777](https://github.com/codev777) -* [fengwuze](https://github.com/fengwuze) - - ### Thanks: To write webmagic, I refered to the projects below : From 45bf2b6fd7433bb1b8e85617e761f83bc4978e98 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 11 Mar 2017 11:01:25 +0800 Subject: [PATCH 17/27] remove javadoc link because it's out of date --- README.md | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/README.md b/README.md index 5572b0c..8785844 100644 --- a/README.md +++ b/README.md @@ -114,9 +114,7 @@ The architecture of webmagic (refered to [Scrapy](http://scrapy.org/)) ![image](http://code4craft.github.io/images/posts/webmagic.png) -Javadocs: [http://code4craft.github.io/webmagic/docs/en/](http://code4craft.github.io/webmagic/docs/en/) - -There are some samples in `webmagic-samples` package. +There are more examples in `webmagic-samples` package. ### Lisence: From 5215a492ccfe69bc83b6cba31f76211c5fef3dae Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 11 Mar 2017 11:26:13 +0800 Subject: [PATCH 18/27] remove duplicate check for POST request #484 --- .../webmagic/scheduler/DuplicateRemovedScheduler.java | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/DuplicateRemovedScheduler.java b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/DuplicateRemovedScheduler.java index 9be7adb..6b7ebae 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/DuplicateRemovedScheduler.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/DuplicateRemovedScheduler.java @@ -6,6 +6,7 @@ import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.scheduler.component.DuplicateRemover; import us.codecraft.webmagic.scheduler.component.HashSetDuplicateRemover; +import us.codecraft.webmagic.utils.HttpConstant; /** * Remove duplicate urls and only push urls which are not duplicate.

@@ -31,7 +32,7 @@ public abstract class DuplicateRemovedScheduler implements Scheduler { @Override public void push(Request request, Task task) { logger.trace("get a candidate url {}", request.getUrl()); - if (!duplicatedRemover.isDuplicate(request, task) || shouldReserved(request)) { + if (!duplicatedRemover.isDuplicate(request, task) || shouldReserved(request) || noNeedToRemoveDuplicate(request)) { logger.debug("push to queue {}", request.getUrl()); pushWhenNoDuplicate(request, task); } @@ -41,6 +42,10 @@ public abstract class DuplicateRemovedScheduler implements Scheduler { return request.getExtra(Request.CYCLE_TRIED_TIMES) != null; } + protected boolean noNeedToRemoveDuplicate(Request request) { + return HttpConstant.Method.POST.equalsIgnoreCase(request.getMethod()); + } + protected void pushWhenNoDuplicate(Request request, Task task) { } From fc702fd3b659f156a738057b5066c59e0d542dc2 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 11 Mar 2017 11:31:15 +0800 Subject: [PATCH 19/27] introduce mockito for test --- pom.xml | 6 ++++++ webmagic-core/pom.xml | 5 +++++ .../DuplicateRemovedSchedulerTest.java | 17 +++++++++++++++++ 3 files changed, 28 insertions(+) create mode 100644 webmagic-samples/src/test/java/us/codecraft/webmagic/samples/scheduler/DuplicateRemovedSchedulerTest.java diff --git a/pom.xml b/pom.xml index 0914e26..7fcdd06 100644 --- a/pom.xml +++ b/pom.xml @@ -64,6 +64,12 @@ 4.11 test
+ + org.mockito + mockito-all + 1.10.19 + test + org.apache.httpcomponents httpclient diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index fbd5034..ad96961 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -40,6 +40,11 @@ slf4j-api + + org.mockito + mockito-all + + org.slf4j slf4j-log4j12 diff --git a/webmagic-samples/src/test/java/us/codecraft/webmagic/samples/scheduler/DuplicateRemovedSchedulerTest.java b/webmagic-samples/src/test/java/us/codecraft/webmagic/samples/scheduler/DuplicateRemovedSchedulerTest.java new file mode 100644 index 0000000..6f7a5d1 --- /dev/null +++ b/webmagic-samples/src/test/java/us/codecraft/webmagic/samples/scheduler/DuplicateRemovedSchedulerTest.java @@ -0,0 +1,17 @@ +package us.codecraft.webmagic.samples.scheduler; + +import org.junit.Test; + +/** + * @author code4crafter@gmail.com + * Date: 17/3/11 + * Time: 上午11:26 + */ +public class DuplicateRemovedSchedulerTest { + + @Test + public void testDuplicateRemoved() throws Exception { + + + } +} From 9b964c0a9968f3c4bb6837b5c584f45ed87851c2 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 11 Mar 2017 11:41:01 +0800 Subject: [PATCH 20/27] test for #484 --- .../scheduler/DuplicateRemovedScheduler.java | 2 +- .../DuplicateRemovedSchedulerTest.java | 39 +++++++++++++++++++ .../DuplicateRemovedSchedulerTest.java | 17 -------- 3 files changed, 40 insertions(+), 18 deletions(-) create mode 100644 webmagic-core/src/test/java/us/codecraft/webmagic/scheduler/DuplicateRemovedSchedulerTest.java delete mode 100644 webmagic-samples/src/test/java/us/codecraft/webmagic/samples/scheduler/DuplicateRemovedSchedulerTest.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/DuplicateRemovedScheduler.java b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/DuplicateRemovedScheduler.java index 6b7ebae..ecbeecb 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/DuplicateRemovedScheduler.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/DuplicateRemovedScheduler.java @@ -32,7 +32,7 @@ public abstract class DuplicateRemovedScheduler implements Scheduler { @Override public void push(Request request, Task task) { logger.trace("get a candidate url {}", request.getUrl()); - if (!duplicatedRemover.isDuplicate(request, task) || shouldReserved(request) || noNeedToRemoveDuplicate(request)) { + if (shouldReserved(request) || noNeedToRemoveDuplicate(request) || !duplicatedRemover.isDuplicate(request, task)) { logger.debug("push to queue {}", request.getUrl()); pushWhenNoDuplicate(request, task); } diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/scheduler/DuplicateRemovedSchedulerTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/scheduler/DuplicateRemovedSchedulerTest.java new file mode 100644 index 0000000..da69129 --- /dev/null +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/scheduler/DuplicateRemovedSchedulerTest.java @@ -0,0 +1,39 @@ +package us.codecraft.webmagic.scheduler; + +import org.junit.Test; +import org.junit.runner.RunWith; +import org.mockito.Mockito; +import org.mockito.runners.MockitoJUnitRunner; +import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.Task; +import us.codecraft.webmagic.scheduler.component.DuplicateRemover; +import us.codecraft.webmagic.utils.HttpConstant; + +import static org.mockito.Matchers.any; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.verify; + +/** + * @author code4crafter@gmail.com + * Date: 17/3/11 + * Time: 上午11:26 + */ +@RunWith(MockitoJUnitRunner.class) +public class DuplicateRemovedSchedulerTest { + + @Test + public void test_no_duplicate_removed_for_post_request() throws Exception { + DuplicateRemovedScheduler duplicateRemovedScheduler = new DuplicateRemovedScheduler() { + @Override + public Request poll(Task task) { + return null; + } + }; + DuplicateRemover duplicateRemover = Mockito.mock(DuplicateRemover.class); + duplicateRemovedScheduler.setDuplicateRemover(duplicateRemover); + Request request = new Request("https://www.google.com/"); + request.setMethod(HttpConstant.Method.POST); + duplicateRemovedScheduler.push(request, null); + verify(duplicateRemover,times(0)).isDuplicate(any(Request.class),any(Task.class)); + } +} diff --git a/webmagic-samples/src/test/java/us/codecraft/webmagic/samples/scheduler/DuplicateRemovedSchedulerTest.java b/webmagic-samples/src/test/java/us/codecraft/webmagic/samples/scheduler/DuplicateRemovedSchedulerTest.java deleted file mode 100644 index 6f7a5d1..0000000 --- a/webmagic-samples/src/test/java/us/codecraft/webmagic/samples/scheduler/DuplicateRemovedSchedulerTest.java +++ /dev/null @@ -1,17 +0,0 @@ -package us.codecraft.webmagic.samples.scheduler; - -import org.junit.Test; - -/** - * @author code4crafter@gmail.com - * Date: 17/3/11 - * Time: 上午11:26 - */ -public class DuplicateRemovedSchedulerTest { - - @Test - public void testDuplicateRemoved() throws Exception { - - - } -} From c175ea88c0bcc2a35055f7d851fc98bbd37e4838 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 11 Mar 2017 11:43:18 +0800 Subject: [PATCH 21/27] #more test #484 --- .../DuplicateRemovedSchedulerTest.java | 23 ++++++++++++++----- 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/scheduler/DuplicateRemovedSchedulerTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/scheduler/DuplicateRemovedSchedulerTest.java index da69129..a098049 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/scheduler/DuplicateRemovedSchedulerTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/scheduler/DuplicateRemovedSchedulerTest.java @@ -21,14 +21,15 @@ import static org.mockito.Mockito.verify; @RunWith(MockitoJUnitRunner.class) public class DuplicateRemovedSchedulerTest { + private DuplicateRemovedScheduler duplicateRemovedScheduler = new DuplicateRemovedScheduler() { + @Override + public Request poll(Task task) { + return null; + } + }; + @Test public void test_no_duplicate_removed_for_post_request() throws Exception { - DuplicateRemovedScheduler duplicateRemovedScheduler = new DuplicateRemovedScheduler() { - @Override - public Request poll(Task task) { - return null; - } - }; DuplicateRemover duplicateRemover = Mockito.mock(DuplicateRemover.class); duplicateRemovedScheduler.setDuplicateRemover(duplicateRemover); Request request = new Request("https://www.google.com/"); @@ -36,4 +37,14 @@ public class DuplicateRemovedSchedulerTest { duplicateRemovedScheduler.push(request, null); verify(duplicateRemover,times(0)).isDuplicate(any(Request.class),any(Task.class)); } + + @Test + public void test_duplicate_removed_for_get_request() throws Exception { + DuplicateRemover duplicateRemover = Mockito.mock(DuplicateRemover.class); + duplicateRemovedScheduler.setDuplicateRemover(duplicateRemover); + Request request = new Request("https://www.google.com/"); + request.setMethod(HttpConstant.Method.GET); + duplicateRemovedScheduler.push(request, null); + verify(duplicateRemover,times(1)).isDuplicate(any(Request.class),any(Task.class)); + } } From 791520e6a0a0315f24c6e5030795658c38a0c125 Mon Sep 17 00:00:00 2001 From: mei Date: Fri, 17 Mar 2017 00:06:15 +0800 Subject: [PATCH 22/27] fix a bug of RegexSelector when regex has zero-width assertions. --- .../webmagic/selector/RegexSelector.java | 27 +++++++++++++++++-- .../webmagic/selector/RegexSelectorTest.java | 16 +++++++++++ 2 files changed, 41 insertions(+), 2 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java index 4381896..584cf90 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java @@ -28,8 +28,7 @@ public class RegexSelector implements Selector { } // Check bracket for regex group. Add default group 1 if there is no group. // Only check if there exists the valid left parenthesis, leave regexp validation for Pattern. - if (StringUtils.countMatches(regexStr, "(") - StringUtils.countMatches(regexStr, "\\(") == - StringUtils.countMatches(regexStr, "(?:") - StringUtils.countMatches(regexStr, "\\(?:")) { + if ( ! hasGroup(regexStr) ){ regexStr = "(" + regexStr + ")"; } this.regexStr = regexStr; @@ -45,6 +44,30 @@ public class RegexSelector implements Selector { this(regexStr, 1); } + private boolean hasGroup(String regexStr) { + if (StringUtils.countMatches(regexStr, "(") - StringUtils.countMatches(regexStr, "\\(") == + StringUtils.countMatches(regexStr, "(?:") - StringUtils.countMatches(regexStr, "\\(?:")){ + return false; + } + if (StringUtils.countMatches(regexStr, "(") - StringUtils.countMatches(regexStr, "\\(") == + StringUtils.countMatches(regexStr, "(?=") - StringUtils.countMatches(regexStr, "\\(?=") ) { + return false; + } + if (StringUtils.countMatches(regexStr, "(") - StringUtils.countMatches(regexStr, "\\(") == + StringUtils.countMatches(regexStr, "(?<") - StringUtils.countMatches(regexStr, "\\(?<") ) { + return false; + } + if (StringUtils.countMatches(regexStr, "(") - StringUtils.countMatches(regexStr, "\\(") == + StringUtils.countMatches(regexStr, "(?!") - StringUtils.countMatches(regexStr, "\\(?!") ) { + return false; + } + if (StringUtils.countMatches(regexStr, "(") - StringUtils.countMatches(regexStr, "\\(") == + StringUtils.countMatches(regexStr, "(?#") - StringUtils.countMatches(regexStr, "\\(?#") ) { + return false; + } + return true; + } + @Override public String select(String text) { return selectGroup(text).get(group); diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/RegexSelectorTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/RegexSelectorTest.java index 63e8e43..144e6fe 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/RegexSelectorTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/RegexSelectorTest.java @@ -22,4 +22,20 @@ public class RegexSelectorTest { String select = regexSelector.select(source); Assertions.assertThat(select).isEqualTo(source); } + + @Test + public void testRegexWithZeroWidthAssertions() { + String regex = "^.*(?=\\?)"; + String source = "hello world?xxxx"; + RegexSelector regexSelector = new RegexSelector(regex); + String select = regexSelector.select(source); + Assertions.assertThat(select).isEqualTo("hello world"); + + + regex = "\\d{3}(?!\\d)"; + source = "123456asdf"; + regexSelector = new RegexSelector(regex); + select = regexSelector.select(source); + Assertions.assertThat(select).isEqualTo("456"); + } } From 0fbf657d86a14d281f05ce367f9910c5df9612bd Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Fri, 17 Mar 2017 06:59:28 +0800 Subject: [PATCH 23/27] update fastjson to 1.2.28 #489 --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 7fcdd06..04b6dec 100644 --- a/pom.xml +++ b/pom.xml @@ -103,7 +103,7 @@ com.alibaba fastjson - 1.2.21 + 1.2.28 com.github.dreamhead From aa01e27779a8148f2372abe25645169b4d56f5b0 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Fri, 17 Mar 2017 07:02:02 +0800 Subject: [PATCH 24/27] change constructor for Proxy to public #490 --- .../src/main/java/us/codecraft/webmagic/proxy/Proxy.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java index 2609671..dbe3a18 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java @@ -79,14 +79,14 @@ public class Proxy implements Delayed, Serializable { private List failedErrorType = new ArrayList(); - Proxy(HttpHost httpHost, String user, String password) { + public Proxy(HttpHost httpHost, String user, String password) { this.httpHost = httpHost; this.user = user; this.password = password; this.canReuseTime = System.nanoTime() + TimeUnit.NANOSECONDS.convert(reuseTimeInterval, TimeUnit.MILLISECONDS); } - Proxy(HttpHost httpHost, int reuseInterval, String user, String password) { + public Proxy(HttpHost httpHost, int reuseInterval, String user, String password) { this.httpHost = httpHost; this.user = user; this.password = password; From 75bad591d74a9f9fb99c528821ccb2277357651e Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Fri, 17 Mar 2017 07:10:14 +0800 Subject: [PATCH 25/27] rewrite hashCode and equals for params #447 --- .../java/us/codecraft/webmagic/Request.java | 27 ++++++++++++------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java index bb0d485..1978792 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java @@ -116,25 +116,19 @@ public class Request implements Serializable { } /** * POST/GET参数设置 + * @param params params * */ public void setParams(Map params) { this.params = params; } /** * POST/GET参数设置 + * @param key key + * @param value value * */ public void putParams(String key,String value) { params.put(key,value); } - @Override - public String toString() { - return "Request{" + - "url='" + url + '\'' + - ", method='" + method + '\'' + - ", extras=" + extras + - ", priority=" + priority + - '}'; - } @Override public boolean equals(Object o) { @@ -144,13 +138,26 @@ public class Request implements Serializable { Request request = (Request) o; if (url != null ? !url.equals(request.url) : request.url != null) return false; - return method != null ? method.equals(request.method) : request.method == null; + if (method != null ? !method.equals(request.method) : request.method != null) return false; + return params != null ? params.equals(request.params) : request.params == null; } @Override public int hashCode() { int result = url != null ? url.hashCode() : 0; result = 31 * result + (method != null ? method.hashCode() : 0); + result = 31 * result + (params != null ? params.hashCode() : 0); return result; } + + @Override + public String toString() { + return "Request{" + + "url='" + url + '\'' + + ", method='" + method + '\'' + + ", extras=" + extras + + ", params=" + params + + ", priority=" + priority + + '}'; + } } From e7d35c4846e723958ac9409ec02288001dd2403f Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Fri, 17 Mar 2017 07:18:05 +0800 Subject: [PATCH 26/27] add params to all method of request #447 --- .../java/us/codecraft/webmagic/Request.java | 8 ++++-- .../downloader/HttpClientDownloader.java | 25 +++++++++++-------- 2 files changed, 20 insertions(+), 13 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java index 1978792..c8c5978 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java @@ -115,14 +115,18 @@ public class Request implements Serializable { return params; } /** - * POST/GET参数设置 + * set params for request + *
+ * DO NOT set this for request already has params, like 'https://github.com/search?q=webmagic' * @param params params * */ public void setParams(Map params) { this.params = params; } /** - * POST/GET参数设置 + * set params for request + *
+ * DO NOT set this for request already has params, like 'https://github.com/search?q=webmagic' * @param key key * @param value value * */ diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 26a7288..0c11149 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -162,13 +162,7 @@ public class HttpClientDownloader extends AbstractDownloader { String method = request.getMethod(); if (method == null || method.equalsIgnoreCase(HttpConstant.Method.GET)) { //default get - RequestBuilder requestBuilder=RequestBuilder.get(); - if (request.getParams() != null) { - for (Map.Entry entry : request.getParams().entrySet()) { - requestBuilder.addParameter(entry.getKey(), entry.getValue()); - } - } - return requestBuilder; + return addParams(RequestBuilder.get(),request.getParams()); } else if (method.equalsIgnoreCase(HttpConstant.Method.POST)) { RequestBuilder requestBuilder = RequestBuilder.post(); NameValuePair[] nameValuePair = (NameValuePair[]) request.getExtra("nameValuePair"); @@ -184,17 +178,26 @@ public class HttpClientDownloader extends AbstractDownloader { requestBuilder.setEntity(new UrlEncodedFormEntity(allNameValuePair, Charset.forName("utf8"))); return requestBuilder; } else if (method.equalsIgnoreCase(HttpConstant.Method.HEAD)) { - return RequestBuilder.head(); + return addParams(RequestBuilder.head(),request.getParams()); } else if (method.equalsIgnoreCase(HttpConstant.Method.PUT)) { - return RequestBuilder.put(); + return addParams(RequestBuilder.put(),request.getParams()); } else if (method.equalsIgnoreCase(HttpConstant.Method.DELETE)) { - return RequestBuilder.delete(); + return addParams(RequestBuilder.delete(),request.getParams()); } else if (method.equalsIgnoreCase(HttpConstant.Method.TRACE)) { - return RequestBuilder.trace(); + return addParams(RequestBuilder.trace(),request.getParams()); } throw new IllegalArgumentException("Illegal HTTP Method " + method); } + private RequestBuilder addParams(RequestBuilder requestBuilder, Map params) { + if (params != null) { + for (Map.Entry entry : params.entrySet()) { + requestBuilder.addParameter(entry.getKey(), entry.getValue()); + } + } + return requestBuilder; + } + protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException { String content = getContent(charset, httpResponse); Page page = new Page(); From e9341d0291101f24af259f62a15d26427a2df853 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Fri, 17 Mar 2017 07:54:28 +0800 Subject: [PATCH 27/27] complete test #447 --- .../downloader/HttpClientDownloader.java | 40 +++++++++--------- .../downloader/HttpClientDownloaderTest.java | 42 +++++++++++++++++++ 2 files changed, 63 insertions(+), 19 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 0c11149..9e77ef5 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -162,34 +162,36 @@ public class HttpClientDownloader extends AbstractDownloader { String method = request.getMethod(); if (method == null || method.equalsIgnoreCase(HttpConstant.Method.GET)) { //default get - return addParams(RequestBuilder.get(),request.getParams()); + return addQueryParams(RequestBuilder.get(),request.getParams()); } else if (method.equalsIgnoreCase(HttpConstant.Method.POST)) { - RequestBuilder requestBuilder = RequestBuilder.post(); - NameValuePair[] nameValuePair = (NameValuePair[]) request.getExtra("nameValuePair"); - List allNameValuePair=new ArrayList(); - if (nameValuePair != null && nameValuePair.length > 0) { - allNameValuePair= Arrays.asList(nameValuePair); - } - if (request.getParams() != null) { - for (String key : request.getParams().keySet()) { - allNameValuePair.add(new BasicNameValuePair(key, request.getParams().get(key))); - } - } - requestBuilder.setEntity(new UrlEncodedFormEntity(allNameValuePair, Charset.forName("utf8"))); - return requestBuilder; + return addFormParams(RequestBuilder.post(), (NameValuePair[]) request.getExtra("nameValuePair"), request.getParams()); } else if (method.equalsIgnoreCase(HttpConstant.Method.HEAD)) { - return addParams(RequestBuilder.head(),request.getParams()); + return addQueryParams(RequestBuilder.head(),request.getParams()); } else if (method.equalsIgnoreCase(HttpConstant.Method.PUT)) { - return addParams(RequestBuilder.put(),request.getParams()); + return addFormParams(RequestBuilder.put(), (NameValuePair[]) request.getExtra("nameValuePair"), request.getParams()); } else if (method.equalsIgnoreCase(HttpConstant.Method.DELETE)) { - return addParams(RequestBuilder.delete(),request.getParams()); + return addQueryParams(RequestBuilder.delete(),request.getParams()); } else if (method.equalsIgnoreCase(HttpConstant.Method.TRACE)) { - return addParams(RequestBuilder.trace(),request.getParams()); + return addQueryParams(RequestBuilder.trace(),request.getParams()); } throw new IllegalArgumentException("Illegal HTTP Method " + method); } - private RequestBuilder addParams(RequestBuilder requestBuilder, Map params) { + private RequestBuilder addFormParams(RequestBuilder requestBuilder, NameValuePair[] nameValuePair, Map params) { + List allNameValuePair=new ArrayList(); + if (nameValuePair != null && nameValuePair.length > 0) { + allNameValuePair= Arrays.asList(nameValuePair); + } + if (params != null) { + for (String key : params.keySet()) { + allNameValuePair.add(new BasicNameValuePair(key, params.get(key))); + } + } + requestBuilder.setEntity(new UrlEncodedFormEntity(allNameValuePair, Charset.forName("utf8"))); + return requestBuilder; + } + + private RequestBuilder addQueryParams(RequestBuilder requestBuilder, Map params) { if (params != null) { for (Map.Entry entry : params.entrySet()) { requestBuilder.addParameter(entry.getKey(), entry.getValue()); diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java index 1735e00..0e442a8 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java @@ -5,13 +5,17 @@ import com.github.dreamhead.moco.Runnable; import com.github.dreamhead.moco.Runner; import org.apache.commons.io.IOUtils; import org.apache.http.client.methods.CloseableHttpResponse; +import org.apache.http.client.methods.RequestBuilder; import org.apache.http.impl.client.CloseableHttpClient; +import org.apache.http.impl.client.HttpClients; +import org.apache.http.util.EntityUtils; import org.junit.Test; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.selector.Html; +import us.codecraft.webmagic.utils.HttpConstant; import java.io.IOException; import java.io.UnsupportedEncodingException; @@ -103,4 +107,42 @@ public class HttpClientDownloaderTest { } }); } + + @Test + public void test_selectRequestMethod() throws Exception { + HttpServer server = httpserver(12306); + server.get(eq(query("q"), "webmagic")).response("get"); + server.post(eq(form("q"), "webmagic")).response("post"); + server.put(eq(form("q"), "webmagic")).response("put"); + server.delete(eq(query("q"), "webmagic")).response("delete"); + server.request(and(by(method("HEAD")),eq(query("q"), "webmagic"))).response(header("method","head")); + server.request(and(by(method("TRACE")),eq(query("q"), "webmagic"))).response("trace"); + Runner.running(server, new Runnable() { + @Override + public void run() throws Exception { + HttpClientDownloader httpClientDownloader = new HttpClientDownloader(); + Request request = new Request(); + request.setUrl("http://127.0.0.1:12306/search"); + request.putParams("q", "webmagic"); + request.setMethod(HttpConstant.Method.GET); + RequestBuilder requestBuilder = httpClientDownloader.selectRequestMethod(request).setUri(request.getUrl()); + assertThat(EntityUtils.toString(HttpClients.custom().build().execute(requestBuilder.build()).getEntity())).isEqualTo("get"); + request.setMethod(HttpConstant.Method.POST); + requestBuilder = httpClientDownloader.selectRequestMethod(request).setUri(request.getUrl()); + assertThat(EntityUtils.toString(HttpClients.custom().build().execute(requestBuilder.build()).getEntity())).isEqualTo("post"); + request.setMethod(HttpConstant.Method.PUT); + requestBuilder = httpClientDownloader.selectRequestMethod(request).setUri(request.getUrl()); + assertThat(EntityUtils.toString(HttpClients.custom().build().execute(requestBuilder.build()).getEntity())).isEqualTo("put"); + request.setMethod(HttpConstant.Method.DELETE); + requestBuilder = httpClientDownloader.selectRequestMethod(request).setUri(request.getUrl()); + assertThat(EntityUtils.toString(HttpClients.custom().build().execute(requestBuilder.build()).getEntity())).isEqualTo("delete"); + request.setMethod(HttpConstant.Method.HEAD); + requestBuilder = httpClientDownloader.selectRequestMethod(request).setUri(request.getUrl()); + assertThat(HttpClients.custom().build().execute(requestBuilder.build()).getFirstHeader("method").getValue()).isEqualTo("head"); + request.setMethod(HttpConstant.Method.TRACE); + requestBuilder = httpClientDownloader.selectRequestMethod(request).setUri(request.getUrl()); + assertThat(EntityUtils.toString(HttpClients.custom().build().execute(requestBuilder.build()).getEntity())).isEqualTo("trace"); + } + }); + } }