commit
76729c9302
|
@ -0,0 +1,191 @@
|
|||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction, and
|
||||
distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by the copyright
|
||||
owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all other entities
|
||||
that control, are controlled by, or are under common control with that entity.
|
||||
For the purposes of this definition, "control" means (i) the power, direct or
|
||||
indirect, to cause the direction or management of such entity, whether by
|
||||
contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity exercising
|
||||
permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications, including
|
||||
but not limited to software source code, documentation source, and configuration
|
||||
files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical transformation or
|
||||
translation of a Source form, including but not limited to compiled object code,
|
||||
generated documentation, and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or Object form, made
|
||||
available under the License, as indicated by a copyright notice that is included
|
||||
in or attached to the work (an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object form, that
|
||||
is based on (or derived from) the Work and for which the editorial revisions,
|
||||
annotations, elaborations, or other modifications represent, as a whole, an
|
||||
original work of authorship. For the purposes of this License, Derivative Works
|
||||
shall not include works that remain separable from, or merely link (or bind by
|
||||
name) to the interfaces of, the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including the original version
|
||||
of the Work and any modifications or additions to that Work or Derivative Works
|
||||
thereof, that is intentionally submitted to Licensor for inclusion in the Work
|
||||
by the copyright owner or by an individual or Legal Entity authorized to submit
|
||||
on behalf of the copyright owner. For the purposes of this definition,
|
||||
"submitted" means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems, and
|
||||
issue tracking systems that are managed by, or on behalf of, the Licensor for
|
||||
the purpose of discussing and improving the Work, but excluding communication
|
||||
that is conspicuously marked or otherwise designated in writing by the copyright
|
||||
owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity on behalf
|
||||
of whom a Contribution has been received by Licensor and subsequently
|
||||
incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License.
|
||||
|
||||
Subject to the terms and conditions of this License, each Contributor hereby
|
||||
grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free,
|
||||
irrevocable copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the Work and such
|
||||
Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License.
|
||||
|
||||
Subject to the terms and conditions of this License, each Contributor hereby
|
||||
grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free,
|
||||
irrevocable (except as stated in this section) patent license to make, have
|
||||
made, use, offer to sell, sell, import, and otherwise transfer the Work, where
|
||||
such license applies only to those patent claims licensable by such Contributor
|
||||
that are necessarily infringed by their Contribution(s) alone or by combination
|
||||
of their Contribution(s) with the Work to which such Contribution(s) was
|
||||
submitted. If You institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work or a
|
||||
Contribution incorporated within the Work constitutes direct or contributory
|
||||
patent infringement, then any patent licenses granted to You under this License
|
||||
for that Work shall terminate as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution.
|
||||
|
||||
You may reproduce and distribute copies of the Work or Derivative Works thereof
|
||||
in any medium, with or without modifications, and in Source or Object form,
|
||||
provided that You meet the following conditions:
|
||||
|
||||
You must give any other recipients of the Work or Derivative Works a copy of
|
||||
this License; and
|
||||
You must cause any modified files to carry prominent notices stating that You
|
||||
changed the files; and
|
||||
You must retain, in the Source form of any Derivative Works that You distribute,
|
||||
all copyright, patent, trademark, and attribution notices from the Source form
|
||||
of the Work, excluding those notices that do not pertain to any part of the
|
||||
Derivative Works; and
|
||||
If the Work includes a "NOTICE" text file as part of its distribution, then any
|
||||
Derivative Works that You distribute must include a readable copy of the
|
||||
attribution notices contained within such NOTICE file, excluding those notices
|
||||
that do not pertain to any part of the Derivative Works, in at least one of the
|
||||
following places: within a NOTICE text file distributed as part of the
|
||||
Derivative Works; within the Source form or documentation, if provided along
|
||||
with the Derivative Works; or, within a display generated by the Derivative
|
||||
Works, if and wherever such third-party notices normally appear. The contents of
|
||||
the NOTICE file are for informational purposes only and do not modify the
|
||||
License. You may add Your own attribution notices within Derivative Works that
|
||||
You distribute, alongside or as an addendum to the NOTICE text from the Work,
|
||||
provided that such additional attribution notices cannot be construed as
|
||||
modifying the License.
|
||||
You may add Your own copyright statement to Your modifications and may provide
|
||||
additional or different license terms and conditions for use, reproduction, or
|
||||
distribution of Your modifications, or for any such Derivative Works as a whole,
|
||||
provided Your use, reproduction, and distribution of the Work otherwise complies
|
||||
with the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions.
|
||||
|
||||
Unless You explicitly state otherwise, any Contribution intentionally submitted
|
||||
for inclusion in the Work by You to the Licensor shall be under the terms and
|
||||
conditions of this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify the terms of
|
||||
any separate license agreement you may have executed with Licensor regarding
|
||||
such Contributions.
|
||||
|
||||
6. Trademarks.
|
||||
|
||||
This License does not grant permission to use the trade names, trademarks,
|
||||
service marks, or product names of the Licensor, except as required for
|
||||
reasonable and customary use in describing the origin of the Work and
|
||||
reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty.
|
||||
|
||||
Unless required by applicable law or agreed to in writing, Licensor provides the
|
||||
Work (and each Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied,
|
||||
including, without limitation, any warranties or conditions of TITLE,
|
||||
NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are
|
||||
solely responsible for determining the appropriateness of using or
|
||||
redistributing the Work and assume any risks associated with Your exercise of
|
||||
permissions under this License.
|
||||
|
||||
8. Limitation of Liability.
|
||||
|
||||
In no event and under no legal theory, whether in tort (including negligence),
|
||||
contract, or otherwise, unless required by applicable law (such as deliberate
|
||||
and grossly negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special, incidental,
|
||||
or consequential damages of any character arising as a result of this License or
|
||||
out of the use or inability to use the Work (including but not limited to
|
||||
damages for loss of goodwill, work stoppage, computer failure or malfunction, or
|
||||
any and all other commercial damages or losses), even if such Contributor has
|
||||
been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability.
|
||||
|
||||
While redistributing the Work or Derivative Works thereof, You may choose to
|
||||
offer, and charge a fee for, acceptance of support, warranty, indemnity, or
|
||||
other liability obligations and/or rights consistent with this License. However,
|
||||
in accepting such obligations, You may act only on Your own behalf and on Your
|
||||
sole responsibility, not on behalf of any other Contributor, and only if You
|
||||
agree to indemnify, defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason of your
|
||||
accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
APPENDIX: How to apply the Apache License to your work
|
||||
|
||||
To apply the Apache License to your work, attach the following boilerplate
|
||||
notice, with the fields enclosed by brackets "{}" replaced with your own
|
||||
identifying information. (Don't include the brackets!) The text should be
|
||||
enclosed in the appropriate comment syntax for the file format. We also
|
||||
recommend that a file or class name and description of purpose be included on
|
||||
the same "printed page" as the copyright notice for easier identification within
|
||||
third-party archives.
|
||||
|
||||
Copyright 2013 code4craft
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
24
README-zh.md
24
README-zh.md
|
@ -168,30 +168,6 @@ webmagic的使用可以参考:[oschina openapi 应用:博客搬家](http://m
|
|||
|
||||
webmagic遵循[Apache 2.0协议](http://opensource.org/licenses/Apache-2.0)
|
||||
|
||||
### 贡献者:
|
||||
|
||||
以下是为WebMagic提交过代码或者issue的朋友:
|
||||
|
||||
* [ccliangbo](https://github.com/ccliangbo)
|
||||
* [yuany](https://github.com/yuany)
|
||||
* [yxssfxwzy](https://github.com/yxssfxwzy)
|
||||
* [linkerlin](https://github.com/linkerlin)
|
||||
* [d0ngw](https://github.com/d0ngw)
|
||||
* [xuchaoo](https://github.com/xuchaoo)
|
||||
* [supermicah](https://github.com/supermicah)
|
||||
* [SimpleExpress](https://github.com/SimpleExpress)
|
||||
* [aruanruan](https://github.com/aruanruan)
|
||||
* [l1z2g9](https://github.com/l1z2g9)
|
||||
* [zhegexiaohuozi](https://github.com/zhegexiaohuozi)
|
||||
* [ywooer](https://github.com/ywooer)
|
||||
* [yyw258520](https://github.com/yyw258520)
|
||||
* [perfecking](https://github.com/perfecking)
|
||||
* [lidongyang](http://my.oschina.net/lidongyang)
|
||||
* [seveniu](https://github.com/seveniu)
|
||||
* [sebastian1118](https://github.com/sebastian1118)
|
||||
* [codev777](https://github.com/codev777)
|
||||
* [fengwuze](https://github.com/fengwuze)
|
||||
|
||||
### 邮件组:
|
||||
|
||||
Gmail:
|
||||
|
|
33
README.md
33
README.md
|
@ -59,7 +59,7 @@ public class GithubRepoPageProcessor implements PageProcessor {
|
|||
public void process(Page page) {
|
||||
page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all());
|
||||
page.putField("author", page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString());
|
||||
page.putField("name", page.getHtml().xpath("//h1[@class='entry-title public']/strong/a/text()").toString());
|
||||
page.putField("name", page.getHtml().xpath("//h1[@class='public']/strong/a/text()").toString());
|
||||
if (page.getResultItems().get("name")==null){
|
||||
//skip this page
|
||||
page.setSkip(true);
|
||||
|
@ -89,7 +89,7 @@ You can also use annotation way:
|
|||
@HelpUrl("https://github.com/\\w+")
|
||||
public class GithubRepo {
|
||||
|
||||
@ExtractBy(value = "//h1[@class='entry-title public']/strong/a/text()", notNull = true)
|
||||
@ExtractBy(value = "//h1[@class='public']/strong/a/text()", notNull = true)
|
||||
private String name;
|
||||
|
||||
@ExtractByUrl("https://github\\.com/(\\w+)/.*")
|
||||
|
@ -114,39 +114,12 @@ The architecture of webmagic (refered to [Scrapy](http://scrapy.org/))
|
|||
|
||||

|
||||
|
||||
Javadocs: [http://code4craft.github.io/webmagic/docs/en/](http://code4craft.github.io/webmagic/docs/en/)
|
||||
|
||||
There are some samples in `webmagic-samples` package.
|
||||
There are more examples in `webmagic-samples` package.
|
||||
|
||||
### Lisence:
|
||||
|
||||
Lisenced under [Apache 2.0 lisence](http://opensource.org/licenses/Apache-2.0)
|
||||
|
||||
### Contributors:
|
||||
|
||||
Thanks these people for commiting source code, reporting bugs or suggesting for new feature:
|
||||
|
||||
* [ccliangbo](https://github.com/ccliangbo)
|
||||
* [yuany](https://github.com/yuany)
|
||||
* [yxssfxwzy](https://github.com/yxssfxwzy)
|
||||
* [linkerlin](https://github.com/linkerlin)
|
||||
* [d0ngw](https://github.com/d0ngw)
|
||||
* [xuchaoo](https://github.com/xuchaoo)
|
||||
* [supermicah](https://github.com/supermicah)
|
||||
* [SimpleExpress](https://github.com/SimpleExpress)
|
||||
* [aruanruan](https://github.com/aruanruan)
|
||||
* [l1z2g9](https://github.com/l1z2g9)
|
||||
* [zhegexiaohuozi](https://github.com/zhegexiaohuozi)
|
||||
* [ywooer](https://github.com/ywooer)
|
||||
* [yyw258520](https://github.com/yyw258520)
|
||||
* [perfecking](https://github.com/perfecking)
|
||||
* [lidongyang](http://my.oschina.net/lidongyang)
|
||||
* [seveniu](https://github.com/seveniu)
|
||||
* [sebastian1118](https://github.com/sebastian1118)
|
||||
* [codev777](https://github.com/codev777)
|
||||
* [fengwuze](https://github.com/fengwuze)
|
||||
|
||||
|
||||
### Thanks:
|
||||
|
||||
To write webmagic, I refered to the projects below :
|
||||
|
|
10
pom.xml
10
pom.xml
|
@ -64,6 +64,12 @@
|
|||
<version>4.11</version>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.mockito</groupId>
|
||||
<artifactId>mockito-all</artifactId>
|
||||
<version>1.10.19</version>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.httpcomponents</groupId>
|
||||
<artifactId>httpclient</artifactId>
|
||||
|
@ -97,7 +103,7 @@
|
|||
<dependency>
|
||||
<groupId>com.alibaba</groupId>
|
||||
<artifactId>fastjson</artifactId>
|
||||
<version>1.2.21</version>
|
||||
<version>1.2.28</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.github.dreamhead</groupId>
|
||||
|
@ -130,7 +136,7 @@
|
|||
<dependency>
|
||||
<groupId>commons-collections</groupId>
|
||||
<artifactId>commons-collections</artifactId>
|
||||
<version>3.2.1</version>
|
||||
<version>3.2.2</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.commons</groupId>
|
||||
|
|
|
@ -40,6 +40,11 @@
|
|||
<artifactId>slf4j-api</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.mockito</groupId>
|
||||
<artifactId>mockito-all</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.slf4j</groupId>
|
||||
<artifactId>slf4j-log4j12</artifactId>
|
||||
|
|
|
@ -107,14 +107,12 @@ public class Page {
|
|||
* @param requests requests
|
||||
*/
|
||||
public void addTargetRequests(List<String> requests) {
|
||||
synchronized (targetRequests) {
|
||||
for (String s : requests) {
|
||||
if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) {
|
||||
continue;
|
||||
}
|
||||
s = UrlUtils.canonicalizeUrl(s, url.toString());
|
||||
targetRequests.add(new Request(s));
|
||||
for (String s : requests) {
|
||||
if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) {
|
||||
continue;
|
||||
}
|
||||
s = UrlUtils.canonicalizeUrl(s, url.toString());
|
||||
targetRequests.add(new Request(s));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -125,14 +123,12 @@ public class Page {
|
|||
* @param priority priority
|
||||
*/
|
||||
public void addTargetRequests(List<String> requests, long priority) {
|
||||
synchronized (targetRequests) {
|
||||
for (String s : requests) {
|
||||
if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) {
|
||||
continue;
|
||||
}
|
||||
s = UrlUtils.canonicalizeUrl(s, url.toString());
|
||||
targetRequests.add(new Request(s).setPriority(priority));
|
||||
for (String s : requests) {
|
||||
if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) {
|
||||
continue;
|
||||
}
|
||||
s = UrlUtils.canonicalizeUrl(s, url.toString());
|
||||
targetRequests.add(new Request(s).setPriority(priority));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -145,10 +141,8 @@ public class Page {
|
|||
if (StringUtils.isBlank(requestString) || requestString.equals("#")) {
|
||||
return;
|
||||
}
|
||||
synchronized (targetRequests) {
|
||||
requestString = UrlUtils.canonicalizeUrl(requestString, url.toString());
|
||||
targetRequests.add(new Request(requestString));
|
||||
}
|
||||
requestString = UrlUtils.canonicalizeUrl(requestString, url.toString());
|
||||
targetRequests.add(new Request(requestString));
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -157,9 +151,7 @@ public class Page {
|
|||
* @param request request
|
||||
*/
|
||||
public void addTargetRequest(Request request) {
|
||||
synchronized (targetRequests) {
|
||||
targetRequests.add(request);
|
||||
}
|
||||
targetRequests.add(request);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -85,27 +85,10 @@ public class Request implements Serializable {
|
|||
return url;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
if (this == o) return true;
|
||||
if (o == null || getClass() != o.getClass()) return false;
|
||||
|
||||
Request request = (Request) o;
|
||||
|
||||
if (!url.equals(request.url)) return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
public Map<String, Object> getExtras() {
|
||||
return extras;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return url.hashCode();
|
||||
}
|
||||
|
||||
public void setExtras(Map<String, Object> extras) {
|
||||
this.extras = extras;
|
||||
}
|
||||
|
@ -132,23 +115,52 @@ public class Request implements Serializable {
|
|||
return params;
|
||||
}
|
||||
/**
|
||||
* POST/GET参数设置
|
||||
* set params for request
|
||||
* <br>
|
||||
* DO NOT set this for request already has params, like 'https://github.com/search?q=webmagic'
|
||||
* @param params params
|
||||
* */
|
||||
public void setParams(Map<String, String> params) {
|
||||
this.params = params;
|
||||
}
|
||||
/**
|
||||
* POST/GET参数设置
|
||||
* set params for request
|
||||
* <br>
|
||||
* DO NOT set this for request already has params, like 'https://github.com/search?q=webmagic'
|
||||
* @param key key
|
||||
* @param value value
|
||||
* */
|
||||
public void putParams(String key,String value) {
|
||||
params.put(key,value);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
if (this == o) return true;
|
||||
if (o == null || getClass() != o.getClass()) return false;
|
||||
|
||||
Request request = (Request) o;
|
||||
|
||||
if (url != null ? !url.equals(request.url) : request.url != null) return false;
|
||||
if (method != null ? !method.equals(request.method) : request.method != null) return false;
|
||||
return params != null ? params.equals(request.params) : request.params == null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
int result = url != null ? url.hashCode() : 0;
|
||||
result = 31 * result + (method != null ? method.hashCode() : 0);
|
||||
result = 31 * result + (params != null ? params.hashCode() : 0);
|
||||
return result;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "Request{" +
|
||||
"url='" + url + '\'' +
|
||||
", method='" + method + '\'' +
|
||||
", extras=" + extras +
|
||||
", params=" + params +
|
||||
", priority=" + priority +
|
||||
'}';
|
||||
}
|
||||
|
|
|
@ -305,7 +305,7 @@ public class Spider implements Runnable, Task {
|
|||
initComponent();
|
||||
logger.info("Spider " + getUUID() + " started!");
|
||||
while (!Thread.currentThread().isInterrupted() && stat.get() == STAT_RUNNING) {
|
||||
Request request = scheduler.poll(this);
|
||||
final Request request = scheduler.poll(this);
|
||||
if (request == null) {
|
||||
if (threadPool.getThreadAlive() == 0 && exitWhenComplete) {
|
||||
break;
|
||||
|
@ -313,16 +313,15 @@ public class Spider implements Runnable, Task {
|
|||
// wait until new url added
|
||||
waitNewUrl();
|
||||
} else {
|
||||
final Request requestFinal = request;
|
||||
threadPool.execute(new Runnable() {
|
||||
@Override
|
||||
public void run() {
|
||||
try {
|
||||
processRequest(requestFinal);
|
||||
onSuccess(requestFinal);
|
||||
processRequest(request);
|
||||
onSuccess(request);
|
||||
} catch (Exception e) {
|
||||
onError(requestFinal);
|
||||
logger.error("process request " + requestFinal + " error", e);
|
||||
onError(request);
|
||||
logger.error("process request " + request + " error", e);
|
||||
} finally {
|
||||
pageCount.incrementAndGet();
|
||||
signalNewUrl();
|
||||
|
@ -587,6 +586,7 @@ public class Spider implements Runnable, Task {
|
|||
if (threadNum <= 0) {
|
||||
throw new IllegalArgumentException("threadNum should be more than one!");
|
||||
}
|
||||
this.executorService = executorService;
|
||||
return this;
|
||||
}
|
||||
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
package us.codecraft.webmagic.downloader;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.http.HttpHost;
|
||||
import org.apache.http.HttpResponse;
|
||||
import org.apache.http.NameValuePair;
|
||||
|
@ -15,10 +14,6 @@ import org.apache.http.client.methods.RequestBuilder;
|
|||
import org.apache.http.impl.client.CloseableHttpClient;
|
||||
import org.apache.http.message.BasicNameValuePair;
|
||||
import org.apache.http.util.EntityUtils;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.jsoup.nodes.Element;
|
||||
import org.jsoup.select.Elements;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import us.codecraft.webmagic.Page;
|
||||
|
@ -27,8 +22,8 @@ import us.codecraft.webmagic.Site;
|
|||
import us.codecraft.webmagic.Task;
|
||||
import us.codecraft.webmagic.proxy.Proxy;
|
||||
import us.codecraft.webmagic.selector.PlainText;
|
||||
import us.codecraft.webmagic.utils.CharsetUtils;
|
||||
import us.codecraft.webmagic.utils.HttpConstant;
|
||||
import us.codecraft.webmagic.utils.UrlUtils;
|
||||
import us.codecraft.webmagic.utils.WMCollections;
|
||||
|
||||
import java.io.IOException;
|
||||
|
@ -98,8 +93,8 @@ public class HttpClientDownloader extends AbstractDownloader {
|
|||
proxyHost = site.getHttpProxy();
|
||||
}
|
||||
|
||||
HttpUriRequest httpUriRequest = getHttpUriRequest(request, site, headers, proxyHost);//<2F><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>˴<EFBFBD><CBB4><EFBFBD>
|
||||
httpResponse = getHttpClient(site, proxy).execute(httpUriRequest);//getHttpClient<6E><74><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>˴<EFBFBD><CBB4><EFBFBD><EFBFBD><EFBFBD>֤
|
||||
HttpUriRequest httpUriRequest = getHttpUriRequest(request, site, headers, proxyHost);
|
||||
httpResponse = getHttpClient(site, proxy).execute(httpUriRequest);
|
||||
statusCode = httpResponse.getStatusLine().getStatusCode();
|
||||
request.putExtra(Request.STATUS_CODE, statusCode);
|
||||
if (statusAccept(acceptStatCode, statusCode)) {
|
||||
|
@ -167,39 +162,44 @@ public class HttpClientDownloader extends AbstractDownloader {
|
|||
String method = request.getMethod();
|
||||
if (method == null || method.equalsIgnoreCase(HttpConstant.Method.GET)) {
|
||||
//default get
|
||||
RequestBuilder requestBuilder=RequestBuilder.get();
|
||||
if (request.getParams() != null) {
|
||||
for (Map.Entry<String, String> entry : request.getParams().entrySet()) {
|
||||
requestBuilder.addParameter(entry.getKey(), entry.getValue());
|
||||
}
|
||||
}
|
||||
return requestBuilder;
|
||||
return addQueryParams(RequestBuilder.get(),request.getParams());
|
||||
} else if (method.equalsIgnoreCase(HttpConstant.Method.POST)) {
|
||||
RequestBuilder requestBuilder = RequestBuilder.post();
|
||||
NameValuePair[] nameValuePair = (NameValuePair[]) request.getExtra("nameValuePair");
|
||||
List<NameValuePair> allNameValuePair=new ArrayList<NameValuePair>();
|
||||
if (nameValuePair != null && nameValuePair.length > 0) {
|
||||
allNameValuePair= Arrays.asList(nameValuePair);
|
||||
}
|
||||
if (request.getParams() != null) {
|
||||
for (String key : request.getParams().keySet()) {
|
||||
allNameValuePair.add(new BasicNameValuePair(key, request.getParams().get(key)));
|
||||
}
|
||||
}
|
||||
requestBuilder.setEntity(new UrlEncodedFormEntity(allNameValuePair, Charset.forName("utf8")));
|
||||
return requestBuilder;
|
||||
return addFormParams(RequestBuilder.post(), (NameValuePair[]) request.getExtra("nameValuePair"), request.getParams());
|
||||
} else if (method.equalsIgnoreCase(HttpConstant.Method.HEAD)) {
|
||||
return RequestBuilder.head();
|
||||
return addQueryParams(RequestBuilder.head(),request.getParams());
|
||||
} else if (method.equalsIgnoreCase(HttpConstant.Method.PUT)) {
|
||||
return RequestBuilder.put();
|
||||
return addFormParams(RequestBuilder.put(), (NameValuePair[]) request.getExtra("nameValuePair"), request.getParams());
|
||||
} else if (method.equalsIgnoreCase(HttpConstant.Method.DELETE)) {
|
||||
return RequestBuilder.delete();
|
||||
return addQueryParams(RequestBuilder.delete(),request.getParams());
|
||||
} else if (method.equalsIgnoreCase(HttpConstant.Method.TRACE)) {
|
||||
return RequestBuilder.trace();
|
||||
return addQueryParams(RequestBuilder.trace(),request.getParams());
|
||||
}
|
||||
throw new IllegalArgumentException("Illegal HTTP Method " + method);
|
||||
}
|
||||
|
||||
private RequestBuilder addFormParams(RequestBuilder requestBuilder, NameValuePair[] nameValuePair, Map<String, String> params) {
|
||||
List<NameValuePair> allNameValuePair=new ArrayList<NameValuePair>();
|
||||
if (nameValuePair != null && nameValuePair.length > 0) {
|
||||
allNameValuePair= Arrays.asList(nameValuePair);
|
||||
}
|
||||
if (params != null) {
|
||||
for (String key : params.keySet()) {
|
||||
allNameValuePair.add(new BasicNameValuePair(key, params.get(key)));
|
||||
}
|
||||
}
|
||||
requestBuilder.setEntity(new UrlEncodedFormEntity(allNameValuePair, Charset.forName("utf8")));
|
||||
return requestBuilder;
|
||||
}
|
||||
|
||||
private RequestBuilder addQueryParams(RequestBuilder requestBuilder, Map<String, String> params) {
|
||||
if (params != null) {
|
||||
for (Map.Entry<String, String> entry : params.entrySet()) {
|
||||
requestBuilder.addParameter(entry.getKey(), entry.getValue());
|
||||
}
|
||||
}
|
||||
return requestBuilder;
|
||||
}
|
||||
|
||||
protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException {
|
||||
String content = getContent(charset, httpResponse);
|
||||
Page page = new Page();
|
||||
|
@ -226,40 +226,6 @@ public class HttpClientDownloader extends AbstractDownloader {
|
|||
}
|
||||
|
||||
protected String getHtmlCharset(HttpResponse httpResponse, byte[] contentBytes) throws IOException {
|
||||
String charset;
|
||||
// charset
|
||||
// 1、encoding in http header Content-Type
|
||||
String value = httpResponse.getEntity().getContentType().getValue();
|
||||
charset = UrlUtils.getCharset(value);
|
||||
if (StringUtils.isNotBlank(charset)) {
|
||||
logger.debug("Auto get charset: {}", charset);
|
||||
return charset;
|
||||
}
|
||||
// use default charset to decode first time
|
||||
Charset defaultCharset = Charset.defaultCharset();
|
||||
String content = new String(contentBytes, defaultCharset.name());
|
||||
// 2、charset in meta
|
||||
if (StringUtils.isNotEmpty(content)) {
|
||||
Document document = Jsoup.parse(content);
|
||||
Elements links = document.select("meta");
|
||||
for (Element link : links) {
|
||||
// 2.1、html4.01 <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
|
||||
String metaContent = link.attr("content");
|
||||
String metaCharset = link.attr("charset");
|
||||
if (metaContent.indexOf("charset") != -1) {
|
||||
metaContent = metaContent.substring(metaContent.indexOf("charset"), metaContent.length());
|
||||
charset = metaContent.split("=")[1];
|
||||
break;
|
||||
}
|
||||
// 2.2、html5 <meta charset="UTF-8" />
|
||||
else if (StringUtils.isNotEmpty(metaCharset)) {
|
||||
charset = metaCharset;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
logger.debug("Auto get charset: {}", charset);
|
||||
// 3、todo use tools as cpdetector for content decode
|
||||
return charset;
|
||||
return CharsetUtils.detectCharset(httpResponse.getEntity().getContentType().getValue(), contentBytes);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -18,7 +18,7 @@ public class GithubRepoPageProcessor implements PageProcessor {
|
|||
page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/[\\w\\-]+/[\\w\\-]+)").all());
|
||||
page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/[\\w\\-])").all());
|
||||
page.putField("author", page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString());
|
||||
page.putField("name", page.getHtml().xpath("//h1[@class='entry-title public']/strong/a/text()").toString());
|
||||
page.putField("name", page.getHtml().xpath("//h1[@class='public']/strong/a/text()").toString());
|
||||
if (page.getResultItems().get("name")==null){
|
||||
//skip this page
|
||||
page.setSkip(true);
|
||||
|
|
|
@ -79,14 +79,14 @@ public class Proxy implements Delayed, Serializable {
|
|||
|
||||
private List<Integer> failedErrorType = new ArrayList<Integer>();
|
||||
|
||||
Proxy(HttpHost httpHost, String user, String password) {
|
||||
public Proxy(HttpHost httpHost, String user, String password) {
|
||||
this.httpHost = httpHost;
|
||||
this.user = user;
|
||||
this.password = password;
|
||||
this.canReuseTime = System.nanoTime() + TimeUnit.NANOSECONDS.convert(reuseTimeInterval, TimeUnit.MILLISECONDS);
|
||||
}
|
||||
|
||||
Proxy(HttpHost httpHost, int reuseInterval, String user, String password) {
|
||||
public Proxy(HttpHost httpHost, int reuseInterval, String user, String password) {
|
||||
this.httpHost = httpHost;
|
||||
this.user = user;
|
||||
this.password = password;
|
||||
|
|
|
@ -6,6 +6,7 @@ import us.codecraft.webmagic.Request;
|
|||
import us.codecraft.webmagic.Task;
|
||||
import us.codecraft.webmagic.scheduler.component.DuplicateRemover;
|
||||
import us.codecraft.webmagic.scheduler.component.HashSetDuplicateRemover;
|
||||
import us.codecraft.webmagic.utils.HttpConstant;
|
||||
|
||||
/**
|
||||
* Remove duplicate urls and only push urls which are not duplicate.<br><br>
|
||||
|
@ -31,7 +32,7 @@ public abstract class DuplicateRemovedScheduler implements Scheduler {
|
|||
@Override
|
||||
public void push(Request request, Task task) {
|
||||
logger.trace("get a candidate url {}", request.getUrl());
|
||||
if (!duplicatedRemover.isDuplicate(request, task) || shouldReserved(request)) {
|
||||
if (shouldReserved(request) || noNeedToRemoveDuplicate(request) || !duplicatedRemover.isDuplicate(request, task)) {
|
||||
logger.debug("push to queue {}", request.getUrl());
|
||||
pushWhenNoDuplicate(request, task);
|
||||
}
|
||||
|
@ -41,6 +42,10 @@ public abstract class DuplicateRemovedScheduler implements Scheduler {
|
|||
return request.getExtra(Request.CYCLE_TRIED_TIMES) != null;
|
||||
}
|
||||
|
||||
protected boolean noNeedToRemoveDuplicate(Request request) {
|
||||
return HttpConstant.Method.POST.equalsIgnoreCase(request.getMethod());
|
||||
}
|
||||
|
||||
protected void pushWhenNoDuplicate(Request request, Task task) {
|
||||
|
||||
}
|
||||
|
|
|
@ -26,7 +26,7 @@ public class QueueScheduler extends DuplicateRemovedScheduler implements Monitor
|
|||
}
|
||||
|
||||
@Override
|
||||
public synchronized Request poll(Task task) {
|
||||
public Request poll(Task task) {
|
||||
return queue.poll();
|
||||
}
|
||||
|
||||
|
|
|
@ -28,8 +28,7 @@ public class RegexSelector implements Selector {
|
|||
}
|
||||
// Check bracket for regex group. Add default group 1 if there is no group.
|
||||
// Only check if there exists the valid left parenthesis, leave regexp validation for Pattern.
|
||||
if (StringUtils.countMatches(regexStr, "(") - StringUtils.countMatches(regexStr, "\\(") ==
|
||||
StringUtils.countMatches(regexStr, "(?:") - StringUtils.countMatches(regexStr, "\\(?:")) {
|
||||
if ( ! hasGroup(regexStr) ){
|
||||
regexStr = "(" + regexStr + ")";
|
||||
}
|
||||
this.regexStr = regexStr;
|
||||
|
@ -45,6 +44,30 @@ public class RegexSelector implements Selector {
|
|||
this(regexStr, 1);
|
||||
}
|
||||
|
||||
private boolean hasGroup(String regexStr) {
|
||||
if (StringUtils.countMatches(regexStr, "(") - StringUtils.countMatches(regexStr, "\\(") ==
|
||||
StringUtils.countMatches(regexStr, "(?:") - StringUtils.countMatches(regexStr, "\\(?:")){
|
||||
return false;
|
||||
}
|
||||
if (StringUtils.countMatches(regexStr, "(") - StringUtils.countMatches(regexStr, "\\(") ==
|
||||
StringUtils.countMatches(regexStr, "(?=") - StringUtils.countMatches(regexStr, "\\(?=") ) {
|
||||
return false;
|
||||
}
|
||||
if (StringUtils.countMatches(regexStr, "(") - StringUtils.countMatches(regexStr, "\\(") ==
|
||||
StringUtils.countMatches(regexStr, "(?<") - StringUtils.countMatches(regexStr, "\\(?<") ) {
|
||||
return false;
|
||||
}
|
||||
if (StringUtils.countMatches(regexStr, "(") - StringUtils.countMatches(regexStr, "\\(") ==
|
||||
StringUtils.countMatches(regexStr, "(?!") - StringUtils.countMatches(regexStr, "\\(?!") ) {
|
||||
return false;
|
||||
}
|
||||
if (StringUtils.countMatches(regexStr, "(") - StringUtils.countMatches(regexStr, "\\(") ==
|
||||
StringUtils.countMatches(regexStr, "(?#") - StringUtils.countMatches(regexStr, "\\(?#") ) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String select(String text) {
|
||||
return selectGroup(text).get(group);
|
||||
|
|
|
@ -0,0 +1,61 @@
|
|||
package us.codecraft.webmagic.utils;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.jsoup.nodes.Element;
|
||||
import org.jsoup.select.Elements;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.charset.Charset;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com
|
||||
* Date: 17/3/11
|
||||
* Time: 10:36
|
||||
* @since 0.6.2
|
||||
*/
|
||||
public abstract class CharsetUtils {
|
||||
|
||||
private static Logger logger = LoggerFactory.getLogger(CharsetUtils.class);
|
||||
|
||||
public static String detectCharset(String contentType, byte[] contentBytes) throws IOException {
|
||||
String charset;
|
||||
// charset
|
||||
// 1、encoding in http header Content-Type
|
||||
charset = UrlUtils.getCharset(contentType);
|
||||
if (StringUtils.isNotBlank(contentType)) {
|
||||
logger.debug("Auto get charset: {}", charset);
|
||||
return charset;
|
||||
}
|
||||
// use default charset to decode first time
|
||||
Charset defaultCharset = Charset.defaultCharset();
|
||||
String content = new String(contentBytes, defaultCharset);
|
||||
// 2、charset in meta
|
||||
if (StringUtils.isNotEmpty(content)) {
|
||||
Document document = Jsoup.parse(content);
|
||||
Elements links = document.select("meta");
|
||||
for (Element link : links) {
|
||||
// 2.1、html4.01 <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
|
||||
String metaContent = link.attr("content");
|
||||
String metaCharset = link.attr("charset");
|
||||
if (metaContent.indexOf("charset") != -1) {
|
||||
metaContent = metaContent.substring(metaContent.indexOf("charset"), metaContent.length());
|
||||
charset = metaContent.split("=")[1];
|
||||
break;
|
||||
}
|
||||
// 2.2、html5 <meta charset="UTF-8" />
|
||||
else if (StringUtils.isNotEmpty(metaCharset)) {
|
||||
charset = metaCharset;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
logger.debug("Auto get charset: {}", charset);
|
||||
// 3、todo use tools as cpdetector for content decode
|
||||
return charset;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,25 @@
|
|||
package us.codecraft.webmagic;
|
||||
|
||||
import org.junit.Test;
|
||||
import us.codecraft.webmagic.utils.HttpConstant;
|
||||
|
||||
import static org.assertj.core.api.Assertions.assertThat;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com
|
||||
* Date: 17/3/11
|
||||
*/
|
||||
public class RequestTest {
|
||||
|
||||
@Test
|
||||
public void testEqualsAndHashCode() throws Exception {
|
||||
Request requestA = new Request("http://www.google.com/");
|
||||
Request requestB = new Request("http://www.google.com/");
|
||||
assertThat(requestA.hashCode()).isEqualTo(requestB.hashCode());
|
||||
assertThat(requestA).isEqualTo(requestB);
|
||||
requestA.setMethod(HttpConstant.Method.GET);
|
||||
requestA.setMethod(HttpConstant.Method.POST);
|
||||
assertThat(requestA).isNotEqualTo(requestB);
|
||||
assertThat(requestA.hashCode()).isNotEqualTo(requestB.hashCode());
|
||||
}
|
||||
}
|
|
@ -5,13 +5,17 @@ import com.github.dreamhead.moco.Runnable;
|
|||
import com.github.dreamhead.moco.Runner;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.http.client.methods.CloseableHttpResponse;
|
||||
import org.apache.http.client.methods.RequestBuilder;
|
||||
import org.apache.http.impl.client.CloseableHttpClient;
|
||||
import org.apache.http.impl.client.HttpClients;
|
||||
import org.apache.http.util.EntityUtils;
|
||||
import org.junit.Test;
|
||||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.Request;
|
||||
import us.codecraft.webmagic.Site;
|
||||
import us.codecraft.webmagic.Task;
|
||||
import us.codecraft.webmagic.selector.Html;
|
||||
import us.codecraft.webmagic.utils.HttpConstant;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.UnsupportedEncodingException;
|
||||
|
@ -103,4 +107,42 @@ public class HttpClientDownloaderTest {
|
|||
}
|
||||
});
|
||||
}
|
||||
|
||||
@Test
|
||||
public void test_selectRequestMethod() throws Exception {
|
||||
HttpServer server = httpserver(12306);
|
||||
server.get(eq(query("q"), "webmagic")).response("get");
|
||||
server.post(eq(form("q"), "webmagic")).response("post");
|
||||
server.put(eq(form("q"), "webmagic")).response("put");
|
||||
server.delete(eq(query("q"), "webmagic")).response("delete");
|
||||
server.request(and(by(method("HEAD")),eq(query("q"), "webmagic"))).response(header("method","head"));
|
||||
server.request(and(by(method("TRACE")),eq(query("q"), "webmagic"))).response("trace");
|
||||
Runner.running(server, new Runnable() {
|
||||
@Override
|
||||
public void run() throws Exception {
|
||||
HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
|
||||
Request request = new Request();
|
||||
request.setUrl("http://127.0.0.1:12306/search");
|
||||
request.putParams("q", "webmagic");
|
||||
request.setMethod(HttpConstant.Method.GET);
|
||||
RequestBuilder requestBuilder = httpClientDownloader.selectRequestMethod(request).setUri(request.getUrl());
|
||||
assertThat(EntityUtils.toString(HttpClients.custom().build().execute(requestBuilder.build()).getEntity())).isEqualTo("get");
|
||||
request.setMethod(HttpConstant.Method.POST);
|
||||
requestBuilder = httpClientDownloader.selectRequestMethod(request).setUri(request.getUrl());
|
||||
assertThat(EntityUtils.toString(HttpClients.custom().build().execute(requestBuilder.build()).getEntity())).isEqualTo("post");
|
||||
request.setMethod(HttpConstant.Method.PUT);
|
||||
requestBuilder = httpClientDownloader.selectRequestMethod(request).setUri(request.getUrl());
|
||||
assertThat(EntityUtils.toString(HttpClients.custom().build().execute(requestBuilder.build()).getEntity())).isEqualTo("put");
|
||||
request.setMethod(HttpConstant.Method.DELETE);
|
||||
requestBuilder = httpClientDownloader.selectRequestMethod(request).setUri(request.getUrl());
|
||||
assertThat(EntityUtils.toString(HttpClients.custom().build().execute(requestBuilder.build()).getEntity())).isEqualTo("delete");
|
||||
request.setMethod(HttpConstant.Method.HEAD);
|
||||
requestBuilder = httpClientDownloader.selectRequestMethod(request).setUri(request.getUrl());
|
||||
assertThat(HttpClients.custom().build().execute(requestBuilder.build()).getFirstHeader("method").getValue()).isEqualTo("head");
|
||||
request.setMethod(HttpConstant.Method.TRACE);
|
||||
requestBuilder = httpClientDownloader.selectRequestMethod(request).setUri(request.getUrl());
|
||||
assertThat(EntityUtils.toString(HttpClients.custom().build().execute(requestBuilder.build()).getEntity())).isEqualTo("trace");
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,50 @@
|
|||
package us.codecraft.webmagic.scheduler;
|
||||
|
||||
import org.junit.Test;
|
||||
import org.junit.runner.RunWith;
|
||||
import org.mockito.Mockito;
|
||||
import org.mockito.runners.MockitoJUnitRunner;
|
||||
import us.codecraft.webmagic.Request;
|
||||
import us.codecraft.webmagic.Task;
|
||||
import us.codecraft.webmagic.scheduler.component.DuplicateRemover;
|
||||
import us.codecraft.webmagic.utils.HttpConstant;
|
||||
|
||||
import static org.mockito.Matchers.any;
|
||||
import static org.mockito.Mockito.times;
|
||||
import static org.mockito.Mockito.verify;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com
|
||||
* Date: 17/3/11
|
||||
* Time: 上午11:26
|
||||
*/
|
||||
@RunWith(MockitoJUnitRunner.class)
|
||||
public class DuplicateRemovedSchedulerTest {
|
||||
|
||||
private DuplicateRemovedScheduler duplicateRemovedScheduler = new DuplicateRemovedScheduler() {
|
||||
@Override
|
||||
public Request poll(Task task) {
|
||||
return null;
|
||||
}
|
||||
};
|
||||
|
||||
@Test
|
||||
public void test_no_duplicate_removed_for_post_request() throws Exception {
|
||||
DuplicateRemover duplicateRemover = Mockito.mock(DuplicateRemover.class);
|
||||
duplicateRemovedScheduler.setDuplicateRemover(duplicateRemover);
|
||||
Request request = new Request("https://www.google.com/");
|
||||
request.setMethod(HttpConstant.Method.POST);
|
||||
duplicateRemovedScheduler.push(request, null);
|
||||
verify(duplicateRemover,times(0)).isDuplicate(any(Request.class),any(Task.class));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void test_duplicate_removed_for_get_request() throws Exception {
|
||||
DuplicateRemover duplicateRemover = Mockito.mock(DuplicateRemover.class);
|
||||
duplicateRemovedScheduler.setDuplicateRemover(duplicateRemover);
|
||||
Request request = new Request("https://www.google.com/");
|
||||
request.setMethod(HttpConstant.Method.GET);
|
||||
duplicateRemovedScheduler.push(request, null);
|
||||
verify(duplicateRemover,times(1)).isDuplicate(any(Request.class),any(Task.class));
|
||||
}
|
||||
}
|
|
@ -22,4 +22,20 @@ public class RegexSelectorTest {
|
|||
String select = regexSelector.select(source);
|
||||
Assertions.assertThat(select).isEqualTo(source);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testRegexWithZeroWidthAssertions() {
|
||||
String regex = "^.*(?=\\?)";
|
||||
String source = "hello world?xxxx";
|
||||
RegexSelector regexSelector = new RegexSelector(regex);
|
||||
String select = regexSelector.select(source);
|
||||
Assertions.assertThat(select).isEqualTo("hello world");
|
||||
|
||||
|
||||
regex = "\\d{3}(?!\\d)";
|
||||
source = "123456asdf";
|
||||
regexSelector = new RegexSelector(regex);
|
||||
select = regexSelector.select(source);
|
||||
Assertions.assertThat(select).isEqualTo("456");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -20,6 +20,9 @@ public class UrlUtilsTest {
|
|||
absoluteUrl = UrlUtils.canonicalizeUrl("../aa", "http://www.dianping.com/sh/ss/com");
|
||||
assertThat(absoluteUrl).isEqualTo("http://www.dianping.com/sh/aa");
|
||||
|
||||
absoluteUrl = UrlUtils.canonicalizeUrl("../mshz", "http://www.court.gov.cn/zgcpwsw/zgrmfy/");
|
||||
assertThat(absoluteUrl).isEqualTo("http://www.court.gov.cn/zgcpwsw/mshz");
|
||||
|
||||
absoluteUrl = UrlUtils.canonicalizeUrl("..aa", "http://www.dianping.com/sh/ss/com");
|
||||
assertThat(absoluteUrl).isEqualTo("http://www.dianping.com/sh/ss/..aa");
|
||||
|
||||
|
|
|
@ -48,11 +48,7 @@ public class RedisScheduler extends DuplicateRemovedScheduler implements Monitor
|
|||
public boolean isDuplicate(Request request, Task task) {
|
||||
Jedis jedis = pool.getResource();
|
||||
try {
|
||||
boolean isDuplicate = jedis.sismember(getSetKey(task), request.getUrl());
|
||||
if (!isDuplicate) {
|
||||
jedis.sadd(getSetKey(task), request.getUrl());
|
||||
}
|
||||
return isDuplicate;
|
||||
return jedis.sadd(getSetKey(task), request.getUrl()) > 0;
|
||||
} finally {
|
||||
pool.returnResource(jedis);
|
||||
}
|
||||
|
|
|
@ -13,7 +13,7 @@
|
|||
<dependency>
|
||||
<groupId>org.seleniumhq.selenium</groupId>
|
||||
<artifactId>selenium-java</artifactId>
|
||||
<version>2.46.0</version>
|
||||
<version>2.41.0</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>us.codecraft</groupId>
|
||||
|
|
|
@ -45,7 +45,7 @@ class WebDriverPool {
|
|||
private WebDriver mDriver = null;
|
||||
private boolean mAutoQuitDriver = true;
|
||||
|
||||
private static final String CONFIG_FILE = "/Users/Bingo/Documents/workspace/webmagic/webmagic-selenium/config.ini";
|
||||
private static final String DEFAULT_CONFIG_FILE = "/data/webmagic/webmagic-selenium/config.ini";
|
||||
private static final String DRIVER_FIREFOX = "firefox";
|
||||
private static final String DRIVER_CHROME = "chrome";
|
||||
private static final String DRIVER_PHANTOMJS = "phantomjs";
|
||||
|
@ -64,7 +64,11 @@ class WebDriverPool {
|
|||
public void configure() throws IOException {
|
||||
// Read config file
|
||||
sConfig = new Properties();
|
||||
sConfig.load(new FileReader(CONFIG_FILE));
|
||||
String configFile = DEFAULT_CONFIG_FILE;
|
||||
if (System.getProperty("selenuim_config")!=null){
|
||||
configFile = System.getProperty("selenuim_config");
|
||||
}
|
||||
sConfig.load(new FileReader(configFile));
|
||||
|
||||
// Prepare capabilities
|
||||
sCaps = new DesiredCapabilities();
|
||||
|
|
|
@ -22,7 +22,7 @@ public class HuabanProcessor implements PageProcessor {
|
|||
public void process(Page page) {
|
||||
page.addTargetRequests(page.getHtml().links().regex("http://huaban\\.com/.*").all());
|
||||
if (page.getUrl().toString().contains("pins")) {
|
||||
page.putField("img", page.getHtml().xpath("//div[@id='pin_img']/a/img/@src").toString());
|
||||
page.putField("img", page.getHtml().xpath("//div[@class='image-holder']/a/img/@src").toString());
|
||||
} else {
|
||||
page.getResultItems().setSkip(true);
|
||||
}
|
||||
|
|
|
@ -0,0 +1,11 @@
|
|||
#driver=phantomjs
|
||||
#driver=firefox
|
||||
driver=chrome
|
||||
#driver=http://localhost:8910
|
||||
driver=http://localhost:4444/wd/hub
|
||||
|
||||
# PhantomJS specific config (change according to your installation)
|
||||
#phantomjs_exec_path=/Users/detro/bin/phantomjs-qt5
|
||||
phantomjs_exec_path=/Users/detro/bin/phantomjs-upstream
|
||||
phantomjs_driver_path=../../src/main.js
|
||||
phantomjs_driver_loglevel=DEBUG
|
Loading…
Reference in New Issue