commit
76729c9302
|
@ -0,0 +1,191 @@
|
||||||
|
Apache License
|
||||||
|
Version 2.0, January 2004
|
||||||
|
http://www.apache.org/licenses/
|
||||||
|
|
||||||
|
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||||
|
|
||||||
|
1. Definitions.
|
||||||
|
|
||||||
|
"License" shall mean the terms and conditions for use, reproduction, and
|
||||||
|
distribution as defined by Sections 1 through 9 of this document.
|
||||||
|
|
||||||
|
"Licensor" shall mean the copyright owner or entity authorized by the copyright
|
||||||
|
owner that is granting the License.
|
||||||
|
|
||||||
|
"Legal Entity" shall mean the union of the acting entity and all other entities
|
||||||
|
that control, are controlled by, or are under common control with that entity.
|
||||||
|
For the purposes of this definition, "control" means (i) the power, direct or
|
||||||
|
indirect, to cause the direction or management of such entity, whether by
|
||||||
|
contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||||
|
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||||
|
|
||||||
|
"You" (or "Your") shall mean an individual or Legal Entity exercising
|
||||||
|
permissions granted by this License.
|
||||||
|
|
||||||
|
"Source" form shall mean the preferred form for making modifications, including
|
||||||
|
but not limited to software source code, documentation source, and configuration
|
||||||
|
files.
|
||||||
|
|
||||||
|
"Object" form shall mean any form resulting from mechanical transformation or
|
||||||
|
translation of a Source form, including but not limited to compiled object code,
|
||||||
|
generated documentation, and conversions to other media types.
|
||||||
|
|
||||||
|
"Work" shall mean the work of authorship, whether in Source or Object form, made
|
||||||
|
available under the License, as indicated by a copyright notice that is included
|
||||||
|
in or attached to the work (an example is provided in the Appendix below).
|
||||||
|
|
||||||
|
"Derivative Works" shall mean any work, whether in Source or Object form, that
|
||||||
|
is based on (or derived from) the Work and for which the editorial revisions,
|
||||||
|
annotations, elaborations, or other modifications represent, as a whole, an
|
||||||
|
original work of authorship. For the purposes of this License, Derivative Works
|
||||||
|
shall not include works that remain separable from, or merely link (or bind by
|
||||||
|
name) to the interfaces of, the Work and Derivative Works thereof.
|
||||||
|
|
||||||
|
"Contribution" shall mean any work of authorship, including the original version
|
||||||
|
of the Work and any modifications or additions to that Work or Derivative Works
|
||||||
|
thereof, that is intentionally submitted to Licensor for inclusion in the Work
|
||||||
|
by the copyright owner or by an individual or Legal Entity authorized to submit
|
||||||
|
on behalf of the copyright owner. For the purposes of this definition,
|
||||||
|
"submitted" means any form of electronic, verbal, or written communication sent
|
||||||
|
to the Licensor or its representatives, including but not limited to
|
||||||
|
communication on electronic mailing lists, source code control systems, and
|
||||||
|
issue tracking systems that are managed by, or on behalf of, the Licensor for
|
||||||
|
the purpose of discussing and improving the Work, but excluding communication
|
||||||
|
that is conspicuously marked or otherwise designated in writing by the copyright
|
||||||
|
owner as "Not a Contribution."
|
||||||
|
|
||||||
|
"Contributor" shall mean Licensor and any individual or Legal Entity on behalf
|
||||||
|
of whom a Contribution has been received by Licensor and subsequently
|
||||||
|
incorporated within the Work.
|
||||||
|
|
||||||
|
2. Grant of Copyright License.
|
||||||
|
|
||||||
|
Subject to the terms and conditions of this License, each Contributor hereby
|
||||||
|
grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free,
|
||||||
|
irrevocable copyright license to reproduce, prepare Derivative Works of,
|
||||||
|
publicly display, publicly perform, sublicense, and distribute the Work and such
|
||||||
|
Derivative Works in Source or Object form.
|
||||||
|
|
||||||
|
3. Grant of Patent License.
|
||||||
|
|
||||||
|
Subject to the terms and conditions of this License, each Contributor hereby
|
||||||
|
grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free,
|
||||||
|
irrevocable (except as stated in this section) patent license to make, have
|
||||||
|
made, use, offer to sell, sell, import, and otherwise transfer the Work, where
|
||||||
|
such license applies only to those patent claims licensable by such Contributor
|
||||||
|
that are necessarily infringed by their Contribution(s) alone or by combination
|
||||||
|
of their Contribution(s) with the Work to which such Contribution(s) was
|
||||||
|
submitted. If You institute patent litigation against any entity (including a
|
||||||
|
cross-claim or counterclaim in a lawsuit) alleging that the Work or a
|
||||||
|
Contribution incorporated within the Work constitutes direct or contributory
|
||||||
|
patent infringement, then any patent licenses granted to You under this License
|
||||||
|
for that Work shall terminate as of the date such litigation is filed.
|
||||||
|
|
||||||
|
4. Redistribution.
|
||||||
|
|
||||||
|
You may reproduce and distribute copies of the Work or Derivative Works thereof
|
||||||
|
in any medium, with or without modifications, and in Source or Object form,
|
||||||
|
provided that You meet the following conditions:
|
||||||
|
|
||||||
|
You must give any other recipients of the Work or Derivative Works a copy of
|
||||||
|
this License; and
|
||||||
|
You must cause any modified files to carry prominent notices stating that You
|
||||||
|
changed the files; and
|
||||||
|
You must retain, in the Source form of any Derivative Works that You distribute,
|
||||||
|
all copyright, patent, trademark, and attribution notices from the Source form
|
||||||
|
of the Work, excluding those notices that do not pertain to any part of the
|
||||||
|
Derivative Works; and
|
||||||
|
If the Work includes a "NOTICE" text file as part of its distribution, then any
|
||||||
|
Derivative Works that You distribute must include a readable copy of the
|
||||||
|
attribution notices contained within such NOTICE file, excluding those notices
|
||||||
|
that do not pertain to any part of the Derivative Works, in at least one of the
|
||||||
|
following places: within a NOTICE text file distributed as part of the
|
||||||
|
Derivative Works; within the Source form or documentation, if provided along
|
||||||
|
with the Derivative Works; or, within a display generated by the Derivative
|
||||||
|
Works, if and wherever such third-party notices normally appear. The contents of
|
||||||
|
the NOTICE file are for informational purposes only and do not modify the
|
||||||
|
License. You may add Your own attribution notices within Derivative Works that
|
||||||
|
You distribute, alongside or as an addendum to the NOTICE text from the Work,
|
||||||
|
provided that such additional attribution notices cannot be construed as
|
||||||
|
modifying the License.
|
||||||
|
You may add Your own copyright statement to Your modifications and may provide
|
||||||
|
additional or different license terms and conditions for use, reproduction, or
|
||||||
|
distribution of Your modifications, or for any such Derivative Works as a whole,
|
||||||
|
provided Your use, reproduction, and distribution of the Work otherwise complies
|
||||||
|
with the conditions stated in this License.
|
||||||
|
|
||||||
|
5. Submission of Contributions.
|
||||||
|
|
||||||
|
Unless You explicitly state otherwise, any Contribution intentionally submitted
|
||||||
|
for inclusion in the Work by You to the Licensor shall be under the terms and
|
||||||
|
conditions of this License, without any additional terms or conditions.
|
||||||
|
Notwithstanding the above, nothing herein shall supersede or modify the terms of
|
||||||
|
any separate license agreement you may have executed with Licensor regarding
|
||||||
|
such Contributions.
|
||||||
|
|
||||||
|
6. Trademarks.
|
||||||
|
|
||||||
|
This License does not grant permission to use the trade names, trademarks,
|
||||||
|
service marks, or product names of the Licensor, except as required for
|
||||||
|
reasonable and customary use in describing the origin of the Work and
|
||||||
|
reproducing the content of the NOTICE file.
|
||||||
|
|
||||||
|
7. Disclaimer of Warranty.
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, Licensor provides the
|
||||||
|
Work (and each Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied,
|
||||||
|
including, without limitation, any warranties or conditions of TITLE,
|
||||||
|
NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are
|
||||||
|
solely responsible for determining the appropriateness of using or
|
||||||
|
redistributing the Work and assume any risks associated with Your exercise of
|
||||||
|
permissions under this License.
|
||||||
|
|
||||||
|
8. Limitation of Liability.
|
||||||
|
|
||||||
|
In no event and under no legal theory, whether in tort (including negligence),
|
||||||
|
contract, or otherwise, unless required by applicable law (such as deliberate
|
||||||
|
and grossly negligent acts) or agreed to in writing, shall any Contributor be
|
||||||
|
liable to You for damages, including any direct, indirect, special, incidental,
|
||||||
|
or consequential damages of any character arising as a result of this License or
|
||||||
|
out of the use or inability to use the Work (including but not limited to
|
||||||
|
damages for loss of goodwill, work stoppage, computer failure or malfunction, or
|
||||||
|
any and all other commercial damages or losses), even if such Contributor has
|
||||||
|
been advised of the possibility of such damages.
|
||||||
|
|
||||||
|
9. Accepting Warranty or Additional Liability.
|
||||||
|
|
||||||
|
While redistributing the Work or Derivative Works thereof, You may choose to
|
||||||
|
offer, and charge a fee for, acceptance of support, warranty, indemnity, or
|
||||||
|
other liability obligations and/or rights consistent with this License. However,
|
||||||
|
in accepting such obligations, You may act only on Your own behalf and on Your
|
||||||
|
sole responsibility, not on behalf of any other Contributor, and only if You
|
||||||
|
agree to indemnify, defend, and hold each Contributor harmless for any liability
|
||||||
|
incurred by, or claims asserted against, such Contributor by reason of your
|
||||||
|
accepting any such warranty or additional liability.
|
||||||
|
|
||||||
|
END OF TERMS AND CONDITIONS
|
||||||
|
|
||||||
|
APPENDIX: How to apply the Apache License to your work
|
||||||
|
|
||||||
|
To apply the Apache License to your work, attach the following boilerplate
|
||||||
|
notice, with the fields enclosed by brackets "{}" replaced with your own
|
||||||
|
identifying information. (Don't include the brackets!) The text should be
|
||||||
|
enclosed in the appropriate comment syntax for the file format. We also
|
||||||
|
recommend that a file or class name and description of purpose be included on
|
||||||
|
the same "printed page" as the copyright notice for easier identification within
|
||||||
|
third-party archives.
|
||||||
|
|
||||||
|
Copyright 2013 code4craft
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
24
README-zh.md
24
README-zh.md
|
@ -168,30 +168,6 @@ webmagic的使用可以参考:[oschina openapi 应用:博客搬家](http://m
|
||||||
|
|
||||||
webmagic遵循[Apache 2.0协议](http://opensource.org/licenses/Apache-2.0)
|
webmagic遵循[Apache 2.0协议](http://opensource.org/licenses/Apache-2.0)
|
||||||
|
|
||||||
### 贡献者:
|
|
||||||
|
|
||||||
以下是为WebMagic提交过代码或者issue的朋友:
|
|
||||||
|
|
||||||
* [ccliangbo](https://github.com/ccliangbo)
|
|
||||||
* [yuany](https://github.com/yuany)
|
|
||||||
* [yxssfxwzy](https://github.com/yxssfxwzy)
|
|
||||||
* [linkerlin](https://github.com/linkerlin)
|
|
||||||
* [d0ngw](https://github.com/d0ngw)
|
|
||||||
* [xuchaoo](https://github.com/xuchaoo)
|
|
||||||
* [supermicah](https://github.com/supermicah)
|
|
||||||
* [SimpleExpress](https://github.com/SimpleExpress)
|
|
||||||
* [aruanruan](https://github.com/aruanruan)
|
|
||||||
* [l1z2g9](https://github.com/l1z2g9)
|
|
||||||
* [zhegexiaohuozi](https://github.com/zhegexiaohuozi)
|
|
||||||
* [ywooer](https://github.com/ywooer)
|
|
||||||
* [yyw258520](https://github.com/yyw258520)
|
|
||||||
* [perfecking](https://github.com/perfecking)
|
|
||||||
* [lidongyang](http://my.oschina.net/lidongyang)
|
|
||||||
* [seveniu](https://github.com/seveniu)
|
|
||||||
* [sebastian1118](https://github.com/sebastian1118)
|
|
||||||
* [codev777](https://github.com/codev777)
|
|
||||||
* [fengwuze](https://github.com/fengwuze)
|
|
||||||
|
|
||||||
### 邮件组:
|
### 邮件组:
|
||||||
|
|
||||||
Gmail:
|
Gmail:
|
||||||
|
|
33
README.md
33
README.md
|
@ -59,7 +59,7 @@ public class GithubRepoPageProcessor implements PageProcessor {
|
||||||
public void process(Page page) {
|
public void process(Page page) {
|
||||||
page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all());
|
page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all());
|
||||||
page.putField("author", page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString());
|
page.putField("author", page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString());
|
||||||
page.putField("name", page.getHtml().xpath("//h1[@class='entry-title public']/strong/a/text()").toString());
|
page.putField("name", page.getHtml().xpath("//h1[@class='public']/strong/a/text()").toString());
|
||||||
if (page.getResultItems().get("name")==null){
|
if (page.getResultItems().get("name")==null){
|
||||||
//skip this page
|
//skip this page
|
||||||
page.setSkip(true);
|
page.setSkip(true);
|
||||||
|
@ -89,7 +89,7 @@ You can also use annotation way:
|
||||||
@HelpUrl("https://github.com/\\w+")
|
@HelpUrl("https://github.com/\\w+")
|
||||||
public class GithubRepo {
|
public class GithubRepo {
|
||||||
|
|
||||||
@ExtractBy(value = "//h1[@class='entry-title public']/strong/a/text()", notNull = true)
|
@ExtractBy(value = "//h1[@class='public']/strong/a/text()", notNull = true)
|
||||||
private String name;
|
private String name;
|
||||||
|
|
||||||
@ExtractByUrl("https://github\\.com/(\\w+)/.*")
|
@ExtractByUrl("https://github\\.com/(\\w+)/.*")
|
||||||
|
@ -114,39 +114,12 @@ The architecture of webmagic (refered to [Scrapy](http://scrapy.org/))
|
||||||
|
|
||||||

|

|
||||||
|
|
||||||
Javadocs: [http://code4craft.github.io/webmagic/docs/en/](http://code4craft.github.io/webmagic/docs/en/)
|
There are more examples in `webmagic-samples` package.
|
||||||
|
|
||||||
There are some samples in `webmagic-samples` package.
|
|
||||||
|
|
||||||
### Lisence:
|
### Lisence:
|
||||||
|
|
||||||
Lisenced under [Apache 2.0 lisence](http://opensource.org/licenses/Apache-2.0)
|
Lisenced under [Apache 2.0 lisence](http://opensource.org/licenses/Apache-2.0)
|
||||||
|
|
||||||
### Contributors:
|
|
||||||
|
|
||||||
Thanks these people for commiting source code, reporting bugs or suggesting for new feature:
|
|
||||||
|
|
||||||
* [ccliangbo](https://github.com/ccliangbo)
|
|
||||||
* [yuany](https://github.com/yuany)
|
|
||||||
* [yxssfxwzy](https://github.com/yxssfxwzy)
|
|
||||||
* [linkerlin](https://github.com/linkerlin)
|
|
||||||
* [d0ngw](https://github.com/d0ngw)
|
|
||||||
* [xuchaoo](https://github.com/xuchaoo)
|
|
||||||
* [supermicah](https://github.com/supermicah)
|
|
||||||
* [SimpleExpress](https://github.com/SimpleExpress)
|
|
||||||
* [aruanruan](https://github.com/aruanruan)
|
|
||||||
* [l1z2g9](https://github.com/l1z2g9)
|
|
||||||
* [zhegexiaohuozi](https://github.com/zhegexiaohuozi)
|
|
||||||
* [ywooer](https://github.com/ywooer)
|
|
||||||
* [yyw258520](https://github.com/yyw258520)
|
|
||||||
* [perfecking](https://github.com/perfecking)
|
|
||||||
* [lidongyang](http://my.oschina.net/lidongyang)
|
|
||||||
* [seveniu](https://github.com/seveniu)
|
|
||||||
* [sebastian1118](https://github.com/sebastian1118)
|
|
||||||
* [codev777](https://github.com/codev777)
|
|
||||||
* [fengwuze](https://github.com/fengwuze)
|
|
||||||
|
|
||||||
|
|
||||||
### Thanks:
|
### Thanks:
|
||||||
|
|
||||||
To write webmagic, I refered to the projects below :
|
To write webmagic, I refered to the projects below :
|
||||||
|
|
10
pom.xml
10
pom.xml
|
@ -64,6 +64,12 @@
|
||||||
<version>4.11</version>
|
<version>4.11</version>
|
||||||
<scope>test</scope>
|
<scope>test</scope>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.mockito</groupId>
|
||||||
|
<artifactId>mockito-all</artifactId>
|
||||||
|
<version>1.10.19</version>
|
||||||
|
<scope>test</scope>
|
||||||
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.httpcomponents</groupId>
|
<groupId>org.apache.httpcomponents</groupId>
|
||||||
<artifactId>httpclient</artifactId>
|
<artifactId>httpclient</artifactId>
|
||||||
|
@ -97,7 +103,7 @@
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>com.alibaba</groupId>
|
<groupId>com.alibaba</groupId>
|
||||||
<artifactId>fastjson</artifactId>
|
<artifactId>fastjson</artifactId>
|
||||||
<version>1.2.21</version>
|
<version>1.2.28</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>com.github.dreamhead</groupId>
|
<groupId>com.github.dreamhead</groupId>
|
||||||
|
@ -130,7 +136,7 @@
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>commons-collections</groupId>
|
<groupId>commons-collections</groupId>
|
||||||
<artifactId>commons-collections</artifactId>
|
<artifactId>commons-collections</artifactId>
|
||||||
<version>3.2.1</version>
|
<version>3.2.2</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.commons</groupId>
|
<groupId>org.apache.commons</groupId>
|
||||||
|
|
|
@ -40,6 +40,11 @@
|
||||||
<artifactId>slf4j-api</artifactId>
|
<artifactId>slf4j-api</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.mockito</groupId>
|
||||||
|
<artifactId>mockito-all</artifactId>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.slf4j</groupId>
|
<groupId>org.slf4j</groupId>
|
||||||
<artifactId>slf4j-log4j12</artifactId>
|
<artifactId>slf4j-log4j12</artifactId>
|
||||||
|
|
|
@ -107,14 +107,12 @@ public class Page {
|
||||||
* @param requests requests
|
* @param requests requests
|
||||||
*/
|
*/
|
||||||
public void addTargetRequests(List<String> requests) {
|
public void addTargetRequests(List<String> requests) {
|
||||||
synchronized (targetRequests) {
|
for (String s : requests) {
|
||||||
for (String s : requests) {
|
if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) {
|
||||||
if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) {
|
continue;
|
||||||
continue;
|
|
||||||
}
|
|
||||||
s = UrlUtils.canonicalizeUrl(s, url.toString());
|
|
||||||
targetRequests.add(new Request(s));
|
|
||||||
}
|
}
|
||||||
|
s = UrlUtils.canonicalizeUrl(s, url.toString());
|
||||||
|
targetRequests.add(new Request(s));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -125,14 +123,12 @@ public class Page {
|
||||||
* @param priority priority
|
* @param priority priority
|
||||||
*/
|
*/
|
||||||
public void addTargetRequests(List<String> requests, long priority) {
|
public void addTargetRequests(List<String> requests, long priority) {
|
||||||
synchronized (targetRequests) {
|
for (String s : requests) {
|
||||||
for (String s : requests) {
|
if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) {
|
||||||
if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) {
|
continue;
|
||||||
continue;
|
|
||||||
}
|
|
||||||
s = UrlUtils.canonicalizeUrl(s, url.toString());
|
|
||||||
targetRequests.add(new Request(s).setPriority(priority));
|
|
||||||
}
|
}
|
||||||
|
s = UrlUtils.canonicalizeUrl(s, url.toString());
|
||||||
|
targetRequests.add(new Request(s).setPriority(priority));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -145,10 +141,8 @@ public class Page {
|
||||||
if (StringUtils.isBlank(requestString) || requestString.equals("#")) {
|
if (StringUtils.isBlank(requestString) || requestString.equals("#")) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
synchronized (targetRequests) {
|
requestString = UrlUtils.canonicalizeUrl(requestString, url.toString());
|
||||||
requestString = UrlUtils.canonicalizeUrl(requestString, url.toString());
|
targetRequests.add(new Request(requestString));
|
||||||
targetRequests.add(new Request(requestString));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -157,9 +151,7 @@ public class Page {
|
||||||
* @param request request
|
* @param request request
|
||||||
*/
|
*/
|
||||||
public void addTargetRequest(Request request) {
|
public void addTargetRequest(Request request) {
|
||||||
synchronized (targetRequests) {
|
targetRequests.add(request);
|
||||||
targetRequests.add(request);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -85,27 +85,10 @@ public class Request implements Serializable {
|
||||||
return url;
|
return url;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
public boolean equals(Object o) {
|
|
||||||
if (this == o) return true;
|
|
||||||
if (o == null || getClass() != o.getClass()) return false;
|
|
||||||
|
|
||||||
Request request = (Request) o;
|
|
||||||
|
|
||||||
if (!url.equals(request.url)) return false;
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
public Map<String, Object> getExtras() {
|
public Map<String, Object> getExtras() {
|
||||||
return extras;
|
return extras;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
public int hashCode() {
|
|
||||||
return url.hashCode();
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setExtras(Map<String, Object> extras) {
|
public void setExtras(Map<String, Object> extras) {
|
||||||
this.extras = extras;
|
this.extras = extras;
|
||||||
}
|
}
|
||||||
|
@ -132,23 +115,52 @@ public class Request implements Serializable {
|
||||||
return params;
|
return params;
|
||||||
}
|
}
|
||||||
/**
|
/**
|
||||||
* POST/GET参数设置
|
* set params for request
|
||||||
|
* <br>
|
||||||
|
* DO NOT set this for request already has params, like 'https://github.com/search?q=webmagic'
|
||||||
|
* @param params params
|
||||||
* */
|
* */
|
||||||
public void setParams(Map<String, String> params) {
|
public void setParams(Map<String, String> params) {
|
||||||
this.params = params;
|
this.params = params;
|
||||||
}
|
}
|
||||||
/**
|
/**
|
||||||
* POST/GET参数设置
|
* set params for request
|
||||||
|
* <br>
|
||||||
|
* DO NOT set this for request already has params, like 'https://github.com/search?q=webmagic'
|
||||||
|
* @param key key
|
||||||
|
* @param value value
|
||||||
* */
|
* */
|
||||||
public void putParams(String key,String value) {
|
public void putParams(String key,String value) {
|
||||||
params.put(key,value);
|
params.put(key,value);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean equals(Object o) {
|
||||||
|
if (this == o) return true;
|
||||||
|
if (o == null || getClass() != o.getClass()) return false;
|
||||||
|
|
||||||
|
Request request = (Request) o;
|
||||||
|
|
||||||
|
if (url != null ? !url.equals(request.url) : request.url != null) return false;
|
||||||
|
if (method != null ? !method.equals(request.method) : request.method != null) return false;
|
||||||
|
return params != null ? params.equals(request.params) : request.params == null;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int hashCode() {
|
||||||
|
int result = url != null ? url.hashCode() : 0;
|
||||||
|
result = 31 * result + (method != null ? method.hashCode() : 0);
|
||||||
|
result = 31 * result + (params != null ? params.hashCode() : 0);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String toString() {
|
public String toString() {
|
||||||
return "Request{" +
|
return "Request{" +
|
||||||
"url='" + url + '\'' +
|
"url='" + url + '\'' +
|
||||||
", method='" + method + '\'' +
|
", method='" + method + '\'' +
|
||||||
", extras=" + extras +
|
", extras=" + extras +
|
||||||
|
", params=" + params +
|
||||||
", priority=" + priority +
|
", priority=" + priority +
|
||||||
'}';
|
'}';
|
||||||
}
|
}
|
||||||
|
|
|
@ -305,7 +305,7 @@ public class Spider implements Runnable, Task {
|
||||||
initComponent();
|
initComponent();
|
||||||
logger.info("Spider " + getUUID() + " started!");
|
logger.info("Spider " + getUUID() + " started!");
|
||||||
while (!Thread.currentThread().isInterrupted() && stat.get() == STAT_RUNNING) {
|
while (!Thread.currentThread().isInterrupted() && stat.get() == STAT_RUNNING) {
|
||||||
Request request = scheduler.poll(this);
|
final Request request = scheduler.poll(this);
|
||||||
if (request == null) {
|
if (request == null) {
|
||||||
if (threadPool.getThreadAlive() == 0 && exitWhenComplete) {
|
if (threadPool.getThreadAlive() == 0 && exitWhenComplete) {
|
||||||
break;
|
break;
|
||||||
|
@ -313,16 +313,15 @@ public class Spider implements Runnable, Task {
|
||||||
// wait until new url added
|
// wait until new url added
|
||||||
waitNewUrl();
|
waitNewUrl();
|
||||||
} else {
|
} else {
|
||||||
final Request requestFinal = request;
|
|
||||||
threadPool.execute(new Runnable() {
|
threadPool.execute(new Runnable() {
|
||||||
@Override
|
@Override
|
||||||
public void run() {
|
public void run() {
|
||||||
try {
|
try {
|
||||||
processRequest(requestFinal);
|
processRequest(request);
|
||||||
onSuccess(requestFinal);
|
onSuccess(request);
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
onError(requestFinal);
|
onError(request);
|
||||||
logger.error("process request " + requestFinal + " error", e);
|
logger.error("process request " + request + " error", e);
|
||||||
} finally {
|
} finally {
|
||||||
pageCount.incrementAndGet();
|
pageCount.incrementAndGet();
|
||||||
signalNewUrl();
|
signalNewUrl();
|
||||||
|
@ -587,6 +586,7 @@ public class Spider implements Runnable, Task {
|
||||||
if (threadNum <= 0) {
|
if (threadNum <= 0) {
|
||||||
throw new IllegalArgumentException("threadNum should be more than one!");
|
throw new IllegalArgumentException("threadNum should be more than one!");
|
||||||
}
|
}
|
||||||
|
this.executorService = executorService;
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
package us.codecraft.webmagic.downloader;
|
package us.codecraft.webmagic.downloader;
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.commons.lang3.StringUtils;
|
|
||||||
import org.apache.http.HttpHost;
|
import org.apache.http.HttpHost;
|
||||||
import org.apache.http.HttpResponse;
|
import org.apache.http.HttpResponse;
|
||||||
import org.apache.http.NameValuePair;
|
import org.apache.http.NameValuePair;
|
||||||
|
@ -15,10 +14,6 @@ import org.apache.http.client.methods.RequestBuilder;
|
||||||
import org.apache.http.impl.client.CloseableHttpClient;
|
import org.apache.http.impl.client.CloseableHttpClient;
|
||||||
import org.apache.http.message.BasicNameValuePair;
|
import org.apache.http.message.BasicNameValuePair;
|
||||||
import org.apache.http.util.EntityUtils;
|
import org.apache.http.util.EntityUtils;
|
||||||
import org.jsoup.Jsoup;
|
|
||||||
import org.jsoup.nodes.Document;
|
|
||||||
import org.jsoup.nodes.Element;
|
|
||||||
import org.jsoup.select.Elements;
|
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
import us.codecraft.webmagic.Page;
|
import us.codecraft.webmagic.Page;
|
||||||
|
@ -27,8 +22,8 @@ import us.codecraft.webmagic.Site;
|
||||||
import us.codecraft.webmagic.Task;
|
import us.codecraft.webmagic.Task;
|
||||||
import us.codecraft.webmagic.proxy.Proxy;
|
import us.codecraft.webmagic.proxy.Proxy;
|
||||||
import us.codecraft.webmagic.selector.PlainText;
|
import us.codecraft.webmagic.selector.PlainText;
|
||||||
|
import us.codecraft.webmagic.utils.CharsetUtils;
|
||||||
import us.codecraft.webmagic.utils.HttpConstant;
|
import us.codecraft.webmagic.utils.HttpConstant;
|
||||||
import us.codecraft.webmagic.utils.UrlUtils;
|
|
||||||
import us.codecraft.webmagic.utils.WMCollections;
|
import us.codecraft.webmagic.utils.WMCollections;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
@ -98,8 +93,8 @@ public class HttpClientDownloader extends AbstractDownloader {
|
||||||
proxyHost = site.getHttpProxy();
|
proxyHost = site.getHttpProxy();
|
||||||
}
|
}
|
||||||
|
|
||||||
HttpUriRequest httpUriRequest = getHttpUriRequest(request, site, headers, proxyHost);//<2F><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>˴<EFBFBD><CBB4><EFBFBD>
|
HttpUriRequest httpUriRequest = getHttpUriRequest(request, site, headers, proxyHost);
|
||||||
httpResponse = getHttpClient(site, proxy).execute(httpUriRequest);//getHttpClient<6E><74><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>˴<EFBFBD><CBB4><EFBFBD><EFBFBD><EFBFBD>֤
|
httpResponse = getHttpClient(site, proxy).execute(httpUriRequest);
|
||||||
statusCode = httpResponse.getStatusLine().getStatusCode();
|
statusCode = httpResponse.getStatusLine().getStatusCode();
|
||||||
request.putExtra(Request.STATUS_CODE, statusCode);
|
request.putExtra(Request.STATUS_CODE, statusCode);
|
||||||
if (statusAccept(acceptStatCode, statusCode)) {
|
if (statusAccept(acceptStatCode, statusCode)) {
|
||||||
|
@ -167,39 +162,44 @@ public class HttpClientDownloader extends AbstractDownloader {
|
||||||
String method = request.getMethod();
|
String method = request.getMethod();
|
||||||
if (method == null || method.equalsIgnoreCase(HttpConstant.Method.GET)) {
|
if (method == null || method.equalsIgnoreCase(HttpConstant.Method.GET)) {
|
||||||
//default get
|
//default get
|
||||||
RequestBuilder requestBuilder=RequestBuilder.get();
|
return addQueryParams(RequestBuilder.get(),request.getParams());
|
||||||
if (request.getParams() != null) {
|
|
||||||
for (Map.Entry<String, String> entry : request.getParams().entrySet()) {
|
|
||||||
requestBuilder.addParameter(entry.getKey(), entry.getValue());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return requestBuilder;
|
|
||||||
} else if (method.equalsIgnoreCase(HttpConstant.Method.POST)) {
|
} else if (method.equalsIgnoreCase(HttpConstant.Method.POST)) {
|
||||||
RequestBuilder requestBuilder = RequestBuilder.post();
|
return addFormParams(RequestBuilder.post(), (NameValuePair[]) request.getExtra("nameValuePair"), request.getParams());
|
||||||
NameValuePair[] nameValuePair = (NameValuePair[]) request.getExtra("nameValuePair");
|
|
||||||
List<NameValuePair> allNameValuePair=new ArrayList<NameValuePair>();
|
|
||||||
if (nameValuePair != null && nameValuePair.length > 0) {
|
|
||||||
allNameValuePair= Arrays.asList(nameValuePair);
|
|
||||||
}
|
|
||||||
if (request.getParams() != null) {
|
|
||||||
for (String key : request.getParams().keySet()) {
|
|
||||||
allNameValuePair.add(new BasicNameValuePair(key, request.getParams().get(key)));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
requestBuilder.setEntity(new UrlEncodedFormEntity(allNameValuePair, Charset.forName("utf8")));
|
|
||||||
return requestBuilder;
|
|
||||||
} else if (method.equalsIgnoreCase(HttpConstant.Method.HEAD)) {
|
} else if (method.equalsIgnoreCase(HttpConstant.Method.HEAD)) {
|
||||||
return RequestBuilder.head();
|
return addQueryParams(RequestBuilder.head(),request.getParams());
|
||||||
} else if (method.equalsIgnoreCase(HttpConstant.Method.PUT)) {
|
} else if (method.equalsIgnoreCase(HttpConstant.Method.PUT)) {
|
||||||
return RequestBuilder.put();
|
return addFormParams(RequestBuilder.put(), (NameValuePair[]) request.getExtra("nameValuePair"), request.getParams());
|
||||||
} else if (method.equalsIgnoreCase(HttpConstant.Method.DELETE)) {
|
} else if (method.equalsIgnoreCase(HttpConstant.Method.DELETE)) {
|
||||||
return RequestBuilder.delete();
|
return addQueryParams(RequestBuilder.delete(),request.getParams());
|
||||||
} else if (method.equalsIgnoreCase(HttpConstant.Method.TRACE)) {
|
} else if (method.equalsIgnoreCase(HttpConstant.Method.TRACE)) {
|
||||||
return RequestBuilder.trace();
|
return addQueryParams(RequestBuilder.trace(),request.getParams());
|
||||||
}
|
}
|
||||||
throw new IllegalArgumentException("Illegal HTTP Method " + method);
|
throw new IllegalArgumentException("Illegal HTTP Method " + method);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private RequestBuilder addFormParams(RequestBuilder requestBuilder, NameValuePair[] nameValuePair, Map<String, String> params) {
|
||||||
|
List<NameValuePair> allNameValuePair=new ArrayList<NameValuePair>();
|
||||||
|
if (nameValuePair != null && nameValuePair.length > 0) {
|
||||||
|
allNameValuePair= Arrays.asList(nameValuePair);
|
||||||
|
}
|
||||||
|
if (params != null) {
|
||||||
|
for (String key : params.keySet()) {
|
||||||
|
allNameValuePair.add(new BasicNameValuePair(key, params.get(key)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
requestBuilder.setEntity(new UrlEncodedFormEntity(allNameValuePair, Charset.forName("utf8")));
|
||||||
|
return requestBuilder;
|
||||||
|
}
|
||||||
|
|
||||||
|
private RequestBuilder addQueryParams(RequestBuilder requestBuilder, Map<String, String> params) {
|
||||||
|
if (params != null) {
|
||||||
|
for (Map.Entry<String, String> entry : params.entrySet()) {
|
||||||
|
requestBuilder.addParameter(entry.getKey(), entry.getValue());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return requestBuilder;
|
||||||
|
}
|
||||||
|
|
||||||
protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException {
|
protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException {
|
||||||
String content = getContent(charset, httpResponse);
|
String content = getContent(charset, httpResponse);
|
||||||
Page page = new Page();
|
Page page = new Page();
|
||||||
|
@ -226,40 +226,6 @@ public class HttpClientDownloader extends AbstractDownloader {
|
||||||
}
|
}
|
||||||
|
|
||||||
protected String getHtmlCharset(HttpResponse httpResponse, byte[] contentBytes) throws IOException {
|
protected String getHtmlCharset(HttpResponse httpResponse, byte[] contentBytes) throws IOException {
|
||||||
String charset;
|
return CharsetUtils.detectCharset(httpResponse.getEntity().getContentType().getValue(), contentBytes);
|
||||||
// charset
|
|
||||||
// 1、encoding in http header Content-Type
|
|
||||||
String value = httpResponse.getEntity().getContentType().getValue();
|
|
||||||
charset = UrlUtils.getCharset(value);
|
|
||||||
if (StringUtils.isNotBlank(charset)) {
|
|
||||||
logger.debug("Auto get charset: {}", charset);
|
|
||||||
return charset;
|
|
||||||
}
|
|
||||||
// use default charset to decode first time
|
|
||||||
Charset defaultCharset = Charset.defaultCharset();
|
|
||||||
String content = new String(contentBytes, defaultCharset.name());
|
|
||||||
// 2、charset in meta
|
|
||||||
if (StringUtils.isNotEmpty(content)) {
|
|
||||||
Document document = Jsoup.parse(content);
|
|
||||||
Elements links = document.select("meta");
|
|
||||||
for (Element link : links) {
|
|
||||||
// 2.1、html4.01 <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
|
|
||||||
String metaContent = link.attr("content");
|
|
||||||
String metaCharset = link.attr("charset");
|
|
||||||
if (metaContent.indexOf("charset") != -1) {
|
|
||||||
metaContent = metaContent.substring(metaContent.indexOf("charset"), metaContent.length());
|
|
||||||
charset = metaContent.split("=")[1];
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
// 2.2、html5 <meta charset="UTF-8" />
|
|
||||||
else if (StringUtils.isNotEmpty(metaCharset)) {
|
|
||||||
charset = metaCharset;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
logger.debug("Auto get charset: {}", charset);
|
|
||||||
// 3、todo use tools as cpdetector for content decode
|
|
||||||
return charset;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -18,7 +18,7 @@ public class GithubRepoPageProcessor implements PageProcessor {
|
||||||
page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/[\\w\\-]+/[\\w\\-]+)").all());
|
page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/[\\w\\-]+/[\\w\\-]+)").all());
|
||||||
page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/[\\w\\-])").all());
|
page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/[\\w\\-])").all());
|
||||||
page.putField("author", page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString());
|
page.putField("author", page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString());
|
||||||
page.putField("name", page.getHtml().xpath("//h1[@class='entry-title public']/strong/a/text()").toString());
|
page.putField("name", page.getHtml().xpath("//h1[@class='public']/strong/a/text()").toString());
|
||||||
if (page.getResultItems().get("name")==null){
|
if (page.getResultItems().get("name")==null){
|
||||||
//skip this page
|
//skip this page
|
||||||
page.setSkip(true);
|
page.setSkip(true);
|
||||||
|
|
|
@ -79,14 +79,14 @@ public class Proxy implements Delayed, Serializable {
|
||||||
|
|
||||||
private List<Integer> failedErrorType = new ArrayList<Integer>();
|
private List<Integer> failedErrorType = new ArrayList<Integer>();
|
||||||
|
|
||||||
Proxy(HttpHost httpHost, String user, String password) {
|
public Proxy(HttpHost httpHost, String user, String password) {
|
||||||
this.httpHost = httpHost;
|
this.httpHost = httpHost;
|
||||||
this.user = user;
|
this.user = user;
|
||||||
this.password = password;
|
this.password = password;
|
||||||
this.canReuseTime = System.nanoTime() + TimeUnit.NANOSECONDS.convert(reuseTimeInterval, TimeUnit.MILLISECONDS);
|
this.canReuseTime = System.nanoTime() + TimeUnit.NANOSECONDS.convert(reuseTimeInterval, TimeUnit.MILLISECONDS);
|
||||||
}
|
}
|
||||||
|
|
||||||
Proxy(HttpHost httpHost, int reuseInterval, String user, String password) {
|
public Proxy(HttpHost httpHost, int reuseInterval, String user, String password) {
|
||||||
this.httpHost = httpHost;
|
this.httpHost = httpHost;
|
||||||
this.user = user;
|
this.user = user;
|
||||||
this.password = password;
|
this.password = password;
|
||||||
|
|
|
@ -6,6 +6,7 @@ import us.codecraft.webmagic.Request;
|
||||||
import us.codecraft.webmagic.Task;
|
import us.codecraft.webmagic.Task;
|
||||||
import us.codecraft.webmagic.scheduler.component.DuplicateRemover;
|
import us.codecraft.webmagic.scheduler.component.DuplicateRemover;
|
||||||
import us.codecraft.webmagic.scheduler.component.HashSetDuplicateRemover;
|
import us.codecraft.webmagic.scheduler.component.HashSetDuplicateRemover;
|
||||||
|
import us.codecraft.webmagic.utils.HttpConstant;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Remove duplicate urls and only push urls which are not duplicate.<br><br>
|
* Remove duplicate urls and only push urls which are not duplicate.<br><br>
|
||||||
|
@ -31,7 +32,7 @@ public abstract class DuplicateRemovedScheduler implements Scheduler {
|
||||||
@Override
|
@Override
|
||||||
public void push(Request request, Task task) {
|
public void push(Request request, Task task) {
|
||||||
logger.trace("get a candidate url {}", request.getUrl());
|
logger.trace("get a candidate url {}", request.getUrl());
|
||||||
if (!duplicatedRemover.isDuplicate(request, task) || shouldReserved(request)) {
|
if (shouldReserved(request) || noNeedToRemoveDuplicate(request) || !duplicatedRemover.isDuplicate(request, task)) {
|
||||||
logger.debug("push to queue {}", request.getUrl());
|
logger.debug("push to queue {}", request.getUrl());
|
||||||
pushWhenNoDuplicate(request, task);
|
pushWhenNoDuplicate(request, task);
|
||||||
}
|
}
|
||||||
|
@ -41,6 +42,10 @@ public abstract class DuplicateRemovedScheduler implements Scheduler {
|
||||||
return request.getExtra(Request.CYCLE_TRIED_TIMES) != null;
|
return request.getExtra(Request.CYCLE_TRIED_TIMES) != null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
protected boolean noNeedToRemoveDuplicate(Request request) {
|
||||||
|
return HttpConstant.Method.POST.equalsIgnoreCase(request.getMethod());
|
||||||
|
}
|
||||||
|
|
||||||
protected void pushWhenNoDuplicate(Request request, Task task) {
|
protected void pushWhenNoDuplicate(Request request, Task task) {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -26,7 +26,7 @@ public class QueueScheduler extends DuplicateRemovedScheduler implements Monitor
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public synchronized Request poll(Task task) {
|
public Request poll(Task task) {
|
||||||
return queue.poll();
|
return queue.poll();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -28,8 +28,7 @@ public class RegexSelector implements Selector {
|
||||||
}
|
}
|
||||||
// Check bracket for regex group. Add default group 1 if there is no group.
|
// Check bracket for regex group. Add default group 1 if there is no group.
|
||||||
// Only check if there exists the valid left parenthesis, leave regexp validation for Pattern.
|
// Only check if there exists the valid left parenthesis, leave regexp validation for Pattern.
|
||||||
if (StringUtils.countMatches(regexStr, "(") - StringUtils.countMatches(regexStr, "\\(") ==
|
if ( ! hasGroup(regexStr) ){
|
||||||
StringUtils.countMatches(regexStr, "(?:") - StringUtils.countMatches(regexStr, "\\(?:")) {
|
|
||||||
regexStr = "(" + regexStr + ")";
|
regexStr = "(" + regexStr + ")";
|
||||||
}
|
}
|
||||||
this.regexStr = regexStr;
|
this.regexStr = regexStr;
|
||||||
|
@ -45,6 +44,30 @@ public class RegexSelector implements Selector {
|
||||||
this(regexStr, 1);
|
this(regexStr, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private boolean hasGroup(String regexStr) {
|
||||||
|
if (StringUtils.countMatches(regexStr, "(") - StringUtils.countMatches(regexStr, "\\(") ==
|
||||||
|
StringUtils.countMatches(regexStr, "(?:") - StringUtils.countMatches(regexStr, "\\(?:")){
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (StringUtils.countMatches(regexStr, "(") - StringUtils.countMatches(regexStr, "\\(") ==
|
||||||
|
StringUtils.countMatches(regexStr, "(?=") - StringUtils.countMatches(regexStr, "\\(?=") ) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (StringUtils.countMatches(regexStr, "(") - StringUtils.countMatches(regexStr, "\\(") ==
|
||||||
|
StringUtils.countMatches(regexStr, "(?<") - StringUtils.countMatches(regexStr, "\\(?<") ) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (StringUtils.countMatches(regexStr, "(") - StringUtils.countMatches(regexStr, "\\(") ==
|
||||||
|
StringUtils.countMatches(regexStr, "(?!") - StringUtils.countMatches(regexStr, "\\(?!") ) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (StringUtils.countMatches(regexStr, "(") - StringUtils.countMatches(regexStr, "\\(") ==
|
||||||
|
StringUtils.countMatches(regexStr, "(?#") - StringUtils.countMatches(regexStr, "\\(?#") ) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String select(String text) {
|
public String select(String text) {
|
||||||
return selectGroup(text).get(group);
|
return selectGroup(text).get(group);
|
||||||
|
|
|
@ -0,0 +1,61 @@
|
||||||
|
package us.codecraft.webmagic.utils;
|
||||||
|
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
import org.jsoup.Jsoup;
|
||||||
|
import org.jsoup.nodes.Document;
|
||||||
|
import org.jsoup.nodes.Element;
|
||||||
|
import org.jsoup.select.Elements;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.charset.Charset;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author code4crafter@gmail.com
|
||||||
|
* Date: 17/3/11
|
||||||
|
* Time: 10:36
|
||||||
|
* @since 0.6.2
|
||||||
|
*/
|
||||||
|
public abstract class CharsetUtils {
|
||||||
|
|
||||||
|
private static Logger logger = LoggerFactory.getLogger(CharsetUtils.class);
|
||||||
|
|
||||||
|
public static String detectCharset(String contentType, byte[] contentBytes) throws IOException {
|
||||||
|
String charset;
|
||||||
|
// charset
|
||||||
|
// 1、encoding in http header Content-Type
|
||||||
|
charset = UrlUtils.getCharset(contentType);
|
||||||
|
if (StringUtils.isNotBlank(contentType)) {
|
||||||
|
logger.debug("Auto get charset: {}", charset);
|
||||||
|
return charset;
|
||||||
|
}
|
||||||
|
// use default charset to decode first time
|
||||||
|
Charset defaultCharset = Charset.defaultCharset();
|
||||||
|
String content = new String(contentBytes, defaultCharset);
|
||||||
|
// 2、charset in meta
|
||||||
|
if (StringUtils.isNotEmpty(content)) {
|
||||||
|
Document document = Jsoup.parse(content);
|
||||||
|
Elements links = document.select("meta");
|
||||||
|
for (Element link : links) {
|
||||||
|
// 2.1、html4.01 <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
|
||||||
|
String metaContent = link.attr("content");
|
||||||
|
String metaCharset = link.attr("charset");
|
||||||
|
if (metaContent.indexOf("charset") != -1) {
|
||||||
|
metaContent = metaContent.substring(metaContent.indexOf("charset"), metaContent.length());
|
||||||
|
charset = metaContent.split("=")[1];
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
// 2.2、html5 <meta charset="UTF-8" />
|
||||||
|
else if (StringUtils.isNotEmpty(metaCharset)) {
|
||||||
|
charset = metaCharset;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
logger.debug("Auto get charset: {}", charset);
|
||||||
|
// 3、todo use tools as cpdetector for content decode
|
||||||
|
return charset;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,25 @@
|
||||||
|
package us.codecraft.webmagic;
|
||||||
|
|
||||||
|
import org.junit.Test;
|
||||||
|
import us.codecraft.webmagic.utils.HttpConstant;
|
||||||
|
|
||||||
|
import static org.assertj.core.api.Assertions.assertThat;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author code4crafter@gmail.com
|
||||||
|
* Date: 17/3/11
|
||||||
|
*/
|
||||||
|
public class RequestTest {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testEqualsAndHashCode() throws Exception {
|
||||||
|
Request requestA = new Request("http://www.google.com/");
|
||||||
|
Request requestB = new Request("http://www.google.com/");
|
||||||
|
assertThat(requestA.hashCode()).isEqualTo(requestB.hashCode());
|
||||||
|
assertThat(requestA).isEqualTo(requestB);
|
||||||
|
requestA.setMethod(HttpConstant.Method.GET);
|
||||||
|
requestA.setMethod(HttpConstant.Method.POST);
|
||||||
|
assertThat(requestA).isNotEqualTo(requestB);
|
||||||
|
assertThat(requestA.hashCode()).isNotEqualTo(requestB.hashCode());
|
||||||
|
}
|
||||||
|
}
|
|
@ -5,13 +5,17 @@ import com.github.dreamhead.moco.Runnable;
|
||||||
import com.github.dreamhead.moco.Runner;
|
import com.github.dreamhead.moco.Runner;
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.http.client.methods.CloseableHttpResponse;
|
import org.apache.http.client.methods.CloseableHttpResponse;
|
||||||
|
import org.apache.http.client.methods.RequestBuilder;
|
||||||
import org.apache.http.impl.client.CloseableHttpClient;
|
import org.apache.http.impl.client.CloseableHttpClient;
|
||||||
|
import org.apache.http.impl.client.HttpClients;
|
||||||
|
import org.apache.http.util.EntityUtils;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
import us.codecraft.webmagic.Page;
|
import us.codecraft.webmagic.Page;
|
||||||
import us.codecraft.webmagic.Request;
|
import us.codecraft.webmagic.Request;
|
||||||
import us.codecraft.webmagic.Site;
|
import us.codecraft.webmagic.Site;
|
||||||
import us.codecraft.webmagic.Task;
|
import us.codecraft.webmagic.Task;
|
||||||
import us.codecraft.webmagic.selector.Html;
|
import us.codecraft.webmagic.selector.Html;
|
||||||
|
import us.codecraft.webmagic.utils.HttpConstant;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.UnsupportedEncodingException;
|
import java.io.UnsupportedEncodingException;
|
||||||
|
@ -103,4 +107,42 @@ public class HttpClientDownloaderTest {
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void test_selectRequestMethod() throws Exception {
|
||||||
|
HttpServer server = httpserver(12306);
|
||||||
|
server.get(eq(query("q"), "webmagic")).response("get");
|
||||||
|
server.post(eq(form("q"), "webmagic")).response("post");
|
||||||
|
server.put(eq(form("q"), "webmagic")).response("put");
|
||||||
|
server.delete(eq(query("q"), "webmagic")).response("delete");
|
||||||
|
server.request(and(by(method("HEAD")),eq(query("q"), "webmagic"))).response(header("method","head"));
|
||||||
|
server.request(and(by(method("TRACE")),eq(query("q"), "webmagic"))).response("trace");
|
||||||
|
Runner.running(server, new Runnable() {
|
||||||
|
@Override
|
||||||
|
public void run() throws Exception {
|
||||||
|
HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
|
||||||
|
Request request = new Request();
|
||||||
|
request.setUrl("http://127.0.0.1:12306/search");
|
||||||
|
request.putParams("q", "webmagic");
|
||||||
|
request.setMethod(HttpConstant.Method.GET);
|
||||||
|
RequestBuilder requestBuilder = httpClientDownloader.selectRequestMethod(request).setUri(request.getUrl());
|
||||||
|
assertThat(EntityUtils.toString(HttpClients.custom().build().execute(requestBuilder.build()).getEntity())).isEqualTo("get");
|
||||||
|
request.setMethod(HttpConstant.Method.POST);
|
||||||
|
requestBuilder = httpClientDownloader.selectRequestMethod(request).setUri(request.getUrl());
|
||||||
|
assertThat(EntityUtils.toString(HttpClients.custom().build().execute(requestBuilder.build()).getEntity())).isEqualTo("post");
|
||||||
|
request.setMethod(HttpConstant.Method.PUT);
|
||||||
|
requestBuilder = httpClientDownloader.selectRequestMethod(request).setUri(request.getUrl());
|
||||||
|
assertThat(EntityUtils.toString(HttpClients.custom().build().execute(requestBuilder.build()).getEntity())).isEqualTo("put");
|
||||||
|
request.setMethod(HttpConstant.Method.DELETE);
|
||||||
|
requestBuilder = httpClientDownloader.selectRequestMethod(request).setUri(request.getUrl());
|
||||||
|
assertThat(EntityUtils.toString(HttpClients.custom().build().execute(requestBuilder.build()).getEntity())).isEqualTo("delete");
|
||||||
|
request.setMethod(HttpConstant.Method.HEAD);
|
||||||
|
requestBuilder = httpClientDownloader.selectRequestMethod(request).setUri(request.getUrl());
|
||||||
|
assertThat(HttpClients.custom().build().execute(requestBuilder.build()).getFirstHeader("method").getValue()).isEqualTo("head");
|
||||||
|
request.setMethod(HttpConstant.Method.TRACE);
|
||||||
|
requestBuilder = httpClientDownloader.selectRequestMethod(request).setUri(request.getUrl());
|
||||||
|
assertThat(EntityUtils.toString(HttpClients.custom().build().execute(requestBuilder.build()).getEntity())).isEqualTo("trace");
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,50 @@
|
||||||
|
package us.codecraft.webmagic.scheduler;
|
||||||
|
|
||||||
|
import org.junit.Test;
|
||||||
|
import org.junit.runner.RunWith;
|
||||||
|
import org.mockito.Mockito;
|
||||||
|
import org.mockito.runners.MockitoJUnitRunner;
|
||||||
|
import us.codecraft.webmagic.Request;
|
||||||
|
import us.codecraft.webmagic.Task;
|
||||||
|
import us.codecraft.webmagic.scheduler.component.DuplicateRemover;
|
||||||
|
import us.codecraft.webmagic.utils.HttpConstant;
|
||||||
|
|
||||||
|
import static org.mockito.Matchers.any;
|
||||||
|
import static org.mockito.Mockito.times;
|
||||||
|
import static org.mockito.Mockito.verify;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author code4crafter@gmail.com
|
||||||
|
* Date: 17/3/11
|
||||||
|
* Time: 上午11:26
|
||||||
|
*/
|
||||||
|
@RunWith(MockitoJUnitRunner.class)
|
||||||
|
public class DuplicateRemovedSchedulerTest {
|
||||||
|
|
||||||
|
private DuplicateRemovedScheduler duplicateRemovedScheduler = new DuplicateRemovedScheduler() {
|
||||||
|
@Override
|
||||||
|
public Request poll(Task task) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void test_no_duplicate_removed_for_post_request() throws Exception {
|
||||||
|
DuplicateRemover duplicateRemover = Mockito.mock(DuplicateRemover.class);
|
||||||
|
duplicateRemovedScheduler.setDuplicateRemover(duplicateRemover);
|
||||||
|
Request request = new Request("https://www.google.com/");
|
||||||
|
request.setMethod(HttpConstant.Method.POST);
|
||||||
|
duplicateRemovedScheduler.push(request, null);
|
||||||
|
verify(duplicateRemover,times(0)).isDuplicate(any(Request.class),any(Task.class));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void test_duplicate_removed_for_get_request() throws Exception {
|
||||||
|
DuplicateRemover duplicateRemover = Mockito.mock(DuplicateRemover.class);
|
||||||
|
duplicateRemovedScheduler.setDuplicateRemover(duplicateRemover);
|
||||||
|
Request request = new Request("https://www.google.com/");
|
||||||
|
request.setMethod(HttpConstant.Method.GET);
|
||||||
|
duplicateRemovedScheduler.push(request, null);
|
||||||
|
verify(duplicateRemover,times(1)).isDuplicate(any(Request.class),any(Task.class));
|
||||||
|
}
|
||||||
|
}
|
|
@ -22,4 +22,20 @@ public class RegexSelectorTest {
|
||||||
String select = regexSelector.select(source);
|
String select = regexSelector.select(source);
|
||||||
Assertions.assertThat(select).isEqualTo(source);
|
Assertions.assertThat(select).isEqualTo(source);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testRegexWithZeroWidthAssertions() {
|
||||||
|
String regex = "^.*(?=\\?)";
|
||||||
|
String source = "hello world?xxxx";
|
||||||
|
RegexSelector regexSelector = new RegexSelector(regex);
|
||||||
|
String select = regexSelector.select(source);
|
||||||
|
Assertions.assertThat(select).isEqualTo("hello world");
|
||||||
|
|
||||||
|
|
||||||
|
regex = "\\d{3}(?!\\d)";
|
||||||
|
source = "123456asdf";
|
||||||
|
regexSelector = new RegexSelector(regex);
|
||||||
|
select = regexSelector.select(source);
|
||||||
|
Assertions.assertThat(select).isEqualTo("456");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -20,6 +20,9 @@ public class UrlUtilsTest {
|
||||||
absoluteUrl = UrlUtils.canonicalizeUrl("../aa", "http://www.dianping.com/sh/ss/com");
|
absoluteUrl = UrlUtils.canonicalizeUrl("../aa", "http://www.dianping.com/sh/ss/com");
|
||||||
assertThat(absoluteUrl).isEqualTo("http://www.dianping.com/sh/aa");
|
assertThat(absoluteUrl).isEqualTo("http://www.dianping.com/sh/aa");
|
||||||
|
|
||||||
|
absoluteUrl = UrlUtils.canonicalizeUrl("../mshz", "http://www.court.gov.cn/zgcpwsw/zgrmfy/");
|
||||||
|
assertThat(absoluteUrl).isEqualTo("http://www.court.gov.cn/zgcpwsw/mshz");
|
||||||
|
|
||||||
absoluteUrl = UrlUtils.canonicalizeUrl("..aa", "http://www.dianping.com/sh/ss/com");
|
absoluteUrl = UrlUtils.canonicalizeUrl("..aa", "http://www.dianping.com/sh/ss/com");
|
||||||
assertThat(absoluteUrl).isEqualTo("http://www.dianping.com/sh/ss/..aa");
|
assertThat(absoluteUrl).isEqualTo("http://www.dianping.com/sh/ss/..aa");
|
||||||
|
|
||||||
|
|
|
@ -48,11 +48,7 @@ public class RedisScheduler extends DuplicateRemovedScheduler implements Monitor
|
||||||
public boolean isDuplicate(Request request, Task task) {
|
public boolean isDuplicate(Request request, Task task) {
|
||||||
Jedis jedis = pool.getResource();
|
Jedis jedis = pool.getResource();
|
||||||
try {
|
try {
|
||||||
boolean isDuplicate = jedis.sismember(getSetKey(task), request.getUrl());
|
return jedis.sadd(getSetKey(task), request.getUrl()) > 0;
|
||||||
if (!isDuplicate) {
|
|
||||||
jedis.sadd(getSetKey(task), request.getUrl());
|
|
||||||
}
|
|
||||||
return isDuplicate;
|
|
||||||
} finally {
|
} finally {
|
||||||
pool.returnResource(jedis);
|
pool.returnResource(jedis);
|
||||||
}
|
}
|
||||||
|
|
|
@ -13,7 +13,7 @@
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.seleniumhq.selenium</groupId>
|
<groupId>org.seleniumhq.selenium</groupId>
|
||||||
<artifactId>selenium-java</artifactId>
|
<artifactId>selenium-java</artifactId>
|
||||||
<version>2.46.0</version>
|
<version>2.41.0</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
|
|
|
@ -45,7 +45,7 @@ class WebDriverPool {
|
||||||
private WebDriver mDriver = null;
|
private WebDriver mDriver = null;
|
||||||
private boolean mAutoQuitDriver = true;
|
private boolean mAutoQuitDriver = true;
|
||||||
|
|
||||||
private static final String CONFIG_FILE = "/Users/Bingo/Documents/workspace/webmagic/webmagic-selenium/config.ini";
|
private static final String DEFAULT_CONFIG_FILE = "/data/webmagic/webmagic-selenium/config.ini";
|
||||||
private static final String DRIVER_FIREFOX = "firefox";
|
private static final String DRIVER_FIREFOX = "firefox";
|
||||||
private static final String DRIVER_CHROME = "chrome";
|
private static final String DRIVER_CHROME = "chrome";
|
||||||
private static final String DRIVER_PHANTOMJS = "phantomjs";
|
private static final String DRIVER_PHANTOMJS = "phantomjs";
|
||||||
|
@ -64,7 +64,11 @@ class WebDriverPool {
|
||||||
public void configure() throws IOException {
|
public void configure() throws IOException {
|
||||||
// Read config file
|
// Read config file
|
||||||
sConfig = new Properties();
|
sConfig = new Properties();
|
||||||
sConfig.load(new FileReader(CONFIG_FILE));
|
String configFile = DEFAULT_CONFIG_FILE;
|
||||||
|
if (System.getProperty("selenuim_config")!=null){
|
||||||
|
configFile = System.getProperty("selenuim_config");
|
||||||
|
}
|
||||||
|
sConfig.load(new FileReader(configFile));
|
||||||
|
|
||||||
// Prepare capabilities
|
// Prepare capabilities
|
||||||
sCaps = new DesiredCapabilities();
|
sCaps = new DesiredCapabilities();
|
||||||
|
|
|
@ -22,7 +22,7 @@ public class HuabanProcessor implements PageProcessor {
|
||||||
public void process(Page page) {
|
public void process(Page page) {
|
||||||
page.addTargetRequests(page.getHtml().links().regex("http://huaban\\.com/.*").all());
|
page.addTargetRequests(page.getHtml().links().regex("http://huaban\\.com/.*").all());
|
||||||
if (page.getUrl().toString().contains("pins")) {
|
if (page.getUrl().toString().contains("pins")) {
|
||||||
page.putField("img", page.getHtml().xpath("//div[@id='pin_img']/a/img/@src").toString());
|
page.putField("img", page.getHtml().xpath("//div[@class='image-holder']/a/img/@src").toString());
|
||||||
} else {
|
} else {
|
||||||
page.getResultItems().setSkip(true);
|
page.getResultItems().setSkip(true);
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,11 @@
|
||||||
|
#driver=phantomjs
|
||||||
|
#driver=firefox
|
||||||
|
driver=chrome
|
||||||
|
#driver=http://localhost:8910
|
||||||
|
driver=http://localhost:4444/wd/hub
|
||||||
|
|
||||||
|
# PhantomJS specific config (change according to your installation)
|
||||||
|
#phantomjs_exec_path=/Users/detro/bin/phantomjs-qt5
|
||||||
|
phantomjs_exec_path=/Users/detro/bin/phantomjs-upstream
|
||||||
|
phantomjs_driver_path=../../src/main.js
|
||||||
|
phantomjs_driver_loglevel=DEBUG
|
Loading…
Reference in New Issue