Merge branch 'release/0.10.0'
commit
5d55bf33d2
|
@ -118,9 +118,9 @@ The architecture of webmagic (refered to [Scrapy](http://scrapy.org/))
|
||||||
|
|
||||||
There are more examples in `webmagic-samples` package.
|
There are more examples in `webmagic-samples` package.
|
||||||
|
|
||||||
### Lisence:
|
### License:
|
||||||
|
|
||||||
Lisenced under [Apache 2.0 lisence](http://opensource.org/licenses/Apache-2.0)
|
Licensed under [Apache 2.0 license](http://opensource.org/licenses/Apache-2.0)
|
||||||
|
|
||||||
### Thanks:
|
### Thanks:
|
||||||
|
|
||||||
|
|
2
pom.xml
2
pom.xml
|
@ -1,7 +1,7 @@
|
||||||
<?xml version="1.0" encoding="UTF-8"?>
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<version>0.9.1</version>
|
<version>0.10.0</version>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
<packaging>pom</packaging>
|
<packaging>pom</packaging>
|
||||||
<properties>
|
<properties>
|
||||||
|
|
|
@ -5,7 +5,7 @@
|
||||||
<skin>
|
<skin>
|
||||||
<groupId>org.apache.maven.skins</groupId>
|
<groupId>org.apache.maven.skins</groupId>
|
||||||
<artifactId>maven-fluido-skin</artifactId>
|
<artifactId>maven-fluido-skin</artifactId>
|
||||||
<version>1.9</version>
|
<version>1.11.1</version>
|
||||||
</skin>
|
</skin>
|
||||||
<body>
|
<body>
|
||||||
<menu ref="parent" inherit="top" />
|
<menu ref="parent" inherit="top" />
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<artifactId>webmagic-parent</artifactId>
|
<artifactId>webmagic-parent</artifactId>
|
||||||
<version>0.9.1</version>
|
<version>0.10.0</version>
|
||||||
</parent>
|
</parent>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
|
|
@ -49,15 +49,34 @@ public class Page {
|
||||||
|
|
||||||
private byte[] bytes;
|
private byte[] bytes;
|
||||||
|
|
||||||
private List<Request> targetRequests = new ArrayList<Request>();
|
private List<Request> targetRequests = new ArrayList<>();
|
||||||
|
|
||||||
private String charset;
|
private String charset;
|
||||||
|
|
||||||
public Page() {
|
public Page() {
|
||||||
}
|
}
|
||||||
|
|
||||||
public static Page fail(){
|
/**
|
||||||
|
* Returns a {@link Page} with {@link #downloadSuccess} is {@code false}.
|
||||||
|
*
|
||||||
|
* @return the page.
|
||||||
|
* @deprecated Use {@link #fail(Request)} instead.
|
||||||
|
*/
|
||||||
|
@Deprecated
|
||||||
|
public static Page fail() {
|
||||||
|
return fail(null);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns a {@link Page} with {@link #downloadSuccess} is {@code false},
|
||||||
|
* and {@link #request} is specified.
|
||||||
|
*
|
||||||
|
* @return the page.
|
||||||
|
* @since 0.10.0
|
||||||
|
*/
|
||||||
|
public static Page fail(Request request){
|
||||||
Page page = new Page();
|
Page page = new Page();
|
||||||
|
page.setRequest(request);
|
||||||
page.setDownloadSuccess(false);
|
page.setDownloadSuccess(false);
|
||||||
return page;
|
return page;
|
||||||
}
|
}
|
||||||
|
@ -123,13 +142,7 @@ public class Page {
|
||||||
* @param requests requests
|
* @param requests requests
|
||||||
*/
|
*/
|
||||||
public void addTargetRequests(Iterable<String> requests) {
|
public void addTargetRequests(Iterable<String> requests) {
|
||||||
for (String s : requests) {
|
addTargetRequests(requests, 0); // Default priority is 0
|
||||||
if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
s = UrlUtils.canonicalizeUrl(s, url.toString());
|
|
||||||
targetRequests.add(new Request(s));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -139,15 +152,34 @@ public class Page {
|
||||||
* @param priority priority
|
* @param priority priority
|
||||||
*/
|
*/
|
||||||
public void addTargetRequests(Iterable<String> requests, long priority) {
|
public void addTargetRequests(Iterable<String> requests, long priority) {
|
||||||
for (String s : requests) {
|
if(requests == null) {
|
||||||
if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) {
|
return;
|
||||||
continue;
|
}
|
||||||
}
|
|
||||||
s = UrlUtils.canonicalizeUrl(s, url.toString());
|
for (String req : requests) {
|
||||||
targetRequests.add(new Request(s).setPriority(priority));
|
addRequestIfValid(req, priority);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Helper method to add a request if it's valid.
|
||||||
|
*
|
||||||
|
* @param url URL to add
|
||||||
|
* @param priority Priority for the URL
|
||||||
|
*/
|
||||||
|
private void addRequestIfValid(String url, long priority) {
|
||||||
|
if (StringUtils.isBlank(url) || url.equals("#") || url.startsWith("javascript:")) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
String canonicalizedUrl = UrlUtils.canonicalizeUrl(url, this.url.toString());
|
||||||
|
Request req = new Request(canonicalizedUrl);
|
||||||
|
if(priority > 0) {
|
||||||
|
req.setPriority(priority);
|
||||||
|
}
|
||||||
|
targetRequests.add(req);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* add url to fetch
|
* add url to fetch
|
||||||
*
|
*
|
||||||
|
|
|
@ -36,26 +36,62 @@ public abstract class AbstractDownloader implements Downloader {
|
||||||
return (Html) page.getHtml();
|
return (Html) page.getHtml();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param request the {@link Request}.
|
||||||
|
* @deprecated Use {@link #onSuccess(Page, Task)} instead.
|
||||||
|
*/
|
||||||
@Deprecated
|
@Deprecated
|
||||||
protected void onSuccess(Request request) {
|
protected void onSuccess(Request request) {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
* @param request the {@link Request}.
|
||||||
|
* @param task the {@link Task}.
|
||||||
* @since 0.7.6
|
* @since 0.7.6
|
||||||
|
* @deprecated Use {@link #onSuccess(Page, Task)} instead.
|
||||||
*/
|
*/
|
||||||
|
@Deprecated
|
||||||
protected void onSuccess(Request request, Task task) {
|
protected void onSuccess(Request request, Task task) {
|
||||||
this.onSuccess(request);
|
this.onSuccess(request);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param page the {@link Page}.
|
||||||
|
* @param task the {@link Task}.
|
||||||
|
* @since 0.10.0
|
||||||
|
*/
|
||||||
|
protected void onSuccess(Page page, Task task) {
|
||||||
|
this.onSuccess(page.getRequest(), task);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param request the {@link Request}.
|
||||||
|
* @deprecated Use {@link #onError(Page, Task, Throwable)} instead.
|
||||||
|
*/
|
||||||
@Deprecated
|
@Deprecated
|
||||||
protected void onError(Request request) {
|
protected void onError(Request request) {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
* @param request the {@link Request}.
|
||||||
|
* @param task the {@link Task}.
|
||||||
|
* @param e the exception.
|
||||||
* @since 0.7.6
|
* @since 0.7.6
|
||||||
|
* @deprecated Use {@link #onError(Page, Task, Throwable)} instead.
|
||||||
*/
|
*/
|
||||||
|
@Deprecated
|
||||||
protected void onError(Request request, Task task, Throwable e) {
|
protected void onError(Request request, Task task, Throwable e) {
|
||||||
this.onError(request);
|
this.onError(request);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param page the {@link Page}.
|
||||||
|
* @param task the {@link Task}.
|
||||||
|
* @param e the exception.
|
||||||
|
* @since 0.10.0
|
||||||
|
*/
|
||||||
|
protected void onError(Page page, Task task, Throwable e) {
|
||||||
|
this.onError(page.getRequest(), task, e);
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -79,18 +79,18 @@ public class HttpClientDownloader extends AbstractDownloader {
|
||||||
CloseableHttpClient httpClient = getHttpClient(task.getSite());
|
CloseableHttpClient httpClient = getHttpClient(task.getSite());
|
||||||
Proxy proxy = proxyProvider != null ? proxyProvider.getProxy(request, task) : null;
|
Proxy proxy = proxyProvider != null ? proxyProvider.getProxy(request, task) : null;
|
||||||
HttpClientRequestContext requestContext = httpUriRequestConverter.convert(request, task.getSite(), proxy);
|
HttpClientRequestContext requestContext = httpUriRequestConverter.convert(request, task.getSite(), proxy);
|
||||||
Page page = Page.fail();
|
Page page = Page.fail(request);
|
||||||
try {
|
try {
|
||||||
httpResponse = httpClient.execute(requestContext.getHttpUriRequest(), requestContext.getHttpClientContext());
|
httpResponse = httpClient.execute(requestContext.getHttpUriRequest(), requestContext.getHttpClientContext());
|
||||||
page = handleResponse(request, request.getCharset() != null ? request.getCharset() : task.getSite().getCharset(), httpResponse, task);
|
page = handleResponse(request, request.getCharset() != null ? request.getCharset() : task.getSite().getCharset(), httpResponse, task);
|
||||||
|
|
||||||
onSuccess(request, task);
|
onSuccess(page, task);
|
||||||
logger.info("downloading page success {}", request.getUrl());
|
logger.info("downloading page success {}", request.getUrl());
|
||||||
|
|
||||||
return page;
|
return page;
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
|
|
||||||
onError(request, task, e);
|
onError(page, task, e);
|
||||||
logger.info("download page {} error", request.getUrl(), e);
|
logger.info("download page {} error", request.getUrl(), e);
|
||||||
|
|
||||||
return page;
|
return page;
|
||||||
|
|
|
@ -3,6 +3,7 @@ package us.codecraft.webmagic.selector;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
import org.apache.commons.collections4.CollectionUtils;
|
import org.apache.commons.collections4.CollectionUtils;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -55,11 +56,12 @@ public abstract class AbstractSelectable implements Selectable {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String get() {
|
public String get() {
|
||||||
if (CollectionUtils.isNotEmpty(all())) {
|
List<String> sourceTexts = all();
|
||||||
return all().get(0);
|
if (CollectionUtils.isNotEmpty(sourceTexts)) {
|
||||||
} else {
|
return sourceTexts.get(0);
|
||||||
return null;
|
|
||||||
}
|
}
|
||||||
|
return null;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -91,8 +93,9 @@ public abstract class AbstractSelectable implements Selectable {
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getFirstSourceText() {
|
public String getFirstSourceText() {
|
||||||
if (getSourceTexts() != null && getSourceTexts().size() > 0) {
|
List<String> sourceTexts = getSourceTexts();
|
||||||
return getSourceTexts().get(0);
|
if (CollectionUtils.isNotEmpty(sourceTexts)) {
|
||||||
|
return sourceTexts.get(0);
|
||||||
}
|
}
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
@ -104,6 +107,6 @@ public abstract class AbstractSelectable implements Selectable {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean match() {
|
public boolean match() {
|
||||||
return getSourceTexts() != null && getSourceTexts().size() > 0;
|
return CollectionUtils.isNotEmpty(getSourceTexts());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -6,12 +6,6 @@ package us.codecraft.webmagic.utils;
|
||||||
public abstract class NumberUtils {
|
public abstract class NumberUtils {
|
||||||
|
|
||||||
public static int compareLong(long o1, long o2) {
|
public static int compareLong(long o1, long o2) {
|
||||||
if (o1 < o2) {
|
return Long.compare(o1, o2);
|
||||||
return -1;
|
|
||||||
} else if (o1 == o2) {
|
|
||||||
return 0;
|
|
||||||
} else {
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -21,10 +21,10 @@ public class WMCollections {
|
||||||
}
|
}
|
||||||
|
|
||||||
public static <T> List<T> newArrayList(T... t){
|
public static <T> List<T> newArrayList(T... t){
|
||||||
List<T> set = new ArrayList<T>(t.length);
|
List<T> list = new ArrayList<T>(t.length);
|
||||||
for (T t1 : t) {
|
for (T t1 : t) {
|
||||||
set.add(t1);
|
list.add(t1);
|
||||||
}
|
}
|
||||||
return set;
|
return list;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -8,7 +8,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<artifactId>webmagic-parent</artifactId>
|
<artifactId>webmagic-parent</artifactId>
|
||||||
<version>0.9.1</version>
|
<version>0.10.0</version>
|
||||||
</parent>
|
</parent>
|
||||||
|
|
||||||
<artifactId>webmagic-coverage</artifactId>
|
<artifactId>webmagic-coverage</artifactId>
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<artifactId>webmagic-parent</artifactId>
|
<artifactId>webmagic-parent</artifactId>
|
||||||
<version>0.9.1</version>
|
<version>0.10.0</version>
|
||||||
</parent>
|
</parent>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
|
|
@ -88,7 +88,7 @@ public class PhantomJSDownloader extends AbstractDownloader {
|
||||||
logger.info("downloading page: " + request.getUrl());
|
logger.info("downloading page: " + request.getUrl());
|
||||||
}
|
}
|
||||||
|
|
||||||
Page page = Page.fail();
|
Page page = Page.fail(request);
|
||||||
try {
|
try {
|
||||||
String content = getPage(request);
|
String content = getPage(request);
|
||||||
if (!content.contains("HTTP request failed")) {
|
if (!content.contains("HTTP request failed")) {
|
||||||
|
@ -98,9 +98,9 @@ public class PhantomJSDownloader extends AbstractDownloader {
|
||||||
page.setRequest(request);
|
page.setRequest(request);
|
||||||
page.setStatusCode(200);
|
page.setStatusCode(200);
|
||||||
}
|
}
|
||||||
onSuccess(request, task);
|
onSuccess(page, task);
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
onError(request, task, e);
|
onError(page, task, e);
|
||||||
logger.warn("download page {} error", request.getUrl(), e);
|
logger.warn("download page {} error", request.getUrl(), e);
|
||||||
}
|
}
|
||||||
return page;
|
return page;
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<artifactId>webmagic-parent</artifactId>
|
<artifactId>webmagic-parent</artifactId>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<version>0.9.1</version>
|
<version>0.10.0</version>
|
||||||
</parent>
|
</parent>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<artifactId>webmagic-parent</artifactId>
|
<artifactId>webmagic-parent</artifactId>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<version>0.9.1</version>
|
<version>0.10.0</version>
|
||||||
</parent>
|
</parent>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<artifactId>webmagic-parent</artifactId>
|
<artifactId>webmagic-parent</artifactId>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<version>0.9.1</version>
|
<version>0.10.0</version>
|
||||||
</parent>
|
</parent>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<artifactId>webmagic-parent</artifactId>
|
<artifactId>webmagic-parent</artifactId>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<version>0.9.1</version>
|
<version>0.10.0</version>
|
||||||
</parent>
|
</parent>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
|
|
@ -74,7 +74,7 @@ public class SeleniumDownloader extends AbstractDownloader implements Closeable
|
||||||
public Page download(Request request, Task task) {
|
public Page download(Request request, Task task) {
|
||||||
checkInit();
|
checkInit();
|
||||||
WebDriver webDriver = null;
|
WebDriver webDriver = null;
|
||||||
Page page = Page.fail();
|
Page page = Page.fail(request);
|
||||||
try {
|
try {
|
||||||
webDriver = webDriverPool.get();
|
webDriver = webDriverPool.get();
|
||||||
|
|
||||||
|
@ -111,10 +111,10 @@ public class SeleniumDownloader extends AbstractDownloader implements Closeable
|
||||||
page.setHtml(new Html(content, request.getUrl()));
|
page.setHtml(new Html(content, request.getUrl()));
|
||||||
page.setUrl(new PlainText(request.getUrl()));
|
page.setUrl(new PlainText(request.getUrl()));
|
||||||
page.setRequest(request);
|
page.setRequest(request);
|
||||||
onSuccess(request, task);
|
onSuccess(page, task);
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
logger.warn("download page {} error", request.getUrl(), e);
|
logger.warn("download page {} error", request.getUrl(), e);
|
||||||
onError(request, task, e);
|
onError(page, task, e);
|
||||||
} finally {
|
} finally {
|
||||||
if (webDriver != null) {
|
if (webDriver != null) {
|
||||||
webDriverPool.returnToPool(webDriver);
|
webDriverPool.returnToPool(webDriver);
|
||||||
|
|
Loading…
Reference in New Issue