Merge branch 'release/0.10.0'
commit
5d55bf33d2
|
@ -118,9 +118,9 @@ The architecture of webmagic (refered to [Scrapy](http://scrapy.org/))
|
|||
|
||||
There are more examples in `webmagic-samples` package.
|
||||
|
||||
### Lisence:
|
||||
### License:
|
||||
|
||||
Lisenced under [Apache 2.0 lisence](http://opensource.org/licenses/Apache-2.0)
|
||||
Licensed under [Apache 2.0 license](http://opensource.org/licenses/Apache-2.0)
|
||||
|
||||
### Thanks:
|
||||
|
||||
|
|
2
pom.xml
2
pom.xml
|
@ -1,7 +1,7 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<groupId>us.codecraft</groupId>
|
||||
<version>0.9.1</version>
|
||||
<version>0.10.0</version>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
<packaging>pom</packaging>
|
||||
<properties>
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
<skin>
|
||||
<groupId>org.apache.maven.skins</groupId>
|
||||
<artifactId>maven-fluido-skin</artifactId>
|
||||
<version>1.9</version>
|
||||
<version>1.11.1</version>
|
||||
</skin>
|
||||
<body>
|
||||
<menu ref="parent" inherit="top" />
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
<parent>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<artifactId>webmagic-parent</artifactId>
|
||||
<version>0.9.1</version>
|
||||
<version>0.10.0</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
|
|
|
@ -49,15 +49,34 @@ public class Page {
|
|||
|
||||
private byte[] bytes;
|
||||
|
||||
private List<Request> targetRequests = new ArrayList<Request>();
|
||||
private List<Request> targetRequests = new ArrayList<>();
|
||||
|
||||
private String charset;
|
||||
|
||||
public Page() {
|
||||
}
|
||||
|
||||
public static Page fail(){
|
||||
/**
|
||||
* Returns a {@link Page} with {@link #downloadSuccess} is {@code false}.
|
||||
*
|
||||
* @return the page.
|
||||
* @deprecated Use {@link #fail(Request)} instead.
|
||||
*/
|
||||
@Deprecated
|
||||
public static Page fail() {
|
||||
return fail(null);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a {@link Page} with {@link #downloadSuccess} is {@code false},
|
||||
* and {@link #request} is specified.
|
||||
*
|
||||
* @return the page.
|
||||
* @since 0.10.0
|
||||
*/
|
||||
public static Page fail(Request request){
|
||||
Page page = new Page();
|
||||
page.setRequest(request);
|
||||
page.setDownloadSuccess(false);
|
||||
return page;
|
||||
}
|
||||
|
@ -123,13 +142,7 @@ public class Page {
|
|||
* @param requests requests
|
||||
*/
|
||||
public void addTargetRequests(Iterable<String> requests) {
|
||||
for (String s : requests) {
|
||||
if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) {
|
||||
continue;
|
||||
}
|
||||
s = UrlUtils.canonicalizeUrl(s, url.toString());
|
||||
targetRequests.add(new Request(s));
|
||||
}
|
||||
addTargetRequests(requests, 0); // Default priority is 0
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -139,14 +152,33 @@ public class Page {
|
|||
* @param priority priority
|
||||
*/
|
||||
public void addTargetRequests(Iterable<String> requests, long priority) {
|
||||
for (String s : requests) {
|
||||
if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) {
|
||||
continue;
|
||||
}
|
||||
s = UrlUtils.canonicalizeUrl(s, url.toString());
|
||||
targetRequests.add(new Request(s).setPriority(priority));
|
||||
if(requests == null) {
|
||||
return;
|
||||
}
|
||||
|
||||
for (String req : requests) {
|
||||
addRequestIfValid(req, priority);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper method to add a request if it's valid.
|
||||
*
|
||||
* @param url URL to add
|
||||
* @param priority Priority for the URL
|
||||
*/
|
||||
private void addRequestIfValid(String url, long priority) {
|
||||
if (StringUtils.isBlank(url) || url.equals("#") || url.startsWith("javascript:")) {
|
||||
return;
|
||||
}
|
||||
|
||||
String canonicalizedUrl = UrlUtils.canonicalizeUrl(url, this.url.toString());
|
||||
Request req = new Request(canonicalizedUrl);
|
||||
if(priority > 0) {
|
||||
req.setPriority(priority);
|
||||
}
|
||||
targetRequests.add(req);
|
||||
}
|
||||
|
||||
/**
|
||||
* add url to fetch
|
||||
|
|
|
@ -36,26 +36,62 @@ public abstract class AbstractDownloader implements Downloader {
|
|||
return (Html) page.getHtml();
|
||||
}
|
||||
|
||||
/**
|
||||
* @param request the {@link Request}.
|
||||
* @deprecated Use {@link #onSuccess(Page, Task)} instead.
|
||||
*/
|
||||
@Deprecated
|
||||
protected void onSuccess(Request request) {
|
||||
}
|
||||
|
||||
/**
|
||||
* @param request the {@link Request}.
|
||||
* @param task the {@link Task}.
|
||||
* @since 0.7.6
|
||||
* @deprecated Use {@link #onSuccess(Page, Task)} instead.
|
||||
*/
|
||||
@Deprecated
|
||||
protected void onSuccess(Request request, Task task) {
|
||||
this.onSuccess(request);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param page the {@link Page}.
|
||||
* @param task the {@link Task}.
|
||||
* @since 0.10.0
|
||||
*/
|
||||
protected void onSuccess(Page page, Task task) {
|
||||
this.onSuccess(page.getRequest(), task);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param request the {@link Request}.
|
||||
* @deprecated Use {@link #onError(Page, Task, Throwable)} instead.
|
||||
*/
|
||||
@Deprecated
|
||||
protected void onError(Request request) {
|
||||
}
|
||||
|
||||
/**
|
||||
* @param request the {@link Request}.
|
||||
* @param task the {@link Task}.
|
||||
* @param e the exception.
|
||||
* @since 0.7.6
|
||||
* @deprecated Use {@link #onError(Page, Task, Throwable)} instead.
|
||||
*/
|
||||
@Deprecated
|
||||
protected void onError(Request request, Task task, Throwable e) {
|
||||
this.onError(request);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param page the {@link Page}.
|
||||
* @param task the {@link Task}.
|
||||
* @param e the exception.
|
||||
* @since 0.10.0
|
||||
*/
|
||||
protected void onError(Page page, Task task, Throwable e) {
|
||||
this.onError(page.getRequest(), task, e);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -79,18 +79,18 @@ public class HttpClientDownloader extends AbstractDownloader {
|
|||
CloseableHttpClient httpClient = getHttpClient(task.getSite());
|
||||
Proxy proxy = proxyProvider != null ? proxyProvider.getProxy(request, task) : null;
|
||||
HttpClientRequestContext requestContext = httpUriRequestConverter.convert(request, task.getSite(), proxy);
|
||||
Page page = Page.fail();
|
||||
Page page = Page.fail(request);
|
||||
try {
|
||||
httpResponse = httpClient.execute(requestContext.getHttpUriRequest(), requestContext.getHttpClientContext());
|
||||
page = handleResponse(request, request.getCharset() != null ? request.getCharset() : task.getSite().getCharset(), httpResponse, task);
|
||||
|
||||
onSuccess(request, task);
|
||||
onSuccess(page, task);
|
||||
logger.info("downloading page success {}", request.getUrl());
|
||||
|
||||
return page;
|
||||
} catch (IOException e) {
|
||||
|
||||
onError(request, task, e);
|
||||
onError(page, task, e);
|
||||
logger.info("download page {} error", request.getUrl(), e);
|
||||
|
||||
return page;
|
||||
|
|
|
@ -3,6 +3,7 @@ package us.codecraft.webmagic.selector;
|
|||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.commons.collections4.CollectionUtils;
|
||||
|
||||
/**
|
||||
|
@ -55,11 +56,12 @@ public abstract class AbstractSelectable implements Selectable {
|
|||
|
||||
@Override
|
||||
public String get() {
|
||||
if (CollectionUtils.isNotEmpty(all())) {
|
||||
return all().get(0);
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
List<String> sourceTexts = all();
|
||||
if (CollectionUtils.isNotEmpty(sourceTexts)) {
|
||||
return sourceTexts.get(0);
|
||||
}
|
||||
return null;
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -91,8 +93,9 @@ public abstract class AbstractSelectable implements Selectable {
|
|||
}
|
||||
|
||||
public String getFirstSourceText() {
|
||||
if (getSourceTexts() != null && getSourceTexts().size() > 0) {
|
||||
return getSourceTexts().get(0);
|
||||
List<String> sourceTexts = getSourceTexts();
|
||||
if (CollectionUtils.isNotEmpty(sourceTexts)) {
|
||||
return sourceTexts.get(0);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
@ -104,6 +107,6 @@ public abstract class AbstractSelectable implements Selectable {
|
|||
|
||||
@Override
|
||||
public boolean match() {
|
||||
return getSourceTexts() != null && getSourceTexts().size() > 0;
|
||||
return CollectionUtils.isNotEmpty(getSourceTexts());
|
||||
}
|
||||
}
|
||||
|
|
|
@ -6,12 +6,6 @@ package us.codecraft.webmagic.utils;
|
|||
public abstract class NumberUtils {
|
||||
|
||||
public static int compareLong(long o1, long o2) {
|
||||
if (o1 < o2) {
|
||||
return -1;
|
||||
} else if (o1 == o2) {
|
||||
return 0;
|
||||
} else {
|
||||
return 1;
|
||||
}
|
||||
return Long.compare(o1, o2);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -21,10 +21,10 @@ public class WMCollections {
|
|||
}
|
||||
|
||||
public static <T> List<T> newArrayList(T... t){
|
||||
List<T> set = new ArrayList<T>(t.length);
|
||||
List<T> list = new ArrayList<T>(t.length);
|
||||
for (T t1 : t) {
|
||||
set.add(t1);
|
||||
list.add(t1);
|
||||
}
|
||||
return set;
|
||||
return list;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -8,7 +8,7 @@
|
|||
<parent>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<artifactId>webmagic-parent</artifactId>
|
||||
<version>0.9.1</version>
|
||||
<version>0.10.0</version>
|
||||
</parent>
|
||||
|
||||
<artifactId>webmagic-coverage</artifactId>
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
<parent>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<artifactId>webmagic-parent</artifactId>
|
||||
<version>0.9.1</version>
|
||||
<version>0.10.0</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
|
|
|
@ -88,7 +88,7 @@ public class PhantomJSDownloader extends AbstractDownloader {
|
|||
logger.info("downloading page: " + request.getUrl());
|
||||
}
|
||||
|
||||
Page page = Page.fail();
|
||||
Page page = Page.fail(request);
|
||||
try {
|
||||
String content = getPage(request);
|
||||
if (!content.contains("HTTP request failed")) {
|
||||
|
@ -98,9 +98,9 @@ public class PhantomJSDownloader extends AbstractDownloader {
|
|||
page.setRequest(request);
|
||||
page.setStatusCode(200);
|
||||
}
|
||||
onSuccess(request, task);
|
||||
onSuccess(page, task);
|
||||
} catch (Exception e) {
|
||||
onError(request, task, e);
|
||||
onError(page, task, e);
|
||||
logger.warn("download page {} error", request.getUrl(), e);
|
||||
}
|
||||
return page;
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
<parent>
|
||||
<artifactId>webmagic-parent</artifactId>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<version>0.9.1</version>
|
||||
<version>0.10.0</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
<parent>
|
||||
<artifactId>webmagic-parent</artifactId>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<version>0.9.1</version>
|
||||
<version>0.10.0</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
<parent>
|
||||
<artifactId>webmagic-parent</artifactId>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<version>0.9.1</version>
|
||||
<version>0.10.0</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
<parent>
|
||||
<artifactId>webmagic-parent</artifactId>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<version>0.9.1</version>
|
||||
<version>0.10.0</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
|
|
|
@ -74,7 +74,7 @@ public class SeleniumDownloader extends AbstractDownloader implements Closeable
|
|||
public Page download(Request request, Task task) {
|
||||
checkInit();
|
||||
WebDriver webDriver = null;
|
||||
Page page = Page.fail();
|
||||
Page page = Page.fail(request);
|
||||
try {
|
||||
webDriver = webDriverPool.get();
|
||||
|
||||
|
@ -111,10 +111,10 @@ public class SeleniumDownloader extends AbstractDownloader implements Closeable
|
|||
page.setHtml(new Html(content, request.getUrl()));
|
||||
page.setUrl(new PlainText(request.getUrl()));
|
||||
page.setRequest(request);
|
||||
onSuccess(request, task);
|
||||
onSuccess(page, task);
|
||||
} catch (Exception e) {
|
||||
logger.warn("download page {} error", request.getUrl(), e);
|
||||
onError(request, task, e);
|
||||
onError(page, task, e);
|
||||
} finally {
|
||||
if (webDriver != null) {
|
||||
webDriverPool.returnToPool(webDriver);
|
||||
|
|
Loading…
Reference in New Issue