Merge branch 'release/0.9.0'
commit
fd4a136f9a
|
@ -39,12 +39,12 @@ webmagic使用maven管理依赖,在项目中添加对应的依赖即可使用w
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<artifactId>webmagic-core</artifactId>
|
<artifactId>webmagic-core</artifactId>
|
||||||
<version>0.7.5</version>
|
<version>${webmagic.version}</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<artifactId>webmagic-extension</artifactId>
|
<artifactId>webmagic-extension</artifactId>
|
||||||
<version>0.7.5</version>
|
<version>${webmagic.version}</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
|
@ -25,12 +25,12 @@ Add dependencies to your pom.xml:
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<artifactId>webmagic-core</artifactId>
|
<artifactId>webmagic-core</artifactId>
|
||||||
<version>0.7.5</version>
|
<version>${webmagic.version}</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<artifactId>webmagic-extension</artifactId>
|
<artifactId>webmagic-extension</artifactId>
|
||||||
<version>0.7.5</version>
|
<version>${webmagic.version}</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
4
pom.xml
4
pom.xml
|
@ -1,7 +1,7 @@
|
||||||
<?xml version="1.0" encoding="UTF-8"?>
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<version>0.8.0</version>
|
<version>0.9.0</version>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
<packaging>pom</packaging>
|
<packaging>pom</packaging>
|
||||||
<properties>
|
<properties>
|
||||||
|
@ -124,7 +124,7 @@
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<artifactId>xsoup</artifactId>
|
<artifactId>xsoup</artifactId>
|
||||||
<version>0.3.6</version>
|
<version>0.3.7</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>com.alibaba</groupId>
|
<groupId>com.alibaba</groupId>
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<artifactId>webmagic-parent</artifactId>
|
<artifactId>webmagic-parent</artifactId>
|
||||||
<version>0.8.0</version>
|
<version>0.9.0</version>
|
||||||
</parent>
|
</parent>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
|
|
@ -20,7 +20,7 @@ import java.util.Map;
|
||||||
* {@link #getHtml()} get content of current page <br>
|
* {@link #getHtml()} get content of current page <br>
|
||||||
* {@link #putField(String, Object)} save extracted result <br>
|
* {@link #putField(String, Object)} save extracted result <br>
|
||||||
* {@link #getResultItems()} get extract results to be used in {@link us.codecraft.webmagic.pipeline.Pipeline}<br>
|
* {@link #getResultItems()} get extract results to be used in {@link us.codecraft.webmagic.pipeline.Pipeline}<br>
|
||||||
* {@link #addTargetRequests(java.util.List)} {@link #addTargetRequest(String)} add urls to fetch <br>
|
* {@link #addTargetRequests(Iterable)} {@link #addTargetRequest(String)} add urls to fetch <br>
|
||||||
*
|
*
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
* @see us.codecraft.webmagic.downloader.Downloader
|
* @see us.codecraft.webmagic.downloader.Downloader
|
||||||
|
@ -52,7 +52,7 @@ public class Page {
|
||||||
private List<Request> targetRequests = new ArrayList<Request>();
|
private List<Request> targetRequests = new ArrayList<Request>();
|
||||||
|
|
||||||
private String charset;
|
private String charset;
|
||||||
|
|
||||||
public Page() {
|
public Page() {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -108,7 +108,8 @@ public class Page {
|
||||||
* @deprecated since 0.4.0
|
* @deprecated since 0.4.0
|
||||||
* The html is parse just when first time of calling {@link #getHtml()}, so use {@link #setRawText(String)} instead.
|
* The html is parse just when first time of calling {@link #getHtml()}, so use {@link #setRawText(String)} instead.
|
||||||
*/
|
*/
|
||||||
public void setHtml(Html html) {
|
@Deprecated
|
||||||
|
public void setHtml(Html html) {
|
||||||
this.html = html;
|
this.html = html;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -121,7 +122,7 @@ public class Page {
|
||||||
*
|
*
|
||||||
* @param requests requests
|
* @param requests requests
|
||||||
*/
|
*/
|
||||||
public void addTargetRequests(List<String> requests) {
|
public void addTargetRequests(Iterable<String> requests) {
|
||||||
for (String s : requests) {
|
for (String s : requests) {
|
||||||
if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) {
|
if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) {
|
||||||
continue;
|
continue;
|
||||||
|
@ -137,7 +138,7 @@ public class Page {
|
||||||
* @param requests requests
|
* @param requests requests
|
||||||
* @param priority priority
|
* @param priority priority
|
||||||
*/
|
*/
|
||||||
public void addTargetRequests(List<String> requests, long priority) {
|
public void addTargetRequests(Iterable<String> requests, long priority) {
|
||||||
for (String s : requests) {
|
for (String s : requests) {
|
||||||
if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) {
|
if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) {
|
||||||
continue;
|
continue;
|
||||||
|
|
|
@ -28,6 +28,8 @@ public class Site {
|
||||||
|
|
||||||
private String charset;
|
private String charset;
|
||||||
|
|
||||||
|
private String defaultCharset;
|
||||||
|
|
||||||
private int sleepTime = 5000;
|
private int sleepTime = 5000;
|
||||||
|
|
||||||
private int retryTimes = 0;
|
private int retryTimes = 0;
|
||||||
|
@ -168,6 +170,30 @@ public class Site {
|
||||||
return charset;
|
return charset;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Set default charset of page.
|
||||||
|
*
|
||||||
|
* When charset detect failed, use this default charset.
|
||||||
|
*
|
||||||
|
* @param defaultCharset the default charset
|
||||||
|
* @return this
|
||||||
|
* @since 0.9.0
|
||||||
|
*/
|
||||||
|
public Site setDefaultCharset(String defaultCharset) {
|
||||||
|
this.defaultCharset = defaultCharset;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The default charset if charset detected failed.
|
||||||
|
*
|
||||||
|
* @return the defulat charset
|
||||||
|
* @since 0.9.0
|
||||||
|
*/
|
||||||
|
public String getDefaultCharset() {
|
||||||
|
return defaultCharset;
|
||||||
|
}
|
||||||
|
|
||||||
public int getTimeOut() {
|
public int getTimeOut() {
|
||||||
return timeOut;
|
return timeOut;
|
||||||
}
|
}
|
||||||
|
|
|
@ -4,6 +4,7 @@ import java.io.IOException;
|
||||||
import java.nio.charset.Charset;
|
import java.nio.charset.Charset;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
import java.util.Optional;
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.http.HttpResponse;
|
import org.apache.http.HttpResponse;
|
||||||
|
@ -76,7 +77,7 @@ public class HttpClientDownloader extends AbstractDownloader {
|
||||||
}
|
}
|
||||||
CloseableHttpResponse httpResponse = null;
|
CloseableHttpResponse httpResponse = null;
|
||||||
CloseableHttpClient httpClient = getHttpClient(task.getSite());
|
CloseableHttpClient httpClient = getHttpClient(task.getSite());
|
||||||
Proxy proxy = proxyProvider != null ? proxyProvider.getProxy(task) : null;
|
Proxy proxy = proxyProvider != null ? proxyProvider.getProxy(request, task) : null;
|
||||||
HttpClientRequestContext requestContext = httpUriRequestConverter.convert(request, task.getSite(), proxy);
|
HttpClientRequestContext requestContext = httpUriRequestConverter.convert(request, task.getSite(), proxy);
|
||||||
Page page = Page.fail();
|
Page page = Page.fail();
|
||||||
try {
|
try {
|
||||||
|
@ -116,7 +117,7 @@ public class HttpClientDownloader extends AbstractDownloader {
|
||||||
page.setBytes(bytes);
|
page.setBytes(bytes);
|
||||||
if (!request.isBinaryContent()) {
|
if (!request.isBinaryContent()) {
|
||||||
if (charset == null) {
|
if (charset == null) {
|
||||||
charset = getHtmlCharset(contentType, bytes);
|
charset = getHtmlCharset(contentType, bytes, task);
|
||||||
}
|
}
|
||||||
page.setCharset(charset);
|
page.setCharset(charset);
|
||||||
page.setRawText(new String(bytes, charset));
|
page.setRawText(new String(bytes, charset));
|
||||||
|
@ -131,11 +132,11 @@ public class HttpClientDownloader extends AbstractDownloader {
|
||||||
return page;
|
return page;
|
||||||
}
|
}
|
||||||
|
|
||||||
private String getHtmlCharset(String contentType, byte[] contentBytes) throws IOException {
|
private String getHtmlCharset(String contentType, byte[] contentBytes, Task task) throws IOException {
|
||||||
String charset = CharsetUtils.detectCharset(contentType, contentBytes);
|
String charset = CharsetUtils.detectCharset(contentType, contentBytes);
|
||||||
if (charset == null) {
|
if (charset == null) {
|
||||||
charset = Charset.defaultCharset().name();
|
charset = Optional.ofNullable(task.getSite().getDefaultCharset()).orElseGet(Charset.defaultCharset()::name);
|
||||||
logger.warn("Charset autodetect failed, use {} as charset. Please specify charset in Site.setCharset()", Charset.defaultCharset());
|
logger.info("Charset autodetect failed, use {} as charset.", task.getSite().getDefaultCharset());
|
||||||
}
|
}
|
||||||
return charset;
|
return charset;
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,16 +1,5 @@
|
||||||
package us.codecraft.webmagic.downloader;
|
package us.codecraft.webmagic.downloader;
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.security.KeyManagementException;
|
|
||||||
import java.security.NoSuchAlgorithmException;
|
|
||||||
import java.security.cert.CertificateException;
|
|
||||||
import java.security.cert.X509Certificate;
|
|
||||||
import java.util.Map;
|
|
||||||
|
|
||||||
import javax.net.ssl.SSLContext;
|
|
||||||
import javax.net.ssl.TrustManager;
|
|
||||||
import javax.net.ssl.X509TrustManager;
|
|
||||||
|
|
||||||
import org.apache.commons.lang3.JavaVersion;
|
import org.apache.commons.lang3.JavaVersion;
|
||||||
import org.apache.commons.lang3.SystemUtils;
|
import org.apache.commons.lang3.SystemUtils;
|
||||||
import org.apache.http.HttpException;
|
import org.apache.http.HttpException;
|
||||||
|
@ -22,28 +11,32 @@ import org.apache.http.config.RegistryBuilder;
|
||||||
import org.apache.http.config.SocketConfig;
|
import org.apache.http.config.SocketConfig;
|
||||||
import org.apache.http.conn.socket.ConnectionSocketFactory;
|
import org.apache.http.conn.socket.ConnectionSocketFactory;
|
||||||
import org.apache.http.conn.socket.PlainConnectionSocketFactory;
|
import org.apache.http.conn.socket.PlainConnectionSocketFactory;
|
||||||
import org.apache.http.conn.ssl.DefaultHostnameVerifier;
|
|
||||||
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
|
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
|
||||||
import org.apache.http.impl.client.BasicCookieStore;
|
import org.apache.http.impl.client.*;
|
||||||
import org.apache.http.impl.client.CloseableHttpClient;
|
|
||||||
import org.apache.http.impl.client.DefaultHttpRequestRetryHandler;
|
|
||||||
import org.apache.http.impl.client.HttpClientBuilder;
|
|
||||||
import org.apache.http.impl.client.HttpClients;
|
|
||||||
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
|
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
|
||||||
import org.apache.http.impl.cookie.BasicClientCookie;
|
import org.apache.http.impl.cookie.BasicClientCookie;
|
||||||
import org.apache.http.protocol.HttpContext;
|
import org.apache.http.protocol.HttpContext;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import us.codecraft.webmagic.Site;
|
import us.codecraft.webmagic.Site;
|
||||||
|
|
||||||
|
import javax.net.ssl.SSLContext;
|
||||||
|
import javax.net.ssl.TrustManager;
|
||||||
|
import javax.net.ssl.X509TrustManager;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.security.KeyManagementException;
|
||||||
|
import java.security.NoSuchAlgorithmException;
|
||||||
|
import java.security.cert.CertificateException;
|
||||||
|
import java.security.cert.X509Certificate;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
* @since 0.4.0
|
* @since 0.4.0
|
||||||
*/
|
*/
|
||||||
public class HttpClientGenerator {
|
public class HttpClientGenerator {
|
||||||
|
|
||||||
private transient Logger logger = LoggerFactory.getLogger(getClass());
|
private transient Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
|
||||||
private PoolingHttpClientConnectionManager connectionManager;
|
private PoolingHttpClientConnectionManager connectionManager;
|
||||||
|
|
||||||
|
@ -61,21 +54,20 @@ public class HttpClientGenerator {
|
||||||
SSLContext sslContext = createIgnoreVerifySSL();
|
SSLContext sslContext = createIgnoreVerifySSL();
|
||||||
String[] supportedProtocols;
|
String[] supportedProtocols;
|
||||||
if (SystemUtils.isJavaVersionAtLeast(JavaVersion.JAVA_11)) {
|
if (SystemUtils.isJavaVersionAtLeast(JavaVersion.JAVA_11)) {
|
||||||
supportedProtocols = new String[] { "SSLv3", "TLSv1", "TLSv1.1", "TLSv1.2", "TLSv1.3" };
|
supportedProtocols = new String[]{"SSLv3", "TLSv1", "TLSv1.1", "TLSv1.2", "TLSv1.3"};
|
||||||
} else {
|
} else {
|
||||||
supportedProtocols = new String[] { "SSLv3", "TLSv1", "TLSv1.1", "TLSv1.2" };
|
supportedProtocols = new String[]{"SSLv3", "TLSv1", "TLSv1.1", "TLSv1.2"};
|
||||||
}
|
}
|
||||||
logger.debug("supportedProtocols: {}", String.join(", ", supportedProtocols));
|
logger.debug("supportedProtocols: {}", String.join(", ", supportedProtocols));
|
||||||
return new SSLConnectionSocketFactory(sslContext, supportedProtocols,
|
return new SSLConnectionSocketFactory(sslContext, supportedProtocols,
|
||||||
null,
|
null,
|
||||||
new DefaultHostnameVerifier()); // 优先绕过安全证书
|
//不进行主机校验
|
||||||
} catch (KeyManagementException e) {
|
(host, sslSession) -> true); // 优先绕过安全证书
|
||||||
logger.error("ssl connection fail", e);
|
} catch (KeyManagementException | NoSuchAlgorithmException e) {
|
||||||
} catch (NoSuchAlgorithmException e) {
|
|
||||||
logger.error("ssl connection fail", e);
|
logger.error("ssl connection fail", e);
|
||||||
}
|
}
|
||||||
return SSLConnectionSocketFactory.getSocketFactory();
|
return SSLConnectionSocketFactory.getSocketFactory();
|
||||||
}
|
}
|
||||||
|
|
||||||
private SSLContext createIgnoreVerifySSL() throws NoSuchAlgorithmException, KeyManagementException {
|
private SSLContext createIgnoreVerifySSL() throws NoSuchAlgorithmException, KeyManagementException {
|
||||||
// 实现一个X509TrustManager接口,用于绕过验证,不用修改里面的方法
|
// 实现一个X509TrustManager接口,用于绕过验证,不用修改里面的方法
|
||||||
|
@ -97,9 +89,9 @@ public class HttpClientGenerator {
|
||||||
};
|
};
|
||||||
|
|
||||||
SSLContext sc = SSLContext.getInstance("TLS");
|
SSLContext sc = SSLContext.getInstance("TLS");
|
||||||
sc.init(null, new TrustManager[] { trustManager }, null);
|
sc.init(null, new TrustManager[]{trustManager}, null);
|
||||||
return sc;
|
return sc;
|
||||||
}
|
}
|
||||||
|
|
||||||
public HttpClientGenerator setPoolSize(int poolSize) {
|
public HttpClientGenerator setPoolSize(int poolSize) {
|
||||||
connectionManager.setMaxTotal(poolSize);
|
connectionManager.setMaxTotal(poolSize);
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
package us.codecraft.webmagic.proxy;
|
package us.codecraft.webmagic.proxy;
|
||||||
|
|
||||||
import us.codecraft.webmagic.Page;
|
import us.codecraft.webmagic.Page;
|
||||||
|
import us.codecraft.webmagic.Request;
|
||||||
import us.codecraft.webmagic.Task;
|
import us.codecraft.webmagic.Task;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -23,7 +24,23 @@ public interface ProxyProvider {
|
||||||
* Get a proxy for task by some strategy.
|
* Get a proxy for task by some strategy.
|
||||||
* @param task the download task
|
* @param task the download task
|
||||||
* @return proxy
|
* @return proxy
|
||||||
|
* @deprecated Use {@link #getProxy(Request, Task)} instead.
|
||||||
*/
|
*/
|
||||||
Proxy getProxy(Task task);
|
@Deprecated
|
||||||
|
default Proxy getProxy(Task task) {
|
||||||
|
throw new UnsupportedOperationException();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns a proxy for the request.
|
||||||
|
*
|
||||||
|
* @param request the request
|
||||||
|
* @param task the download task
|
||||||
|
* @return proxy
|
||||||
|
* @since 0.9.0
|
||||||
|
*/
|
||||||
|
default Proxy getProxy(Request request, Task task) {
|
||||||
|
return this.getProxy(task);
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
package us.codecraft.webmagic.proxy;
|
package us.codecraft.webmagic.proxy;
|
||||||
|
|
||||||
import us.codecraft.webmagic.Page;
|
import us.codecraft.webmagic.Page;
|
||||||
|
import us.codecraft.webmagic.Request;
|
||||||
import us.codecraft.webmagic.Task;
|
import us.codecraft.webmagic.Task;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
@ -44,7 +45,7 @@ public class SimpleProxyProvider implements ProxyProvider {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Proxy getProxy(Task task) {
|
public Proxy getProxy(Request request, Task task) {
|
||||||
return proxies.get(incrForLoop());
|
return proxies.get(incrForLoop());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -3,6 +3,7 @@ package us.codecraft.webmagic.selector;
|
||||||
import org.jsoup.Jsoup;
|
import org.jsoup.Jsoup;
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
import org.jsoup.nodes.Element;
|
import org.jsoup.nodes.Element;
|
||||||
|
import us.codecraft.webmagic.utils.BaseSelectorUtils;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
@ -13,16 +14,9 @@ import java.util.List;
|
||||||
*/
|
*/
|
||||||
public abstract class BaseElementSelector implements Selector, ElementSelector {
|
public abstract class BaseElementSelector implements Selector, ElementSelector {
|
||||||
private Document parse(String text) {
|
private Document parse(String text) {
|
||||||
if (text == null) {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Jsoup could not parse <tr></tr> or <td></td> tag directly
|
// Jsoup could not parse <tr></tr> or <td></td> tag directly
|
||||||
// https://stackoverflow.com/questions/63607740/jsoup-couldnt-parse-tr-tag
|
// https://stackoverflow.com/questions/63607740/jsoup-couldnt-parse-tr-tag
|
||||||
if ((text.startsWith("<tr>") && text.endsWith("</tr>"))
|
text = BaseSelectorUtils.preParse(text);
|
||||||
|| (text.startsWith("<td>") && text.endsWith("</td>"))) {
|
|
||||||
text = "<table>" + text + "</table>";
|
|
||||||
}
|
|
||||||
return Jsoup.parse(text);
|
return Jsoup.parse(text);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,23 @@
|
||||||
|
package us.codecraft.webmagic.utils;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author hooy
|
||||||
|
*/
|
||||||
|
public class BaseSelectorUtils {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Jsoup/HtmlCleaner could not parse "tr" or "td" tag directly
|
||||||
|
* https://stackoverflow.com/questions/63607740/jsoup-couldnt-parse-tr-tag
|
||||||
|
*
|
||||||
|
* @param text - the html string
|
||||||
|
* @return text
|
||||||
|
*/
|
||||||
|
public static String preParse(String text) {
|
||||||
|
if (((text.startsWith("<tr>") || text.startsWith("<tr ")) && text.endsWith("</tr>"))
|
||||||
|
|| ((text.startsWith("<td>") || text.startsWith("<td ")) && text.endsWith("</td>"))) {
|
||||||
|
text = "<table>" + text + "</table>";
|
||||||
|
}
|
||||||
|
return text;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,17 @@
|
||||||
|
package us.codecraft.webmagic;
|
||||||
|
|
||||||
|
import static org.junit.Assert.assertEquals;
|
||||||
|
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
|
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
public class SiteTest {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void test() {
|
||||||
|
Site site = Site.me().setDefaultCharset(StandardCharsets.UTF_8.name());
|
||||||
|
assertEquals(StandardCharsets.UTF_8.name(), site.getDefaultCharset());
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -1,6 +1,9 @@
|
||||||
package us.codecraft.webmagic.proxy;
|
package us.codecraft.webmagic.proxy;
|
||||||
|
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
import org.mockito.Mockito;
|
||||||
|
|
||||||
|
import us.codecraft.webmagic.Request;
|
||||||
import us.codecraft.webmagic.Site;
|
import us.codecraft.webmagic.Site;
|
||||||
import us.codecraft.webmagic.Task;
|
import us.codecraft.webmagic.Task;
|
||||||
|
|
||||||
|
@ -20,11 +23,12 @@ public class SimpleProxyProviderTest {
|
||||||
Proxy originProxy1 = new Proxy("127.0.0.1", 1087);
|
Proxy originProxy1 = new Proxy("127.0.0.1", 1087);
|
||||||
Proxy originProxy2 = new Proxy("127.0.0.1", 1088);
|
Proxy originProxy2 = new Proxy("127.0.0.1", 1088);
|
||||||
SimpleProxyProvider proxyProvider = SimpleProxyProvider.from(originProxy1, originProxy2);
|
SimpleProxyProvider proxyProvider = SimpleProxyProvider.from(originProxy1, originProxy2);
|
||||||
Proxy proxy = proxyProvider.getProxy(TASK);
|
Request request = Mockito.mock(Request.class);
|
||||||
|
Proxy proxy = proxyProvider.getProxy(request, TASK);
|
||||||
assertThat(proxy).isEqualTo(originProxy1);
|
assertThat(proxy).isEqualTo(originProxy1);
|
||||||
proxy = proxyProvider.getProxy(TASK);
|
proxy = proxyProvider.getProxy(request, TASK);
|
||||||
assertThat(proxy).isEqualTo(originProxy2);
|
assertThat(proxy).isEqualTo(originProxy2);
|
||||||
proxy = proxyProvider.getProxy(TASK);
|
proxy = proxyProvider.getProxy(request, TASK);
|
||||||
assertThat(proxy).isEqualTo(originProxy1);
|
assertThat(proxy).isEqualTo(originProxy1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -8,7 +8,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<artifactId>webmagic-parent</artifactId>
|
<artifactId>webmagic-parent</artifactId>
|
||||||
<version>0.8.0</version>
|
<version>0.9.0</version>
|
||||||
</parent>
|
</parent>
|
||||||
|
|
||||||
<artifactId>webmagic-coverage</artifactId>
|
<artifactId>webmagic-coverage</artifactId>
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<artifactId>webmagic-parent</artifactId>
|
<artifactId>webmagic-parent</artifactId>
|
||||||
<version>0.8.0</version>
|
<version>0.9.0</version>
|
||||||
</parent>
|
</parent>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
@ -14,6 +14,11 @@
|
||||||
<groupId>redis.clients</groupId>
|
<groupId>redis.clients</groupId>
|
||||||
<artifactId>jedis</artifactId>
|
<artifactId>jedis</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.assertj</groupId>
|
||||||
|
<artifactId>assertj-core</artifactId>
|
||||||
|
<scope>test</scope>
|
||||||
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>com.google.guava</groupId>
|
<groupId>com.google.guava</groupId>
|
||||||
<artifactId>guava</artifactId>
|
<artifactId>guava</artifactId>
|
||||||
|
|
|
@ -1,21 +1,25 @@
|
||||||
package us.codecraft.webmagic.monitor;
|
package us.codecraft.webmagic.monitor;
|
||||||
|
|
||||||
import org.slf4j.Logger;
|
import java.lang.management.ManagementFactory;
|
||||||
import org.slf4j.LoggerFactory;
|
import java.util.ArrayList;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.concurrent.atomic.AtomicInteger;
|
||||||
|
|
||||||
|
import javax.management.InstanceAlreadyExistsException;
|
||||||
|
import javax.management.JMException;
|
||||||
|
import javax.management.MBeanRegistrationException;
|
||||||
|
import javax.management.MBeanServer;
|
||||||
|
import javax.management.MalformedObjectNameException;
|
||||||
|
import javax.management.NotCompliantMBeanException;
|
||||||
|
import javax.management.ObjectName;
|
||||||
|
|
||||||
import us.codecraft.webmagic.Request;
|
import us.codecraft.webmagic.Request;
|
||||||
import us.codecraft.webmagic.Spider;
|
import us.codecraft.webmagic.Spider;
|
||||||
import us.codecraft.webmagic.SpiderListener;
|
import us.codecraft.webmagic.SpiderListener;
|
||||||
import us.codecraft.webmagic.utils.Experimental;
|
import us.codecraft.webmagic.utils.Experimental;
|
||||||
import us.codecraft.webmagic.utils.UrlUtils;
|
import us.codecraft.webmagic.utils.UrlUtils;
|
||||||
|
|
||||||
import javax.management.*;
|
|
||||||
import java.lang.management.ManagementFactory;
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.Collections;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.concurrent.atomic.AtomicBoolean;
|
|
||||||
import java.util.concurrent.atomic.AtomicInteger;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @author code4crafer@gmail.com
|
* @author code4crafer@gmail.com
|
||||||
* @since 0.5.0
|
* @since 0.5.0
|
||||||
|
@ -23,17 +27,13 @@ import java.util.concurrent.atomic.AtomicInteger;
|
||||||
@Experimental
|
@Experimental
|
||||||
public class SpiderMonitor {
|
public class SpiderMonitor {
|
||||||
|
|
||||||
private static SpiderMonitor INSTANCE = new SpiderMonitor();
|
private static final SpiderMonitor INSTANCE = new SpiderMonitor();
|
||||||
|
|
||||||
private AtomicBoolean started = new AtomicBoolean(false);
|
|
||||||
|
|
||||||
private Logger logger = LoggerFactory.getLogger(getClass());
|
|
||||||
|
|
||||||
private MBeanServer mbeanServer;
|
private MBeanServer mbeanServer;
|
||||||
|
|
||||||
private String jmxServerName;
|
private String jmxServerName;
|
||||||
|
|
||||||
private List<SpiderStatusMXBean> spiderStatuses = new ArrayList<SpiderStatusMXBean>();
|
private List<SpiderStatusMXBean> spiderStatuses = new ArrayList<>();
|
||||||
|
|
||||||
protected SpiderMonitor() {
|
protected SpiderMonitor() {
|
||||||
jmxServerName = "WebMagic";
|
jmxServerName = "WebMagic";
|
||||||
|
@ -51,7 +51,7 @@ public class SpiderMonitor {
|
||||||
for (Spider spider : spiders) {
|
for (Spider spider : spiders) {
|
||||||
MonitorSpiderListener monitorSpiderListener = new MonitorSpiderListener();
|
MonitorSpiderListener monitorSpiderListener = new MonitorSpiderListener();
|
||||||
if (spider.getSpiderListeners() == null) {
|
if (spider.getSpiderListeners() == null) {
|
||||||
List<SpiderListener> spiderListeners = new ArrayList<SpiderListener>();
|
List<SpiderListener> spiderListeners = new ArrayList<>();
|
||||||
spiderListeners.add(monitorSpiderListener);
|
spiderListeners.add(monitorSpiderListener);
|
||||||
spider.setSpiderListeners(spiderListeners);
|
spider.setSpiderListeners(spiderListeners);
|
||||||
} else {
|
} else {
|
||||||
|
@ -90,7 +90,7 @@ public class SpiderMonitor {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void onError(Request request) {
|
public void onError(Request request, Exception e) {
|
||||||
errorUrls.add(request.getUrl());
|
errorUrls.add(request.getUrl());
|
||||||
errorCount.incrementAndGet();
|
errorCount.incrementAndGet();
|
||||||
}
|
}
|
||||||
|
@ -109,7 +109,6 @@ public class SpiderMonitor {
|
||||||
}
|
}
|
||||||
|
|
||||||
protected void registerMBean(SpiderStatusMXBean spiderStatus) throws MalformedObjectNameException, InstanceAlreadyExistsException, MBeanRegistrationException, NotCompliantMBeanException {
|
protected void registerMBean(SpiderStatusMXBean spiderStatus) throws MalformedObjectNameException, InstanceAlreadyExistsException, MBeanRegistrationException, NotCompliantMBeanException {
|
||||||
// ObjectName objName = new ObjectName(jmxServerName + ":name=" + spiderStatus.getName());
|
|
||||||
ObjectName objName = new ObjectName(jmxServerName + ":name=" + UrlUtils.removePort(spiderStatus.getName()));
|
ObjectName objName = new ObjectName(jmxServerName + ":name=" + UrlUtils.removePort(spiderStatus.getName()));
|
||||||
mbeanServer.registerMBean(spiderStatus, objName);
|
mbeanServer.registerMBean(spiderStatus, objName);
|
||||||
}
|
}
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<artifactId>webmagic-parent</artifactId>
|
<artifactId>webmagic-parent</artifactId>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<version>0.8.0</version>
|
<version>0.9.0</version>
|
||||||
</parent>
|
</parent>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
@ -27,22 +27,22 @@
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.mapdb</groupId>
|
<groupId>org.mapdb</groupId>
|
||||||
<artifactId>mapdb</artifactId>
|
<artifactId>mapdb</artifactId>
|
||||||
<version>3.0.8</version>
|
<version>3.0.9</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>com.fasterxml.jackson.core</groupId>
|
<groupId>com.fasterxml.jackson.core</groupId>
|
||||||
<artifactId>jackson-core</artifactId>
|
<artifactId>jackson-core</artifactId>
|
||||||
<version>2.13.0-rc1</version>
|
<version>2.15.2</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>com.fasterxml.jackson.core</groupId>
|
<groupId>com.fasterxml.jackson.core</groupId>
|
||||||
<artifactId>jackson-annotations</artifactId>
|
<artifactId>jackson-annotations</artifactId>
|
||||||
<version>2.13.0-rc1</version>
|
<version>2.15.2</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>com.fasterxml.jackson.core</groupId>
|
<groupId>com.fasterxml.jackson.core</groupId>
|
||||||
<artifactId>jackson-databind</artifactId>
|
<artifactId>jackson-databind</artifactId>
|
||||||
<version>2.13.4.2</version>
|
<version>2.15.2</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
</dependencies>
|
</dependencies>
|
||||||
|
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<artifactId>webmagic-parent</artifactId>
|
<artifactId>webmagic-parent</artifactId>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<version>0.8.0</version>
|
<version>0.9.0</version>
|
||||||
</parent>
|
</parent>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,61 @@
|
||||||
|
package us.codecraft.webmagic.selector;
|
||||||
|
|
||||||
|
import org.w3c.dom.Node;
|
||||||
|
import org.w3c.dom.NodeList;
|
||||||
|
|
||||||
|
import javax.xml.transform.OutputKeys;
|
||||||
|
import javax.xml.transform.Transformer;
|
||||||
|
import javax.xml.transform.TransformerException;
|
||||||
|
import javax.xml.transform.TransformerFactory;
|
||||||
|
import javax.xml.transform.dom.DOMSource;
|
||||||
|
import javax.xml.transform.stream.StreamResult;
|
||||||
|
import java.io.StringWriter;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author hooy
|
||||||
|
*/
|
||||||
|
public final class JaxpSelectorUtils {
|
||||||
|
|
||||||
|
private JaxpSelectorUtils() {
|
||||||
|
throw new RuntimeException("The util class cannot be instanced");
|
||||||
|
}
|
||||||
|
|
||||||
|
public static List<Node> NodeListToArrayList(NodeList nodes) {
|
||||||
|
List<Node> list = new ArrayList<>(nodes.getLength());
|
||||||
|
for (int i = 0; i < nodes.getLength(); i++) {
|
||||||
|
list.add(nodes.item(i));
|
||||||
|
}
|
||||||
|
return list;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static String nodeToString(Node node) throws TransformerException {
|
||||||
|
List<Node> before = Collections.singletonList(node);
|
||||||
|
List<String> after = nodesToStrings(before);
|
||||||
|
if (after.size() > 0) {
|
||||||
|
return after.get(0);
|
||||||
|
} else {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public static List<String> nodesToStrings(List<Node> nodes) throws TransformerException {
|
||||||
|
List<String> results = new ArrayList<>(nodes.size());
|
||||||
|
Transformer transformer = TransformerFactory.newInstance().newTransformer();
|
||||||
|
StreamResult xmlOutput = new StreamResult();
|
||||||
|
transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
|
||||||
|
for (Node node : nodes) {
|
||||||
|
if (node.getNodeType() == Node.ATTRIBUTE_NODE || node.getNodeType() == Node.TEXT_NODE) {
|
||||||
|
results.add(node.getTextContent());
|
||||||
|
} else {
|
||||||
|
xmlOutput.setWriter(new StringWriter());
|
||||||
|
transformer.transform(new DOMSource(node), xmlOutput);
|
||||||
|
results.add(xmlOutput.getWriter().toString());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return results;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,32 @@
|
||||||
|
package us.codecraft.webmagic.selector;
|
||||||
|
|
||||||
|
import org.w3c.dom.Node;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Selector(extractor) for html node.<br>
|
||||||
|
*
|
||||||
|
* @author hooy <br>
|
||||||
|
* @since 0.8.0
|
||||||
|
*/
|
||||||
|
public interface NodeSelector {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract single result in text.<br>
|
||||||
|
* If there are more than one result, only the first will be chosen.
|
||||||
|
*
|
||||||
|
* @param node node
|
||||||
|
* @return result
|
||||||
|
*/
|
||||||
|
String select(Node node);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract all results in text.<br>
|
||||||
|
*
|
||||||
|
* @param node node
|
||||||
|
* @return results
|
||||||
|
*/
|
||||||
|
List<String> selectList(Node node);
|
||||||
|
|
||||||
|
}
|
|
@ -1,18 +1,10 @@
|
||||||
package us.codecraft.webmagic.selector;
|
package us.codecraft.webmagic.selector;
|
||||||
|
|
||||||
import java.io.StringWriter;
|
import java.util.*;
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.Iterator;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Map;
|
|
||||||
import java.util.concurrent.ConcurrentHashMap;
|
import java.util.concurrent.ConcurrentHashMap;
|
||||||
|
|
||||||
import javax.xml.namespace.NamespaceContext;
|
import javax.xml.namespace.NamespaceContext;
|
||||||
import javax.xml.transform.OutputKeys;
|
import javax.xml.parsers.ParserConfigurationException;
|
||||||
import javax.xml.transform.Transformer;
|
|
||||||
import javax.xml.transform.TransformerFactory;
|
|
||||||
import javax.xml.transform.dom.DOMSource;
|
|
||||||
import javax.xml.transform.stream.StreamResult;
|
|
||||||
import javax.xml.xpath.XPathConstants;
|
import javax.xml.xpath.XPathConstants;
|
||||||
import javax.xml.xpath.XPathExpression;
|
import javax.xml.xpath.XPathExpression;
|
||||||
import javax.xml.xpath.XPathExpressionException;
|
import javax.xml.xpath.XPathExpressionException;
|
||||||
|
@ -29,21 +21,24 @@ import org.w3c.dom.NodeList;
|
||||||
|
|
||||||
import net.sf.saxon.lib.NamespaceConstant;
|
import net.sf.saxon.lib.NamespaceConstant;
|
||||||
import net.sf.saxon.xpath.XPathEvaluator;
|
import net.sf.saxon.xpath.XPathEvaluator;
|
||||||
|
import us.codecraft.webmagic.utils.BaseSelectorUtils;
|
||||||
|
|
||||||
|
import static us.codecraft.webmagic.selector.JaxpSelectorUtils.*;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 支持xpath2.0的选择器。包装了HtmlCleaner和Saxon HE。<br>
|
* 支持xpath2.0的选择器。包装了HtmlCleaner和Saxon HE。<br>
|
||||||
*
|
*
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com, hooy <br>
|
||||||
* Date: 13-4-21
|
* Date: 13-4-21
|
||||||
* Time: 上午9:39
|
* Time: 上午9:39
|
||||||
*/
|
*/
|
||||||
public class Xpath2Selector implements Selector {
|
public class Xpath2Selector implements Selector, NodeSelector {
|
||||||
|
|
||||||
private String xpathStr;
|
private final String xpathStr;
|
||||||
|
|
||||||
private XPathExpression xPathExpression;
|
private XPathExpression xPathExpression;
|
||||||
|
|
||||||
private Logger logger = LoggerFactory.getLogger(getClass());
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
|
||||||
public Xpath2Selector(String xpathStr) {
|
public Xpath2Selector(String xpathStr) {
|
||||||
this.xpathStr = xpathStr;
|
this.xpathStr = xpathStr;
|
||||||
|
@ -54,25 +49,25 @@ public class Xpath2Selector implements Selector {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static Xpath2Selector newInstance(String xpathStr) {
|
||||||
|
return new Xpath2Selector(xpathStr);
|
||||||
|
}
|
||||||
|
|
||||||
enum XPath2NamespaceContext implements NamespaceContext {
|
enum XPath2NamespaceContext implements NamespaceContext {
|
||||||
|
|
||||||
INSTANCE;
|
INSTANCE;
|
||||||
|
|
||||||
private final Map<String, String> prefix2NamespaceMap = new ConcurrentHashMap<String, String>();
|
private final Map<String, String> prefix2NamespaceMap = new ConcurrentHashMap<>();
|
||||||
|
|
||||||
private final Map<String, List<String>> namespace2PrefixMap = new ConcurrentHashMap<String, List<String>>();
|
private final Map<String, List<String>> namespace2PrefixMap = new ConcurrentHashMap<>();
|
||||||
|
|
||||||
private void put(String prefix, String namespaceURI) {
|
private void put(String prefix, String namespaceURI) {
|
||||||
prefix2NamespaceMap.put(prefix, namespaceURI);
|
prefix2NamespaceMap.put(prefix, namespaceURI);
|
||||||
List<String> prefixes = namespace2PrefixMap.get(namespaceURI);
|
List<String> prefixes = namespace2PrefixMap.computeIfAbsent(namespaceURI, k -> new ArrayList<>());
|
||||||
if (prefixes == null) {
|
|
||||||
prefixes = new ArrayList<String>();
|
|
||||||
namespace2PrefixMap.put(namespaceURI, prefixes);
|
|
||||||
}
|
|
||||||
prefixes.add(prefix);
|
prefixes.add(prefix);
|
||||||
}
|
}
|
||||||
|
|
||||||
private XPath2NamespaceContext() {
|
XPath2NamespaceContext() {
|
||||||
put("fn", NamespaceConstant.FN);
|
put("fn", NamespaceConstant.FN);
|
||||||
put("xslt", NamespaceConstant.XSLT);
|
put("xslt", NamespaceConstant.XSLT);
|
||||||
put("xhtml", NamespaceConstant.XHTML);
|
put("xhtml", NamespaceConstant.XHTML);
|
||||||
|
@ -111,32 +106,18 @@ public class Xpath2Selector implements Selector {
|
||||||
@Override
|
@Override
|
||||||
public String select(String text) {
|
public String select(String text) {
|
||||||
try {
|
try {
|
||||||
HtmlCleaner htmlCleaner = new HtmlCleaner();
|
Document doc = parse(text);
|
||||||
TagNode tagNode = htmlCleaner.clean(text);
|
return select(doc);
|
||||||
Document document = new DomSerializer(new CleanerProperties()).createDOM(tagNode);
|
} catch (Exception e) {
|
||||||
Object result;
|
logger.error("select text error! " + xpathStr, e);
|
||||||
try {
|
}
|
||||||
result = xPathExpression.evaluate(document, XPathConstants.NODESET);
|
return null;
|
||||||
} catch (XPathExpressionException e) {
|
}
|
||||||
result = xPathExpression.evaluate(document, XPathConstants.STRING);
|
|
||||||
}
|
@Override
|
||||||
if (result instanceof NodeList) {
|
public String select(Node node) {
|
||||||
NodeList nodeList = (NodeList) result;
|
try {
|
||||||
if (nodeList.getLength() == 0) {
|
return (String) xPathExpression.evaluate(node, XPathConstants.STRING);
|
||||||
return null;
|
|
||||||
}
|
|
||||||
Node item = nodeList.item(0);
|
|
||||||
if (item.getNodeType() == Node.ATTRIBUTE_NODE || item.getNodeType() == Node.TEXT_NODE) {
|
|
||||||
return item.getTextContent();
|
|
||||||
} else {
|
|
||||||
StreamResult xmlOutput = new StreamResult(new StringWriter());
|
|
||||||
Transformer transformer = TransformerFactory.newInstance().newTransformer();
|
|
||||||
transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
|
|
||||||
transformer.transform(new DOMSource(item), xmlOutput);
|
|
||||||
return xmlOutput.getWriter().toString();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return result.toString();
|
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
logger.error("select text error! " + xpathStr, e);
|
logger.error("select text error! " + xpathStr, e);
|
||||||
}
|
}
|
||||||
|
@ -145,38 +126,72 @@ public class Xpath2Selector implements Selector {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public List<String> selectList(String text) {
|
public List<String> selectList(String text) {
|
||||||
List<String> results = new ArrayList<String>();
|
|
||||||
try {
|
try {
|
||||||
HtmlCleaner htmlCleaner = new HtmlCleaner();
|
Document doc = parse(text);
|
||||||
TagNode tagNode = htmlCleaner.clean(text);
|
return selectList(doc);
|
||||||
Document document = new DomSerializer(new CleanerProperties()).createDOM(tagNode);
|
|
||||||
Object result;
|
|
||||||
try {
|
|
||||||
result = xPathExpression.evaluate(document, XPathConstants.NODESET);
|
|
||||||
} catch (XPathExpressionException e) {
|
|
||||||
result = xPathExpression.evaluate(document, XPathConstants.STRING);
|
|
||||||
}
|
|
||||||
if (result instanceof NodeList) {
|
|
||||||
NodeList nodeList = (NodeList) result;
|
|
||||||
Transformer transformer = TransformerFactory.newInstance().newTransformer();
|
|
||||||
StreamResult xmlOutput = new StreamResult();
|
|
||||||
transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
|
|
||||||
for (int i = 0; i < nodeList.getLength(); i++) {
|
|
||||||
Node item = nodeList.item(i);
|
|
||||||
if (item.getNodeType() == Node.ATTRIBUTE_NODE || item.getNodeType() == Node.TEXT_NODE) {
|
|
||||||
results.add(item.getTextContent());
|
|
||||||
} else {
|
|
||||||
xmlOutput.setWriter(new StringWriter());
|
|
||||||
transformer.transform(new DOMSource(item), xmlOutput);
|
|
||||||
results.add(xmlOutput.getWriter().toString());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
results.add(result.toString());
|
|
||||||
}
|
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
logger.error("select text error! " + xpathStr, e);
|
logger.error("select text error! " + xpathStr, e);
|
||||||
}
|
}
|
||||||
return results;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public List<String> selectList(Node node) {
|
||||||
|
try {
|
||||||
|
NodeList result = (NodeList) xPathExpression.evaluate(node, XPathConstants.NODESET);
|
||||||
|
List<Node> nodes = NodeListToArrayList(result);
|
||||||
|
return nodesToStrings(nodes);
|
||||||
|
} catch (Exception e) {
|
||||||
|
logger.error("select text error! " + xpathStr, e);
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Node selectNode(String text) {
|
||||||
|
try {
|
||||||
|
Document doc = parse(text);
|
||||||
|
return selectNode(doc);
|
||||||
|
} catch (Exception e) {
|
||||||
|
logger.error("select text error! " + xpathStr, e);
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Node selectNode(Node node) {
|
||||||
|
try {
|
||||||
|
return (Node) xPathExpression.evaluate(node, XPathConstants.NODE);
|
||||||
|
} catch (Exception e) {
|
||||||
|
logger.error("select text error! " + xpathStr, e);
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<Node> selectNodes(String text) {
|
||||||
|
try {
|
||||||
|
Document doc = parse(text);
|
||||||
|
return selectNodes(doc);
|
||||||
|
} catch (Exception e) {
|
||||||
|
logger.error("select text error! " + xpathStr, e);
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<Node> selectNodes(Node node) {
|
||||||
|
try {
|
||||||
|
NodeList result = (NodeList) xPathExpression.evaluate(node, XPathConstants.NODESET);
|
||||||
|
return NodeListToArrayList(result);
|
||||||
|
} catch (Exception e) {
|
||||||
|
logger.error("select text error! " + xpathStr, e);
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected static Document parse(String text) throws ParserConfigurationException {
|
||||||
|
// HtmlCleaner could not parse <tr></tr> or <td></td> tag directly
|
||||||
|
text = BaseSelectorUtils.preParse(text);
|
||||||
|
HtmlCleaner htmlCleaner = new HtmlCleaner();
|
||||||
|
TagNode tagNode = htmlCleaner.clean(text);
|
||||||
|
return new DomSerializer(new CleanerProperties()).createDOM(tagNode);
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
File diff suppressed because one or more lines are too long
|
@ -3,7 +3,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<artifactId>webmagic-parent</artifactId>
|
<artifactId>webmagic-parent</artifactId>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<version>0.8.0</version>
|
<version>0.9.0</version>
|
||||||
</parent>
|
</parent>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<artifactId>webmagic-parent</artifactId>
|
<artifactId>webmagic-parent</artifactId>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<version>0.8.0</version>
|
<version>0.9.0</version>
|
||||||
</parent>
|
</parent>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue