Merge branch 'release/0.9.0'
commit
fd4a136f9a
|
@ -39,12 +39,12 @@ webmagic使用maven管理依赖,在项目中添加对应的依赖即可使用w
|
|||
<dependency>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<artifactId>webmagic-core</artifactId>
|
||||
<version>0.7.5</version>
|
||||
<version>${webmagic.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<artifactId>webmagic-extension</artifactId>
|
||||
<version>0.7.5</version>
|
||||
<version>${webmagic.version}</version>
|
||||
</dependency>
|
||||
```
|
||||
|
||||
|
|
|
@ -25,12 +25,12 @@ Add dependencies to your pom.xml:
|
|||
<dependency>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<artifactId>webmagic-core</artifactId>
|
||||
<version>0.7.5</version>
|
||||
<version>${webmagic.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<artifactId>webmagic-extension</artifactId>
|
||||
<version>0.7.5</version>
|
||||
<version>${webmagic.version}</version>
|
||||
</dependency>
|
||||
```
|
||||
|
||||
|
|
4
pom.xml
4
pom.xml
|
@ -1,7 +1,7 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<groupId>us.codecraft</groupId>
|
||||
<version>0.8.0</version>
|
||||
<version>0.9.0</version>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
<packaging>pom</packaging>
|
||||
<properties>
|
||||
|
@ -124,7 +124,7 @@
|
|||
<dependency>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<artifactId>xsoup</artifactId>
|
||||
<version>0.3.6</version>
|
||||
<version>0.3.7</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.alibaba</groupId>
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
<parent>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<artifactId>webmagic-parent</artifactId>
|
||||
<version>0.8.0</version>
|
||||
<version>0.9.0</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
|
|
|
@ -20,7 +20,7 @@ import java.util.Map;
|
|||
* {@link #getHtml()} get content of current page <br>
|
||||
* {@link #putField(String, Object)} save extracted result <br>
|
||||
* {@link #getResultItems()} get extract results to be used in {@link us.codecraft.webmagic.pipeline.Pipeline}<br>
|
||||
* {@link #addTargetRequests(java.util.List)} {@link #addTargetRequest(String)} add urls to fetch <br>
|
||||
* {@link #addTargetRequests(Iterable)} {@link #addTargetRequest(String)} add urls to fetch <br>
|
||||
*
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* @see us.codecraft.webmagic.downloader.Downloader
|
||||
|
@ -52,7 +52,7 @@ public class Page {
|
|||
private List<Request> targetRequests = new ArrayList<Request>();
|
||||
|
||||
private String charset;
|
||||
|
||||
|
||||
public Page() {
|
||||
}
|
||||
|
||||
|
@ -108,7 +108,8 @@ public class Page {
|
|||
* @deprecated since 0.4.0
|
||||
* The html is parse just when first time of calling {@link #getHtml()}, so use {@link #setRawText(String)} instead.
|
||||
*/
|
||||
public void setHtml(Html html) {
|
||||
@Deprecated
|
||||
public void setHtml(Html html) {
|
||||
this.html = html;
|
||||
}
|
||||
|
||||
|
@ -121,7 +122,7 @@ public class Page {
|
|||
*
|
||||
* @param requests requests
|
||||
*/
|
||||
public void addTargetRequests(List<String> requests) {
|
||||
public void addTargetRequests(Iterable<String> requests) {
|
||||
for (String s : requests) {
|
||||
if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) {
|
||||
continue;
|
||||
|
@ -137,7 +138,7 @@ public class Page {
|
|||
* @param requests requests
|
||||
* @param priority priority
|
||||
*/
|
||||
public void addTargetRequests(List<String> requests, long priority) {
|
||||
public void addTargetRequests(Iterable<String> requests, long priority) {
|
||||
for (String s : requests) {
|
||||
if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) {
|
||||
continue;
|
||||
|
|
|
@ -28,6 +28,8 @@ public class Site {
|
|||
|
||||
private String charset;
|
||||
|
||||
private String defaultCharset;
|
||||
|
||||
private int sleepTime = 5000;
|
||||
|
||||
private int retryTimes = 0;
|
||||
|
@ -168,6 +170,30 @@ public class Site {
|
|||
return charset;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set default charset of page.
|
||||
*
|
||||
* When charset detect failed, use this default charset.
|
||||
*
|
||||
* @param defaultCharset the default charset
|
||||
* @return this
|
||||
* @since 0.9.0
|
||||
*/
|
||||
public Site setDefaultCharset(String defaultCharset) {
|
||||
this.defaultCharset = defaultCharset;
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* The default charset if charset detected failed.
|
||||
*
|
||||
* @return the defulat charset
|
||||
* @since 0.9.0
|
||||
*/
|
||||
public String getDefaultCharset() {
|
||||
return defaultCharset;
|
||||
}
|
||||
|
||||
public int getTimeOut() {
|
||||
return timeOut;
|
||||
}
|
||||
|
|
|
@ -4,6 +4,7 @@ import java.io.IOException;
|
|||
import java.nio.charset.Charset;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.http.HttpResponse;
|
||||
|
@ -76,7 +77,7 @@ public class HttpClientDownloader extends AbstractDownloader {
|
|||
}
|
||||
CloseableHttpResponse httpResponse = null;
|
||||
CloseableHttpClient httpClient = getHttpClient(task.getSite());
|
||||
Proxy proxy = proxyProvider != null ? proxyProvider.getProxy(task) : null;
|
||||
Proxy proxy = proxyProvider != null ? proxyProvider.getProxy(request, task) : null;
|
||||
HttpClientRequestContext requestContext = httpUriRequestConverter.convert(request, task.getSite(), proxy);
|
||||
Page page = Page.fail();
|
||||
try {
|
||||
|
@ -116,7 +117,7 @@ public class HttpClientDownloader extends AbstractDownloader {
|
|||
page.setBytes(bytes);
|
||||
if (!request.isBinaryContent()) {
|
||||
if (charset == null) {
|
||||
charset = getHtmlCharset(contentType, bytes);
|
||||
charset = getHtmlCharset(contentType, bytes, task);
|
||||
}
|
||||
page.setCharset(charset);
|
||||
page.setRawText(new String(bytes, charset));
|
||||
|
@ -131,11 +132,11 @@ public class HttpClientDownloader extends AbstractDownloader {
|
|||
return page;
|
||||
}
|
||||
|
||||
private String getHtmlCharset(String contentType, byte[] contentBytes) throws IOException {
|
||||
private String getHtmlCharset(String contentType, byte[] contentBytes, Task task) throws IOException {
|
||||
String charset = CharsetUtils.detectCharset(contentType, contentBytes);
|
||||
if (charset == null) {
|
||||
charset = Charset.defaultCharset().name();
|
||||
logger.warn("Charset autodetect failed, use {} as charset. Please specify charset in Site.setCharset()", Charset.defaultCharset());
|
||||
charset = Optional.ofNullable(task.getSite().getDefaultCharset()).orElseGet(Charset.defaultCharset()::name);
|
||||
logger.info("Charset autodetect failed, use {} as charset.", task.getSite().getDefaultCharset());
|
||||
}
|
||||
return charset;
|
||||
}
|
||||
|
|
|
@ -1,16 +1,5 @@
|
|||
package us.codecraft.webmagic.downloader;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.security.KeyManagementException;
|
||||
import java.security.NoSuchAlgorithmException;
|
||||
import java.security.cert.CertificateException;
|
||||
import java.security.cert.X509Certificate;
|
||||
import java.util.Map;
|
||||
|
||||
import javax.net.ssl.SSLContext;
|
||||
import javax.net.ssl.TrustManager;
|
||||
import javax.net.ssl.X509TrustManager;
|
||||
|
||||
import org.apache.commons.lang3.JavaVersion;
|
||||
import org.apache.commons.lang3.SystemUtils;
|
||||
import org.apache.http.HttpException;
|
||||
|
@ -22,28 +11,32 @@ import org.apache.http.config.RegistryBuilder;
|
|||
import org.apache.http.config.SocketConfig;
|
||||
import org.apache.http.conn.socket.ConnectionSocketFactory;
|
||||
import org.apache.http.conn.socket.PlainConnectionSocketFactory;
|
||||
import org.apache.http.conn.ssl.DefaultHostnameVerifier;
|
||||
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
|
||||
import org.apache.http.impl.client.BasicCookieStore;
|
||||
import org.apache.http.impl.client.CloseableHttpClient;
|
||||
import org.apache.http.impl.client.DefaultHttpRequestRetryHandler;
|
||||
import org.apache.http.impl.client.HttpClientBuilder;
|
||||
import org.apache.http.impl.client.HttpClients;
|
||||
import org.apache.http.impl.client.*;
|
||||
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
|
||||
import org.apache.http.impl.cookie.BasicClientCookie;
|
||||
import org.apache.http.protocol.HttpContext;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import us.codecraft.webmagic.Site;
|
||||
|
||||
import javax.net.ssl.SSLContext;
|
||||
import javax.net.ssl.TrustManager;
|
||||
import javax.net.ssl.X509TrustManager;
|
||||
import java.io.IOException;
|
||||
import java.security.KeyManagementException;
|
||||
import java.security.NoSuchAlgorithmException;
|
||||
import java.security.cert.CertificateException;
|
||||
import java.security.cert.X509Certificate;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* @since 0.4.0
|
||||
*/
|
||||
public class HttpClientGenerator {
|
||||
|
||||
private transient Logger logger = LoggerFactory.getLogger(getClass());
|
||||
private transient Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
private PoolingHttpClientConnectionManager connectionManager;
|
||||
|
||||
|
@ -61,21 +54,20 @@ public class HttpClientGenerator {
|
|||
SSLContext sslContext = createIgnoreVerifySSL();
|
||||
String[] supportedProtocols;
|
||||
if (SystemUtils.isJavaVersionAtLeast(JavaVersion.JAVA_11)) {
|
||||
supportedProtocols = new String[] { "SSLv3", "TLSv1", "TLSv1.1", "TLSv1.2", "TLSv1.3" };
|
||||
supportedProtocols = new String[]{"SSLv3", "TLSv1", "TLSv1.1", "TLSv1.2", "TLSv1.3"};
|
||||
} else {
|
||||
supportedProtocols = new String[] { "SSLv3", "TLSv1", "TLSv1.1", "TLSv1.2" };
|
||||
supportedProtocols = new String[]{"SSLv3", "TLSv1", "TLSv1.1", "TLSv1.2"};
|
||||
}
|
||||
logger.debug("supportedProtocols: {}", String.join(", ", supportedProtocols));
|
||||
return new SSLConnectionSocketFactory(sslContext, supportedProtocols,
|
||||
null,
|
||||
new DefaultHostnameVerifier()); // 优先绕过安全证书
|
||||
} catch (KeyManagementException e) {
|
||||
logger.error("ssl connection fail", e);
|
||||
} catch (NoSuchAlgorithmException e) {
|
||||
//不进行主机校验
|
||||
(host, sslSession) -> true); // 优先绕过安全证书
|
||||
} catch (KeyManagementException | NoSuchAlgorithmException e) {
|
||||
logger.error("ssl connection fail", e);
|
||||
}
|
||||
return SSLConnectionSocketFactory.getSocketFactory();
|
||||
}
|
||||
}
|
||||
|
||||
private SSLContext createIgnoreVerifySSL() throws NoSuchAlgorithmException, KeyManagementException {
|
||||
// 实现一个X509TrustManager接口,用于绕过验证,不用修改里面的方法
|
||||
|
@ -97,9 +89,9 @@ public class HttpClientGenerator {
|
|||
};
|
||||
|
||||
SSLContext sc = SSLContext.getInstance("TLS");
|
||||
sc.init(null, new TrustManager[] { trustManager }, null);
|
||||
sc.init(null, new TrustManager[]{trustManager}, null);
|
||||
return sc;
|
||||
}
|
||||
}
|
||||
|
||||
public HttpClientGenerator setPoolSize(int poolSize) {
|
||||
connectionManager.setMaxTotal(poolSize);
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
package us.codecraft.webmagic.proxy;
|
||||
|
||||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.Request;
|
||||
import us.codecraft.webmagic.Task;
|
||||
|
||||
/**
|
||||
|
@ -23,7 +24,23 @@ public interface ProxyProvider {
|
|||
* Get a proxy for task by some strategy.
|
||||
* @param task the download task
|
||||
* @return proxy
|
||||
* @deprecated Use {@link #getProxy(Request, Task)} instead.
|
||||
*/
|
||||
Proxy getProxy(Task task);
|
||||
@Deprecated
|
||||
default Proxy getProxy(Task task) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a proxy for the request.
|
||||
*
|
||||
* @param request the request
|
||||
* @param task the download task
|
||||
* @return proxy
|
||||
* @since 0.9.0
|
||||
*/
|
||||
default Proxy getProxy(Request request, Task task) {
|
||||
return this.getProxy(task);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
package us.codecraft.webmagic.proxy;
|
||||
|
||||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.Request;
|
||||
import us.codecraft.webmagic.Task;
|
||||
|
||||
import java.util.ArrayList;
|
||||
|
@ -44,7 +45,7 @@ public class SimpleProxyProvider implements ProxyProvider {
|
|||
}
|
||||
|
||||
@Override
|
||||
public Proxy getProxy(Task task) {
|
||||
public Proxy getProxy(Request request, Task task) {
|
||||
return proxies.get(incrForLoop());
|
||||
}
|
||||
|
||||
|
|
|
@ -3,6 +3,7 @@ package us.codecraft.webmagic.selector;
|
|||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.jsoup.nodes.Element;
|
||||
import us.codecraft.webmagic.utils.BaseSelectorUtils;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
@ -13,16 +14,9 @@ import java.util.List;
|
|||
*/
|
||||
public abstract class BaseElementSelector implements Selector, ElementSelector {
|
||||
private Document parse(String text) {
|
||||
if (text == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// Jsoup could not parse <tr></tr> or <td></td> tag directly
|
||||
// https://stackoverflow.com/questions/63607740/jsoup-couldnt-parse-tr-tag
|
||||
if ((text.startsWith("<tr>") && text.endsWith("</tr>"))
|
||||
|| (text.startsWith("<td>") && text.endsWith("</td>"))) {
|
||||
text = "<table>" + text + "</table>";
|
||||
}
|
||||
text = BaseSelectorUtils.preParse(text);
|
||||
return Jsoup.parse(text);
|
||||
}
|
||||
|
||||
|
|
|
@ -0,0 +1,23 @@
|
|||
package us.codecraft.webmagic.utils;
|
||||
|
||||
/**
|
||||
* @author hooy
|
||||
*/
|
||||
public class BaseSelectorUtils {
|
||||
|
||||
/**
|
||||
* Jsoup/HtmlCleaner could not parse "tr" or "td" tag directly
|
||||
* https://stackoverflow.com/questions/63607740/jsoup-couldnt-parse-tr-tag
|
||||
*
|
||||
* @param text - the html string
|
||||
* @return text
|
||||
*/
|
||||
public static String preParse(String text) {
|
||||
if (((text.startsWith("<tr>") || text.startsWith("<tr ")) && text.endsWith("</tr>"))
|
||||
|| ((text.startsWith("<td>") || text.startsWith("<td ")) && text.endsWith("</td>"))) {
|
||||
text = "<table>" + text + "</table>";
|
||||
}
|
||||
return text;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,17 @@
|
|||
package us.codecraft.webmagic;
|
||||
|
||||
import static org.junit.Assert.assertEquals;
|
||||
|
||||
import java.nio.charset.StandardCharsets;
|
||||
|
||||
import org.junit.Test;
|
||||
|
||||
public class SiteTest {
|
||||
|
||||
@Test
|
||||
public void test() {
|
||||
Site site = Site.me().setDefaultCharset(StandardCharsets.UTF_8.name());
|
||||
assertEquals(StandardCharsets.UTF_8.name(), site.getDefaultCharset());
|
||||
}
|
||||
|
||||
}
|
|
@ -1,6 +1,9 @@
|
|||
package us.codecraft.webmagic.proxy;
|
||||
|
||||
import org.junit.Test;
|
||||
import org.mockito.Mockito;
|
||||
|
||||
import us.codecraft.webmagic.Request;
|
||||
import us.codecraft.webmagic.Site;
|
||||
import us.codecraft.webmagic.Task;
|
||||
|
||||
|
@ -20,11 +23,12 @@ public class SimpleProxyProviderTest {
|
|||
Proxy originProxy1 = new Proxy("127.0.0.1", 1087);
|
||||
Proxy originProxy2 = new Proxy("127.0.0.1", 1088);
|
||||
SimpleProxyProvider proxyProvider = SimpleProxyProvider.from(originProxy1, originProxy2);
|
||||
Proxy proxy = proxyProvider.getProxy(TASK);
|
||||
Request request = Mockito.mock(Request.class);
|
||||
Proxy proxy = proxyProvider.getProxy(request, TASK);
|
||||
assertThat(proxy).isEqualTo(originProxy1);
|
||||
proxy = proxyProvider.getProxy(TASK);
|
||||
proxy = proxyProvider.getProxy(request, TASK);
|
||||
assertThat(proxy).isEqualTo(originProxy2);
|
||||
proxy = proxyProvider.getProxy(TASK);
|
||||
proxy = proxyProvider.getProxy(request, TASK);
|
||||
assertThat(proxy).isEqualTo(originProxy1);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -8,7 +8,7 @@
|
|||
<parent>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<artifactId>webmagic-parent</artifactId>
|
||||
<version>0.8.0</version>
|
||||
<version>0.9.0</version>
|
||||
</parent>
|
||||
|
||||
<artifactId>webmagic-coverage</artifactId>
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
<parent>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<artifactId>webmagic-parent</artifactId>
|
||||
<version>0.8.0</version>
|
||||
<version>0.9.0</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
|
@ -14,6 +14,11 @@
|
|||
<groupId>redis.clients</groupId>
|
||||
<artifactId>jedis</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.assertj</groupId>
|
||||
<artifactId>assertj-core</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.google.guava</groupId>
|
||||
<artifactId>guava</artifactId>
|
||||
|
|
|
@ -1,21 +1,25 @@
|
|||
package us.codecraft.webmagic.monitor;
|
||||
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import java.lang.management.ManagementFactory;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
|
||||
import javax.management.InstanceAlreadyExistsException;
|
||||
import javax.management.JMException;
|
||||
import javax.management.MBeanRegistrationException;
|
||||
import javax.management.MBeanServer;
|
||||
import javax.management.MalformedObjectNameException;
|
||||
import javax.management.NotCompliantMBeanException;
|
||||
import javax.management.ObjectName;
|
||||
|
||||
import us.codecraft.webmagic.Request;
|
||||
import us.codecraft.webmagic.Spider;
|
||||
import us.codecraft.webmagic.SpiderListener;
|
||||
import us.codecraft.webmagic.utils.Experimental;
|
||||
import us.codecraft.webmagic.utils.UrlUtils;
|
||||
|
||||
import javax.management.*;
|
||||
import java.lang.management.ManagementFactory;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.atomic.AtomicBoolean;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
|
||||
/**
|
||||
* @author code4crafer@gmail.com
|
||||
* @since 0.5.0
|
||||
|
@ -23,17 +27,13 @@ import java.util.concurrent.atomic.AtomicInteger;
|
|||
@Experimental
|
||||
public class SpiderMonitor {
|
||||
|
||||
private static SpiderMonitor INSTANCE = new SpiderMonitor();
|
||||
|
||||
private AtomicBoolean started = new AtomicBoolean(false);
|
||||
|
||||
private Logger logger = LoggerFactory.getLogger(getClass());
|
||||
private static final SpiderMonitor INSTANCE = new SpiderMonitor();
|
||||
|
||||
private MBeanServer mbeanServer;
|
||||
|
||||
private String jmxServerName;
|
||||
|
||||
private List<SpiderStatusMXBean> spiderStatuses = new ArrayList<SpiderStatusMXBean>();
|
||||
private List<SpiderStatusMXBean> spiderStatuses = new ArrayList<>();
|
||||
|
||||
protected SpiderMonitor() {
|
||||
jmxServerName = "WebMagic";
|
||||
|
@ -51,7 +51,7 @@ public class SpiderMonitor {
|
|||
for (Spider spider : spiders) {
|
||||
MonitorSpiderListener monitorSpiderListener = new MonitorSpiderListener();
|
||||
if (spider.getSpiderListeners() == null) {
|
||||
List<SpiderListener> spiderListeners = new ArrayList<SpiderListener>();
|
||||
List<SpiderListener> spiderListeners = new ArrayList<>();
|
||||
spiderListeners.add(monitorSpiderListener);
|
||||
spider.setSpiderListeners(spiderListeners);
|
||||
} else {
|
||||
|
@ -90,7 +90,7 @@ public class SpiderMonitor {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void onError(Request request) {
|
||||
public void onError(Request request, Exception e) {
|
||||
errorUrls.add(request.getUrl());
|
||||
errorCount.incrementAndGet();
|
||||
}
|
||||
|
@ -109,7 +109,6 @@ public class SpiderMonitor {
|
|||
}
|
||||
|
||||
protected void registerMBean(SpiderStatusMXBean spiderStatus) throws MalformedObjectNameException, InstanceAlreadyExistsException, MBeanRegistrationException, NotCompliantMBeanException {
|
||||
// ObjectName objName = new ObjectName(jmxServerName + ":name=" + spiderStatus.getName());
|
||||
ObjectName objName = new ObjectName(jmxServerName + ":name=" + UrlUtils.removePort(spiderStatus.getName()));
|
||||
mbeanServer.registerMBean(spiderStatus, objName);
|
||||
}
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
<parent>
|
||||
<artifactId>webmagic-parent</artifactId>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<version>0.8.0</version>
|
||||
<version>0.9.0</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
|
@ -27,22 +27,22 @@
|
|||
<dependency>
|
||||
<groupId>org.mapdb</groupId>
|
||||
<artifactId>mapdb</artifactId>
|
||||
<version>3.0.8</version>
|
||||
<version>3.0.9</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.fasterxml.jackson.core</groupId>
|
||||
<artifactId>jackson-core</artifactId>
|
||||
<version>2.13.0-rc1</version>
|
||||
<version>2.15.2</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.fasterxml.jackson.core</groupId>
|
||||
<artifactId>jackson-annotations</artifactId>
|
||||
<version>2.13.0-rc1</version>
|
||||
<version>2.15.2</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.fasterxml.jackson.core</groupId>
|
||||
<artifactId>jackson-databind</artifactId>
|
||||
<version>2.13.4.2</version>
|
||||
<version>2.15.2</version>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
<parent>
|
||||
<artifactId>webmagic-parent</artifactId>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<version>0.8.0</version>
|
||||
<version>0.9.0</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
|
|
|
@ -0,0 +1,61 @@
|
|||
package us.codecraft.webmagic.selector;
|
||||
|
||||
import org.w3c.dom.Node;
|
||||
import org.w3c.dom.NodeList;
|
||||
|
||||
import javax.xml.transform.OutputKeys;
|
||||
import javax.xml.transform.Transformer;
|
||||
import javax.xml.transform.TransformerException;
|
||||
import javax.xml.transform.TransformerFactory;
|
||||
import javax.xml.transform.dom.DOMSource;
|
||||
import javax.xml.transform.stream.StreamResult;
|
||||
import java.io.StringWriter;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* @author hooy
|
||||
*/
|
||||
public final class JaxpSelectorUtils {
|
||||
|
||||
private JaxpSelectorUtils() {
|
||||
throw new RuntimeException("The util class cannot be instanced");
|
||||
}
|
||||
|
||||
public static List<Node> NodeListToArrayList(NodeList nodes) {
|
||||
List<Node> list = new ArrayList<>(nodes.getLength());
|
||||
for (int i = 0; i < nodes.getLength(); i++) {
|
||||
list.add(nodes.item(i));
|
||||
}
|
||||
return list;
|
||||
}
|
||||
|
||||
public static String nodeToString(Node node) throws TransformerException {
|
||||
List<Node> before = Collections.singletonList(node);
|
||||
List<String> after = nodesToStrings(before);
|
||||
if (after.size() > 0) {
|
||||
return after.get(0);
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
public static List<String> nodesToStrings(List<Node> nodes) throws TransformerException {
|
||||
List<String> results = new ArrayList<>(nodes.size());
|
||||
Transformer transformer = TransformerFactory.newInstance().newTransformer();
|
||||
StreamResult xmlOutput = new StreamResult();
|
||||
transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
|
||||
for (Node node : nodes) {
|
||||
if (node.getNodeType() == Node.ATTRIBUTE_NODE || node.getNodeType() == Node.TEXT_NODE) {
|
||||
results.add(node.getTextContent());
|
||||
} else {
|
||||
xmlOutput.setWriter(new StringWriter());
|
||||
transformer.transform(new DOMSource(node), xmlOutput);
|
||||
results.add(xmlOutput.getWriter().toString());
|
||||
}
|
||||
}
|
||||
return results;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,32 @@
|
|||
package us.codecraft.webmagic.selector;
|
||||
|
||||
import org.w3c.dom.Node;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Selector(extractor) for html node.<br>
|
||||
*
|
||||
* @author hooy <br>
|
||||
* @since 0.8.0
|
||||
*/
|
||||
public interface NodeSelector {
|
||||
|
||||
/**
|
||||
* Extract single result in text.<br>
|
||||
* If there are more than one result, only the first will be chosen.
|
||||
*
|
||||
* @param node node
|
||||
* @return result
|
||||
*/
|
||||
String select(Node node);
|
||||
|
||||
/**
|
||||
* Extract all results in text.<br>
|
||||
*
|
||||
* @param node node
|
||||
* @return results
|
||||
*/
|
||||
List<String> selectList(Node node);
|
||||
|
||||
}
|
|
@ -1,18 +1,10 @@
|
|||
package us.codecraft.webmagic.selector;
|
||||
|
||||
import java.io.StringWriter;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.*;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
|
||||
import javax.xml.namespace.NamespaceContext;
|
||||
import javax.xml.transform.OutputKeys;
|
||||
import javax.xml.transform.Transformer;
|
||||
import javax.xml.transform.TransformerFactory;
|
||||
import javax.xml.transform.dom.DOMSource;
|
||||
import javax.xml.transform.stream.StreamResult;
|
||||
import javax.xml.parsers.ParserConfigurationException;
|
||||
import javax.xml.xpath.XPathConstants;
|
||||
import javax.xml.xpath.XPathExpression;
|
||||
import javax.xml.xpath.XPathExpressionException;
|
||||
|
@ -29,21 +21,24 @@ import org.w3c.dom.NodeList;
|
|||
|
||||
import net.sf.saxon.lib.NamespaceConstant;
|
||||
import net.sf.saxon.xpath.XPathEvaluator;
|
||||
import us.codecraft.webmagic.utils.BaseSelectorUtils;
|
||||
|
||||
import static us.codecraft.webmagic.selector.JaxpSelectorUtils.*;
|
||||
|
||||
/**
|
||||
* 支持xpath2.0的选择器。包装了HtmlCleaner和Saxon HE。<br>
|
||||
*
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-4-21
|
||||
* Time: 上午9:39
|
||||
* @author code4crafter@gmail.com, hooy <br>
|
||||
* Date: 13-4-21
|
||||
* Time: 上午9:39
|
||||
*/
|
||||
public class Xpath2Selector implements Selector {
|
||||
public class Xpath2Selector implements Selector, NodeSelector {
|
||||
|
||||
private String xpathStr;
|
||||
private final String xpathStr;
|
||||
|
||||
private XPathExpression xPathExpression;
|
||||
|
||||
private Logger logger = LoggerFactory.getLogger(getClass());
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
public Xpath2Selector(String xpathStr) {
|
||||
this.xpathStr = xpathStr;
|
||||
|
@ -54,25 +49,25 @@ public class Xpath2Selector implements Selector {
|
|||
}
|
||||
}
|
||||
|
||||
public static Xpath2Selector newInstance(String xpathStr) {
|
||||
return new Xpath2Selector(xpathStr);
|
||||
}
|
||||
|
||||
enum XPath2NamespaceContext implements NamespaceContext {
|
||||
|
||||
INSTANCE;
|
||||
|
||||
private final Map<String, String> prefix2NamespaceMap = new ConcurrentHashMap<String, String>();
|
||||
private final Map<String, String> prefix2NamespaceMap = new ConcurrentHashMap<>();
|
||||
|
||||
private final Map<String, List<String>> namespace2PrefixMap = new ConcurrentHashMap<String, List<String>>();
|
||||
private final Map<String, List<String>> namespace2PrefixMap = new ConcurrentHashMap<>();
|
||||
|
||||
private void put(String prefix, String namespaceURI) {
|
||||
prefix2NamespaceMap.put(prefix, namespaceURI);
|
||||
List<String> prefixes = namespace2PrefixMap.get(namespaceURI);
|
||||
if (prefixes == null) {
|
||||
prefixes = new ArrayList<String>();
|
||||
namespace2PrefixMap.put(namespaceURI, prefixes);
|
||||
}
|
||||
List<String> prefixes = namespace2PrefixMap.computeIfAbsent(namespaceURI, k -> new ArrayList<>());
|
||||
prefixes.add(prefix);
|
||||
}
|
||||
|
||||
private XPath2NamespaceContext() {
|
||||
XPath2NamespaceContext() {
|
||||
put("fn", NamespaceConstant.FN);
|
||||
put("xslt", NamespaceConstant.XSLT);
|
||||
put("xhtml", NamespaceConstant.XHTML);
|
||||
|
@ -111,32 +106,18 @@ public class Xpath2Selector implements Selector {
|
|||
@Override
|
||||
public String select(String text) {
|
||||
try {
|
||||
HtmlCleaner htmlCleaner = new HtmlCleaner();
|
||||
TagNode tagNode = htmlCleaner.clean(text);
|
||||
Document document = new DomSerializer(new CleanerProperties()).createDOM(tagNode);
|
||||
Object result;
|
||||
try {
|
||||
result = xPathExpression.evaluate(document, XPathConstants.NODESET);
|
||||
} catch (XPathExpressionException e) {
|
||||
result = xPathExpression.evaluate(document, XPathConstants.STRING);
|
||||
}
|
||||
if (result instanceof NodeList) {
|
||||
NodeList nodeList = (NodeList) result;
|
||||
if (nodeList.getLength() == 0) {
|
||||
return null;
|
||||
}
|
||||
Node item = nodeList.item(0);
|
||||
if (item.getNodeType() == Node.ATTRIBUTE_NODE || item.getNodeType() == Node.TEXT_NODE) {
|
||||
return item.getTextContent();
|
||||
} else {
|
||||
StreamResult xmlOutput = new StreamResult(new StringWriter());
|
||||
Transformer transformer = TransformerFactory.newInstance().newTransformer();
|
||||
transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
|
||||
transformer.transform(new DOMSource(item), xmlOutput);
|
||||
return xmlOutput.getWriter().toString();
|
||||
}
|
||||
}
|
||||
return result.toString();
|
||||
Document doc = parse(text);
|
||||
return select(doc);
|
||||
} catch (Exception e) {
|
||||
logger.error("select text error! " + xpathStr, e);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String select(Node node) {
|
||||
try {
|
||||
return (String) xPathExpression.evaluate(node, XPathConstants.STRING);
|
||||
} catch (Exception e) {
|
||||
logger.error("select text error! " + xpathStr, e);
|
||||
}
|
||||
|
@ -145,38 +126,72 @@ public class Xpath2Selector implements Selector {
|
|||
|
||||
@Override
|
||||
public List<String> selectList(String text) {
|
||||
List<String> results = new ArrayList<String>();
|
||||
try {
|
||||
HtmlCleaner htmlCleaner = new HtmlCleaner();
|
||||
TagNode tagNode = htmlCleaner.clean(text);
|
||||
Document document = new DomSerializer(new CleanerProperties()).createDOM(tagNode);
|
||||
Object result;
|
||||
try {
|
||||
result = xPathExpression.evaluate(document, XPathConstants.NODESET);
|
||||
} catch (XPathExpressionException e) {
|
||||
result = xPathExpression.evaluate(document, XPathConstants.STRING);
|
||||
}
|
||||
if (result instanceof NodeList) {
|
||||
NodeList nodeList = (NodeList) result;
|
||||
Transformer transformer = TransformerFactory.newInstance().newTransformer();
|
||||
StreamResult xmlOutput = new StreamResult();
|
||||
transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
|
||||
for (int i = 0; i < nodeList.getLength(); i++) {
|
||||
Node item = nodeList.item(i);
|
||||
if (item.getNodeType() == Node.ATTRIBUTE_NODE || item.getNodeType() == Node.TEXT_NODE) {
|
||||
results.add(item.getTextContent());
|
||||
} else {
|
||||
xmlOutput.setWriter(new StringWriter());
|
||||
transformer.transform(new DOMSource(item), xmlOutput);
|
||||
results.add(xmlOutput.getWriter().toString());
|
||||
}
|
||||
}
|
||||
} else {
|
||||
results.add(result.toString());
|
||||
}
|
||||
Document doc = parse(text);
|
||||
return selectList(doc);
|
||||
} catch (Exception e) {
|
||||
logger.error("select text error! " + xpathStr, e);
|
||||
}
|
||||
return results;
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<String> selectList(Node node) {
|
||||
try {
|
||||
NodeList result = (NodeList) xPathExpression.evaluate(node, XPathConstants.NODESET);
|
||||
List<Node> nodes = NodeListToArrayList(result);
|
||||
return nodesToStrings(nodes);
|
||||
} catch (Exception e) {
|
||||
logger.error("select text error! " + xpathStr, e);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
public Node selectNode(String text) {
|
||||
try {
|
||||
Document doc = parse(text);
|
||||
return selectNode(doc);
|
||||
} catch (Exception e) {
|
||||
logger.error("select text error! " + xpathStr, e);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
public Node selectNode(Node node) {
|
||||
try {
|
||||
return (Node) xPathExpression.evaluate(node, XPathConstants.NODE);
|
||||
} catch (Exception e) {
|
||||
logger.error("select text error! " + xpathStr, e);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
public List<Node> selectNodes(String text) {
|
||||
try {
|
||||
Document doc = parse(text);
|
||||
return selectNodes(doc);
|
||||
} catch (Exception e) {
|
||||
logger.error("select text error! " + xpathStr, e);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
public List<Node> selectNodes(Node node) {
|
||||
try {
|
||||
NodeList result = (NodeList) xPathExpression.evaluate(node, XPathConstants.NODESET);
|
||||
return NodeListToArrayList(result);
|
||||
} catch (Exception e) {
|
||||
logger.error("select text error! " + xpathStr, e);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
protected static Document parse(String text) throws ParserConfigurationException {
|
||||
// HtmlCleaner could not parse <tr></tr> or <td></td> tag directly
|
||||
text = BaseSelectorUtils.preParse(text);
|
||||
HtmlCleaner htmlCleaner = new HtmlCleaner();
|
||||
TagNode tagNode = htmlCleaner.clean(text);
|
||||
return new DomSerializer(new CleanerProperties()).createDOM(tagNode);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
File diff suppressed because one or more lines are too long
|
@ -3,7 +3,7 @@
|
|||
<parent>
|
||||
<artifactId>webmagic-parent</artifactId>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<version>0.8.0</version>
|
||||
<version>0.9.0</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
<parent>
|
||||
<artifactId>webmagic-parent</artifactId>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<version>0.8.0</version>
|
||||
<version>0.9.0</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
|
|
Loading…
Reference in New Issue