Merge branch 'release/0.9.0'

master
Joe Zhou 2023-06-22 11:23:41 +08:00
commit fd4a136f9a
25 changed files with 418 additions and 187 deletions

View File

@ -39,12 +39,12 @@ webmagic使用maven管理依赖在项目中添加对应的依赖即可使用w
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.7.5</version>
<version>${webmagic.version}</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.7.5</version>
<version>${webmagic.version}</version>
</dependency>
```

View File

@ -25,12 +25,12 @@ Add dependencies to your pom.xml:
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.7.5</version>
<version>${webmagic.version}</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.7.5</version>
<version>${webmagic.version}</version>
</dependency>
```

View File

@ -1,7 +1,7 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<groupId>us.codecraft</groupId>
<version>0.8.0</version>
<version>0.9.0</version>
<modelVersion>4.0.0</modelVersion>
<packaging>pom</packaging>
<properties>
@ -124,7 +124,7 @@
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>xsoup</artifactId>
<version>0.3.6</version>
<version>0.3.7</version>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>

View File

@ -3,7 +3,7 @@
<parent>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-parent</artifactId>
<version>0.8.0</version>
<version>0.9.0</version>
</parent>
<modelVersion>4.0.0</modelVersion>

View File

@ -20,7 +20,7 @@ import java.util.Map;
* {@link #getHtml()} get content of current page <br>
* {@link #putField(String, Object)} save extracted result <br>
* {@link #getResultItems()} get extract results to be used in {@link us.codecraft.webmagic.pipeline.Pipeline}<br>
* {@link #addTargetRequests(java.util.List)} {@link #addTargetRequest(String)} add urls to fetch <br>
* {@link #addTargetRequests(Iterable)} {@link #addTargetRequest(String)} add urls to fetch <br>
*
* @author code4crafter@gmail.com <br>
* @see us.codecraft.webmagic.downloader.Downloader
@ -52,7 +52,7 @@ public class Page {
private List<Request> targetRequests = new ArrayList<Request>();
private String charset;
public Page() {
}
@ -108,7 +108,8 @@ public class Page {
* @deprecated since 0.4.0
* The html is parse just when first time of calling {@link #getHtml()}, so use {@link #setRawText(String)} instead.
*/
public void setHtml(Html html) {
@Deprecated
public void setHtml(Html html) {
this.html = html;
}
@ -121,7 +122,7 @@ public class Page {
*
* @param requests requests
*/
public void addTargetRequests(List<String> requests) {
public void addTargetRequests(Iterable<String> requests) {
for (String s : requests) {
if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) {
continue;
@ -137,7 +138,7 @@ public class Page {
* @param requests requests
* @param priority priority
*/
public void addTargetRequests(List<String> requests, long priority) {
public void addTargetRequests(Iterable<String> requests, long priority) {
for (String s : requests) {
if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) {
continue;

View File

@ -28,6 +28,8 @@ public class Site {
private String charset;
private String defaultCharset;
private int sleepTime = 5000;
private int retryTimes = 0;
@ -168,6 +170,30 @@ public class Site {
return charset;
}
/**
* Set default charset of page.
*
* When charset detect failed, use this default charset.
*
* @param defaultCharset the default charset
* @return this
* @since 0.9.0
*/
public Site setDefaultCharset(String defaultCharset) {
this.defaultCharset = defaultCharset;
return this;
}
/**
* The default charset if charset detected failed.
*
* @return the defulat charset
* @since 0.9.0
*/
public String getDefaultCharset() {
return defaultCharset;
}
public int getTimeOut() {
return timeOut;
}

View File

@ -4,6 +4,7 @@ import java.io.IOException;
import java.nio.charset.Charset;
import java.util.HashMap;
import java.util.Map;
import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.http.HttpResponse;
@ -76,7 +77,7 @@ public class HttpClientDownloader extends AbstractDownloader {
}
CloseableHttpResponse httpResponse = null;
CloseableHttpClient httpClient = getHttpClient(task.getSite());
Proxy proxy = proxyProvider != null ? proxyProvider.getProxy(task) : null;
Proxy proxy = proxyProvider != null ? proxyProvider.getProxy(request, task) : null;
HttpClientRequestContext requestContext = httpUriRequestConverter.convert(request, task.getSite(), proxy);
Page page = Page.fail();
try {
@ -116,7 +117,7 @@ public class HttpClientDownloader extends AbstractDownloader {
page.setBytes(bytes);
if (!request.isBinaryContent()) {
if (charset == null) {
charset = getHtmlCharset(contentType, bytes);
charset = getHtmlCharset(contentType, bytes, task);
}
page.setCharset(charset);
page.setRawText(new String(bytes, charset));
@ -131,11 +132,11 @@ public class HttpClientDownloader extends AbstractDownloader {
return page;
}
private String getHtmlCharset(String contentType, byte[] contentBytes) throws IOException {
private String getHtmlCharset(String contentType, byte[] contentBytes, Task task) throws IOException {
String charset = CharsetUtils.detectCharset(contentType, contentBytes);
if (charset == null) {
charset = Charset.defaultCharset().name();
logger.warn("Charset autodetect failed, use {} as charset. Please specify charset in Site.setCharset()", Charset.defaultCharset());
charset = Optional.ofNullable(task.getSite().getDefaultCharset()).orElseGet(Charset.defaultCharset()::name);
logger.info("Charset autodetect failed, use {} as charset.", task.getSite().getDefaultCharset());
}
return charset;
}

View File

@ -1,16 +1,5 @@
package us.codecraft.webmagic.downloader;
import java.io.IOException;
import java.security.KeyManagementException;
import java.security.NoSuchAlgorithmException;
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;
import java.util.Map;
import javax.net.ssl.SSLContext;
import javax.net.ssl.TrustManager;
import javax.net.ssl.X509TrustManager;
import org.apache.commons.lang3.JavaVersion;
import org.apache.commons.lang3.SystemUtils;
import org.apache.http.HttpException;
@ -22,28 +11,32 @@ import org.apache.http.config.RegistryBuilder;
import org.apache.http.config.SocketConfig;
import org.apache.http.conn.socket.ConnectionSocketFactory;
import org.apache.http.conn.socket.PlainConnectionSocketFactory;
import org.apache.http.conn.ssl.DefaultHostnameVerifier;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.impl.client.BasicCookieStore;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.DefaultHttpRequestRetryHandler;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.client.*;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.impl.cookie.BasicClientCookie;
import org.apache.http.protocol.HttpContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.Site;
import javax.net.ssl.SSLContext;
import javax.net.ssl.TrustManager;
import javax.net.ssl.X509TrustManager;
import java.io.IOException;
import java.security.KeyManagementException;
import java.security.NoSuchAlgorithmException;
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;
import java.util.Map;
/**
* @author code4crafter@gmail.com <br>
* @since 0.4.0
*/
public class HttpClientGenerator {
private transient Logger logger = LoggerFactory.getLogger(getClass());
private transient Logger logger = LoggerFactory.getLogger(getClass());
private PoolingHttpClientConnectionManager connectionManager;
@ -61,21 +54,20 @@ public class HttpClientGenerator {
SSLContext sslContext = createIgnoreVerifySSL();
String[] supportedProtocols;
if (SystemUtils.isJavaVersionAtLeast(JavaVersion.JAVA_11)) {
supportedProtocols = new String[] { "SSLv3", "TLSv1", "TLSv1.1", "TLSv1.2", "TLSv1.3" };
supportedProtocols = new String[]{"SSLv3", "TLSv1", "TLSv1.1", "TLSv1.2", "TLSv1.3"};
} else {
supportedProtocols = new String[] { "SSLv3", "TLSv1", "TLSv1.1", "TLSv1.2" };
supportedProtocols = new String[]{"SSLv3", "TLSv1", "TLSv1.1", "TLSv1.2"};
}
logger.debug("supportedProtocols: {}", String.join(", ", supportedProtocols));
return new SSLConnectionSocketFactory(sslContext, supportedProtocols,
null,
new DefaultHostnameVerifier()); // 优先绕过安全证书
} catch (KeyManagementException e) {
logger.error("ssl connection fail", e);
} catch (NoSuchAlgorithmException e) {
//不进行主机校验
(host, sslSession) -> true); // 优先绕过安全证书
} catch (KeyManagementException | NoSuchAlgorithmException e) {
logger.error("ssl connection fail", e);
}
return SSLConnectionSocketFactory.getSocketFactory();
}
}
private SSLContext createIgnoreVerifySSL() throws NoSuchAlgorithmException, KeyManagementException {
// 实现一个X509TrustManager接口用于绕过验证不用修改里面的方法
@ -97,9 +89,9 @@ public class HttpClientGenerator {
};
SSLContext sc = SSLContext.getInstance("TLS");
sc.init(null, new TrustManager[] { trustManager }, null);
sc.init(null, new TrustManager[]{trustManager}, null);
return sc;
}
}
public HttpClientGenerator setPoolSize(int poolSize) {
connectionManager.setMaxTotal(poolSize);

View File

@ -1,6 +1,7 @@
package us.codecraft.webmagic.proxy;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task;
/**
@ -23,7 +24,23 @@ public interface ProxyProvider {
* Get a proxy for task by some strategy.
* @param task the download task
* @return proxy
* @deprecated Use {@link #getProxy(Request, Task)} instead.
*/
Proxy getProxy(Task task);
@Deprecated
default Proxy getProxy(Task task) {
throw new UnsupportedOperationException();
}
/**
* Returns a proxy for the request.
*
* @param request the request
* @param task the download task
* @return proxy
* @since 0.9.0
*/
default Proxy getProxy(Request request, Task task) {
return this.getProxy(task);
}
}

View File

@ -1,6 +1,7 @@
package us.codecraft.webmagic.proxy;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task;
import java.util.ArrayList;
@ -44,7 +45,7 @@ public class SimpleProxyProvider implements ProxyProvider {
}
@Override
public Proxy getProxy(Task task) {
public Proxy getProxy(Request request, Task task) {
return proxies.get(incrForLoop());
}

View File

@ -3,6 +3,7 @@ package us.codecraft.webmagic.selector;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import us.codecraft.webmagic.utils.BaseSelectorUtils;
import java.util.ArrayList;
import java.util.List;
@ -13,16 +14,9 @@ import java.util.List;
*/
public abstract class BaseElementSelector implements Selector, ElementSelector {
private Document parse(String text) {
if (text == null) {
return null;
}
// Jsoup could not parse <tr></tr> or <td></td> tag directly
// https://stackoverflow.com/questions/63607740/jsoup-couldnt-parse-tr-tag
if ((text.startsWith("<tr>") && text.endsWith("</tr>"))
|| (text.startsWith("<td>") && text.endsWith("</td>"))) {
text = "<table>" + text + "</table>";
}
text = BaseSelectorUtils.preParse(text);
return Jsoup.parse(text);
}

View File

@ -0,0 +1,23 @@
package us.codecraft.webmagic.utils;
/**
* @author hooy
*/
public class BaseSelectorUtils {
/**
* Jsoup/HtmlCleaner could not parse "tr" or "td" tag directly
* https://stackoverflow.com/questions/63607740/jsoup-couldnt-parse-tr-tag
*
* @param text - the html string
* @return text
*/
public static String preParse(String text) {
if (((text.startsWith("<tr>") || text.startsWith("<tr ")) && text.endsWith("</tr>"))
|| ((text.startsWith("<td>") || text.startsWith("<td ")) && text.endsWith("</td>"))) {
text = "<table>" + text + "</table>";
}
return text;
}
}

View File

@ -0,0 +1,17 @@
package us.codecraft.webmagic;
import static org.junit.Assert.assertEquals;
import java.nio.charset.StandardCharsets;
import org.junit.Test;
public class SiteTest {
@Test
public void test() {
Site site = Site.me().setDefaultCharset(StandardCharsets.UTF_8.name());
assertEquals(StandardCharsets.UTF_8.name(), site.getDefaultCharset());
}
}

View File

@ -1,6 +1,9 @@
package us.codecraft.webmagic.proxy;
import org.junit.Test;
import org.mockito.Mockito;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Task;
@ -20,11 +23,12 @@ public class SimpleProxyProviderTest {
Proxy originProxy1 = new Proxy("127.0.0.1", 1087);
Proxy originProxy2 = new Proxy("127.0.0.1", 1088);
SimpleProxyProvider proxyProvider = SimpleProxyProvider.from(originProxy1, originProxy2);
Proxy proxy = proxyProvider.getProxy(TASK);
Request request = Mockito.mock(Request.class);
Proxy proxy = proxyProvider.getProxy(request, TASK);
assertThat(proxy).isEqualTo(originProxy1);
proxy = proxyProvider.getProxy(TASK);
proxy = proxyProvider.getProxy(request, TASK);
assertThat(proxy).isEqualTo(originProxy2);
proxy = proxyProvider.getProxy(TASK);
proxy = proxyProvider.getProxy(request, TASK);
assertThat(proxy).isEqualTo(originProxy1);
}
}

View File

@ -8,7 +8,7 @@
<parent>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-parent</artifactId>
<version>0.8.0</version>
<version>0.9.0</version>
</parent>
<artifactId>webmagic-coverage</artifactId>

View File

@ -3,7 +3,7 @@
<parent>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-parent</artifactId>
<version>0.8.0</version>
<version>0.9.0</version>
</parent>
<modelVersion>4.0.0</modelVersion>
@ -14,6 +14,11 @@
<groupId>redis.clients</groupId>
<artifactId>jedis</artifactId>
</dependency>
<dependency>
<groupId>org.assertj</groupId>
<artifactId>assertj-core</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>

View File

@ -1,21 +1,25 @@
package us.codecraft.webmagic.monitor;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.lang.management.ManagementFactory;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.concurrent.atomic.AtomicInteger;
import javax.management.InstanceAlreadyExistsException;
import javax.management.JMException;
import javax.management.MBeanRegistrationException;
import javax.management.MBeanServer;
import javax.management.MalformedObjectNameException;
import javax.management.NotCompliantMBeanException;
import javax.management.ObjectName;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.SpiderListener;
import us.codecraft.webmagic.utils.Experimental;
import us.codecraft.webmagic.utils.UrlUtils;
import javax.management.*;
import java.lang.management.ManagementFactory;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;
/**
* @author code4crafer@gmail.com
* @since 0.5.0
@ -23,17 +27,13 @@ import java.util.concurrent.atomic.AtomicInteger;
@Experimental
public class SpiderMonitor {
private static SpiderMonitor INSTANCE = new SpiderMonitor();
private AtomicBoolean started = new AtomicBoolean(false);
private Logger logger = LoggerFactory.getLogger(getClass());
private static final SpiderMonitor INSTANCE = new SpiderMonitor();
private MBeanServer mbeanServer;
private String jmxServerName;
private List<SpiderStatusMXBean> spiderStatuses = new ArrayList<SpiderStatusMXBean>();
private List<SpiderStatusMXBean> spiderStatuses = new ArrayList<>();
protected SpiderMonitor() {
jmxServerName = "WebMagic";
@ -51,7 +51,7 @@ public class SpiderMonitor {
for (Spider spider : spiders) {
MonitorSpiderListener monitorSpiderListener = new MonitorSpiderListener();
if (spider.getSpiderListeners() == null) {
List<SpiderListener> spiderListeners = new ArrayList<SpiderListener>();
List<SpiderListener> spiderListeners = new ArrayList<>();
spiderListeners.add(monitorSpiderListener);
spider.setSpiderListeners(spiderListeners);
} else {
@ -90,7 +90,7 @@ public class SpiderMonitor {
}
@Override
public void onError(Request request) {
public void onError(Request request, Exception e) {
errorUrls.add(request.getUrl());
errorCount.incrementAndGet();
}
@ -109,7 +109,6 @@ public class SpiderMonitor {
}
protected void registerMBean(SpiderStatusMXBean spiderStatus) throws MalformedObjectNameException, InstanceAlreadyExistsException, MBeanRegistrationException, NotCompliantMBeanException {
// ObjectName objName = new ObjectName(jmxServerName + ":name=" + spiderStatus.getName());
ObjectName objName = new ObjectName(jmxServerName + ":name=" + UrlUtils.removePort(spiderStatus.getName()));
mbeanServer.registerMBean(spiderStatus, objName);
}

View File

@ -3,7 +3,7 @@
<parent>
<artifactId>webmagic-parent</artifactId>
<groupId>us.codecraft</groupId>
<version>0.8.0</version>
<version>0.9.0</version>
</parent>
<modelVersion>4.0.0</modelVersion>
@ -27,22 +27,22 @@
<dependency>
<groupId>org.mapdb</groupId>
<artifactId>mapdb</artifactId>
<version>3.0.8</version>
<version>3.0.9</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-core</artifactId>
<version>2.13.0-rc1</version>
<version>2.15.2</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-annotations</artifactId>
<version>2.13.0-rc1</version>
<version>2.15.2</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
<version>2.13.4.2</version>
<version>2.15.2</version>
</dependency>
</dependencies>

View File

@ -3,7 +3,7 @@
<parent>
<artifactId>webmagic-parent</artifactId>
<groupId>us.codecraft</groupId>
<version>0.8.0</version>
<version>0.9.0</version>
</parent>
<modelVersion>4.0.0</modelVersion>

View File

@ -0,0 +1,61 @@
package us.codecraft.webmagic.selector;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
/**
* @author hooy
*/
public final class JaxpSelectorUtils {
private JaxpSelectorUtils() {
throw new RuntimeException("The util class cannot be instanced");
}
public static List<Node> NodeListToArrayList(NodeList nodes) {
List<Node> list = new ArrayList<>(nodes.getLength());
for (int i = 0; i < nodes.getLength(); i++) {
list.add(nodes.item(i));
}
return list;
}
public static String nodeToString(Node node) throws TransformerException {
List<Node> before = Collections.singletonList(node);
List<String> after = nodesToStrings(before);
if (after.size() > 0) {
return after.get(0);
} else {
return null;
}
}
public static List<String> nodesToStrings(List<Node> nodes) throws TransformerException {
List<String> results = new ArrayList<>(nodes.size());
Transformer transformer = TransformerFactory.newInstance().newTransformer();
StreamResult xmlOutput = new StreamResult();
transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
for (Node node : nodes) {
if (node.getNodeType() == Node.ATTRIBUTE_NODE || node.getNodeType() == Node.TEXT_NODE) {
results.add(node.getTextContent());
} else {
xmlOutput.setWriter(new StringWriter());
transformer.transform(new DOMSource(node), xmlOutput);
results.add(xmlOutput.getWriter().toString());
}
}
return results;
}
}

View File

@ -0,0 +1,32 @@
package us.codecraft.webmagic.selector;
import org.w3c.dom.Node;
import java.util.List;
/**
* Selector(extractor) for html node.<br>
*
* @author hooy <br>
* @since 0.8.0
*/
public interface NodeSelector {
/**
* Extract single result in text.<br>
* If there are more than one result, only the first will be chosen.
*
* @param node node
* @return result
*/
String select(Node node);
/**
* Extract all results in text.<br>
*
* @param node node
* @return results
*/
List<String> selectList(Node node);
}

View File

@ -1,18 +1,10 @@
package us.codecraft.webmagic.selector;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
import javax.xml.namespace.NamespaceContext;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpression;
import javax.xml.xpath.XPathExpressionException;
@ -29,21 +21,24 @@ import org.w3c.dom.NodeList;
import net.sf.saxon.lib.NamespaceConstant;
import net.sf.saxon.xpath.XPathEvaluator;
import us.codecraft.webmagic.utils.BaseSelectorUtils;
import static us.codecraft.webmagic.selector.JaxpSelectorUtils.*;
/**
* xpath2.0HtmlCleanerSaxon HE<br>
*
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* Time: 9:39
* @author code4crafter@gmail.com, hooy <br>
* Date: 13-4-21
* Time: 9:39
*/
public class Xpath2Selector implements Selector {
public class Xpath2Selector implements Selector, NodeSelector {
private String xpathStr;
private final String xpathStr;
private XPathExpression xPathExpression;
private Logger logger = LoggerFactory.getLogger(getClass());
private final Logger logger = LoggerFactory.getLogger(getClass());
public Xpath2Selector(String xpathStr) {
this.xpathStr = xpathStr;
@ -54,25 +49,25 @@ public class Xpath2Selector implements Selector {
}
}
public static Xpath2Selector newInstance(String xpathStr) {
return new Xpath2Selector(xpathStr);
}
enum XPath2NamespaceContext implements NamespaceContext {
INSTANCE;
private final Map<String, String> prefix2NamespaceMap = new ConcurrentHashMap<String, String>();
private final Map<String, String> prefix2NamespaceMap = new ConcurrentHashMap<>();
private final Map<String, List<String>> namespace2PrefixMap = new ConcurrentHashMap<String, List<String>>();
private final Map<String, List<String>> namespace2PrefixMap = new ConcurrentHashMap<>();
private void put(String prefix, String namespaceURI) {
prefix2NamespaceMap.put(prefix, namespaceURI);
List<String> prefixes = namespace2PrefixMap.get(namespaceURI);
if (prefixes == null) {
prefixes = new ArrayList<String>();
namespace2PrefixMap.put(namespaceURI, prefixes);
}
List<String> prefixes = namespace2PrefixMap.computeIfAbsent(namespaceURI, k -> new ArrayList<>());
prefixes.add(prefix);
}
private XPath2NamespaceContext() {
XPath2NamespaceContext() {
put("fn", NamespaceConstant.FN);
put("xslt", NamespaceConstant.XSLT);
put("xhtml", NamespaceConstant.XHTML);
@ -111,32 +106,18 @@ public class Xpath2Selector implements Selector {
@Override
public String select(String text) {
try {
HtmlCleaner htmlCleaner = new HtmlCleaner();
TagNode tagNode = htmlCleaner.clean(text);
Document document = new DomSerializer(new CleanerProperties()).createDOM(tagNode);
Object result;
try {
result = xPathExpression.evaluate(document, XPathConstants.NODESET);
} catch (XPathExpressionException e) {
result = xPathExpression.evaluate(document, XPathConstants.STRING);
}
if (result instanceof NodeList) {
NodeList nodeList = (NodeList) result;
if (nodeList.getLength() == 0) {
return null;
}
Node item = nodeList.item(0);
if (item.getNodeType() == Node.ATTRIBUTE_NODE || item.getNodeType() == Node.TEXT_NODE) {
return item.getTextContent();
} else {
StreamResult xmlOutput = new StreamResult(new StringWriter());
Transformer transformer = TransformerFactory.newInstance().newTransformer();
transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
transformer.transform(new DOMSource(item), xmlOutput);
return xmlOutput.getWriter().toString();
}
}
return result.toString();
Document doc = parse(text);
return select(doc);
} catch (Exception e) {
logger.error("select text error! " + xpathStr, e);
}
return null;
}
@Override
public String select(Node node) {
try {
return (String) xPathExpression.evaluate(node, XPathConstants.STRING);
} catch (Exception e) {
logger.error("select text error! " + xpathStr, e);
}
@ -145,38 +126,72 @@ public class Xpath2Selector implements Selector {
@Override
public List<String> selectList(String text) {
List<String> results = new ArrayList<String>();
try {
HtmlCleaner htmlCleaner = new HtmlCleaner();
TagNode tagNode = htmlCleaner.clean(text);
Document document = new DomSerializer(new CleanerProperties()).createDOM(tagNode);
Object result;
try {
result = xPathExpression.evaluate(document, XPathConstants.NODESET);
} catch (XPathExpressionException e) {
result = xPathExpression.evaluate(document, XPathConstants.STRING);
}
if (result instanceof NodeList) {
NodeList nodeList = (NodeList) result;
Transformer transformer = TransformerFactory.newInstance().newTransformer();
StreamResult xmlOutput = new StreamResult();
transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
for (int i = 0; i < nodeList.getLength(); i++) {
Node item = nodeList.item(i);
if (item.getNodeType() == Node.ATTRIBUTE_NODE || item.getNodeType() == Node.TEXT_NODE) {
results.add(item.getTextContent());
} else {
xmlOutput.setWriter(new StringWriter());
transformer.transform(new DOMSource(item), xmlOutput);
results.add(xmlOutput.getWriter().toString());
}
}
} else {
results.add(result.toString());
}
Document doc = parse(text);
return selectList(doc);
} catch (Exception e) {
logger.error("select text error! " + xpathStr, e);
}
return results;
return null;
}
@Override
public List<String> selectList(Node node) {
try {
NodeList result = (NodeList) xPathExpression.evaluate(node, XPathConstants.NODESET);
List<Node> nodes = NodeListToArrayList(result);
return nodesToStrings(nodes);
} catch (Exception e) {
logger.error("select text error! " + xpathStr, e);
}
return null;
}
public Node selectNode(String text) {
try {
Document doc = parse(text);
return selectNode(doc);
} catch (Exception e) {
logger.error("select text error! " + xpathStr, e);
}
return null;
}
public Node selectNode(Node node) {
try {
return (Node) xPathExpression.evaluate(node, XPathConstants.NODE);
} catch (Exception e) {
logger.error("select text error! " + xpathStr, e);
}
return null;
}
public List<Node> selectNodes(String text) {
try {
Document doc = parse(text);
return selectNodes(doc);
} catch (Exception e) {
logger.error("select text error! " + xpathStr, e);
}
return null;
}
public List<Node> selectNodes(Node node) {
try {
NodeList result = (NodeList) xPathExpression.evaluate(node, XPathConstants.NODESET);
return NodeListToArrayList(result);
} catch (Exception e) {
logger.error("select text error! " + xpathStr, e);
}
return null;
}
protected static Document parse(String text) throws ParserConfigurationException {
// HtmlCleaner could not parse <tr></tr> or <td></td> tag directly
text = BaseSelectorUtils.preParse(text);
HtmlCleaner htmlCleaner = new HtmlCleaner();
TagNode tagNode = htmlCleaner.clean(text);
return new DomSerializer(new CleanerProperties()).createDOM(tagNode);
}
}

File diff suppressed because one or more lines are too long

View File

@ -3,7 +3,7 @@
<parent>
<artifactId>webmagic-parent</artifactId>
<groupId>us.codecraft</groupId>
<version>0.8.0</version>
<version>0.9.0</version>
</parent>
<modelVersion>4.0.0</modelVersion>

View File

@ -3,7 +3,7 @@
<parent>
<artifactId>webmagic-parent</artifactId>
<groupId>us.codecraft</groupId>
<version>0.8.0</version>
<version>0.9.0</version>
</parent>
<modelVersion>4.0.0</modelVersion>