move xpath2.0 support to seperate package
parent
268bd8d0c4
commit
521fbad987
1
pom.xml
1
pom.xml
|
@ -14,6 +14,7 @@
|
|||
<module>webmagic-samples/</module>
|
||||
<module>webmagic-selenium/</module>
|
||||
<module>webmagic-lucene/</module>
|
||||
<module>webmagic-saxon/</module>
|
||||
</modules>
|
||||
|
||||
<dependencyManagement>
|
||||
|
|
|
@ -27,10 +27,6 @@
|
|||
<artifactId>webmagic-core</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>net.sf.saxon</groupId>
|
||||
<artifactId>Saxon-HE</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>junit</groupId>
|
||||
<artifactId>junit</artifactId>
|
||||
|
|
|
@ -110,11 +110,8 @@ class PageModelExtractor {
|
|||
case XPath:
|
||||
selector = new XpathSelector(value);
|
||||
break;
|
||||
case XPath2:
|
||||
selector = new Xpath2Selector(value);
|
||||
break;
|
||||
default:
|
||||
selector = new Xpath2Selector(value);
|
||||
selector = new XpathSelector(value);
|
||||
}
|
||||
fieldExtractor = new FieldExtractor(field, selector, FieldExtractor.Source.Html, extractBy.notNull(), extractBy.multi());
|
||||
Method setterMethod = getSetterMethod(clazz, field);
|
||||
|
@ -140,11 +137,8 @@ class PageModelExtractor {
|
|||
case XPath:
|
||||
selector = new XpathSelector(value);
|
||||
break;
|
||||
case XPath2:
|
||||
selector = new Xpath2Selector(value);
|
||||
break;
|
||||
default:
|
||||
selector = new Xpath2Selector(value);
|
||||
selector = new XpathSelector(value);
|
||||
}
|
||||
fieldExtractor.setSelector(new AndSelector(fieldExtractor.getSelector(), selector));
|
||||
}
|
||||
|
@ -165,11 +159,8 @@ class PageModelExtractor {
|
|||
case XPath:
|
||||
selector = new XpathSelector(value);
|
||||
break;
|
||||
case XPath2:
|
||||
selector = new Xpath2Selector(value);
|
||||
break;
|
||||
default:
|
||||
selector = new Xpath2Selector(value);
|
||||
selector = new XpathSelector(value);
|
||||
}
|
||||
fieldExtractor.setSelector(new AndSelector(fieldExtractor.getSelector(), selector));
|
||||
}
|
||||
|
@ -191,11 +182,8 @@ class PageModelExtractor {
|
|||
case XPath:
|
||||
selector = new XpathSelector(value);
|
||||
break;
|
||||
case XPath2:
|
||||
selector = new Xpath2Selector(value);
|
||||
break;
|
||||
default:
|
||||
selector = new Xpath2Selector(value);
|
||||
selector = new XpathSelector(value);
|
||||
}
|
||||
fieldExtractor = new FieldExtractor(field, selector, FieldExtractor.Source.RawHtml, extractByRaw.notNull(), extractByRaw.multi());
|
||||
Method setterMethod = getSetterMethod(clazz, field);
|
||||
|
@ -228,7 +216,7 @@ class PageModelExtractor {
|
|||
targetUrlPatterns.add(Pattern.compile("(" + s.replace(".", "\\.").replace("*", "[^\"'#]*") + ")"));
|
||||
}
|
||||
if (!targetUrl.sourceRegion().equals("")) {
|
||||
targetUrlRegionSelector = new Xpath2Selector(targetUrl.sourceRegion());
|
||||
targetUrlRegionSelector = new XpathSelector(targetUrl.sourceRegion());
|
||||
}
|
||||
}
|
||||
annotation = clazz.getAnnotation(HelpUrl.class);
|
||||
|
@ -239,13 +227,13 @@ class PageModelExtractor {
|
|||
helpUrlPatterns.add(Pattern.compile("(" + s.replace(".", "\\.").replace("*", "[^\"'#]*") + ")"));
|
||||
}
|
||||
if (!helpUrl.sourceRegion().equals("")) {
|
||||
helpUrlRegionSelector = new Xpath2Selector(helpUrl.sourceRegion());
|
||||
helpUrlRegionSelector = new XpathSelector(helpUrl.sourceRegion());
|
||||
}
|
||||
}
|
||||
annotation = clazz.getAnnotation(ExtractBy.class);
|
||||
if (annotation != null) {
|
||||
ExtractBy extractBy = (ExtractBy) annotation;
|
||||
extractor = new Extractor(new Xpath2Selector(extractBy.value()), Extractor.Source.Html, extractBy.notNull(), extractBy.multi());
|
||||
extractor = new Extractor(new XpathSelector(extractBy.value()), Extractor.Source.Html, extractBy.notNull(), extractBy.multi());
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -16,9 +16,9 @@ public @interface ExtractBy {
|
|||
|
||||
String value();
|
||||
|
||||
public enum Type {XPath2, XPath, Regex, Css}
|
||||
public enum Type {XPath, Regex, Css}
|
||||
|
||||
Type type() default Type.XPath2;
|
||||
Type type() default Type.XPath;
|
||||
|
||||
boolean notNull() default true;
|
||||
|
||||
|
|
|
@ -6,6 +6,7 @@ import java.lang.annotation.Target;
|
|||
|
||||
/**
|
||||
* 定义类或者字段的抽取规则。<br>
|
||||
*
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* @date: 13-8-1 <br>
|
||||
* Time: 下午8:40 <br>
|
||||
|
@ -16,8 +17,8 @@ public @interface ExtractBy2 {
|
|||
|
||||
String value();
|
||||
|
||||
public enum Type {XPath2, XPath, Regex, Css}
|
||||
public enum Type {XPath, Regex, Css}
|
||||
|
||||
Type type() default Type.XPath2;
|
||||
Type type() default Type.XPath;
|
||||
|
||||
}
|
||||
|
|
|
@ -16,8 +16,8 @@ public @interface ExtractBy3 {
|
|||
|
||||
String value();
|
||||
|
||||
public enum Type {XPath2, XPath, Regex, Css}
|
||||
public enum Type { XPath, Regex, Css}
|
||||
|
||||
Type type() default Type.XPath2;
|
||||
Type type() default Type.XPath;
|
||||
|
||||
}
|
||||
|
|
|
@ -16,9 +16,9 @@ public @interface ExtractByRaw {
|
|||
|
||||
String value();
|
||||
|
||||
public enum Type {XPath2, XPath, Regex, Css}
|
||||
public enum Type {XPath, Regex, Css}
|
||||
|
||||
Type type() default Type.XPath2;
|
||||
Type type() default Type.XPath;
|
||||
|
||||
boolean notNull() default true;
|
||||
|
||||
|
|
|
@ -0,0 +1,3 @@
|
|||
webmagic-extension
|
||||
-------
|
||||
webmagic的扩展模块,依赖Saxon进行xpath2.0解析支持。Saxon依赖包太大,不作为默认模块引入。
|
|
@ -0,0 +1,30 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<parent>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<artifactId>webmagic</artifactId>
|
||||
<version>0.2.0</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
<artifactId>webmagic-saxon</artifactId>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<artifactId>webmagic-core</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>net.sf.saxon</groupId>
|
||||
<artifactId>Saxon-HE</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>junit</groupId>
|
||||
<artifactId>junit</artifactId>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
</project>
|
|
@ -1,25 +1,8 @@
|
|||
package us.codecraft.webmagic.selector;
|
||||
|
||||
import net.sf.saxon.Configuration;
|
||||
import net.sf.saxon.lib.NamespaceConstant;
|
||||
import net.sf.saxon.om.NamespaceResolver;
|
||||
import net.sf.saxon.pull.NamespaceContextImpl;
|
||||
import net.sf.saxon.xpath.JAXPXPathStaticContext;
|
||||
import net.sf.saxon.xpath.XPathEvaluator;
|
||||
import net.sf.saxon.xpath.XPathFactoryImpl;
|
||||
import org.htmlcleaner.CleanerProperties;
|
||||
import org.htmlcleaner.DomSerializer;
|
||||
import org.htmlcleaner.HtmlCleaner;
|
||||
import org.htmlcleaner.TagNode;
|
||||
import org.junit.Assert;
|
||||
import org.junit.Ignore;
|
||||
import org.junit.Test;
|
||||
import org.w3c.dom.Document;
|
||||
import org.w3c.dom.NodeList;
|
||||
|
||||
import javax.xml.xpath.*;
|
||||
import java.util.Collections;
|
||||
import java.util.Iterator;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com <br> Date: 13-4-21 Time: 上午10:06
|
Loading…
Reference in New Issue