remove saxon to extension
parent
c6132e0746
commit
268bd8d0c4
|
@ -27,11 +27,6 @@
|
|||
<artifactId>commons-lang3</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>net.sf.saxon</groupId>
|
||||
<artifactId>Saxon-HE</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>log4j</groupId>
|
||||
<artifactId>log4j</artifactId>
|
||||
|
|
|
@ -63,12 +63,6 @@ public class Html extends PlainText {
|
|||
return selectList(xpathSelector, strings);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Selectable xpath2(String xpath) {
|
||||
Xpath2Selector xpathSelector = SelectorFactory.getInstatnce().newXpath2Selector(xpath);
|
||||
return selectList(xpathSelector, strings);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Selectable $(String selector) {
|
||||
CssSelector cssSelector = new CssSelector(selector);
|
||||
|
|
|
@ -34,11 +34,6 @@ public class PlainText implements Selectable {
|
|||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Selectable xpath2(String xpath) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Selectable $(String selector) {
|
||||
throw new UnsupportedOperationException();
|
||||
|
|
|
@ -18,14 +18,6 @@ public interface Selectable {
|
|||
*/
|
||||
public Selectable xpath(String xpath);
|
||||
|
||||
/**
|
||||
* select list with xpath 2.0 syntax
|
||||
*
|
||||
* @param xpath
|
||||
* @return new Selectable after extract
|
||||
*/
|
||||
public Selectable xpath2(String xpath);
|
||||
|
||||
/**
|
||||
* select list with css selector
|
||||
*
|
||||
|
|
|
@ -34,10 +34,6 @@ public class SelectorFactory {
|
|||
return newSelector(XpathSelector.class, xpath);
|
||||
}
|
||||
|
||||
public Xpath2Selector newXpath2Selector(String xpath) {
|
||||
return newSelector(Xpath2Selector.class, xpath);
|
||||
}
|
||||
|
||||
public SmartContentSelector newSmartContentSelector(){
|
||||
return newSelector(SmartContentSelector.class);
|
||||
}
|
||||
|
|
|
@ -1,45 +0,0 @@
|
|||
package us.codecraft.webmagic.selector;
|
||||
|
||||
import net.sf.saxon.xpath.XPathFactoryImpl;
|
||||
import org.htmlcleaner.CleanerProperties;
|
||||
import org.htmlcleaner.DomSerializer;
|
||||
import org.htmlcleaner.HtmlCleaner;
|
||||
import org.htmlcleaner.TagNode;
|
||||
import org.junit.Test;
|
||||
import org.w3c.dom.Document;
|
||||
import org.w3c.dom.NodeList;
|
||||
|
||||
import javax.xml.xpath.XPath;
|
||||
import javax.xml.xpath.XPathConstants;
|
||||
import javax.xml.xpath.XPathExpression;
|
||||
import javax.xml.xpath.XPathFactoryConfigurationException;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* @date: 13-8-2 <br>
|
||||
* Time: 下午5:48 <br>
|
||||
*/
|
||||
public class SaxonTest {
|
||||
|
||||
@Test
|
||||
public void test() throws XPathFactoryConfigurationException {
|
||||
// System.setProperty("javax.xml.xpath.XPathFactory:" + NamespaceConstant.OBJECT_MODEL_SAXON, "net.sf.saxon.xpath.XPathFactoryImpl");
|
||||
// XPathFactory xpf = XPathFactory.newInstance(NamespaceConstant.OBJECT_MODEL_SAXON);
|
||||
String xml = "<root><a>#BBB#</a><a>#CCC#</a><b><a>#DDD#</a></b></root>";
|
||||
try {
|
||||
HtmlCleaner htmlCleaner = new HtmlCleaner();
|
||||
TagNode tagNode = htmlCleaner.clean("");
|
||||
Document document = new DomSerializer(new CleanerProperties()).createDOM(tagNode);
|
||||
|
||||
javax.xml.xpath.XPathFactory factory = XPathFactoryImpl.newInstance();
|
||||
XPath xpath = factory.newXPath();
|
||||
XPathExpression expr = xpath.compile("//a[matches(.,'#...#')]");
|
||||
|
||||
Object result = expr.evaluate(document, XPathConstants.NODESET);
|
||||
NodeList nodes = (NodeList) result;
|
||||
System.out.println(nodes);
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
}
|
|
@ -27,6 +27,10 @@
|
|||
<artifactId>webmagic-core</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>net.sf.saxon</groupId>
|
||||
<artifactId>Saxon-HE</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>junit</groupId>
|
||||
<artifactId>junit</artifactId>
|
||||
|
|
|
@ -1380,42 +1380,6 @@ public class XpathSelectorTest {
|
|||
System.out.println(xpathSelector.select(text));
|
||||
}
|
||||
|
||||
//http://sourceforge.net/mailarchive/forum.php?thread_name=4EA92A8A.6080202%40saxonica.com&forum_name=saxon-help
|
||||
@Test
|
||||
public void testSaxon() {
|
||||
String text = "<h1>眉山:扎实推进农业农村工作 促农持续增收<br>\n" +
|
||||
"<span>2013-07-31 23:29:45 来源:<a href=\"http://www.mshw.net\" target=\"_blank\" style=\"color:#AAA\">眉山网</a> 责任编辑:张斯炜</span></h1>";
|
||||
try {
|
||||
HtmlCleaner htmlCleaner = new HtmlCleaner();
|
||||
TagNode tagNode = htmlCleaner.clean(text);
|
||||
Document document = new DomSerializer(new CleanerProperties()).createDOM(tagNode);
|
||||
XPathEvaluator xPathEvaluator = new XPathEvaluator();
|
||||
xPathEvaluator.setNamespaceContext(new NamespaceContextImpl(new NamespaceResolver() {
|
||||
|
||||
|
||||
@Override
|
||||
public String getURIForPrefix(String s, boolean b) {
|
||||
return NamespaceConstant.FN;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Iterator<String> iteratePrefixes() {
|
||||
return Collections.singletonList("fn").iterator();
|
||||
}
|
||||
}));
|
||||
XPathExpression expr = xPathEvaluator.compile("fn:substring-before(//h1,'\n')");
|
||||
Object result = expr.evaluate(document, XPathConstants.STRING);
|
||||
Assert.assertNotNull(result);
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
Xpath2Selector xpath2Selector = new Xpath2Selector("fn:substring-before(//h1,'\n')");
|
||||
String select = xpath2Selector.select(text);
|
||||
Assert.assertNotNull(select);
|
||||
Assert.assertNotNull(xpath2Selector.selectList(text));
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testXpath2Selector() {
|
||||
Xpath2Selector xpath2Selector = new Xpath2Selector("//a/@href");
|
Loading…
Reference in New Issue