newselectors
parent
b1cba78bd6
commit
55d4a76ab7
|
@ -25,6 +25,12 @@
|
||||||
<artifactId>commons-lang3</artifactId>
|
<artifactId>commons-lang3</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>us.codecraft</groupId>
|
||||||
|
<artifactId>xsoup</artifactId>
|
||||||
|
<version>0.0.1-SNAPSHOT</version>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>log4j</groupId>
|
<groupId>log4j</groupId>
|
||||||
<artifactId>log4j</artifactId>
|
<artifactId>log4j</artifactId>
|
||||||
|
|
|
@ -0,0 +1,23 @@
|
||||||
|
package us.codecraft.webmagic.selector;
|
||||||
|
|
||||||
|
import org.jsoup.Jsoup;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author code4crafter@gmail.com
|
||||||
|
* @since 0.2.2
|
||||||
|
*/
|
||||||
|
public abstract class BaseElementSelector implements Selector,ElementSelector {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String select(String text) {
|
||||||
|
return select(Jsoup.parse(text));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public List<String> selectList(String text) {
|
||||||
|
return selectList(Jsoup.parse(text));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -1,8 +1,6 @@
|
||||||
package us.codecraft.webmagic.selector;
|
package us.codecraft.webmagic.selector;
|
||||||
|
|
||||||
import org.apache.commons.collections.CollectionUtils;
|
import org.apache.commons.collections.CollectionUtils;
|
||||||
import org.jsoup.Jsoup;
|
|
||||||
import org.jsoup.nodes.Document;
|
|
||||||
import org.jsoup.nodes.Element;
|
import org.jsoup.nodes.Element;
|
||||||
import org.jsoup.select.Elements;
|
import org.jsoup.select.Elements;
|
||||||
|
|
||||||
|
@ -15,7 +13,7 @@ import java.util.List;
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
* @since 0.1.0
|
* @since 0.1.0
|
||||||
*/
|
*/
|
||||||
public class CssSelector implements Selector {
|
public class CssSelector extends BaseElementSelector {
|
||||||
|
|
||||||
private String selectorText;
|
private String selectorText;
|
||||||
|
|
||||||
|
@ -30,16 +28,6 @@ public class CssSelector implements Selector {
|
||||||
this.attrName = attrName;
|
this.attrName = attrName;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
public String select(String text) {
|
|
||||||
Document doc = Jsoup.parse(text);
|
|
||||||
Elements elements = doc.select(selectorText);
|
|
||||||
if (CollectionUtils.isEmpty(elements)) {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
return getValue(elements.get(0));
|
|
||||||
}
|
|
||||||
|
|
||||||
private String getValue(Element element) {
|
private String getValue(Element element) {
|
||||||
if (attrName == null) {
|
if (attrName == null) {
|
||||||
return element.outerHtml();
|
return element.outerHtml();
|
||||||
|
@ -51,9 +39,17 @@ public class CssSelector implements Selector {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public List<String> selectList(String text) {
|
public String select(Element element) {
|
||||||
|
Elements elements = element.select(selectorText);
|
||||||
|
if (CollectionUtils.isEmpty(elements)) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
return getValue(elements.get(0));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public List<String> selectList(Element doc) {
|
||||||
List<String> strings = new ArrayList<String>();
|
List<String> strings = new ArrayList<String>();
|
||||||
Document doc = Jsoup.parse(text);
|
|
||||||
Elements elements = doc.select(selectorText);
|
Elements elements = doc.select(selectorText);
|
||||||
if (CollectionUtils.isNotEmpty(elements)) {
|
if (CollectionUtils.isNotEmpty(elements)) {
|
||||||
for (Element element : elements) {
|
for (Element element : elements) {
|
||||||
|
|
|
@ -0,0 +1,32 @@
|
||||||
|
package us.codecraft.webmagic.selector;
|
||||||
|
|
||||||
|
import org.jsoup.nodes.Element;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Selector(extractor) for html elements.<br>
|
||||||
|
*
|
||||||
|
* @author code4crafter@gmail.com <br>
|
||||||
|
* @since 0.2.2
|
||||||
|
*/
|
||||||
|
public interface ElementSelector {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract single result in text.<br>
|
||||||
|
* If there are more than one result, only the first will be chosen.
|
||||||
|
*
|
||||||
|
* @param element
|
||||||
|
* @return result
|
||||||
|
*/
|
||||||
|
public String select(Element element);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract all results in text.<br>
|
||||||
|
*
|
||||||
|
* @param element
|
||||||
|
* @return results
|
||||||
|
*/
|
||||||
|
public List<String> selectList(Element element);
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,32 @@
|
||||||
|
package us.codecraft.webmagic.selector;
|
||||||
|
|
||||||
|
import org.jsoup.nodes.Element;
|
||||||
|
import us.codecraft.xsoup.XPathEvaluator;
|
||||||
|
import us.codecraft.xsoup.Xsoup;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* XPath selector based on Xsoup.<br>
|
||||||
|
*
|
||||||
|
* @author code4crafter@gmail.com <br>
|
||||||
|
* @since 0.2.2
|
||||||
|
*/
|
||||||
|
public class XsoupSelector extends BaseElementSelector {
|
||||||
|
|
||||||
|
private XPathEvaluator xPathEvaluator;
|
||||||
|
|
||||||
|
public XsoupSelector(String xpathStr) {
|
||||||
|
this.xPathEvaluator = Xsoup.compile(xpathStr);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String select(Element element) {
|
||||||
|
return xPathEvaluator.evaluate(element).get();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public List<String> selectList(Element element) {
|
||||||
|
return xPathEvaluator.evaluate(element).list();
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue