refactor in selectors
parent
85b7cf1563
commit
2c3574537a
|
@ -1,5 +1,8 @@
|
||||||
package us.codecraft.webmagic.selector;
|
package us.codecraft.webmagic.selector;
|
||||||
|
|
||||||
|
import org.jsoup.Jsoup;
|
||||||
|
import org.jsoup.nodes.Document;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
|
@ -11,12 +14,23 @@ import java.util.List;
|
||||||
*/
|
*/
|
||||||
public class Html extends PlainText {
|
public class Html extends PlainText {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Store parsed document for better performance when only one text exist.
|
||||||
|
*/
|
||||||
|
private Document document;
|
||||||
|
|
||||||
public Html(List<String> strings) {
|
public Html(List<String> strings) {
|
||||||
super(strings);
|
super(strings);
|
||||||
}
|
}
|
||||||
|
|
||||||
public Html(String text) {
|
public Html(String text) {
|
||||||
super(text);
|
super(text);
|
||||||
|
this.document = Jsoup.parse(text);
|
||||||
|
}
|
||||||
|
|
||||||
|
public Html(Document document) {
|
||||||
|
super(document.html());
|
||||||
|
this.document = document;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static Html create(String text) {
|
public static Html create(String text) {
|
||||||
|
@ -53,38 +67,34 @@ public class Html extends PlainText {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Selectable links() {
|
public Selectable links() {
|
||||||
XsoupSelector xpathSelector = new XsoupSelector("//a/@href");
|
return xpath("//a/@href");
|
||||||
return selectList(xpathSelector, strings);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Selectable xpath(String xpath) {
|
public Selectable xpath(String xpath) {
|
||||||
XsoupSelector xpathSelector = new XsoupSelector(xpath);
|
XsoupSelector xsoupSelector = new XsoupSelector(xpath);
|
||||||
return selectList(xpathSelector, strings);
|
if (document!=null){
|
||||||
|
return new Html(xsoupSelector.selectList(document));
|
||||||
|
}
|
||||||
|
return selectList(xsoupSelector, strings);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Selectable $(String selector) {
|
public Selectable $(String selector) {
|
||||||
CssSelector cssSelector = Selectors.$(selector);
|
CssSelector cssSelector = Selectors.$(selector);
|
||||||
|
if (document!=null){
|
||||||
|
return new Html(cssSelector.selectList(document));
|
||||||
|
}
|
||||||
return selectList(cssSelector, strings);
|
return selectList(cssSelector, strings);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Selectable $(String selector, String attrName) {
|
public Selectable $(String selector, String attrName) {
|
||||||
CssSelector cssSelector = Selectors.$(selector, attrName);
|
CssSelector cssSelector = Selectors.$(selector, attrName);
|
||||||
|
if (document!=null){
|
||||||
|
return new Html(cssSelector.selectList(document));
|
||||||
|
}
|
||||||
return selectList(cssSelector, strings);
|
return selectList(cssSelector, strings);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
public Selectable text() {
|
|
||||||
TextContentSelector selector = Selectors.text();
|
|
||||||
return select(selector, strings);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public Selectable text(String newlineSeparator) {
|
|
||||||
TextContentSelector selector = Selectors.text(newlineSeparator);
|
|
||||||
return select(selector, strings);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -89,7 +89,7 @@ public class PlainText implements Selectable {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Selectable replace(String regex, String replacement) {
|
public Selectable replace(String regex, String replacement) {
|
||||||
ReplaceSelector replaceSelector = SelectorFactory.getInstatnce().newReplaceSelector(regex, replacement);
|
ReplaceSelector replaceSelector = new ReplaceSelector(regex,replacement);
|
||||||
return select(replaceSelector, strings);
|
return select(replaceSelector, strings);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -107,18 +107,6 @@ public class PlainText implements Selectable {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
public Selectable text() {
|
|
||||||
//do nothing
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public Selectable text(String newlineSeparator) {
|
|
||||||
//do nothing
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean match() {
|
public boolean match() {
|
||||||
return strings != null && strings.size() > 0;
|
return strings != null && strings.size() > 0;
|
||||||
|
|
|
@ -82,20 +82,6 @@ public interface Selectable {
|
||||||
*/
|
*/
|
||||||
public String toString();
|
public String toString();
|
||||||
|
|
||||||
/**
|
|
||||||
* select text content of html
|
|
||||||
*
|
|
||||||
* @return text
|
|
||||||
*/
|
|
||||||
public Selectable text();
|
|
||||||
|
|
||||||
/**
|
|
||||||
* select text content of html
|
|
||||||
*
|
|
||||||
* @return text
|
|
||||||
*/
|
|
||||||
public Selectable text(String newlineSeparator);
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* if result exist for select
|
* if result exist for select
|
||||||
*
|
*
|
||||||
|
|
|
@ -1,91 +0,0 @@
|
||||||
package us.codecraft.webmagic.selector;
|
|
||||||
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
|
||||||
|
|
||||||
import java.lang.reflect.Constructor;
|
|
||||||
import java.util.Map;
|
|
||||||
import java.util.concurrent.ConcurrentHashMap;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Selector factory with some inner cache.<br>
|
|
||||||
*
|
|
||||||
* @author code4crafter@gmail.com <br>
|
|
||||||
* @since 0.1.0
|
|
||||||
*/
|
|
||||||
public class SelectorFactory {
|
|
||||||
|
|
||||||
private Map<String, Selector> innerCache = new ConcurrentHashMap<String, Selector>();
|
|
||||||
|
|
||||||
private static final SelectorFactory INSTATNCE = new SelectorFactory();
|
|
||||||
|
|
||||||
public static SelectorFactory getInstatnce() {
|
|
||||||
return INSTATNCE;
|
|
||||||
}
|
|
||||||
|
|
||||||
public RegexSelector newRegexSelector(String regex) {
|
|
||||||
return newSelector(RegexSelector.class, regex);
|
|
||||||
}
|
|
||||||
|
|
||||||
public RegexSelector newRegexSelector(String regex, int group) {
|
|
||||||
String cacheKey = getCacheKey(RegexSelector.class, regex, String.valueOf(group));
|
|
||||||
if (innerCache.get(cacheKey) != null) {
|
|
||||||
return (RegexSelector) innerCache.get(cacheKey);
|
|
||||||
}
|
|
||||||
return new RegexSelector(regex, group);
|
|
||||||
}
|
|
||||||
|
|
||||||
public ReplaceSelector newReplaceSelector(String regex, String replacement) {
|
|
||||||
return newSelector(ReplaceSelector.class, regex, replacement);
|
|
||||||
}
|
|
||||||
|
|
||||||
public XpathSelector newXpathSelector(String xpath) {
|
|
||||||
return newSelector(XpathSelector.class, xpath);
|
|
||||||
}
|
|
||||||
|
|
||||||
public SmartContentSelector newSmartContentSelector() {
|
|
||||||
return newSelector(SmartContentSelector.class);
|
|
||||||
}
|
|
||||||
|
|
||||||
public <T extends Selector> T newAndCacheSelector(Class<T> clazz, String... param) {
|
|
||||||
String cacheKey = getCacheKey(RegexSelector.class, param);
|
|
||||||
if (innerCache.get(cacheKey) != null) {
|
|
||||||
return (T) innerCache.get(cacheKey);
|
|
||||||
}
|
|
||||||
T selector = newSelector(clazz, param);
|
|
||||||
if (selector != null) {
|
|
||||||
innerCache.put(cacheKey, selector);
|
|
||||||
}
|
|
||||||
return selector;
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
public <T extends Selector> T newSelector(Class<T> clazz, String... param) {
|
|
||||||
try {
|
|
||||||
if (param.length == 0) {
|
|
||||||
Constructor<T> constructor
|
|
||||||
= clazz.getConstructor();
|
|
||||||
T selector = constructor.newInstance();
|
|
||||||
return selector;
|
|
||||||
} else if (param.length == 1) {
|
|
||||||
Constructor<T> constructor
|
|
||||||
= clazz.getConstructor(String.class);
|
|
||||||
T selector = constructor.newInstance(param[0]);
|
|
||||||
return selector;
|
|
||||||
} else if (param.length == 2) {
|
|
||||||
Constructor<T> constructor
|
|
||||||
= clazz.getConstructor(String.class, String.class);
|
|
||||||
T selector = constructor.newInstance(param[0], param[1]);
|
|
||||||
return selector;
|
|
||||||
} else {
|
|
||||||
throw new UnsupportedOperationException();
|
|
||||||
}
|
|
||||||
} catch (Exception e) {
|
|
||||||
throw new IllegalArgumentException("init object error", e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private String getCacheKey(Class<?> clazz, String... param) {
|
|
||||||
return clazz.toString() + "_" + StringUtils.join(param, "_");
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
|
@ -9,15 +9,15 @@ package us.codecraft.webmagic.selector;
|
||||||
public abstract class Selectors {
|
public abstract class Selectors {
|
||||||
|
|
||||||
public static RegexSelector regex(String expr) {
|
public static RegexSelector regex(String expr) {
|
||||||
return SelectorFactory.getInstatnce().newRegexSelector(expr);
|
return new RegexSelector(expr);
|
||||||
}
|
}
|
||||||
|
|
||||||
public static RegexSelector regex(String expr, int group) {
|
public static RegexSelector regex(String expr, int group) {
|
||||||
return SelectorFactory.getInstatnce().newRegexSelector(expr, group);
|
return new RegexSelector(expr,group);
|
||||||
}
|
}
|
||||||
|
|
||||||
public static SmartContentSelector smartContent() {
|
public static SmartContentSelector smartContent() {
|
||||||
return SelectorFactory.getInstatnce().newSmartContentSelector();
|
return new SmartContentSelector();
|
||||||
}
|
}
|
||||||
|
|
||||||
public static CssSelector $(String expr) {
|
public static CssSelector $(String expr) {
|
||||||
|
@ -29,7 +29,11 @@ public abstract class Selectors {
|
||||||
}
|
}
|
||||||
|
|
||||||
public static XpathSelector xpath(String expr) {
|
public static XpathSelector xpath(String expr) {
|
||||||
return SelectorFactory.getInstatnce().newXpathSelector(expr);
|
return new XpathSelector(expr);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static XsoupSelector xsoup(String expr) {
|
||||||
|
return new XsoupSelector(expr);
|
||||||
}
|
}
|
||||||
|
|
||||||
public static AndSelector and(Selector... selectors) {
|
public static AndSelector and(Selector... selectors) {
|
||||||
|
@ -40,14 +44,6 @@ public abstract class Selectors {
|
||||||
return new OrSelector(selectors);
|
return new OrSelector(selectors);
|
||||||
}
|
}
|
||||||
|
|
||||||
public static TextContentSelector text() {
|
|
||||||
return new TextContentSelector();
|
|
||||||
}
|
|
||||||
|
|
||||||
public static TextContentSelector text(String newlineSeperator) {
|
|
||||||
return new TextContentSelector(newlineSeperator);
|
|
||||||
}
|
|
||||||
|
|
||||||
public static void main(String[] args) {
|
public static void main(String[] args) {
|
||||||
String s = "a";
|
String s = "a";
|
||||||
or(regex("<title>(.*)</title>"), xpath("//title"), $("title")).select(s);
|
or(regex("<title>(.*)</title>"), xpath("//title"), $("title")).select(s);
|
||||||
|
|
|
@ -1,68 +0,0 @@
|
||||||
package us.codecraft.webmagic.selector;
|
|
||||||
|
|
||||||
import org.jsoup.Jsoup;
|
|
||||||
import org.jsoup.nodes.Document;
|
|
||||||
import org.jsoup.nodes.Element;
|
|
||||||
|
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.HashSet;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Extract text content in html.<br>
|
|
||||||
* Algorithm from <a href="http://www.elias.cn/En/ExtMainText">http://www.elias.cn/En/ExtMainText</a>. <br>
|
|
||||||
*
|
|
||||||
* @author code4crafter@gmail.com <br>
|
|
||||||
* @since 0.2.2
|
|
||||||
*/
|
|
||||||
public class TextContentSelector implements Selector {
|
|
||||||
|
|
||||||
private String newLineSeperator = "\n";
|
|
||||||
|
|
||||||
public TextContentSelector() {
|
|
||||||
}
|
|
||||||
|
|
||||||
public TextContentSelector(String newLineSeperator) {
|
|
||||||
this.newLineSeperator = newLineSeperator;
|
|
||||||
}
|
|
||||||
|
|
||||||
private final static Set<String> TAGS_IN_NEWLINE = new HashSet<String>();
|
|
||||||
|
|
||||||
private final static Set<String> TAGS_TO_IGNORE = new HashSet<String>();
|
|
||||||
|
|
||||||
static {
|
|
||||||
TAGS_IN_NEWLINE.addAll(Arrays.asList(new String[]{"p", "div", "h1", "h2", "h3", "h4", "h5", "h6", "br", "li"}));
|
|
||||||
TAGS_TO_IGNORE.addAll(Arrays.asList(new String[]{"head", "style", "script", "noscript", "option"}));
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public String select(String text) {
|
|
||||||
Document doc = Jsoup.parse(text);
|
|
||||||
return select0(doc);
|
|
||||||
}
|
|
||||||
|
|
||||||
protected String select0(Element element) {
|
|
||||||
String tagName = element.tagName().toLowerCase();
|
|
||||||
if (TAGS_TO_IGNORE.contains(tagName)) {
|
|
||||||
return "";
|
|
||||||
}
|
|
||||||
StringBuilder textBuilder = new StringBuilder();
|
|
||||||
textBuilder.append(element.text());
|
|
||||||
if (element.children() != null) {
|
|
||||||
for (Element child : element.children()) {
|
|
||||||
textBuilder.append(select0(child));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (TAGS_IN_NEWLINE.contains(tagName)) {
|
|
||||||
textBuilder.append(newLineSeperator);
|
|
||||||
}
|
|
||||||
return textBuilder.toString();
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public List<String> selectList(String text) {
|
|
||||||
throw new UnsupportedOperationException();
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
|
@ -1,34 +0,0 @@
|
||||||
package us.codecraft.webmagic.selector;
|
|
||||||
|
|
||||||
import junit.framework.Assert;
|
|
||||||
import org.junit.Ignore;
|
|
||||||
import org.junit.Test;
|
|
||||||
import us.codecraft.webmagic.downloader.HttpClientDownloader;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @author code4crafter@gmail.com <br>
|
|
||||||
* @since 0.2.2
|
|
||||||
*/
|
|
||||||
public class TextContentSelectorTest {
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void test() {
|
|
||||||
String html = "<div class=\"edit-comment-hide\">\n" +
|
|
||||||
" <div class=\"js-comment-body comment-body markdown-body markdown-format\">\n" +
|
|
||||||
" <p>Add more powerful selector for content text extract refered to <a href=\"http://www.elias.cn/En/ExtMainText\">http://www.elias.cn/En/ExtMainText</a></p>\n" +
|
|
||||||
" </div>\n" +
|
|
||||||
" </div>";
|
|
||||||
TextContentSelector textContentSelector = new TextContentSelector("<br>");
|
|
||||||
String text = textContentSelector.select(html);
|
|
||||||
Assert.assertNotNull(text);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Ignore("takes long time")
|
|
||||||
@Test
|
|
||||||
public void testDownload() {
|
|
||||||
String s = new HttpClientDownloader().download("http://blog.codecraft.us/blog/2013/08/18/ti-yan-dao-liao-open-sourcede-mei-li/", "utf-8")
|
|
||||||
.smartContent().text().toString();
|
|
||||||
Assert.assertNotNull(s);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
Loading…
Reference in New Issue