add more option to extractors
parent
c70ed57025
commit
20705b34ac
|
@ -19,10 +19,17 @@ public class CssSelector implements Selector {
|
|||
|
||||
private String selectorText;
|
||||
|
||||
private String attrName;
|
||||
|
||||
public CssSelector(String selectorText) {
|
||||
this.selectorText = selectorText;
|
||||
}
|
||||
|
||||
public CssSelector(String selectorText, String attrName) {
|
||||
this.selectorText = selectorText;
|
||||
this.attrName = attrName;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String select(String text) {
|
||||
Document doc = Jsoup.parse(text);
|
||||
|
@ -30,7 +37,15 @@ public class CssSelector implements Selector {
|
|||
if (CollectionUtils.isEmpty(elements)) {
|
||||
return null;
|
||||
}
|
||||
return elements.get(0).outerHtml();
|
||||
return getValue(elements.get(0));
|
||||
}
|
||||
|
||||
private String getValue(Element element) {
|
||||
if (attrName == null) {
|
||||
return element.outerHtml();
|
||||
} else {
|
||||
return element.attr(attrName);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -40,7 +55,10 @@ public class CssSelector implements Selector {
|
|||
Elements elements = doc.select(selectorText);
|
||||
if (CollectionUtils.isNotEmpty(elements)) {
|
||||
for (Element element : elements) {
|
||||
strings.add(element.outerHtml());
|
||||
String value = getValue(element);
|
||||
if (value != null) {
|
||||
strings.add(value);
|
||||
}
|
||||
}
|
||||
}
|
||||
return strings;
|
||||
|
|
|
@ -69,4 +69,10 @@ public class Html extends PlainText {
|
|||
return selectList(cssSelector, strings);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Selectable $(String selector, String attrName) {
|
||||
CssSelector cssSelector = new CssSelector(selector, attrName);
|
||||
return selectList(cssSelector, strings);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -40,6 +40,11 @@ public class PlainText implements Selectable {
|
|||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Selectable $(String selector, String attrName) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Selectable smartContent() {
|
||||
throw new UnsupportedOperationException();
|
||||
|
@ -56,6 +61,12 @@ public class PlainText implements Selectable {
|
|||
return selectList(regexSelector, strings);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Selectable regex(String regex, int group) {
|
||||
RegexSelector regexSelector = SelectorFactory.getInstatnce().newRegexSelector(regex, group);
|
||||
return selectList(regexSelector, strings);
|
||||
}
|
||||
|
||||
protected Selectable select(Selector selector, List<String> strings) {
|
||||
List<String> results = new ArrayList<String>();
|
||||
for (String string : strings) {
|
||||
|
|
|
@ -26,6 +26,15 @@ public interface Selectable {
|
|||
*/
|
||||
public Selectable $(String selector);
|
||||
|
||||
/**
|
||||
* select list with css selector
|
||||
*
|
||||
* @param selector css selector expression
|
||||
* @param attrName attribute name of css selector
|
||||
* @return new Selectable after extract
|
||||
*/
|
||||
public Selectable $(String selector, String attrName);
|
||||
|
||||
/**
|
||||
* select smart content with ReadAbility algorithm
|
||||
*
|
||||
|
@ -41,13 +50,22 @@ public interface Selectable {
|
|||
public Selectable links();
|
||||
|
||||
/**
|
||||
* select list with regex
|
||||
* select list with regex, default group is group 1
|
||||
*
|
||||
* @param regex
|
||||
* @return new Selectable after extract
|
||||
*/
|
||||
public Selectable regex(String regex);
|
||||
|
||||
/**
|
||||
* select list with regex
|
||||
*
|
||||
* @param regex
|
||||
* @param group
|
||||
* @return new Selectable after extract
|
||||
*/
|
||||
public Selectable regex(String regex, int group);
|
||||
|
||||
/**
|
||||
* replace with regex
|
||||
*
|
||||
|
|
|
@ -26,6 +26,10 @@ public class SelectorFactory {
|
|||
return newSelector(RegexSelector.class, regex);
|
||||
}
|
||||
|
||||
public RegexSelector newRegexSelector(String regex, int group) {
|
||||
return newSelector(RegexSelector.class, regex, String.valueOf(group));
|
||||
}
|
||||
|
||||
public ReplaceSelector newReplaceSelector(String regex, String replacement) {
|
||||
return newSelector(ReplaceSelector.class, regex, replacement);
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue