add more option to extractors
parent
c70ed57025
commit
20705b34ac
|
@ -19,10 +19,17 @@ public class CssSelector implements Selector {
|
||||||
|
|
||||||
private String selectorText;
|
private String selectorText;
|
||||||
|
|
||||||
|
private String attrName;
|
||||||
|
|
||||||
public CssSelector(String selectorText) {
|
public CssSelector(String selectorText) {
|
||||||
this.selectorText = selectorText;
|
this.selectorText = selectorText;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public CssSelector(String selectorText, String attrName) {
|
||||||
|
this.selectorText = selectorText;
|
||||||
|
this.attrName = attrName;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String select(String text) {
|
public String select(String text) {
|
||||||
Document doc = Jsoup.parse(text);
|
Document doc = Jsoup.parse(text);
|
||||||
|
@ -30,7 +37,15 @@ public class CssSelector implements Selector {
|
||||||
if (CollectionUtils.isEmpty(elements)) {
|
if (CollectionUtils.isEmpty(elements)) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
return elements.get(0).outerHtml();
|
return getValue(elements.get(0));
|
||||||
|
}
|
||||||
|
|
||||||
|
private String getValue(Element element) {
|
||||||
|
if (attrName == null) {
|
||||||
|
return element.outerHtml();
|
||||||
|
} else {
|
||||||
|
return element.attr(attrName);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -40,7 +55,10 @@ public class CssSelector implements Selector {
|
||||||
Elements elements = doc.select(selectorText);
|
Elements elements = doc.select(selectorText);
|
||||||
if (CollectionUtils.isNotEmpty(elements)) {
|
if (CollectionUtils.isNotEmpty(elements)) {
|
||||||
for (Element element : elements) {
|
for (Element element : elements) {
|
||||||
strings.add(element.outerHtml());
|
String value = getValue(element);
|
||||||
|
if (value != null) {
|
||||||
|
strings.add(value);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return strings;
|
return strings;
|
||||||
|
|
|
@ -69,4 +69,10 @@ public class Html extends PlainText {
|
||||||
return selectList(cssSelector, strings);
|
return selectList(cssSelector, strings);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Selectable $(String selector, String attrName) {
|
||||||
|
CssSelector cssSelector = new CssSelector(selector, attrName);
|
||||||
|
return selectList(cssSelector, strings);
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -40,6 +40,11 @@ public class PlainText implements Selectable {
|
||||||
throw new UnsupportedOperationException();
|
throw new UnsupportedOperationException();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Selectable $(String selector, String attrName) {
|
||||||
|
throw new UnsupportedOperationException();
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Selectable smartContent() {
|
public Selectable smartContent() {
|
||||||
throw new UnsupportedOperationException();
|
throw new UnsupportedOperationException();
|
||||||
|
@ -56,6 +61,12 @@ public class PlainText implements Selectable {
|
||||||
return selectList(regexSelector, strings);
|
return selectList(regexSelector, strings);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Selectable regex(String regex, int group) {
|
||||||
|
RegexSelector regexSelector = SelectorFactory.getInstatnce().newRegexSelector(regex, group);
|
||||||
|
return selectList(regexSelector, strings);
|
||||||
|
}
|
||||||
|
|
||||||
protected Selectable select(Selector selector, List<String> strings) {
|
protected Selectable select(Selector selector, List<String> strings) {
|
||||||
List<String> results = new ArrayList<String>();
|
List<String> results = new ArrayList<String>();
|
||||||
for (String string : strings) {
|
for (String string : strings) {
|
||||||
|
|
|
@ -26,6 +26,15 @@ public interface Selectable {
|
||||||
*/
|
*/
|
||||||
public Selectable $(String selector);
|
public Selectable $(String selector);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* select list with css selector
|
||||||
|
*
|
||||||
|
* @param selector css selector expression
|
||||||
|
* @param attrName attribute name of css selector
|
||||||
|
* @return new Selectable after extract
|
||||||
|
*/
|
||||||
|
public Selectable $(String selector, String attrName);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* select smart content with ReadAbility algorithm
|
* select smart content with ReadAbility algorithm
|
||||||
*
|
*
|
||||||
|
@ -41,13 +50,22 @@ public interface Selectable {
|
||||||
public Selectable links();
|
public Selectable links();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* select list with regex
|
* select list with regex, default group is group 1
|
||||||
*
|
*
|
||||||
* @param regex
|
* @param regex
|
||||||
* @return new Selectable after extract
|
* @return new Selectable after extract
|
||||||
*/
|
*/
|
||||||
public Selectable regex(String regex);
|
public Selectable regex(String regex);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* select list with regex
|
||||||
|
*
|
||||||
|
* @param regex
|
||||||
|
* @param group
|
||||||
|
* @return new Selectable after extract
|
||||||
|
*/
|
||||||
|
public Selectable regex(String regex, int group);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* replace with regex
|
* replace with regex
|
||||||
*
|
*
|
||||||
|
|
|
@ -26,6 +26,10 @@ public class SelectorFactory {
|
||||||
return newSelector(RegexSelector.class, regex);
|
return newSelector(RegexSelector.class, regex);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public RegexSelector newRegexSelector(String regex, int group) {
|
||||||
|
return newSelector(RegexSelector.class, regex, String.valueOf(group));
|
||||||
|
}
|
||||||
|
|
||||||
public ReplaceSelector newReplaceSelector(String regex, String replacement) {
|
public ReplaceSelector newReplaceSelector(String regex, String replacement) {
|
||||||
return newSelector(ReplaceSelector.class, regex, replacement);
|
return newSelector(ReplaceSelector.class, regex, replacement);
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue