refactor of selectable cont' #113
1. remove lazy init of Html 2. rename strings to sourceTexts for better meaning 3. make getSourceTexts abstract and DO NOT always store strings 4. instead store parsed elements of document in HtmlNodemaster
parent
f9825c214a
commit
41c2ea9498
2
pom.xml
2
pom.xml
|
@ -88,7 +88,7 @@
|
|||
<dependency>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<artifactId>xsoup</artifactId>
|
||||
<version>0.2.3</version>
|
||||
<version>0.2.4-SNAPSHOT</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.alibaba</groupId>
|
||||
|
|
|
@ -11,17 +11,7 @@ import java.util.List;
|
|||
*/
|
||||
public abstract class AbstractSelectable implements Selectable {
|
||||
|
||||
protected List<String> strings;
|
||||
|
||||
public AbstractSelectable(String text) {
|
||||
List<String> results = new ArrayList<String>();
|
||||
results.add(text);
|
||||
this.strings = results;
|
||||
}
|
||||
|
||||
public AbstractSelectable(List<String> strings) {
|
||||
this.strings = strings;
|
||||
}
|
||||
protected abstract List<String> getSourceTexts();
|
||||
|
||||
@Override
|
||||
public Selectable css(String selector) {
|
||||
|
@ -55,7 +45,7 @@ public abstract class AbstractSelectable implements Selectable {
|
|||
|
||||
@Override
|
||||
public List<String> all() {
|
||||
return strings;
|
||||
return getSourceTexts();
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -74,30 +64,37 @@ public abstract class AbstractSelectable implements Selectable {
|
|||
|
||||
@Override
|
||||
public Selectable select(Selector selector) {
|
||||
return select(selector, strings);
|
||||
return select(selector, getSourceTexts());
|
||||
}
|
||||
|
||||
@Override
|
||||
public Selectable selectList(Selector selector) {
|
||||
return selectList(selector, strings);
|
||||
return selectList(selector, getSourceTexts());
|
||||
}
|
||||
|
||||
@Override
|
||||
public Selectable regex(String regex) {
|
||||
RegexSelector regexSelector = Selectors.regex(regex);
|
||||
return selectList(regexSelector, strings);
|
||||
return selectList(regexSelector, getSourceTexts());
|
||||
}
|
||||
|
||||
@Override
|
||||
public Selectable regex(String regex, int group) {
|
||||
RegexSelector regexSelector = Selectors.regex(regex, group);
|
||||
return selectList(regexSelector, strings);
|
||||
return selectList(regexSelector, getSourceTexts());
|
||||
}
|
||||
|
||||
@Override
|
||||
public Selectable replace(String regex, String replacement) {
|
||||
ReplaceSelector replaceSelector = new ReplaceSelector(regex,replacement);
|
||||
return select(replaceSelector, strings);
|
||||
return select(replaceSelector, getSourceTexts());
|
||||
}
|
||||
|
||||
public String getFirstSourceText() {
|
||||
if (getSourceTexts() != null && getSourceTexts().size() > 0) {
|
||||
return getSourceTexts().get(0);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -107,6 +104,6 @@ public abstract class AbstractSelectable implements Selectable {
|
|||
|
||||
@Override
|
||||
public boolean match() {
|
||||
return strings != null && strings.size() > 0;
|
||||
return getSourceTexts() != null && getSourceTexts().size() > 0;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -2,7 +2,6 @@ package us.codecraft.webmagic.selector;
|
|||
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Element;
|
||||
import org.jsoup.select.Elements;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
@ -37,16 +36,18 @@ public abstract class BaseElementSelector implements Selector, ElementSelector {
|
|||
return null;
|
||||
}
|
||||
|
||||
public Elements selectElements(String text) {
|
||||
public List<Element> selectElements(String text) {
|
||||
if (text != null) {
|
||||
return selectElements(Jsoup.parse(text));
|
||||
} else {
|
||||
return new Elements();
|
||||
return new ArrayList<Element>();
|
||||
}
|
||||
}
|
||||
|
||||
public abstract Element selectElement(Element element);
|
||||
|
||||
public abstract Elements selectElements(Element element);
|
||||
public abstract List<Element> selectElements(Element element);
|
||||
|
||||
public abstract boolean hasAttribute();
|
||||
|
||||
}
|
||||
|
|
|
@ -57,7 +57,7 @@ public class CssSelector extends BaseElementSelector {
|
|||
|
||||
@Override
|
||||
public String select(Element element) {
|
||||
Elements elements = selectElements(element);
|
||||
List<Element> elements = selectElements(element);
|
||||
if (CollectionUtils.isEmpty(elements)) {
|
||||
return null;
|
||||
}
|
||||
|
@ -67,7 +67,7 @@ public class CssSelector extends BaseElementSelector {
|
|||
@Override
|
||||
public List<String> selectList(Element doc) {
|
||||
List<String> strings = new ArrayList<String>();
|
||||
Elements elements = selectElements(doc);
|
||||
List<Element> elements = selectElements(doc);
|
||||
if (CollectionUtils.isNotEmpty(elements)) {
|
||||
for (Element element : elements) {
|
||||
String value = getValue(element);
|
||||
|
@ -89,7 +89,12 @@ public class CssSelector extends BaseElementSelector {
|
|||
}
|
||||
|
||||
@Override
|
||||
public Elements selectElements(Element element) {
|
||||
public List<Element> selectElements(Element element) {
|
||||
return element.select(selectorText);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasAttribute() {
|
||||
return attrName != null;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -2,10 +2,11 @@ package us.codecraft.webmagic.selector;
|
|||
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.jsoup.nodes.Element;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
|
@ -14,7 +15,7 @@ import java.util.List;
|
|||
* @author code4crafter@gmail.com <br>
|
||||
* @since 0.1.0
|
||||
*/
|
||||
public class Html extends PlainText {
|
||||
public class Html extends HtmlNode {
|
||||
|
||||
private Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
|
@ -23,130 +24,26 @@ public class Html extends PlainText {
|
|||
*/
|
||||
private Document document;
|
||||
|
||||
private boolean needInitCache = true;
|
||||
|
||||
public Html(List<String> strings) {
|
||||
super(strings);
|
||||
}
|
||||
|
||||
public Html(String text) {
|
||||
super(text);
|
||||
}
|
||||
|
||||
public Html(List<String> strings, boolean needInitCache) {
|
||||
super(strings);
|
||||
this.needInitCache = needInitCache;
|
||||
}
|
||||
|
||||
public Html(String text, boolean needInitCache) {
|
||||
super(text);
|
||||
this.needInitCache = needInitCache;
|
||||
}
|
||||
|
||||
/**
|
||||
* lazy init
|
||||
*/
|
||||
private void initDocument() {
|
||||
if (this.document == null && needInitCache) {
|
||||
needInitCache = false;
|
||||
//just init once whether the parsing succeeds or not
|
||||
try {
|
||||
this.document = Jsoup.parse(getText());
|
||||
this.document = Jsoup.parse(text);
|
||||
} catch (Exception e) {
|
||||
this.document = null;
|
||||
logger.warn("parse document error ", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public Html(Document document) {
|
||||
super(document.html());
|
||||
this.document = document;
|
||||
}
|
||||
|
||||
public static Html create(String text) {
|
||||
return new Html(text);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Selectable select(Selector selector, List<String> strings) {
|
||||
initDocument();
|
||||
List<String> results = new ArrayList<String>();
|
||||
for (String string : strings) {
|
||||
String result = selector.select(string);
|
||||
if (result != null) {
|
||||
results.add(result);
|
||||
}
|
||||
}
|
||||
return new Html(results, false);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Selectable selectList(Selector selector, List<String> strings) {
|
||||
initDocument();
|
||||
List<String> results = new ArrayList<String>();
|
||||
for (String string : strings) {
|
||||
List<String> result = selector.selectList(string);
|
||||
results.addAll(result);
|
||||
}
|
||||
return new Html(results, false);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Selectable smartContent() {
|
||||
initDocument();
|
||||
SmartContentSelector smartContentSelector = Selectors.smartContent();
|
||||
return select(smartContentSelector, strings);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Selectable links() {
|
||||
return xpath("//a/@href");
|
||||
}
|
||||
|
||||
@Override
|
||||
public Selectable xpath(String xpath) {
|
||||
XpathSelector xpathSelector = Selectors.xpath(xpath);
|
||||
if (document != null) {
|
||||
return new Html(xpathSelector.selectList(document), false);
|
||||
}
|
||||
return selectList(xpathSelector, strings);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Selectable $(String selector) {
|
||||
CssSelector cssSelector = Selectors.$(selector);
|
||||
if (document != null) {
|
||||
return new Html(cssSelector.selectList(document), false);
|
||||
}
|
||||
return selectList(cssSelector, strings);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Selectable $(String selector, String attrName) {
|
||||
CssSelector cssSelector = Selectors.$(selector, attrName);
|
||||
if (document != null) {
|
||||
return new Html(cssSelector.selectList(document), false);
|
||||
}
|
||||
return selectList(cssSelector, strings);
|
||||
}
|
||||
|
||||
public Document getDocument() {
|
||||
initDocument();
|
||||
return document;
|
||||
}
|
||||
|
||||
public String getText() {
|
||||
if (strings != null && strings.size() > 0) {
|
||||
return strings.get(0);
|
||||
}
|
||||
return document.html();
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<Selectable> nodes() {
|
||||
ArrayList<Selectable> selectables = new ArrayList<Selectable>();
|
||||
selectables.add(this);
|
||||
return selectables;
|
||||
protected List<Element> getElements() {
|
||||
return Collections.<Element>singletonList(getDocument());
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -158,7 +55,7 @@ public class Html extends PlainText {
|
|||
ElementSelector elementSelector = (ElementSelector) selector;
|
||||
return elementSelector.select(getDocument());
|
||||
} else {
|
||||
return selector.select(getText());
|
||||
return selector.select(getFirstSourceText());
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -167,7 +64,12 @@ public class Html extends PlainText {
|
|||
ElementSelector elementSelector = (ElementSelector) selector;
|
||||
return elementSelector.selectList(getDocument());
|
||||
} else {
|
||||
return selector.selectList(getText());
|
||||
return selector.selectList(getFirstSourceText());
|
||||
}
|
||||
}
|
||||
|
||||
public static Html create(String text) {
|
||||
return new Html(text);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,7 +0,0 @@
|
|||
package us.codecraft.webmagic.selector;
|
||||
|
||||
/**
|
||||
* @author code4crafer@gmail.com
|
||||
*/
|
||||
public class HtmlFragment {
|
||||
}
|
|
@ -0,0 +1,97 @@
|
|||
package us.codecraft.webmagic.selector;
|
||||
|
||||
import org.jsoup.nodes.Element;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* @author code4crafer@gmail.com
|
||||
*/
|
||||
public class HtmlNode extends AbstractSelectable {
|
||||
|
||||
private final List<Element> elements;
|
||||
|
||||
public HtmlNode(List<Element> elements) {
|
||||
this.elements = elements;
|
||||
}
|
||||
|
||||
public HtmlNode() {
|
||||
elements = null;
|
||||
}
|
||||
|
||||
protected List<Element> getElements() {
|
||||
return elements;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Selectable smartContent() {
|
||||
SmartContentSelector smartContentSelector = Selectors.smartContent();
|
||||
return select(smartContentSelector, getSourceTexts());
|
||||
}
|
||||
|
||||
@Override
|
||||
public Selectable links() {
|
||||
return xpath("//a/@href");
|
||||
}
|
||||
|
||||
@Override
|
||||
public Selectable xpath(String xpath) {
|
||||
XpathSelector xpathSelector = Selectors.xpath(xpath);
|
||||
return selectElements(xpathSelector);
|
||||
}
|
||||
|
||||
/**
|
||||
* select elements
|
||||
*
|
||||
* @param elementSelector
|
||||
* @return
|
||||
*/
|
||||
protected Selectable selectElements(BaseElementSelector elementSelector) {
|
||||
if (!elementSelector.hasAttribute()) {
|
||||
List<Element> resultElements = new ArrayList<Element>();
|
||||
for (Element element : getElements()) {
|
||||
List<Element> selectElements = elementSelector.selectElements(element);
|
||||
resultElements.addAll(selectElements);
|
||||
}
|
||||
return new HtmlNode(resultElements);
|
||||
} else {
|
||||
// has attribute, consider as plaintext
|
||||
List<String> resultStrings = new ArrayList<String>();
|
||||
for (Element element : getElements()) {
|
||||
List<String> selectList = elementSelector.selectList(element);
|
||||
resultStrings.addAll(selectList);
|
||||
}
|
||||
return new PlainText(resultStrings);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Selectable $(String selector) {
|
||||
CssSelector cssSelector = Selectors.$(selector);
|
||||
return selectElements(cssSelector);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Selectable $(String selector, String attrName) {
|
||||
CssSelector cssSelector = Selectors.$(selector, attrName);
|
||||
return selectElements(cssSelector);
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<Selectable> nodes() {
|
||||
ArrayList<Selectable> selectables = new ArrayList<Selectable>();
|
||||
selectables.add(this);
|
||||
return selectables;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<String> getSourceTexts() {
|
||||
List<String> sourceTexts = new ArrayList<String>(getElements().size());
|
||||
for (Element element : getElements()) {
|
||||
sourceTexts.add(element.toString());
|
||||
}
|
||||
return sourceTexts;
|
||||
}
|
||||
}
|
|
@ -26,7 +26,7 @@ public class Json extends PlainText {
|
|||
* @return
|
||||
*/
|
||||
public Json removePadding(String padding) {
|
||||
String text = getText();
|
||||
String text = getFirstSourceText();
|
||||
XTokenQueue tokenQueue = new XTokenQueue(text);
|
||||
tokenQueue.consumeWhitespace();
|
||||
tokenQueue.consume(padding);
|
||||
|
@ -36,29 +36,22 @@ public class Json extends PlainText {
|
|||
}
|
||||
|
||||
public <T> T toObject(Class<T> clazz) {
|
||||
if (getText() == null) {
|
||||
if (getFirstSourceText() == null) {
|
||||
return null;
|
||||
}
|
||||
return JSON.parseObject(getText(), clazz);
|
||||
return JSON.parseObject(getFirstSourceText(), clazz);
|
||||
}
|
||||
|
||||
public <T> List<T> toList(Class<T> clazz) {
|
||||
if (getText() == null) {
|
||||
if (getFirstSourceText() == null) {
|
||||
return null;
|
||||
}
|
||||
return JSON.parseArray(getText(), clazz);
|
||||
}
|
||||
|
||||
public String getText() {
|
||||
if (strings != null && strings.size() > 0) {
|
||||
return strings.get(0);
|
||||
}
|
||||
return null;
|
||||
return JSON.parseArray(getFirstSourceText(), clazz);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Selectable jsonPath(String jsonPath) {
|
||||
JsonPathSelector jsonPathSelector = new JsonPathSelector(jsonPath);
|
||||
return selectList(jsonPathSelector,strings);
|
||||
return selectList(jsonPathSelector,getSourceTexts());
|
||||
}
|
||||
}
|
||||
|
|
|
@ -12,12 +12,15 @@ import java.util.List;
|
|||
*/
|
||||
public class PlainText extends AbstractSelectable {
|
||||
|
||||
public PlainText(List<String> strings) {
|
||||
super(strings);
|
||||
protected List<String> sourceTexts;
|
||||
|
||||
public PlainText(List<String> sourceTexts) {
|
||||
this.sourceTexts = sourceTexts;
|
||||
}
|
||||
|
||||
public PlainText(String text) {
|
||||
super(text);
|
||||
this.sourceTexts = new ArrayList<String>();
|
||||
sourceTexts.add(text);
|
||||
}
|
||||
|
||||
public static PlainText create(String text) {
|
||||
|
@ -51,11 +54,15 @@ public class PlainText extends AbstractSelectable {
|
|||
|
||||
@Override
|
||||
public List<Selectable> nodes() {
|
||||
List<Selectable> nodes = new ArrayList<Selectable>(strings.size());
|
||||
for (String string : strings) {
|
||||
List<Selectable> nodes = new ArrayList<Selectable>(getSourceTexts().size());
|
||||
for (String string : getSourceTexts()) {
|
||||
nodes.add(PlainText.create(string));
|
||||
}
|
||||
return nodes;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<String> getSourceTexts() {
|
||||
return sourceTexts;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -2,7 +2,6 @@ package us.codecraft.webmagic.selector;
|
|||
|
||||
import org.apache.commons.collections.CollectionUtils;
|
||||
import org.jsoup.nodes.Element;
|
||||
import org.jsoup.select.Elements;
|
||||
import us.codecraft.xsoup.XPathEvaluator;
|
||||
import us.codecraft.xsoup.Xsoup;
|
||||
|
||||
|
@ -34,7 +33,7 @@ public class XpathSelector extends BaseElementSelector {
|
|||
|
||||
@Override
|
||||
public Element selectElement(Element element) {
|
||||
Elements elements = selectElements(element);
|
||||
List<Element> elements = selectElements(element);
|
||||
if (CollectionUtils.isNotEmpty(elements)){
|
||||
return elements.get(0);
|
||||
}
|
||||
|
@ -42,7 +41,12 @@ public class XpathSelector extends BaseElementSelector {
|
|||
}
|
||||
|
||||
@Override
|
||||
public Elements selectElements(Element element) {
|
||||
public List<Element> selectElements(Element element) {
|
||||
return xPathEvaluator.evaluate(element).getElements();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasAttribute() {
|
||||
return xPathEvaluator.hasAttribute();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -39,7 +39,7 @@ public class HttpClientDownloaderTest {
|
|||
public void testDownloader() {
|
||||
HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
|
||||
Html html = httpClientDownloader.download("https://github.com");
|
||||
assertTrue(!html.getText().isEmpty());
|
||||
assertTrue(!html.getFirstSourceText().isEmpty());
|
||||
}
|
||||
|
||||
@Test(expected = IllegalArgumentException.class)
|
||||
|
|
Loading…
Reference in New Issue