From 5e9e8b2541a3b85dadd222bc923540d51f30b09c Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sun, 25 Aug 2013 16:30:38 +0800 Subject: [PATCH] add TextContentSelector --- .../us/codecraft/webmagic/selector/Html.java | 22 ++++-- .../webmagic/selector/PlainText.java | 21 +++++- .../webmagic/selector/Selectable.java | 21 ++++++ .../webmagic/selector/Selectors.java | 12 ++++ .../selector/TextContentSelector.java | 68 +++++++++++++++++++ .../selector/TextContentSelectorTest.java | 34 ++++++++++ 6 files changed, 171 insertions(+), 7 deletions(-) create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/selector/TextContentSelector.java create mode 100644 webmagic-core/src/test/java/us/codecraft/webmagic/selector/TextContentSelectorTest.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java index 1d5e8c5..f3d29aa 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java @@ -47,32 +47,44 @@ public class Html extends PlainText { @Override public Selectable smartContent() { - SmartContentSelector smartContentSelector = SelectorFactory.getInstatnce().newSmartContentSelector(); + SmartContentSelector smartContentSelector = Selectors.smartContent(); return select(smartContentSelector, strings); } @Override public Selectable links() { - XpathSelector xpathSelector = SelectorFactory.getInstatnce().newXpathSelector("//a/@href"); + XpathSelector xpathSelector = Selectors.xpath("//a/@href"); return selectList(xpathSelector, strings); } @Override public Selectable xpath(String xpath) { - XpathSelector xpathSelector = SelectorFactory.getInstatnce().newXpathSelector(xpath); + XpathSelector xpathSelector = Selectors.xpath(xpath); return selectList(xpathSelector, strings); } @Override public Selectable $(String selector) { - CssSelector cssSelector = new CssSelector(selector); + CssSelector cssSelector = Selectors.$(selector); return selectList(cssSelector, strings); } @Override public Selectable $(String selector, String attrName) { - CssSelector cssSelector = new CssSelector(selector, attrName); + CssSelector cssSelector = Selectors.$(selector, attrName); return selectList(cssSelector, strings); } + @Override + public Selectable text() { + TextContentSelector selector = Selectors.text(); + return select(selector, strings); + } + + @Override + public Selectable text(String newlineSeparator) { + TextContentSelector selector = Selectors.text(newlineSeparator); + return select(selector, strings); + } + } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java index e0501eb..df6926d 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java @@ -57,13 +57,13 @@ public class PlainText implements Selectable { @Override public Selectable regex(String regex) { - RegexSelector regexSelector = SelectorFactory.getInstatnce().newRegexSelector(regex); + RegexSelector regexSelector = Selectors.regex(regex); return selectList(regexSelector, strings); } @Override public Selectable regex(String regex, int group) { - RegexSelector regexSelector = SelectorFactory.getInstatnce().newRegexSelector(regex, group); + RegexSelector regexSelector = Selectors.regex(regex, group); return selectList(regexSelector, strings); } @@ -106,4 +106,21 @@ public class PlainText implements Selectable { return null; } } + + @Override + public Selectable text() { + //do nothing + return this; + } + + @Override + public Selectable text(String newlineSeparator) { + //do nothing + return this; + } + + @Override + public boolean match() { + return strings != null && strings.size() > 0; + } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java index 21c9381..398906f 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java @@ -82,6 +82,27 @@ public interface Selectable { */ public String toString(); + /** + * select text content of html + * + * @return text + */ + public Selectable text(); + + /** + * select text content of html + * + * @return text + */ + public Selectable text(String newlineSeparator); + + /** + * if result exist for select + * + * @return true if result exist + */ + public boolean match(); + /** * multi string result * diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectors.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectors.java index b52d128..051d6a4 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectors.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectors.java @@ -16,6 +16,10 @@ public abstract class Selectors { return SelectorFactory.getInstatnce().newRegexSelector(expr, group); } + public static SmartContentSelector smartContent() { + return SelectorFactory.getInstatnce().newSmartContentSelector(); + } + public static CssSelector $(String expr) { return new CssSelector(expr); } @@ -36,6 +40,14 @@ public abstract class Selectors { return new OrSelector(selectors); } + public static TextContentSelector text() { + return new TextContentSelector(); + } + + public static TextContentSelector text(String newlineSeperator) { + return new TextContentSelector(newlineSeperator); + } + public static void main(String[] args) { String s = "a"; or(regex("(.*)"), xpath("//title"), $("title")).select(s); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/TextContentSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/TextContentSelector.java new file mode 100644 index 0000000..54e8204 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/TextContentSelector.java @@ -0,0 +1,68 @@ +package us.codecraft.webmagic.selector; + +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; + +import java.util.Arrays; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +/** + * Extract text content in html.
+ * Algorithm from http://www.elias.cn/En/ExtMainText.
+ * + * @author code4crafter@gmail.com
+ * @since 0.2.2 + */ +public class TextContentSelector implements Selector { + + private String newLineSeperator = "\n"; + + public TextContentSelector() { + } + + public TextContentSelector(String newLineSeperator) { + this.newLineSeperator = newLineSeperator; + } + + private final static Set TAGS_IN_NEWLINE = new HashSet(); + + private final static Set TAGS_TO_IGNORE = new HashSet(); + + static { + TAGS_IN_NEWLINE.addAll(Arrays.asList(new String[]{"p", "div", "h1", "h2", "h3", "h4", "h5", "h6", "br", "li"})); + TAGS_TO_IGNORE.addAll(Arrays.asList(new String[]{"head", "style", "script", "noscript", "option"})); + } + + @Override + public String select(String text) { + Document doc = Jsoup.parse(text); + return select0(doc); + } + + protected String select0(Element element) { + String tagName = element.tagName().toLowerCase(); + if (TAGS_TO_IGNORE.contains(tagName)) { + return ""; + } + StringBuilder textBuilder = new StringBuilder(); + textBuilder.append(element.text()); + if (element.children() != null) { + for (Element child : element.children()) { + textBuilder.append(select0(child)); + } + } + if (TAGS_IN_NEWLINE.contains(tagName)) { + textBuilder.append(newLineSeperator); + } + return textBuilder.toString(); + } + + @Override + public List selectList(String text) { + throw new UnsupportedOperationException(); + } + +} diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/TextContentSelectorTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/TextContentSelectorTest.java new file mode 100644 index 0000000..a7a294a --- /dev/null +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/TextContentSelectorTest.java @@ -0,0 +1,34 @@ +package us.codecraft.webmagic.selector; + +import junit.framework.Assert; +import org.junit.Ignore; +import org.junit.Test; +import us.codecraft.webmagic.downloader.HttpClientDownloader; + +/** + * @author code4crafter@gmail.com
+ * @since 0.2.2 + */ +public class TextContentSelectorTest { + + @Test + public void test() { + String html = "
\n" + + "
\n" + + "

Add more powerful selector for content text extract refered to http://www.elias.cn/En/ExtMainText

\n" + + "
\n" + + "
"; + TextContentSelector textContentSelector = new TextContentSelector("
"); + String text = textContentSelector.select(html); + Assert.assertNotNull(text); + } + + @Ignore("takes long time") + @Test + public void testDownload() { + String s = new HttpClientDownloader().download("http://blog.codecraft.us/blog/2013/08/18/ti-yan-dao-liao-open-sourcede-mei-li/", "utf-8") + .smartContent().text().toString(); + Assert.assertNotNull(text); + } + +}