From f9825c214ab2a6fa6fcd565ed8b2b8f8b52ad38c Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Tue, 27 May 2014 16:00:51 +0800 Subject: [PATCH] refactor selectable for html fragment #113 --- .../webmagic/selector/AbstractSelectable.java | 112 ++++++++++++++++++ .../selector/BaseElementSelector.java | 21 ++++ .../webmagic/selector/CssSelector.java | 18 ++- .../us/codecraft/webmagic/selector/Html.java | 7 ++ .../webmagic/selector/HtmlFragment.java | 7 ++ .../webmagic/selector/PlainText.java | 97 ++------------- .../webmagic/selector/Selectable.java | 6 + .../webmagic/selector/XpathSelector.java | 16 +++ .../webmagic/samples/MamacnPageProcessor.java | 46 +++++++ .../samples/pipeline/OneFilePipeline.java | 50 ++++++++ 10 files changed, 288 insertions(+), 92 deletions(-) create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/selector/AbstractSelectable.java create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlFragment.java create mode 100644 webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MamacnPageProcessor.java create mode 100644 webmagic-samples/src/main/java/us/codecraft/webmagic/samples/pipeline/OneFilePipeline.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/AbstractSelectable.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/AbstractSelectable.java new file mode 100644 index 0000000..2ac4c70 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/AbstractSelectable.java @@ -0,0 +1,112 @@ +package us.codecraft.webmagic.selector; + +import org.apache.commons.collections.CollectionUtils; + +import java.util.ArrayList; +import java.util.List; + +/** + * @author code4crafer@gmail.com + * @since 0.5.2 + */ +public abstract class AbstractSelectable implements Selectable { + + protected List strings; + + public AbstractSelectable(String text) { + List results = new ArrayList(); + results.add(text); + this.strings = results; + } + + public AbstractSelectable(List strings) { + this.strings = strings; + } + + @Override + public Selectable css(String selector) { + return $(selector); + } + + @Override + public Selectable css(String selector, String attrName) { + return $(selector, attrName); + } + + protected Selectable select(Selector selector, List strings) { + List results = new ArrayList(); + for (String string : strings) { + String result = selector.select(string); + if (result != null) { + results.add(result); + } + } + return new PlainText(results); + } + + protected Selectable selectList(Selector selector, List strings) { + List results = new ArrayList(); + for (String string : strings) { + List result = selector.selectList(string); + results.addAll(result); + } + return new PlainText(results); + } + + @Override + public List all() { + return strings; + } + + @Override + public Selectable jsonPath(String jsonPath) { + throw new UnsupportedOperationException(); + } + + @Override + public String get() { + if (CollectionUtils.isNotEmpty(all())) { + return all().get(0); + } else { + return null; + } + } + + @Override + public Selectable select(Selector selector) { + return select(selector, strings); + } + + @Override + public Selectable selectList(Selector selector) { + return selectList(selector, strings); + } + + @Override + public Selectable regex(String regex) { + RegexSelector regexSelector = Selectors.regex(regex); + return selectList(regexSelector, strings); + } + + @Override + public Selectable regex(String regex, int group) { + RegexSelector regexSelector = Selectors.regex(regex, group); + return selectList(regexSelector, strings); + } + + @Override + public Selectable replace(String regex, String replacement) { + ReplaceSelector replaceSelector = new ReplaceSelector(regex,replacement); + return select(replaceSelector, strings); + } + + @Override + public String toString() { + return get(); + } + + @Override + public boolean match() { + return strings != null && strings.size() > 0; + } +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java index 7d9035f..3b9b22d 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java @@ -1,6 +1,8 @@ package us.codecraft.webmagic.selector; import org.jsoup.Jsoup; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; import java.util.ArrayList; import java.util.List; @@ -28,4 +30,23 @@ public abstract class BaseElementSelector implements Selector, ElementSelector { } } + public Element selectElement(String text) { + if (text != null) { + return selectElement(Jsoup.parse(text)); + } + return null; + } + + public Elements selectElements(String text) { + if (text != null) { + return selectElements(Jsoup.parse(text)); + } else { + return new Elements(); + } + } + + public abstract Element selectElement(Element element); + + public abstract Elements selectElements(Element element); + } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java index 185db74..095af35 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java @@ -57,7 +57,7 @@ public class CssSelector extends BaseElementSelector { @Override public String select(Element element) { - Elements elements = element.select(selectorText); + Elements elements = selectElements(element); if (CollectionUtils.isEmpty(elements)) { return null; } @@ -67,7 +67,7 @@ public class CssSelector extends BaseElementSelector { @Override public List selectList(Element doc) { List strings = new ArrayList(); - Elements elements = doc.select(selectorText); + Elements elements = selectElements(doc); if (CollectionUtils.isNotEmpty(elements)) { for (Element element : elements) { String value = getValue(element); @@ -78,4 +78,18 @@ public class CssSelector extends BaseElementSelector { } return strings; } + + @Override + public Element selectElement(Element element) { + Elements elements = element.select(selectorText); + if (CollectionUtils.isNotEmpty(elements)) { + return elements.get(0); + } + return null; + } + + @Override + public Elements selectElements(Element element) { + return element.select(selectorText); + } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java index 34386b5..9748577 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java @@ -142,6 +142,13 @@ public class Html extends PlainText { return document.html(); } + @Override + public List nodes() { + ArrayList selectables = new ArrayList(); + selectables.add(this); + return selectables; + } + /** * @param selector * @return diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlFragment.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlFragment.java new file mode 100644 index 0000000..d427f67 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlFragment.java @@ -0,0 +1,7 @@ +package us.codecraft.webmagic.selector; + +/** + * @author code4crafer@gmail.com + */ +public class HtmlFragment { +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java index efa38d8..c1d034a 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java @@ -1,7 +1,5 @@ package us.codecraft.webmagic.selector; -import org.apache.commons.collections.CollectionUtils; - import java.util.ArrayList; import java.util.List; @@ -12,18 +10,14 @@ import java.util.List; * @author code4crafter@gmail.com
* @since 0.1.0 */ -public class PlainText implements Selectable { - - protected List strings; +public class PlainText extends AbstractSelectable { public PlainText(List strings) { - this.strings = strings; + super(strings); } public PlainText(String text) { - List results = new ArrayList(); - results.add(text); - this.strings = results; + super(text); } public static PlainText create(String text) { @@ -45,16 +39,6 @@ public class PlainText implements Selectable { throw new UnsupportedOperationException(); } - @Override - public Selectable css(String selector) { - return $(selector); - } - - @Override - public Selectable css(String selector, String attrName) { - return $(selector, attrName); - } - @Override public Selectable smartContent() { throw new UnsupportedOperationException(); @@ -66,79 +50,12 @@ public class PlainText implements Selectable { } @Override - public Selectable regex(String regex) { - RegexSelector regexSelector = Selectors.regex(regex); - return selectList(regexSelector, strings); - } - - @Override - public Selectable regex(String regex, int group) { - RegexSelector regexSelector = Selectors.regex(regex, group); - return selectList(regexSelector, strings); - } - - protected Selectable select(Selector selector, List strings) { - List results = new ArrayList(); + public List nodes() { + List nodes = new ArrayList(strings.size()); for (String string : strings) { - String result = selector.select(string); - if (result != null) { - results.add(result); - } + nodes.add(PlainText.create(string)); } - return new PlainText(results); + return nodes; } - protected Selectable selectList(Selector selector, List strings) { - List results = new ArrayList(); - for (String string : strings) { - List result = selector.selectList(string); - results.addAll(result); - } - return new PlainText(results); - } - - @Override - public Selectable replace(String regex, String replacement) { - ReplaceSelector replaceSelector = new ReplaceSelector(regex,replacement); - return select(replaceSelector, strings); - } - - @Override - public List all() { - return strings; - } - - @Override - public Selectable jsonPath(String jsonPath) { - throw new UnsupportedOperationException(); - } - - @Override - public String get() { - if (CollectionUtils.isNotEmpty(all())) { - return all().get(0); - } else { - return null; - } - } - - @Override - public Selectable select(Selector selector) { - return select(selector, strings); - } - - @Override - public Selectable selectList(Selector selector) { - return selectList(selector, strings); - } - - @Override - public String toString() { - return get(); - } - - @Override - public boolean match() { - return strings != null && strings.size() > 0; - } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java index 2cc4ed9..341a077 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java @@ -143,4 +143,10 @@ public interface Selectable { * @return */ public Selectable selectList(Selector selector); + + /** + * get all nodes + * @return + */ + public List nodes(); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XpathSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XpathSelector.java index d1bbcae..4516a3d 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XpathSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XpathSelector.java @@ -1,6 +1,8 @@ package us.codecraft.webmagic.selector; +import org.apache.commons.collections.CollectionUtils; import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; import us.codecraft.xsoup.XPathEvaluator; import us.codecraft.xsoup.Xsoup; @@ -29,4 +31,18 @@ public class XpathSelector extends BaseElementSelector { public List selectList(Element element) { return xPathEvaluator.evaluate(element).list(); } + + @Override + public Element selectElement(Element element) { + Elements elements = selectElements(element); + if (CollectionUtils.isNotEmpty(elements)){ + return elements.get(0); + } + return null; + } + + @Override + public Elements selectElements(Element element) { + return xPathEvaluator.evaluate(element).getElements(); + } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MamacnPageProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MamacnPageProcessor.java new file mode 100644 index 0000000..c639b63 --- /dev/null +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MamacnPageProcessor.java @@ -0,0 +1,46 @@ +package us.codecraft.webmagic.samples; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.processor.PageProcessor; +import us.codecraft.webmagic.samples.pipeline.OneFilePipeline; +import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler; +import us.codecraft.webmagic.selector.Selectable; + +import java.io.FileNotFoundException; +import java.io.UnsupportedEncodingException; + +/** + * @author code4crafer@gmail.com + */ +public class MamacnPageProcessor implements PageProcessor { + + private Site site = Site.me().setDomain("www.mama.cn").setSleepTime(100); + + @Override + public void process(Page page) { + Selectable images = page.getHtml().xpath("//ul[@id=ma-thumb-list]/li"); + page.putField("img", images.xpath("//div[@class=picList]/div[@class=pre]/div[@class=npic]//img/@src").get()); + page.putField("title", page.getHtml().xpath("//div[@class=picList]/div[@class=pre]/div[@class=npic]//img/@alt").get()); + page.putField("url", page.getUrl().toString()); + if (page.getResultItems().get("title") == null) { + page.setSkip(true); + } + page.addTargetRequests(page.getHtml().links().regex("http://www\\.mama\\.cn/photo/.*\\.html").all()); + } + + @Override + public Site getSite() { + return site; + } + + public static void main(String[] args) throws FileNotFoundException, UnsupportedEncodingException { + Spider.create(new MamacnPageProcessor()) + .setScheduler(new FileCacheQueueScheduler("/data/webmagic/mamacn")) + .addUrl("http://www.mama.cn/photo/t1-p1.html") + .addPipeline(new OneFilePipeline("/data/webmagic/mamacn/data")) + .thread(5) + .run(); + } +} diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/pipeline/OneFilePipeline.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/pipeline/OneFilePipeline.java new file mode 100644 index 0000000..9cb1bc2 --- /dev/null +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/pipeline/OneFilePipeline.java @@ -0,0 +1,50 @@ +package us.codecraft.webmagic.samples.pipeline; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import us.codecraft.webmagic.ResultItems; +import us.codecraft.webmagic.Task; +import us.codecraft.webmagic.pipeline.Pipeline; +import us.codecraft.webmagic.utils.FilePersistentBase; + +import java.io.*; +import java.util.Map; + +/** + * @author code4crafer@gmail.com + */ +public class OneFilePipeline extends FilePersistentBase implements Pipeline { + + private Logger logger = LoggerFactory.getLogger(getClass()); + + private PrintWriter printWriter; + + /** + * create a FilePipeline with default path"/data/webmagic/" + */ + public OneFilePipeline() throws FileNotFoundException, UnsupportedEncodingException { + this("/data/webmagic/"); + } + + public OneFilePipeline(String path) throws FileNotFoundException, UnsupportedEncodingException { + setPath(path); + printWriter = new PrintWriter(new OutputStreamWriter(new FileOutputStream(getFile(path)), "UTF-8")); + } + + @Override + public synchronized void process(ResultItems resultItems, Task task) { + printWriter.println("url:\t" + resultItems.getRequest().getUrl()); + for (Map.Entry entry : resultItems.getAll().entrySet()) { + if (entry.getValue() instanceof Iterable) { + Iterable value = (Iterable) entry.getValue(); + printWriter.println(entry.getKey() + ":"); + for (Object o : value) { + printWriter.println(o); + } + } else { + printWriter.println(entry.getKey() + ":\t" + entry.getValue()); + } + } + printWriter.flush(); + } +}