From c143fc662cb0e21ced7ed084aff63d25b09b5b3b Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 5 Apr 2014 18:17:48 +0800 Subject: [PATCH] add SubPageProcessor #86 --- .../handler/CompositePageProcessor.java | 49 +++++++++++++++++++ .../webmagic/handler/SubPageProcessor.java | 33 +++++++++++++ 2 files changed, 82 insertions(+) create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/handler/CompositePageProcessor.java create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/handler/SubPageProcessor.java diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/CompositePageProcessor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/CompositePageProcessor.java new file mode 100644 index 0000000..ecf4aa1 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/CompositePageProcessor.java @@ -0,0 +1,49 @@ +package us.codecraft.webmagic.handler; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.processor.PageProcessor; + +import java.util.ArrayList; +import java.util.List; + +/** + * @author code4crafter@gmail.com + * @date 14-4-5 + */ +public class CompositePageProcessor implements PageProcessor { + + private Site site; + + private List subPageProcessors; + + @Override + public void process(Page page) { + for (SubPageProcessor subPageProcessor : subPageProcessors) { + if (subPageProcessor.match(page)) { + SubPageProcessor.MatchOtherProcessor matchOtherProcessorProcessor = subPageProcessor.process(page); + if (matchOtherProcessorProcessor == null || matchOtherProcessorProcessor != SubPageProcessor.MatchOtherProcessor.YES) { + return; + } + } + } + } + + public CompositePageProcessor setSite(Site site) { + this.site = site; + return this; + } + + public CompositePageProcessor setSubPageProcessors(SubPageProcessor... subPageProcessors) { + this.subPageProcessors = new ArrayList(); + for (SubPageProcessor subPageProcessor : subPageProcessors) { + this.subPageProcessors.add(subPageProcessor); + } + return this; + } + + @Override + public Site getSite() { + return site; + } +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/SubPageProcessor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/SubPageProcessor.java new file mode 100644 index 0000000..c880500 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/SubPageProcessor.java @@ -0,0 +1,33 @@ +package us.codecraft.webmagic.handler; + +import us.codecraft.webmagic.Page; + +/** + * @author code4crafter@gmail.com + * @date 14-4-5 + */ +public interface SubPageProcessor { + + /** + * Check whether the SubPageProcessor can process the page.

+ * Please DO NOT change page status in this method. + * + * @param page + * @return + */ + public boolean match(Page page); + + /** + * + * process the page, extract urls to fetch, extract the data and store + * + * @param page + * @return whether continue to match + */ + public MatchOtherProcessor process(Page page); + + public enum MatchOtherProcessor { + YES, NO; + } + +}