diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/CompositePageProcessor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/CompositePageProcessor.java new file mode 100644 index 0000000..ecf4aa1 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/CompositePageProcessor.java @@ -0,0 +1,49 @@ +package us.codecraft.webmagic.handler; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.processor.PageProcessor; + +import java.util.ArrayList; +import java.util.List; + +/** + * @author code4crafter@gmail.com + * @date 14-4-5 + */ +public class CompositePageProcessor implements PageProcessor { + + private Site site; + + private List subPageProcessors; + + @Override + public void process(Page page) { + for (SubPageProcessor subPageProcessor : subPageProcessors) { + if (subPageProcessor.match(page)) { + SubPageProcessor.MatchOtherProcessor matchOtherProcessorProcessor = subPageProcessor.process(page); + if (matchOtherProcessorProcessor == null || matchOtherProcessorProcessor != SubPageProcessor.MatchOtherProcessor.YES) { + return; + } + } + } + } + + public CompositePageProcessor setSite(Site site) { + this.site = site; + return this; + } + + public CompositePageProcessor setSubPageProcessors(SubPageProcessor... subPageProcessors) { + this.subPageProcessors = new ArrayList(); + for (SubPageProcessor subPageProcessor : subPageProcessors) { + this.subPageProcessors.add(subPageProcessor); + } + return this; + } + + @Override + public Site getSite() { + return site; + } +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/SubPageProcessor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/SubPageProcessor.java new file mode 100644 index 0000000..c880500 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/SubPageProcessor.java @@ -0,0 +1,33 @@ +package us.codecraft.webmagic.handler; + +import us.codecraft.webmagic.Page; + +/** + * @author code4crafter@gmail.com + * @date 14-4-5 + */ +public interface SubPageProcessor { + + /** + * Check whether the SubPageProcessor can process the page.

+ * Please DO NOT change page status in this method. + * + * @param page + * @return + */ + public boolean match(Page page); + + /** + * + * process the page, extract urls to fetch, extract the data and store + * + * @param page + * @return whether continue to match + */ + public MatchOtherProcessor process(Page page); + + public enum MatchOtherProcessor { + YES, NO; + } + +}