add SubPageProcessor #86

master
yihua.huang 2014-04-05 18:17:48 +08:00
parent 2b2ce9ce13
commit c143fc662c
2 changed files with 82 additions and 0 deletions

View File

@ -0,0 +1,49 @@
package us.codecraft.webmagic.handler;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
import java.util.ArrayList;
import java.util.List;
/**
* @author code4crafter@gmail.com
* @date 14-4-5
*/
public class CompositePageProcessor implements PageProcessor {
private Site site;
private List<SubPageProcessor> subPageProcessors;
@Override
public void process(Page page) {
for (SubPageProcessor subPageProcessor : subPageProcessors) {
if (subPageProcessor.match(page)) {
SubPageProcessor.MatchOtherProcessor matchOtherProcessorProcessor = subPageProcessor.process(page);
if (matchOtherProcessorProcessor == null || matchOtherProcessorProcessor != SubPageProcessor.MatchOtherProcessor.YES) {
return;
}
}
}
}
public CompositePageProcessor setSite(Site site) {
this.site = site;
return this;
}
public CompositePageProcessor setSubPageProcessors(SubPageProcessor... subPageProcessors) {
this.subPageProcessors = new ArrayList<SubPageProcessor>();
for (SubPageProcessor subPageProcessor : subPageProcessors) {
this.subPageProcessors.add(subPageProcessor);
}
return this;
}
@Override
public Site getSite() {
return site;
}
}

View File

@ -0,0 +1,33 @@
package us.codecraft.webmagic.handler;
import us.codecraft.webmagic.Page;
/**
* @author code4crafter@gmail.com
* @date 14-4-5
*/
public interface SubPageProcessor {
/**
* Check whether the SubPageProcessor can process the page.<br></br>
* Please DO NOT change page status in this method.
*
* @param page
* @return
*/
public boolean match(Page page);
/**
*
* process the page, extract urls to fetch, extract the data and store
*
* @param page
* @return whether continue to match
*/
public MatchOtherProcessor process(Page page);
public enum MatchOtherProcessor {
YES, NO;
}
}