diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/PatternProcessorDemo.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/PatternProcessorDemo.java new file mode 100644 index 0000000..51a9484 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/PatternProcessorDemo.java @@ -0,0 +1,53 @@ +package us.codecraft.webmagic.example; + +import org.apache.log4j.Logger; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.ResultItems; +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.Task; +import us.codecraft.webmagic.handler.PatternHandler; +import us.codecraft.webmagic.pipeline.PatternPipeline; +import us.codecraft.webmagic.processor.PatternPageProcessor; + +/** + * Created with IntelliJ IDEA. + * User: Sebastian MA + * Date: April 04, 2014 + * Time: 21:23 + */ +public class PatternProcessorDemo { + + private static Logger log = Logger.getLogger(PatternProcessorDemo.class); + + public static void main(String... args) { + + PatternPageProcessor processor + = new PatternPageProcessor("http://item.jd.com/981821.html", + PatternPageProcessor.TARGET_PATTERN_ALL + ); + + PatternPipeline pipeline = new PatternPipeline(); + + // define a handler which handles only "http://item.jd.com/.*" + PatternHandler handler = new PatternHandler("http://item.jd.com/.*") { + + @Override + public void onExtract(Page page) { + + log.info("Extracting from " + page.getUrl()); + page.putField("test", "hello world:)"); + } + + @Override + public void onHandle(ResultItems result, Task task) { + + log.info("Handling " + result.getRequest().getUrl()); + log.info("Retrieved test=" + result.get("test")); + } + }; + + handler.register(processor, pipeline); + + Spider.create(processor).thread(5).addPipeline(pipeline).runAsync(); + } +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/PatternHandler.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/PatternHandler.java new file mode 100644 index 0000000..51e44e0 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/PatternHandler.java @@ -0,0 +1,113 @@ +package us.codecraft.webmagic.handler; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.ResultItems; +import us.codecraft.webmagic.Task; +import us.codecraft.webmagic.pipeline.PatternPipeline; +import us.codecraft.webmagic.processor.PatternPageProcessor; + +import java.util.UUID; + +/** + * Created with IntelliJ IDEA. + * User: Sebastian MA + * Date: April 03, 2014 + * Time: 10:00 + *
+ * A PatternHandler is in charge of both page extraction and data processing by implementing + * its two abstract methods. + */ +public abstract class PatternHandler { + + /** + * identity of the handler. + */ + protected String id; + + /** + * match pattern. only matched page should be handled. + */ + protected String pattern; + + /** + * @param pattern + * url pattern to handle + */ + protected PatternHandler(String pattern) { + + this.pattern = pattern; + this.id = UUID.randomUUID().toString(); + } + + /** + * determine if the page should be handled. + */ + public boolean match(String url) { + + return url.matches(pattern); + } + + /** + * registers to both the page processor and the pipeline so the handler could take charge of + * both end of procedure. + * + * @param processor + * the processor to handle + * @param pipeline + * the pipeline to handle + */ + public void register(PatternPageProcessor processor, PatternPipeline pipeline) { + + processor.addHandler(this); + pipeline.addHandler(this); + } + + public void unregister(PatternPageProcessor processor, PatternPipeline pipeline) { + + processor.removeHandler(this); + pipeline.removeHandler(this); + } + + public boolean process(Page page) { + + if(match(page.getUrl().toString())) { + page.putField(id, true); + onExtract(page); + return true; + } else { + return false; + } + } + + public boolean process(ResultItems resultItems, Task task) { + + if(resultItems.isSkip()) { + return false; + } + + if(match(resultItems.getRequest().getUrl()) && resultItems.get(id) != null) { + onHandle(resultItems, task); + return true; + } else { + return false; + } + } + + /** + * implements this method to extract from page. + * + * @param page + * the page to extract + */ + public abstract void onExtract(Page page); + + /** + * implements this method to handle the extraction result. + * + * @param result + * extraction result + * @param task + */ + public abstract void onHandle(ResultItems result, Task task); + +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/PatternPipeline.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/PatternPipeline.java new file mode 100644 index 0000000..582b162 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/PatternPipeline.java @@ -0,0 +1,44 @@ +package us.codecraft.webmagic.pipeline; + +import us.codecraft.webmagic.ResultItems; +import us.codecraft.webmagic.Task; +import us.codecraft.webmagic.handler.PatternHandler; + +import java.util.ArrayList; + +/** + * Created with IntelliJ IDEA. + * User: Sebastian MA + * Date: April 04, 2014 + * Time: 20:44 + */ +public class PatternPipeline implements Pipeline { + + protected ArrayList