From 38a12f864113b383824a1903a1b780487810102d Mon Sep 17 00:00:00 2001 From: Tian Date: Fri, 4 Apr 2014 22:02:52 +0800 Subject: [PATCH] new feature: PatternProcessor --- .../example/PatternProcessorDemo.java | 53 ++++++++ .../webmagic/handler/PatternHandler.java | 113 ++++++++++++++++++ .../webmagic/pipeline/PatternPipeline.java | 44 +++++++ .../processor/PatternPageProcessor.java | 78 ++++++++++++ 4 files changed, 288 insertions(+) create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/example/PatternProcessorDemo.java create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/handler/PatternHandler.java create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/PatternPipeline.java create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/processor/PatternPageProcessor.java diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/PatternProcessorDemo.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/PatternProcessorDemo.java new file mode 100644 index 0000000..51a9484 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/PatternProcessorDemo.java @@ -0,0 +1,53 @@ +package us.codecraft.webmagic.example; + +import org.apache.log4j.Logger; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.ResultItems; +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.Task; +import us.codecraft.webmagic.handler.PatternHandler; +import us.codecraft.webmagic.pipeline.PatternPipeline; +import us.codecraft.webmagic.processor.PatternPageProcessor; + +/** + * Created with IntelliJ IDEA. + * User: Sebastian MA + * Date: April 04, 2014 + * Time: 21:23 + */ +public class PatternProcessorDemo { + + private static Logger log = Logger.getLogger(PatternProcessorDemo.class); + + public static void main(String... args) { + + PatternPageProcessor processor + = new PatternPageProcessor("http://item.jd.com/981821.html", + PatternPageProcessor.TARGET_PATTERN_ALL + ); + + PatternPipeline pipeline = new PatternPipeline(); + + // define a handler which handles only "http://item.jd.com/.*" + PatternHandler handler = new PatternHandler("http://item.jd.com/.*") { + + @Override + public void onExtract(Page page) { + + log.info("Extracting from " + page.getUrl()); + page.putField("test", "hello world:)"); + } + + @Override + public void onHandle(ResultItems result, Task task) { + + log.info("Handling " + result.getRequest().getUrl()); + log.info("Retrieved test=" + result.get("test")); + } + }; + + handler.register(processor, pipeline); + + Spider.create(processor).thread(5).addPipeline(pipeline).runAsync(); + } +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/PatternHandler.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/PatternHandler.java new file mode 100644 index 0000000..51e44e0 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/PatternHandler.java @@ -0,0 +1,113 @@ +package us.codecraft.webmagic.handler; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.ResultItems; +import us.codecraft.webmagic.Task; +import us.codecraft.webmagic.pipeline.PatternPipeline; +import us.codecraft.webmagic.processor.PatternPageProcessor; + +import java.util.UUID; + +/** + * Created with IntelliJ IDEA. + * User: Sebastian MA + * Date: April 03, 2014 + * Time: 10:00 + *

+ * A PatternHandler is in charge of both page extraction and data processing by implementing + * its two abstract methods. + */ +public abstract class PatternHandler { + + /** + * identity of the handler. + */ + protected String id; + + /** + * match pattern. only matched page should be handled. + */ + protected String pattern; + + /** + * @param pattern + * url pattern to handle + */ + protected PatternHandler(String pattern) { + + this.pattern = pattern; + this.id = UUID.randomUUID().toString(); + } + + /** + * determine if the page should be handled. + */ + public boolean match(String url) { + + return url.matches(pattern); + } + + /** + * registers to both the page processor and the pipeline so the handler could take charge of + * both end of procedure. + * + * @param processor + * the processor to handle + * @param pipeline + * the pipeline to handle + */ + public void register(PatternPageProcessor processor, PatternPipeline pipeline) { + + processor.addHandler(this); + pipeline.addHandler(this); + } + + public void unregister(PatternPageProcessor processor, PatternPipeline pipeline) { + + processor.removeHandler(this); + pipeline.removeHandler(this); + } + + public boolean process(Page page) { + + if(match(page.getUrl().toString())) { + page.putField(id, true); + onExtract(page); + return true; + } else { + return false; + } + } + + public boolean process(ResultItems resultItems, Task task) { + + if(resultItems.isSkip()) { + return false; + } + + if(match(resultItems.getRequest().getUrl()) && resultItems.get(id) != null) { + onHandle(resultItems, task); + return true; + } else { + return false; + } + } + + /** + * implements this method to extract from page. + * + * @param page + * the page to extract + */ + public abstract void onExtract(Page page); + + /** + * implements this method to handle the extraction result. + * + * @param result + * extraction result + * @param task + */ + public abstract void onHandle(ResultItems result, Task task); + +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/PatternPipeline.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/PatternPipeline.java new file mode 100644 index 0000000..582b162 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/PatternPipeline.java @@ -0,0 +1,44 @@ +package us.codecraft.webmagic.pipeline; + +import us.codecraft.webmagic.ResultItems; +import us.codecraft.webmagic.Task; +import us.codecraft.webmagic.handler.PatternHandler; + +import java.util.ArrayList; + +/** + * Created with IntelliJ IDEA. + * User: Sebastian MA + * Date: April 04, 2014 + * Time: 20:44 + */ +public class PatternPipeline implements Pipeline { + + protected ArrayList handlers = new ArrayList(); + + /** + * A handler works only if it is added to BOTH the page processor and the pipeline. + * Uses PatternHandler's register instead. + * + * @param handler the pattern handler + * + * @see PatternHandler#register + */ + public void addHandler(PatternHandler handler) { + + handlers.add(handler); + } + + public void removeHandler(PatternHandler handler) { + + handlers.remove(handler); + } + + @Override + public void process(ResultItems resultItems, Task task) { + + for(PatternHandler handler : handlers) { + handler.process(resultItems, task); + } + } +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/processor/PatternPageProcessor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/processor/PatternPageProcessor.java new file mode 100644 index 0000000..d7d909c --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/processor/PatternPageProcessor.java @@ -0,0 +1,78 @@ +package us.codecraft.webmagic.processor; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.handler.PatternHandler; +import us.codecraft.webmagic.utils.UrlUtils; + +import java.util.ArrayList; +import java.util.List; + +/** + * Created with IntelliJ IDEA. + * User: Sebastian MA + * Date: April 04, 2014 + * Time: 15:36 + *

+ * A PatternPageProcessor uses PatternHandler to setup extraction rules for specific url pattern. + * + * @see us.codecraft.webmagic.handler.PatternHandler + */ +public class PatternPageProcessor implements PageProcessor { + + public static final String TARGET_PATTERN_ALL = "http://*"; + + protected Site site; + + protected String targetPattern; + + protected ArrayList handlers = new ArrayList(); + + public PatternPageProcessor(String startUrl, String targetPattern) { + + this.targetPattern = targetPattern; + + this.site = Site.me().addStartUrl(startUrl).setDomain(UrlUtils.getDomain(startUrl)); + this.targetPattern = "(" + targetPattern.replace(".", "\\.").replace("*", + "[^\"'#]*") + ")"; + + site.setUserAgent("Chrome/5.0.354.0"); + } + + @Override + public void process(Page page) { + + + List requests = page.getHtml().links().regex(targetPattern).all(); + page.addTargetRequests(requests); + for(PatternHandler handler : handlers) { + if(handler.match(page.getUrl().toString())) { + handler.process(page); + } + } + } + + /** + * A handler works only if it is added to BOTH the page processor and the pipeline. + * Uses PatternHandler's register instead. + * + * @param handler the pattern handler + * + * @see PatternHandler#register + */ + public void addHandler(PatternHandler handler) { + + handlers.add(handler); + } + + public void removeHandler(PatternHandler handler) { + + handlers.remove(handler); + } + + @Override + public Site getSite() { + + return site; + } +}