From 7ffc6998ef8673d2087dbbfc14ff2c00abbcf6a7 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 27 May 2017 16:20:06 +0800 Subject: [PATCH] add isExtractLinks to OOSpider #575 --- .../webmagic/model/ModelPageProcessor.java | 16 ++++++++++++++-- .../us/codecraft/webmagic/model/OOSpider.java | 5 +++++ 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ModelPageProcessor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ModelPageProcessor.java index 6bfe88d..1d9bf25 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ModelPageProcessor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ModelPageProcessor.java @@ -23,6 +23,8 @@ class ModelPageProcessor implements PageProcessor { private Site site; + private boolean extractLinks = true; + public static ModelPageProcessor create(Site site, Class... clazzs) { ModelPageProcessor modelPageProcessor = new ModelPageProcessor(site); for (Class clazz : clazzs) { @@ -45,8 +47,10 @@ class ModelPageProcessor implements PageProcessor { @Override public void process(Page page) { for (PageModelExtractor pageModelExtractor : pageModelExtractorList) { - extractLinks(page, pageModelExtractor.getHelpUrlRegionSelector(), pageModelExtractor.getHelpUrlPatterns()); - extractLinks(page, pageModelExtractor.getTargetUrlRegionSelector(), pageModelExtractor.getTargetUrlPatterns()); + if (extractLinks) { + extractLinks(page, pageModelExtractor.getHelpUrlRegionSelector(), pageModelExtractor.getHelpUrlPatterns()); + extractLinks(page, pageModelExtractor.getTargetUrlRegionSelector(), pageModelExtractor.getTargetUrlPatterns()); + } Object process = pageModelExtractor.process(page); if (process == null || (process instanceof List && ((List) process).size() == 0)) { continue; @@ -83,4 +87,12 @@ class ModelPageProcessor implements PageProcessor { public Site getSite() { return site; } + + public boolean isExtractLinks() { + return extractLinks; + } + + public void setExtractLinks(boolean extractLinks) { + this.extractLinks = extractLinks; + } } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/OOSpider.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/OOSpider.java index 08dc64a..eaabcca 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/OOSpider.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/OOSpider.java @@ -97,4 +97,9 @@ public class OOSpider extends Spider { return this; } + public OOSpider setIsExtractLinks(boolean isExtractLinks){ + modelPageProcessor.setExtractLinks(isExtractLinks); + return this; + } + }