From b06aa489fba007ba1e634f792045f0b52caf3c80 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Fri, 18 Apr 2014 17:48:26 +0800 Subject: [PATCH] [BugFix]Only one url from sourceRegion can be extracted #107 --- .../us/codecraft/webmagic/selector/PlainText.java | 10 ++++++++++ .../codecraft/webmagic/selector/Selectable.java | 15 +++++++++++++++ .../webmagic/model/ModelPageProcessor.java | 4 +--- 3 files changed, 26 insertions(+), 3 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java index ca40fac..efa38d8 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java @@ -122,6 +122,16 @@ public class PlainText implements Selectable { } } + @Override + public Selectable select(Selector selector) { + return select(selector, strings); + } + + @Override + public Selectable selectList(Selector selector) { + return selectList(selector, strings); + } + @Override public String toString() { return get(); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java index cdab8bf..2cc4ed9 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java @@ -128,4 +128,19 @@ public interface Selectable { */ public Selectable jsonPath(String jsonPath); + /** + * extract by custom selector + * + * @param selector + * @return + */ + public Selectable select(Selector selector); + + /** + * extract by custom selector + * + * @param selector + * @return + */ + public Selectable selectList(Selector selector); } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ModelPageProcessor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ModelPageProcessor.java index 3a97e1d..6bfe88d 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ModelPageProcessor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ModelPageProcessor.java @@ -7,9 +7,7 @@ import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.selector.Selector; import java.util.ArrayList; -import java.util.HashSet; import java.util.List; -import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -66,7 +64,7 @@ class ModelPageProcessor implements PageProcessor { if (urlRegionSelector == null) { links = page.getHtml().links().all(); } else { - links = urlRegionSelector.selectList(page.getHtml().toString()); + links = page.getHtml().selectList(urlRegionSelector).links().all(); } for (String link : links) { for (Pattern targetUrlPattern : urlPatterns) {