[BugFix]Only one url from sourceRegion can be extracted #107

master
yihua.huang 2014-04-18 17:48:26 +08:00
parent 08fa3b01c1
commit b06aa489fb
3 changed files with 26 additions and 3 deletions

View File

@ -122,6 +122,16 @@ public class PlainText implements Selectable {
}
}
@Override
public Selectable select(Selector selector) {
return select(selector, strings);
}
@Override
public Selectable selectList(Selector selector) {
return selectList(selector, strings);
}
@Override
public String toString() {
return get();

View File

@ -128,4 +128,19 @@ public interface Selectable {
*/
public Selectable jsonPath(String jsonPath);
/**
* extract by custom selector
*
* @param selector
* @return
*/
public Selectable select(Selector selector);
/**
* extract by custom selector
*
* @param selector
* @return
*/
public Selectable selectList(Selector selector);
}

View File

@ -7,9 +7,7 @@ import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Selector;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@ -66,7 +64,7 @@ class ModelPageProcessor implements PageProcessor {
if (urlRegionSelector == null) {
links = page.getHtml().links().all();
} else {
links = urlRegionSelector.selectList(page.getHtml().toString());
links = page.getHtml().selectList(urlRegionSelector).links().all();
}
for (String link : links) {
for (Pattern targetUrlPattern : urlPatterns) {