extractors
parent
20705b34ac
commit
c1471718df
|
@ -43,6 +43,8 @@ public class CssSelector implements Selector {
|
|||
private String getValue(Element element) {
|
||||
if (attrName == null) {
|
||||
return element.outerHtml();
|
||||
} else if ("innerHtml".equalsIgnoreCase(attrName)) {
|
||||
return element.html();
|
||||
} else {
|
||||
return element.attr(attrName);
|
||||
}
|
||||
|
|
|
@ -26,9 +26,9 @@ public class OrSelector implements Selector {
|
|||
@Override
|
||||
public String select(String text) {
|
||||
for (Selector selector : selectors) {
|
||||
text = selector.select(text);
|
||||
if (text != null) {
|
||||
return text;
|
||||
String result = selector.select(text);
|
||||
if (result != null) {
|
||||
return result;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
|
|
|
@ -20,7 +20,9 @@ public class RegexSelector implements Selector {
|
|||
|
||||
private Pattern regex;
|
||||
|
||||
public RegexSelector(String regexStr) {
|
||||
private int group = 1;
|
||||
|
||||
public RegexSelector(String regexStr, int group) {
|
||||
if (StringUtils.isBlank(regexStr)) {
|
||||
throw new IllegalArgumentException("regex must not be empty");
|
||||
}
|
||||
|
@ -36,11 +38,16 @@ public class RegexSelector implements Selector {
|
|||
} catch (PatternSyntaxException e) {
|
||||
throw new IllegalArgumentException("invalid regex", e);
|
||||
}
|
||||
this.group = group;
|
||||
}
|
||||
|
||||
public RegexSelector(String regexStr) {
|
||||
this(regexStr, 1);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String select(String text) {
|
||||
return selectGroup(text).get(1);
|
||||
return selectGroup(text).get(group);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -48,7 +55,7 @@ public class RegexSelector implements Selector {
|
|||
List<String> strings = new ArrayList<String>();
|
||||
List<RegexResult> results = selectGroupList(text);
|
||||
for (RegexResult result : results) {
|
||||
strings.add(result.get(1));
|
||||
strings.add(result.get(group));
|
||||
}
|
||||
return strings;
|
||||
}
|
||||
|
|
|
@ -27,7 +27,11 @@ public class SelectorFactory {
|
|||
}
|
||||
|
||||
public RegexSelector newRegexSelector(String regex, int group) {
|
||||
return newSelector(RegexSelector.class, regex, String.valueOf(group));
|
||||
String cacheKey = getCacheKey(RegexSelector.class, regex, String.valueOf(group));
|
||||
if (innerCache.get(cacheKey) != null) {
|
||||
return (RegexSelector) innerCache.get(cacheKey);
|
||||
}
|
||||
return new RegexSelector(regex, group);
|
||||
}
|
||||
|
||||
public ReplaceSelector newReplaceSelector(String regex, String replacement) {
|
||||
|
|
|
@ -0,0 +1,44 @@
|
|||
package us.codecraft.webmagic.selector;
|
||||
|
||||
/**
|
||||
* Convenient methods for selectors.<br>
|
||||
*
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* @since 0.2.1
|
||||
*/
|
||||
public abstract class Selectors {
|
||||
|
||||
public static RegexSelector regex(String expr) {
|
||||
return SelectorFactory.getInstatnce().newRegexSelector(expr);
|
||||
}
|
||||
|
||||
public static RegexSelector regex(String expr, int group) {
|
||||
return SelectorFactory.getInstatnce().newRegexSelector(expr, group);
|
||||
}
|
||||
|
||||
public static CssSelector $(String expr) {
|
||||
return new CssSelector(expr);
|
||||
}
|
||||
|
||||
public static CssSelector $(String expr, String attrName) {
|
||||
return new CssSelector(expr, attrName);
|
||||
}
|
||||
|
||||
public static XpathSelector xpath(String expr) {
|
||||
return SelectorFactory.getInstatnce().newXpathSelector(expr);
|
||||
}
|
||||
|
||||
public static AndSelector and(Selector... selectors) {
|
||||
return new AndSelector(selectors);
|
||||
}
|
||||
|
||||
public static OrSelector or(Selector... selectors) {
|
||||
return new OrSelector(selectors);
|
||||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
String s = "a";
|
||||
or(regex("<title>(.*)</title>"), xpath("//title"), $("title")).select(s);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,34 @@
|
|||
package us.codecraft.webmagic.selector;
|
||||
|
||||
import junit.framework.Assert;
|
||||
import org.junit.Test;
|
||||
|
||||
import static us.codecraft.webmagic.selector.Selectors.*;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com <br>
|
||||
*/
|
||||
public class ExtractorsTest {
|
||||
|
||||
String html = "<div><h1>test<a href=\"xxx\">aabbcc</a></h1></div>";
|
||||
|
||||
String html2 = "<title>aabbcc</title>";
|
||||
|
||||
@Test
|
||||
public void testEach() {
|
||||
Assert.assertEquals("<a href=\"xxx\">aabbcc</a>", $("div h1 a").select(html));
|
||||
Assert.assertEquals("xxx", $("div h1 a", "href").select(html));
|
||||
Assert.assertEquals("aabbcc", $("div h1 a", "innerHtml").select(html));
|
||||
Assert.assertEquals("xxx", xpath("//a/@href").select(html));
|
||||
Assert.assertEquals("xxx", regex("a href=\"(.*)\"").select(html));
|
||||
Assert.assertEquals("xxx", regex("(a href)=\"(.*)\"", 2).select(html));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCombo() {
|
||||
Assert.assertEquals("bb", and($("title"), regex("aa(bb)cc")).select(html2));
|
||||
OrSelector or = or($("div h1 a", "innerHtml"), xpath("//title"));
|
||||
Assert.assertEquals("aabbcc", or.select(html));
|
||||
Assert.assertEquals("aabbcc", or.select(html2));
|
||||
}
|
||||
}
|
|
@ -5,8 +5,6 @@ import org.junit.Test;
|
|||
|
||||
/**
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-4-21
|
||||
* Time: 上午7:13
|
||||
*/
|
||||
public class RegexSelectorTest {
|
||||
|
||||
|
|
Loading…
Reference in New Issue