extractors
parent
20705b34ac
commit
c1471718df
|
@ -43,6 +43,8 @@ public class CssSelector implements Selector {
|
||||||
private String getValue(Element element) {
|
private String getValue(Element element) {
|
||||||
if (attrName == null) {
|
if (attrName == null) {
|
||||||
return element.outerHtml();
|
return element.outerHtml();
|
||||||
|
} else if ("innerHtml".equalsIgnoreCase(attrName)) {
|
||||||
|
return element.html();
|
||||||
} else {
|
} else {
|
||||||
return element.attr(attrName);
|
return element.attr(attrName);
|
||||||
}
|
}
|
||||||
|
|
|
@ -26,9 +26,9 @@ public class OrSelector implements Selector {
|
||||||
@Override
|
@Override
|
||||||
public String select(String text) {
|
public String select(String text) {
|
||||||
for (Selector selector : selectors) {
|
for (Selector selector : selectors) {
|
||||||
text = selector.select(text);
|
String result = selector.select(text);
|
||||||
if (text != null) {
|
if (result != null) {
|
||||||
return text;
|
return result;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return null;
|
return null;
|
||||||
|
|
|
@ -20,7 +20,9 @@ public class RegexSelector implements Selector {
|
||||||
|
|
||||||
private Pattern regex;
|
private Pattern regex;
|
||||||
|
|
||||||
public RegexSelector(String regexStr) {
|
private int group = 1;
|
||||||
|
|
||||||
|
public RegexSelector(String regexStr, int group) {
|
||||||
if (StringUtils.isBlank(regexStr)) {
|
if (StringUtils.isBlank(regexStr)) {
|
||||||
throw new IllegalArgumentException("regex must not be empty");
|
throw new IllegalArgumentException("regex must not be empty");
|
||||||
}
|
}
|
||||||
|
@ -36,11 +38,16 @@ public class RegexSelector implements Selector {
|
||||||
} catch (PatternSyntaxException e) {
|
} catch (PatternSyntaxException e) {
|
||||||
throw new IllegalArgumentException("invalid regex", e);
|
throw new IllegalArgumentException("invalid regex", e);
|
||||||
}
|
}
|
||||||
|
this.group = group;
|
||||||
|
}
|
||||||
|
|
||||||
|
public RegexSelector(String regexStr) {
|
||||||
|
this(regexStr, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String select(String text) {
|
public String select(String text) {
|
||||||
return selectGroup(text).get(1);
|
return selectGroup(text).get(group);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -48,7 +55,7 @@ public class RegexSelector implements Selector {
|
||||||
List<String> strings = new ArrayList<String>();
|
List<String> strings = new ArrayList<String>();
|
||||||
List<RegexResult> results = selectGroupList(text);
|
List<RegexResult> results = selectGroupList(text);
|
||||||
for (RegexResult result : results) {
|
for (RegexResult result : results) {
|
||||||
strings.add(result.get(1));
|
strings.add(result.get(group));
|
||||||
}
|
}
|
||||||
return strings;
|
return strings;
|
||||||
}
|
}
|
||||||
|
|
|
@ -27,7 +27,11 @@ public class SelectorFactory {
|
||||||
}
|
}
|
||||||
|
|
||||||
public RegexSelector newRegexSelector(String regex, int group) {
|
public RegexSelector newRegexSelector(String regex, int group) {
|
||||||
return newSelector(RegexSelector.class, regex, String.valueOf(group));
|
String cacheKey = getCacheKey(RegexSelector.class, regex, String.valueOf(group));
|
||||||
|
if (innerCache.get(cacheKey) != null) {
|
||||||
|
return (RegexSelector) innerCache.get(cacheKey);
|
||||||
|
}
|
||||||
|
return new RegexSelector(regex, group);
|
||||||
}
|
}
|
||||||
|
|
||||||
public ReplaceSelector newReplaceSelector(String regex, String replacement) {
|
public ReplaceSelector newReplaceSelector(String regex, String replacement) {
|
||||||
|
|
|
@ -0,0 +1,44 @@
|
||||||
|
package us.codecraft.webmagic.selector;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Convenient methods for selectors.<br>
|
||||||
|
*
|
||||||
|
* @author code4crafter@gmail.com <br>
|
||||||
|
* @since 0.2.1
|
||||||
|
*/
|
||||||
|
public abstract class Selectors {
|
||||||
|
|
||||||
|
public static RegexSelector regex(String expr) {
|
||||||
|
return SelectorFactory.getInstatnce().newRegexSelector(expr);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static RegexSelector regex(String expr, int group) {
|
||||||
|
return SelectorFactory.getInstatnce().newRegexSelector(expr, group);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static CssSelector $(String expr) {
|
||||||
|
return new CssSelector(expr);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static CssSelector $(String expr, String attrName) {
|
||||||
|
return new CssSelector(expr, attrName);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static XpathSelector xpath(String expr) {
|
||||||
|
return SelectorFactory.getInstatnce().newXpathSelector(expr);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static AndSelector and(Selector... selectors) {
|
||||||
|
return new AndSelector(selectors);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static OrSelector or(Selector... selectors) {
|
||||||
|
return new OrSelector(selectors);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void main(String[] args) {
|
||||||
|
String s = "a";
|
||||||
|
or(regex("<title>(.*)</title>"), xpath("//title"), $("title")).select(s);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,34 @@
|
||||||
|
package us.codecraft.webmagic.selector;
|
||||||
|
|
||||||
|
import junit.framework.Assert;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
import static us.codecraft.webmagic.selector.Selectors.*;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author code4crafter@gmail.com <br>
|
||||||
|
*/
|
||||||
|
public class ExtractorsTest {
|
||||||
|
|
||||||
|
String html = "<div><h1>test<a href=\"xxx\">aabbcc</a></h1></div>";
|
||||||
|
|
||||||
|
String html2 = "<title>aabbcc</title>";
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testEach() {
|
||||||
|
Assert.assertEquals("<a href=\"xxx\">aabbcc</a>", $("div h1 a").select(html));
|
||||||
|
Assert.assertEquals("xxx", $("div h1 a", "href").select(html));
|
||||||
|
Assert.assertEquals("aabbcc", $("div h1 a", "innerHtml").select(html));
|
||||||
|
Assert.assertEquals("xxx", xpath("//a/@href").select(html));
|
||||||
|
Assert.assertEquals("xxx", regex("a href=\"(.*)\"").select(html));
|
||||||
|
Assert.assertEquals("xxx", regex("(a href)=\"(.*)\"", 2).select(html));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testCombo() {
|
||||||
|
Assert.assertEquals("bb", and($("title"), regex("aa(bb)cc")).select(html2));
|
||||||
|
OrSelector or = or($("div h1 a", "innerHtml"), xpath("//title"));
|
||||||
|
Assert.assertEquals("aabbcc", or.select(html));
|
||||||
|
Assert.assertEquals("aabbcc", or.select(html2));
|
||||||
|
}
|
||||||
|
}
|
|
@ -5,8 +5,6 @@ import org.junit.Test;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
* Date: 13-4-21
|
|
||||||
* Time: 上午7:13
|
|
||||||
*/
|
*/
|
||||||
public class RegexSelectorTest {
|
public class RegexSelectorTest {
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue