update comments for selector
parent
77e6ca2945
commit
17f8ead28f
|
@ -4,10 +4,10 @@ import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 可抽取的html文本。<br>
|
* Selectable plain text.<br>
|
||||||
|
*
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
* Date: 13-4-21
|
* @since 0.1.0
|
||||||
* Time: 上午7:54
|
|
||||||
*/
|
*/
|
||||||
public class Html extends PlainText {
|
public class Html extends PlainText {
|
||||||
|
|
||||||
|
@ -66,7 +66,7 @@ public class Html extends PlainText {
|
||||||
@Override
|
@Override
|
||||||
public Selectable $(String selector) {
|
public Selectable $(String selector) {
|
||||||
CssSelector cssSelector = new CssSelector(selector);
|
CssSelector cssSelector = new CssSelector(selector);
|
||||||
return selectList(cssSelector,strings);
|
return selectList(cssSelector, strings);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,10 +1,10 @@
|
||||||
package us.codecraft.webmagic.selector;
|
package us.codecraft.webmagic.selector;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 封装正则表达式抽取接口的类。<br>
|
* Object contains regex results.<br>
|
||||||
|
*
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
* Date: 13-4-21
|
* @since 0.1.0
|
||||||
* Time: 上午7:39
|
|
||||||
*/
|
*/
|
||||||
class RegexResult {
|
class RegexResult {
|
||||||
|
|
||||||
|
|
|
@ -9,10 +9,10 @@ import java.util.regex.Pattern;
|
||||||
import java.util.regex.PatternSyntaxException;
|
import java.util.regex.PatternSyntaxException;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 正则表达式抽取器。<br>
|
* Selector in regex.<br>
|
||||||
|
*
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
* Date: 13-4-21
|
* @since 0.1.0
|
||||||
* Time: 上午7:09
|
|
||||||
*/
|
*/
|
||||||
public class RegexSelector implements Selector {
|
public class RegexSelector implements Selector {
|
||||||
|
|
||||||
|
@ -21,18 +21,18 @@ public class RegexSelector implements Selector {
|
||||||
private Pattern regex;
|
private Pattern regex;
|
||||||
|
|
||||||
public RegexSelector(String regexStr) {
|
public RegexSelector(String regexStr) {
|
||||||
if (StringUtils.isBlank(regexStr)){
|
if (StringUtils.isBlank(regexStr)) {
|
||||||
throw new IllegalArgumentException("regex must not be empty");
|
throw new IllegalArgumentException("regex must not be empty");
|
||||||
}
|
}
|
||||||
if (!StringUtils.contains(regexStr,"(")&&!StringUtils.contains(regexStr,")")){
|
if (!StringUtils.contains(regexStr, "(") && !StringUtils.contains(regexStr, ")")) {
|
||||||
regexStr="("+regexStr+")";
|
regexStr = "(" + regexStr + ")";
|
||||||
}
|
}
|
||||||
if (!StringUtils.contains(regexStr,"(")||!StringUtils.contains(regexStr,")")){
|
if (!StringUtils.contains(regexStr, "(") || !StringUtils.contains(regexStr, ")")) {
|
||||||
throw new IllegalArgumentException("regex must have capture group 1");
|
throw new IllegalArgumentException("regex must have capture group 1");
|
||||||
}
|
}
|
||||||
this.regexStr = regexStr;
|
this.regexStr = regexStr;
|
||||||
try {
|
try {
|
||||||
regex = Pattern.compile(regexStr,Pattern.DOTALL|Pattern.CASE_INSENSITIVE);
|
regex = Pattern.compile(regexStr, Pattern.DOTALL | Pattern.CASE_INSENSITIVE);
|
||||||
} catch (PatternSyntaxException e) {
|
} catch (PatternSyntaxException e) {
|
||||||
throw new IllegalArgumentException("invalid regex", e);
|
throw new IllegalArgumentException("invalid regex", e);
|
||||||
}
|
}
|
||||||
|
@ -45,7 +45,7 @@ public class RegexSelector implements Selector {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public List<String> selectList(String text) {
|
public List<String> selectList(String text) {
|
||||||
List<String> strings=new ArrayList<String>();
|
List<String> strings = new ArrayList<String>();
|
||||||
List<RegexResult> results = selectGroupList(text);
|
List<RegexResult> results = selectGroupList(text);
|
||||||
for (RegexResult result : results) {
|
for (RegexResult result : results) {
|
||||||
strings.add(result.get(1));
|
strings.add(result.get(1));
|
||||||
|
@ -56,7 +56,7 @@ public class RegexSelector implements Selector {
|
||||||
public RegexResult selectGroup(String text) {
|
public RegexResult selectGroup(String text) {
|
||||||
Matcher matcher = regex.matcher(text);
|
Matcher matcher = regex.matcher(text);
|
||||||
if (matcher.find()) {
|
if (matcher.find()) {
|
||||||
String[] groups = new String[matcher.groupCount()+1];
|
String[] groups = new String[matcher.groupCount() + 1];
|
||||||
for (int i = 0; i < groups.length; i++) {
|
for (int i = 0; i < groups.length; i++) {
|
||||||
groups[i] = matcher.group(i);
|
groups[i] = matcher.group(i);
|
||||||
}
|
}
|
||||||
|
@ -69,7 +69,7 @@ public class RegexSelector implements Selector {
|
||||||
Matcher matcher = regex.matcher(text);
|
Matcher matcher = regex.matcher(text);
|
||||||
List<RegexResult> resultList = new ArrayList<RegexResult>();
|
List<RegexResult> resultList = new ArrayList<RegexResult>();
|
||||||
while (matcher.find()) {
|
while (matcher.find()) {
|
||||||
String[] groups = new String[matcher.groupCount()+1];
|
String[] groups = new String[matcher.groupCount() + 1];
|
||||||
for (int i = 0; i < groups.length; i++) {
|
for (int i = 0; i < groups.length; i++) {
|
||||||
groups[i] = matcher.group(i);
|
groups[i] = matcher.group(i);
|
||||||
}
|
}
|
||||||
|
|
|
@ -6,10 +6,10 @@ import java.util.regex.Pattern;
|
||||||
import java.util.regex.PatternSyntaxException;
|
import java.util.regex.PatternSyntaxException;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 对文本进行替换。<br>
|
* Replace selector。<br>
|
||||||
|
*
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
* Date: 13-4-21
|
* @since 0.1.0
|
||||||
* Time: 上午7:09
|
|
||||||
*/
|
*/
|
||||||
public class ReplaceSelector implements Selector {
|
public class ReplaceSelector implements Selector {
|
||||||
|
|
||||||
|
|
|
@ -3,10 +3,10 @@ package us.codecraft.webmagic.selector;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 可进行抽取的文本。<br>
|
* Selectable text.<br>
|
||||||
|
*
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
* Date: 13-4-20
|
* @since 0.1.0
|
||||||
* Time: 下午7:51
|
|
||||||
*/
|
*/
|
||||||
public interface Selectable {
|
public interface Selectable {
|
||||||
|
|
||||||
|
|
|
@ -3,15 +3,24 @@ package us.codecraft.webmagic.selector;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 抽取器。<br>
|
* Selector(extractor) for text.<br>
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
* Date: 13-4-20
|
|
||||||
* Time: 下午8:02
|
|
||||||
*/
|
*/
|
||||||
public interface Selector {
|
public interface Selector {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract single result in text.<br>
|
||||||
|
* If there are more than one result, only the first will be chosen.
|
||||||
|
* @param text
|
||||||
|
* @return result
|
||||||
|
*/
|
||||||
public String select(String text);
|
public String select(String text);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract all results in text.<br>
|
||||||
|
* @param text
|
||||||
|
* @return results
|
||||||
|
*/
|
||||||
public List<String> selectList(String text);
|
public List<String> selectList(String text);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -7,10 +7,10 @@ import java.util.Map;
|
||||||
import java.util.concurrent.ConcurrentHashMap;
|
import java.util.concurrent.ConcurrentHashMap;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 产生selector的工厂。<br>
|
* Selector factory with some inner cache.<br>
|
||||||
|
*
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
* Date: 13-4-21
|
* @since 0.1.0
|
||||||
* Time: 上午7:56
|
|
||||||
*/
|
*/
|
||||||
public class SelectorFactory {
|
public class SelectorFactory {
|
||||||
|
|
||||||
|
@ -34,7 +34,7 @@ public class SelectorFactory {
|
||||||
return newSelector(XpathSelector.class, xpath);
|
return newSelector(XpathSelector.class, xpath);
|
||||||
}
|
}
|
||||||
|
|
||||||
public SmartContentSelector newSmartContentSelector(){
|
public SmartContentSelector newSmartContentSelector() {
|
||||||
return newSelector(SmartContentSelector.class);
|
return newSelector(SmartContentSelector.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -3,17 +3,19 @@ package us.codecraft.webmagic.selector;
|
||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
import org.htmlcleaner.HtmlCleaner;
|
import org.htmlcleaner.HtmlCleaner;
|
||||||
import org.htmlcleaner.TagNode;
|
import org.htmlcleaner.TagNode;
|
||||||
|
import us.codecraft.webmagic.utils.Experimental;
|
||||||
|
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.concurrent.atomic.AtomicInteger;
|
import java.util.concurrent.atomic.AtomicInteger;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* readability算法,基础是找到所有p标签的父节点
|
* Extract the text content of html.<br>
|
||||||
* 写的比较乱,最终效果还在尝试中
|
* Using Readability algorithm: find parents of all p tags.
|
||||||
|
*
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
* Date: 13-4-21
|
* @since 0.1.0
|
||||||
* Time: 下午4:42
|
|
||||||
*/
|
*/
|
||||||
|
@Experimental
|
||||||
public class SmartContentSelector implements Selector {
|
public class SmartContentSelector implements Selector {
|
||||||
|
|
||||||
private Logger logger = Logger.getLogger(getClass());
|
private Logger logger = Logger.getLogger(getClass());
|
||||||
|
|
|
@ -6,10 +6,10 @@ import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* xpath的选择器。包装了HtmlCleaner。<br>
|
* XPath selector based on HtmlCleaner。<br>
|
||||||
|
*
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
* Date: 13-4-21
|
* @since 0.1.0
|
||||||
* Time: 上午9:39
|
|
||||||
*/
|
*/
|
||||||
public class XpathSelector implements Selector {
|
public class XpathSelector implements Selector {
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
<html>
|
<html>
|
||||||
<body>
|
<body>
|
||||||
提供了便捷抽取页面内容的工具,对外核心接口是Selectable,内部抽取则是通过实现Selector来定制。
|
Selectors for page extraction. Core API is the interface Selectable,and internal core is the interface Selector。
|
||||||
</body>
|
</body>
|
||||||
</html>
|
</html>
|
||||||
|
|
Loading…
Reference in New Issue