update comments for selector

master
yihua.huang 2013-08-17 21:33:54 +08:00
parent 77e6ca2945
commit 17f8ead28f
10 changed files with 50 additions and 39 deletions

View File

@ -4,10 +4,10 @@ import java.util.ArrayList;
import java.util.List;
/**
* html<br>
* Selectable plain text.<br>
*
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* Time: 7:54
* @since 0.1.0
*/
public class Html extends PlainText {

View File

@ -1,10 +1,10 @@
package us.codecraft.webmagic.selector;
/**
* <br>
* Object contains regex results.<br>
*
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* Time: 7:39
* @since 0.1.0
*/
class RegexResult {

View File

@ -9,10 +9,10 @@ import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
/**
* <br>
* Selector in regex.<br>
*
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* Time: 7:09
* @since 0.1.0
*/
public class RegexSelector implements Selector {

View File

@ -6,10 +6,10 @@ import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
/**
* <br>
* Replace selector<br>
*
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* Time: 7:09
* @since 0.1.0
*/
public class ReplaceSelector implements Selector {

View File

@ -3,10 +3,10 @@ package us.codecraft.webmagic.selector;
import java.util.List;
/**
* <br>
* Selectable text.<br>
*
* @author code4crafter@gmail.com <br>
* Date: 13-4-20
* Time: 7:51
* @since 0.1.0
*/
public interface Selectable {

View File

@ -3,15 +3,24 @@ package us.codecraft.webmagic.selector;
import java.util.List;
/**
* <br>
* Selector(extractor) for text.<br>
* @author code4crafter@gmail.com <br>
* Date: 13-4-20
* Time: 8:02
*/
public interface Selector {
/**
* Extract single result in text.<br>
* If there are more than one result, only the first will be chosen.
* @param text
* @return result
*/
public String select(String text);
/**
* Extract all results in text.<br>
* @param text
* @return results
*/
public List<String> selectList(String text);
}

View File

@ -7,10 +7,10 @@ import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
/**
* selector<br>
* Selector factory with some inner cache.<br>
*
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* Time: 7:56
* @since 0.1.0
*/
public class SelectorFactory {

View File

@ -3,17 +3,19 @@ package us.codecraft.webmagic.selector;
import org.apache.log4j.Logger;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;
import us.codecraft.webmagic.utils.Experimental;
import java.util.*;
import java.util.concurrent.atomic.AtomicInteger;
/**
* readabilityp
*
* Extract the text content of html.<br>
* Using Readability algorithm: find parents of all p tags.
*
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* Time: 4:42
* @since 0.1.0
*/
@Experimental
public class SmartContentSelector implements Selector {
private Logger logger = Logger.getLogger(getClass());

View File

@ -6,10 +6,10 @@ import java.util.ArrayList;
import java.util.List;
/**
* xpathHtmlCleaner<br>
* XPath selector based on HtmlCleaner<br>
*
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* Time: 9:39
* @since 0.1.0
*/
public class XpathSelector implements Selector {

View File

@ -1,5 +1,5 @@
<html>
<body>
提供了便捷抽取页面内容的工具对外核心接口是Selectable内部抽取则是通过实现Selector来定制
Selectors for page extraction. Core API is the interface Selectableand internal core is the interface Selector
</body>
</html>