update java docs
parent
96454fd74c
commit
827972d80f
|
@ -22,7 +22,7 @@ public interface Scheduler {
|
|||
/**
|
||||
* 返回下一个要抓取的链接
|
||||
* @param task 定义的任务,以满足单Scheduler多Task的情况
|
||||
* @return
|
||||
* @return 下一个要抓取的链接
|
||||
*/
|
||||
public Request poll(Task task);
|
||||
|
||||
|
|
|
@ -4,6 +4,7 @@ import java.util.ArrayList;
|
|||
import java.util.List;
|
||||
|
||||
/**
|
||||
* 可抽取的html文本。<br>
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-4-21
|
||||
* Time: 上午7:54
|
||||
|
|
|
@ -6,6 +6,7 @@ import java.util.ArrayList;
|
|||
import java.util.List;
|
||||
|
||||
/**
|
||||
* 可抽取的纯文本,不包括xpath和css selector实现。<br>
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-4-21
|
||||
* Time: 上午7:54
|
||||
|
|
|
@ -9,6 +9,7 @@ import java.util.regex.Pattern;
|
|||
import java.util.regex.PatternSyntaxException;
|
||||
|
||||
/**
|
||||
* 正则表达式抽取器。<br>
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-4-21
|
||||
* Time: 上午7:09
|
||||
|
|
|
@ -6,6 +6,7 @@ import java.util.regex.Pattern;
|
|||
import java.util.regex.PatternSyntaxException;
|
||||
|
||||
/**
|
||||
* 对文本进行替换。<br>
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-4-21
|
||||
* Time: 上午7:09
|
||||
|
|
|
@ -3,6 +3,7 @@ package us.codecraft.webmagic.selector;
|
|||
import java.util.List;
|
||||
|
||||
/**
|
||||
* 可进行抽取的文本。<br>
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-4-20
|
||||
* Time: 下午7:51
|
||||
|
@ -20,8 +21,8 @@ public interface Selectable {
|
|||
/**
|
||||
* select list with css selector
|
||||
*
|
||||
* @param
|
||||
* @return
|
||||
* @param selector css selector expression
|
||||
* @return new Selectable after extract
|
||||
*/
|
||||
public Selectable $(String selector);
|
||||
|
||||
|
|
|
@ -3,6 +3,7 @@ package us.codecraft.webmagic.selector;
|
|||
import java.util.List;
|
||||
|
||||
/**
|
||||
* 抽取器。<br>
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-4-20
|
||||
* Time: 下午8:02
|
||||
|
|
|
@ -7,6 +7,7 @@ import java.util.Map;
|
|||
import java.util.concurrent.ConcurrentHashMap;
|
||||
|
||||
/**
|
||||
* 产生selector的工厂。<br>
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-4-21
|
||||
* Time: 上午7:56
|
||||
|
|
|
@ -6,6 +6,7 @@ import java.util.concurrent.ThreadPoolExecutor;
|
|||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
/**
|
||||
* 线程工具类。<br>
|
||||
* @author code4crafer@gmail.com
|
||||
* Date: 13-6-23
|
||||
* Time: 下午7:11
|
||||
|
|
|
@ -6,6 +6,7 @@ import java.util.regex.Matcher;
|
|||
import java.util.regex.Pattern;
|
||||
|
||||
/**
|
||||
* url及html处理工具类。<br>
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-4-21
|
||||
* Time: 下午1:52
|
||||
|
@ -18,7 +19,7 @@ public class UrlUtils {
|
|||
* 将url想对地址转化为绝对地址
|
||||
* @param url url地址
|
||||
* @param refer url地址来自哪个页面
|
||||
* @return
|
||||
* @return url绝对地址
|
||||
*/
|
||||
public static String canonicalizeUrl(String url, String refer) {
|
||||
if (StringUtils.isBlank(url) || StringUtils.isBlank(refer)) {
|
||||
|
|
Loading…
Reference in New Issue