update comments
parent
6cc1d62a08
commit
5f1f4cbc46
|
@ -8,30 +8,19 @@ import java.util.ArrayList;
|
|||
import java.util.List;
|
||||
|
||||
/**
|
||||
* <pre class="zh">
|
||||
* Page保存了上一次抓取的结果,并可定义待抓取的链接内容。
|
||||
*
|
||||
* 主要方法:
|
||||
* {@link #getUrl()} 获取页面的Url
|
||||
* {@link #getHtml()} 获取页面的html内容
|
||||
* {@link #putField(String, Object)} 保存抽取的结果
|
||||
* {@link #getResultItems()} 获取抽取的结果,在 {@link us.codecraft.webmagic.pipeline.Pipeline} 中调用
|
||||
* {@link #addTargetRequests(java.util.List)} {@link #addTargetRequest(String)} 添加待抓取的链接
|
||||
*
|
||||
* </pre>
|
||||
* <pre class="en">
|
||||
* Store extracted result and urls to be crawled.
|
||||
*
|
||||
* Main method:
|
||||
* {@link #getUrl()} get url of current page
|
||||
* {@link #getHtml()} get content of current page
|
||||
* {@link #putField(String, Object)} save extracted result
|
||||
* {@link #getResultItems()} get extract results to be used in {@link us.codecraft.webmagic.pipeline.Pipeline}
|
||||
* {@link #addTargetRequests(java.util.List)} {@link #addTargetRequest(String)} add urls to crawl
|
||||
*
|
||||
* </pre>
|
||||
* Object storing extracted result and urls to be crawled.<br>
|
||||
* Main method: <br>
|
||||
* {@link #getUrl()} get url of current page <br>
|
||||
* {@link #getHtml()} get content of current page <br>
|
||||
* {@link #putField(String, Object)} save extracted result <br>
|
||||
* {@link #getResultItems()} get extract results to be used in {@link us.codecraft.webmagic.pipeline.Pipeline}<br>
|
||||
* {@link #addTargetRequests(java.util.List)} {@link #addTargetRequest(String)} add urls to crawl <br>
|
||||
*
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* @since 0.1.0
|
||||
* @see us.codecraft.webmagic.downloader.Downloader
|
||||
* @see us.codecraft.webmagic.processor.PageProcessor
|
||||
*/
|
||||
public class Page {
|
||||
|
||||
|
@ -55,19 +44,19 @@ public class Page {
|
|||
}
|
||||
|
||||
/**
|
||||
* store extract results
|
||||
*
|
||||
*
|
||||
* @param key 结果的key
|
||||
* @param field 结果的value
|
||||
* @param key
|
||||
* @param field
|
||||
*/
|
||||
public void putField(String key, Object field) {
|
||||
resultItems.put(key, field);
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取页面的html内容
|
||||
* get html content of page
|
||||
*
|
||||
* @return html 页面的html内容
|
||||
* @return html
|
||||
*/
|
||||
public Selectable getHtml() {
|
||||
return html;
|
||||
|
@ -82,9 +71,9 @@ public class Page {
|
|||
}
|
||||
|
||||
/**
|
||||
* 添加待抓取的链接
|
||||
* add urls to crawl
|
||||
*
|
||||
* @param requests 待抓取的链接
|
||||
* @param requests
|
||||
*/
|
||||
public void addTargetRequests(List<String> requests) {
|
||||
synchronized (targetRequests) {
|
||||
|
@ -99,9 +88,9 @@ public class Page {
|
|||
}
|
||||
|
||||
/**
|
||||
* 添加待抓取的链接
|
||||
* add url to crawl
|
||||
*
|
||||
* @param requestString 待抓取的链接
|
||||
* @param requestString
|
||||
*/
|
||||
public void addTargetRequest(String requestString) {
|
||||
if (StringUtils.isBlank(requestString) || requestString.equals("#")) {
|
||||
|
@ -114,9 +103,9 @@ public class Page {
|
|||
}
|
||||
|
||||
/**
|
||||
* 添加待抓取的页面,在需要传递附加信息时使用
|
||||
* add requests to crawl
|
||||
*
|
||||
* @param request 待抓取的页面
|
||||
* @param request
|
||||
*/
|
||||
public void addTargetRequest(Request request) {
|
||||
synchronized (targetRequests) {
|
||||
|
@ -125,27 +114,22 @@ public class Page {
|
|||
}
|
||||
|
||||
/**
|
||||
* 获取页面的Url
|
||||
* get url of current page
|
||||
*
|
||||
* @return url 当前页面的url,可用于抽取
|
||||
* @return url of current page
|
||||
*/
|
||||
public Selectable getUrl() {
|
||||
return url;
|
||||
}
|
||||
|
||||
/**
|
||||
* 设置url
|
||||
*
|
||||
* @param url
|
||||
*/
|
||||
public void setUrl(Selectable url) {
|
||||
this.url = url;
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取抓取请求
|
||||
* get request of current page
|
||||
*
|
||||
* @return request 抓取请求
|
||||
* @return request
|
||||
*/
|
||||
public Request getRequest() {
|
||||
return request;
|
||||
|
|
|
@ -1,33 +1,17 @@
|
|||
package us.codecraft.webmagic;
|
||||
|
||||
import us.codecraft.webmagic.utils.Experimental;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* <div class="zh">
|
||||
* Request对象封装了待抓取的url信息。<br/>
|
||||
* 在PageProcessor中,Request对象可以通过{@link us.codecraft.webmagic.Page#getRequest()} 获取。<br/>
|
||||
* <br/>
|
||||
* Request对象包含一个extra属性,可以写入一些必须的上下文,这个特性在某些场合会有用。<br/>
|
||||
* <pre>
|
||||
* Example:
|
||||
* 抓取<a href="${link}">${linktext}</a>时,希望提取链接link,并保存linktext的信息。
|
||||
* 在上一个页面:
|
||||
* public void process(Page page){
|
||||
* Request request = new Request(link,linktext);
|
||||
* page.addTargetRequest(request)
|
||||
* }
|
||||
* 在下一个页面:
|
||||
* public void process(Page page){
|
||||
* String linktext = (String)page.getRequest().getExtra()[0];
|
||||
* }
|
||||
* </pre>
|
||||
* </div>
|
||||
* Object contains url to crawl.<br>
|
||||
* It contains some additional information.<br>
|
||||
*
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-4-21
|
||||
* Time: 上午11:37
|
||||
* @since 0.1.0
|
||||
*/
|
||||
public class Request implements Serializable {
|
||||
|
||||
|
@ -36,20 +20,22 @@ public class Request implements Serializable {
|
|||
private String url;
|
||||
|
||||
/**
|
||||
* 额外参数,可以保存一些需要的上下文信息
|
||||
* Store additional information in extras.
|
||||
*/
|
||||
private Map<String, Object> extras;
|
||||
|
||||
/**
|
||||
* Priority of the request.<br>
|
||||
* The bigger will be processed earlier. <br>
|
||||
* Need a scheduler supporting priority.<br>
|
||||
* But no scheduler in webmagic supporting priority now (:
|
||||
*/
|
||||
@Experimental
|
||||
private double priority;
|
||||
|
||||
public Request() {
|
||||
}
|
||||
|
||||
/**
|
||||
* 构建一个request对象
|
||||
*
|
||||
* @param url 必须参数,待抓取的url
|
||||
*/
|
||||
public Request(String url) {
|
||||
this.url = url;
|
||||
}
|
||||
|
@ -59,12 +45,14 @@ public class Request implements Serializable {
|
|||
}
|
||||
|
||||
/**
|
||||
* 设置优先级,用于URL队列排序<br>
|
||||
* 需扩展Scheduler<br>
|
||||
* 目前还没有对应支持优先级的Scheduler实现 =。= <br>
|
||||
* @param priority 优先级,越大则越靠前
|
||||
* Set the priority of request for sorting.<br>
|
||||
* Need a scheduler supporting priority.<br>
|
||||
* But no scheduler in webmagic supporting priority now (:
|
||||
*
|
||||
* @param priority
|
||||
* @return this
|
||||
*/
|
||||
@Experimental
|
||||
public Request setPriority(double priority) {
|
||||
this.priority = priority;
|
||||
return this;
|
||||
|
@ -85,11 +73,6 @@ public class Request implements Serializable {
|
|||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取待抓取的url
|
||||
*
|
||||
* @return url 待抓取的url
|
||||
*/
|
||||
public String getUrl() {
|
||||
return url;
|
||||
}
|
||||
|
|
|
@ -4,10 +4,13 @@ import java.util.HashMap;
|
|||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* 保存抽取结果的类,由PageProcessor处理得到,传递给{@link us.codecraft.webmagic.pipeline.Pipeline}进行持久化。<br>
|
||||
* Object contains extract results.<br>
|
||||
* It is contained in Page and will be processed in pipeline.
|
||||
*
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-7-25 <br>
|
||||
* Time: 下午12:20 <br>
|
||||
* @since 0.1.0
|
||||
* @see Page
|
||||
* @see us.codecraft.webmagic.pipeline.Pipeline
|
||||
*/
|
||||
public class ResultItems {
|
||||
|
||||
|
@ -25,7 +28,7 @@ public class ResultItems {
|
|||
return (T) fields.get(key);
|
||||
}
|
||||
|
||||
public Map<String, Object> getAll() {
|
||||
public Map<String, Object> getAll() {
|
||||
return fields;
|
||||
}
|
||||
|
||||
|
@ -44,8 +47,10 @@ public class ResultItems {
|
|||
}
|
||||
|
||||
/**
|
||||
* 是否忽略这个页面,用于pipeline来判断是否对这个页面进行处理
|
||||
* @return 是否忽略 true 忽略
|
||||
* Whether to skip the result.<br>
|
||||
* Result which is skipped will not be processed by Pipeline.
|
||||
*
|
||||
* @return whether to skip the result
|
||||
*/
|
||||
public boolean isSkip() {
|
||||
return skip;
|
||||
|
@ -53,8 +58,10 @@ public class ResultItems {
|
|||
|
||||
|
||||
/**
|
||||
* 设置是否忽略这个页面,用于pipeline来判断是否对这个页面进行处理
|
||||
* @param skip
|
||||
* Set whether to skip the result.<br>
|
||||
* Result which is skipped will not be processed by Pipeline.
|
||||
*
|
||||
* @param skip whether to skip the result
|
||||
* @return this
|
||||
*/
|
||||
public ResultItems setSkip(boolean skip) {
|
||||
|
|
|
@ -5,12 +5,11 @@ import us.codecraft.webmagic.utils.UrlUtils;
|
|||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Site定义一个待抓取的站点的各种信息。<br>
|
||||
* 这个类的所有getter方法,一般都只会被爬虫框架内部进行调用。<br>
|
||||
* Object contains setting for crawler.<br>
|
||||
*
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-4-21
|
||||
* Time: 下午12:13
|
||||
* @since 0.1.0
|
||||
* @see us.codecraft.webmagic.processor.PageProcessor
|
||||
*/
|
||||
public class Site {
|
||||
|
||||
|
@ -22,6 +21,9 @@ public class Site {
|
|||
|
||||
private String charset;
|
||||
|
||||
/**
|
||||
* startUrls is the urls the crawler to start with.
|
||||
*/
|
||||
private List<String> startUrls = new ArrayList<String>();
|
||||
|
||||
private int sleepTime = 3000;
|
||||
|
@ -37,19 +39,19 @@ public class Site {
|
|||
}
|
||||
|
||||
/**
|
||||
* 创建一个Site对象,等价于new Site()
|
||||
* new a Site
|
||||
*
|
||||
* @return 新建的对象
|
||||
* @return new site
|
||||
*/
|
||||
public static Site me() {
|
||||
return new Site();
|
||||
}
|
||||
|
||||
/**
|
||||
* 为这个站点添加一个cookie,可用于抓取某些需要登录访问的站点。这个cookie的域名与{@link #getDomain()}是一致的
|
||||
* Add a cookie with domain {@link #getDomain()}
|
||||
*
|
||||
* @param name cookie的名称
|
||||
* @param value cookie的值
|
||||
* @param name
|
||||
* @param value
|
||||
* @return this
|
||||
*/
|
||||
public Site addCookie(String name, String value) {
|
||||
|
@ -58,7 +60,7 @@ public class Site {
|
|||
}
|
||||
|
||||
/**
|
||||
* 为这个站点设置user-agent,很多网站都对user-agent进行了限制,不设置此选项可能会得到期望之外的结果。
|
||||
* set user agent
|
||||
*
|
||||
* @param userAgent userAgent
|
||||
* @return this
|
||||
|
@ -69,27 +71,27 @@ public class Site {
|
|||
}
|
||||
|
||||
/**
|
||||
* 获取已经设置的所有cookie
|
||||
* get cookies
|
||||
*
|
||||
* @return 已经设置的所有cookie
|
||||
* @return get cookies
|
||||
*/
|
||||
public Map<String, String> getCookies() {
|
||||
return cookies;
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取已设置的user-agent
|
||||
* get user agent
|
||||
*
|
||||
* @return 已设置的user-agent
|
||||
* @return user agent
|
||||
*/
|
||||
public String getUserAgent() {
|
||||
return userAgent;
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取已设置的domain
|
||||
* get domain
|
||||
*
|
||||
* @return 已设置的domain
|
||||
* @return get domain
|
||||
*/
|
||||
public String getDomain() {
|
||||
if (domain == null) {
|
||||
|
@ -101,10 +103,9 @@ public class Site {
|
|||
}
|
||||
|
||||
/**
|
||||
* 设置这个站点所在域名,必须项。<br>
|
||||
* 目前不支持多个域名的抓取。抓取多个域名请新建一个Spider。
|
||||
* set the domain of site.
|
||||
*
|
||||
* @param domain 爬虫会抓取的域名
|
||||
* @param domain
|
||||
* @return this
|
||||
*/
|
||||
public Site setDomain(String domain) {
|
||||
|
@ -113,10 +114,10 @@ public class Site {
|
|||
}
|
||||
|
||||
/**
|
||||
* 设置页面编码,若不设置则自动根据Html meta信息获取。<br>
|
||||
* 一般无需设置encoding,如果发现下载的结果是乱码,则可以设置此项。<br>
|
||||
* Set charset of page manually.<br>
|
||||
* When charset is not set or set to null, it can be auto detected by Http header.
|
||||
*
|
||||
* @param charset 编码格式,主要是"utf-8"、"gbk"两种
|
||||
* @param charset
|
||||
* @return this
|
||||
*/
|
||||
public Site setCharset(String charset) {
|
||||
|
@ -125,20 +126,21 @@ public class Site {
|
|||
}
|
||||
|
||||
/**
|
||||
* 获取已设置的编码
|
||||
* get charset set manually
|
||||
*
|
||||
* @return 已设置的domain
|
||||
* @return charset
|
||||
*/
|
||||
public String getCharset() {
|
||||
return charset;
|
||||
}
|
||||
|
||||
/**
|
||||
* 设置可接受的http状态码,仅当状态码在这个集合中时,才会读取页面内容。<br>
|
||||
* 默认为200,正常情况下,无须设置此项。<br>
|
||||
* 某些站点会错误的返回状态码,此时可以对这个选项进行设置。<br>
|
||||
* Set acceptStatCode.<br>
|
||||
* When status code of http response is in acceptStatCodes, it will be processed.<br>
|
||||
* {200} by default.<br>
|
||||
* It is not necessarily to be set.<br>
|
||||
*
|
||||
* @param acceptStatCode 可接受的状态码
|
||||
* @param acceptStatCode
|
||||
* @return this
|
||||
*/
|
||||
public Site setAcceptStatCode(Set<Integer> acceptStatCode) {
|
||||
|
@ -147,27 +149,27 @@ public class Site {
|
|||
}
|
||||
|
||||
/**
|
||||
* 获取可接受的状态码
|
||||
* get acceptStatCode
|
||||
*
|
||||
* @return 可接受的状态码
|
||||
* @return acceptStatCode
|
||||
*/
|
||||
public Set<Integer> getAcceptStatCode() {
|
||||
return acceptStatCode;
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取初始页面的地址列表
|
||||
* get start urls
|
||||
*
|
||||
* @return 初始页面的地址列表
|
||||
* @return start urls
|
||||
*/
|
||||
public List<String> getStartUrls() {
|
||||
return startUrls;
|
||||
}
|
||||
|
||||
/**
|
||||
* 增加初始页面的地址,可反复调用此方法增加多个初始地址。
|
||||
* Add a url to start url.<br>
|
||||
*
|
||||
* @param startUrl 初始页面的地址
|
||||
* @param startUrl
|
||||
* @return this
|
||||
*/
|
||||
public Site addStartUrl(String startUrl) {
|
||||
|
@ -176,9 +178,10 @@ public class Site {
|
|||
}
|
||||
|
||||
/**
|
||||
* 设置两次抓取之间的间隔,避免对目标站点压力过大(或者避免被防火墙屏蔽...)。
|
||||
* Set the interval between the processing of two pages.<br>
|
||||
* Time unit is micro seconds.<br>
|
||||
*
|
||||
* @param sleepTime 单位毫秒
|
||||
* @param sleepTime
|
||||
* @return this
|
||||
*/
|
||||
public Site setSleepTime(int sleepTime) {
|
||||
|
@ -187,25 +190,26 @@ public class Site {
|
|||
}
|
||||
|
||||
/**
|
||||
* 获取两次抓取之间的间隔
|
||||
* Get the interval between the processing of two pages.<br>
|
||||
* Time unit is micro seconds.<br>
|
||||
*
|
||||
* @return 两次抓取之间的间隔,单位毫秒
|
||||
* @return the interval between the processing of two pages,
|
||||
*/
|
||||
public int getSleepTime() {
|
||||
return sleepTime;
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取重新下载的次数,默认为0
|
||||
* Get retry times when download fail, 0 by default.<br>
|
||||
*
|
||||
* @return 重新下载的次数
|
||||
* @return retry times when download fail
|
||||
*/
|
||||
public int getRetryTimes() {
|
||||
return retryTimes;
|
||||
}
|
||||
|
||||
/**
|
||||
* 设置获取重新下载的次数,默认为0
|
||||
* Set retry times when download fail, 0 by default.<br>
|
||||
*
|
||||
* @return this
|
||||
*/
|
||||
|
|
|
@ -18,25 +18,30 @@ import java.util.concurrent.ExecutorService;
|
|||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
|
||||
/**
|
||||
* <pre>
|
||||
* webmagic爬虫的入口类。
|
||||
*
|
||||
* 示例:
|
||||
* 定义一个最简单的爬虫:
|
||||
* Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*")).run();
|
||||
*
|
||||
* 使用FilePipeline保存结果到文件:
|
||||
* Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*"))
|
||||
* .pipeline(new FilePipeline("/data/temp/webmagic/")).run();
|
||||
*
|
||||
* 使用FileCacheQueueScheduler缓存URL,关闭爬虫后下次自动从停止的页面继续抓取:
|
||||
* Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*"))
|
||||
* .scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")).run();
|
||||
* </pre>
|
||||
* Entrance of a crawler.<br>
|
||||
* A spider contains four modules: Downloader, Scheduler, PageProcessor and Pipeline.<br>
|
||||
* Every module is a field of Spider. <br>
|
||||
* The modules are defined in interface. <br>
|
||||
* You can customize a spider with various implementations of them. <br>
|
||||
* Examples: <br>
|
||||
* <br>
|
||||
* A simple crawler: <br>
|
||||
* Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*")).run();<br>
|
||||
* <br>
|
||||
* Store results to files by FilePipeline: <br>
|
||||
* Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*")) <br>
|
||||
* .pipeline(new FilePipeline("/data/temp/webmagic/")).run(); <br>
|
||||
* <br>
|
||||
* Use FileCacheQueueScheduler to store urls and cursor in files, so that a Spider can resume the status when shutdown. <br>
|
||||
* Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*")) <br>
|
||||
* .scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")).run(); <br>
|
||||
*
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-4-21
|
||||
* Time: 上午6:53
|
||||
* @see Downloader
|
||||
* @see Scheduler
|
||||
* @see PageProcessor
|
||||
* @see Pipeline
|
||||
* @since 0.1.0
|
||||
*/
|
||||
public class Spider implements Runnable, Task {
|
||||
|
||||
|
@ -222,11 +227,12 @@ public class Spider implements Runnable, Task {
|
|||
|
||||
/**
|
||||
* 用某些特定URL进行爬虫测试
|
||||
*
|
||||
* @param urls 要抓取的url
|
||||
*/
|
||||
public void test(String... urls){
|
||||
public void test(String... urls) {
|
||||
checkComponent();
|
||||
if (urls.length>0){
|
||||
if (urls.length > 0) {
|
||||
for (String url : urls) {
|
||||
processRequest(new Request(url));
|
||||
}
|
||||
|
@ -241,7 +247,7 @@ public class Spider implements Runnable, Task {
|
|||
}
|
||||
pageProcessor.process(page);
|
||||
addRequest(page);
|
||||
if (!page.getResultItems().isSkip()){
|
||||
if (!page.getResultItems().isSkip()) {
|
||||
for (Pipeline pipeline : pipelines) {
|
||||
pipeline.process(page.getResultItems(), this);
|
||||
}
|
||||
|
@ -298,8 +304,8 @@ public class Spider implements Runnable, Task {
|
|||
return this;
|
||||
}
|
||||
|
||||
public Spider clearPipeline(){
|
||||
pipelines=new ArrayList<Pipeline>();
|
||||
public Spider clearPipeline() {
|
||||
pipelines = new ArrayList<Pipeline>();
|
||||
return this;
|
||||
}
|
||||
|
||||
|
|
|
@ -38,7 +38,7 @@ public class HttpClientDownloader implements Downloader {
|
|||
* 直接下载页面的简便方法
|
||||
*
|
||||
* @param url
|
||||
* @return
|
||||
* @return html
|
||||
*/
|
||||
public Html download(String url) {
|
||||
Page page = download(new Request(url), null);
|
||||
|
|
|
@ -2,9 +2,6 @@
|
|||
<body>
|
||||
<div class="en">
|
||||
Main class "Spider" and models.
|
||||
</div>
|
||||
<div class="zh">
|
||||
包括webmagic入口类Spider和一些数据传递的实体类。
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
package us.codecraft.webmagic.model.annotation;
|
||||
package us.codecraft.webmagic.utils;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com <br>
|
|
@ -1,6 +1,6 @@
|
|||
package us.codecraft.webmagic;
|
||||
|
||||
import us.codecraft.webmagic.model.annotation.Experimental;
|
||||
import us.codecraft.webmagic.utils.Experimental;
|
||||
|
||||
import java.util.Collection;
|
||||
|
||||
|
|
|
@ -4,7 +4,7 @@ import org.apache.commons.codec.digest.DigestUtils;
|
|||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.log4j.Logger;
|
||||
import us.codecraft.webmagic.*;
|
||||
import us.codecraft.webmagic.model.annotation.Experimental;
|
||||
import us.codecraft.webmagic.utils.Experimental;
|
||||
import us.codecraft.webmagic.pipeline.Pipeline;
|
||||
import us.codecraft.webmagic.processor.PageProcessor;
|
||||
import us.codecraft.webmagic.processor.SimplePageProcessor;
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
package us.codecraft.webmagic.model;
|
||||
|
||||
import us.codecraft.webmagic.model.annotation.Experimental;
|
||||
import us.codecraft.webmagic.utils.Experimental;
|
||||
|
||||
/**
|
||||
* Interface to be implemented by page mode.<br>
|
||||
|
|
|
@ -21,7 +21,7 @@ public @interface ComboExtract {
|
|||
*/
|
||||
ExtractBy[] value();
|
||||
|
||||
enum Op {
|
||||
public static enum Op {
|
||||
/**
|
||||
* All extractors will be arranged as a pipeline. <br>
|
||||
* The next extractor uses the result of the previous as source.
|
||||
|
@ -49,7 +49,10 @@ public @interface ComboExtract {
|
|||
*/
|
||||
boolean notNull() default false;
|
||||
|
||||
public enum Source {
|
||||
/**
|
||||
* types of source for extracting.
|
||||
*/
|
||||
public static enum Source {
|
||||
/**
|
||||
* extract from the content extracted by class extractor
|
||||
*/
|
||||
|
|
|
@ -21,7 +21,10 @@ public @interface ExtractBy {
|
|||
*/
|
||||
String value();
|
||||
|
||||
public enum Type {XPath, Regex, Css}
|
||||
/**
|
||||
* types of extractor expressions
|
||||
*/
|
||||
public static enum Type {XPath, Regex, Css}
|
||||
|
||||
/**
|
||||
* Extractor type, support XPath, CSS Selector and regex.
|
||||
|
@ -38,7 +41,10 @@ public @interface ExtractBy {
|
|||
*/
|
||||
boolean notNull() default false;
|
||||
|
||||
public enum Source {
|
||||
/**
|
||||
* types of source for extracting.
|
||||
*/
|
||||
public static enum Source {
|
||||
/**
|
||||
* extract from the content extracted by class extractor
|
||||
*/
|
||||
|
|
|
@ -3,7 +3,7 @@ package us.codecraft.webmagic.pipeline;
|
|||
import us.codecraft.webmagic.MultiPageModel;
|
||||
import us.codecraft.webmagic.ResultItems;
|
||||
import us.codecraft.webmagic.Task;
|
||||
import us.codecraft.webmagic.model.annotation.Experimental;
|
||||
import us.codecraft.webmagic.utils.Experimental;
|
||||
import us.codecraft.webmagic.utils.DoubleKeyMap;
|
||||
|
||||
import java.util.*;
|
||||
|
|
|
@ -16,7 +16,7 @@ import java.util.concurrent.atomic.AtomicBoolean;
|
|||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
|
||||
/**
|
||||
* Store urls and cursor in files so that a Spider can resume the status when shutdown。<br>
|
||||
* Store urls and cursor in files so that a Spider can resume the status when shutdown.<br>
|
||||
*
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* @since 0.2.0
|
||||
|
|
Loading…
Reference in New Issue