webmagic-core
parent
17f8ead28f
commit
90bbe9b951
|
@ -9,13 +9,13 @@ import java.util.List;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
*
|
||||||
* Object storing extracted result and urls to be crawled.<br>
|
* Object storing extracted result and urls to fetch.<br>
|
||||||
* Main method: <br>
|
* Main method: <br>
|
||||||
* {@link #getUrl()} get url of current page <br>
|
* {@link #getUrl()} get url of current page <br>
|
||||||
* {@link #getHtml()} get content of current page <br>
|
* {@link #getHtml()} get content of current page <br>
|
||||||
* {@link #putField(String, Object)} save extracted result <br>
|
* {@link #putField(String, Object)} save extracted result <br>
|
||||||
* {@link #getResultItems()} get extract results to be used in {@link us.codecraft.webmagic.pipeline.Pipeline}<br>
|
* {@link #getResultItems()} get extract results to be used in {@link us.codecraft.webmagic.pipeline.Pipeline}<br>
|
||||||
* {@link #addTargetRequests(java.util.List)} {@link #addTargetRequest(String)} add urls to crawl <br>
|
* {@link #addTargetRequests(java.util.List)} {@link #addTargetRequest(String)} add urls to fetch <br>
|
||||||
*
|
*
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
* @since 0.1.0
|
* @since 0.1.0
|
||||||
|
@ -71,7 +71,7 @@ public class Page {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* add urls to crawl
|
* add urls to fetch
|
||||||
*
|
*
|
||||||
* @param requests
|
* @param requests
|
||||||
*/
|
*/
|
||||||
|
@ -88,7 +88,7 @@ public class Page {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* add url to crawl
|
* add url to fetch
|
||||||
*
|
*
|
||||||
* @param requestString
|
* @param requestString
|
||||||
*/
|
*/
|
||||||
|
@ -103,7 +103,7 @@ public class Page {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* add requests to crawl
|
* add requests to fetch
|
||||||
*
|
*
|
||||||
* @param request
|
* @param request
|
||||||
*/
|
*/
|
||||||
|
|
|
@ -5,16 +5,17 @@ import us.codecraft.webmagic.Request;
|
||||||
import us.codecraft.webmagic.Task;
|
import us.codecraft.webmagic.Task;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Downloader是webmagic下载页面的接口。webmagic默认使用了HttpComponent作为下载器,一般情况,你无需自己实现这个接口。<br>
|
* Downloader is the part that downloads web pages and store in Page object. <br>
|
||||||
|
* Downloader has {@link #setThread(int)} method because downloader is always the bottleneck of a crawler,
|
||||||
|
* there are always some mechanisms such as pooling in downloader, and pool size is related to thread numbers.
|
||||||
*
|
*
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
* Date: 13-4-21
|
* @since 0.1.0
|
||||||
* Time: 下午12:14
|
|
||||||
*/
|
*/
|
||||||
public interface Downloader {
|
public interface Downloader {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 下载页面,并保存信息到Page对象中。
|
* Downloads web pages and store in Page object.
|
||||||
*
|
*
|
||||||
* @param request
|
* @param request
|
||||||
* @param task
|
* @param task
|
||||||
|
@ -23,10 +24,8 @@ public interface Downloader {
|
||||||
public Page download(Request request, Task task);
|
public Page download(Request request, Task task);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 设置线程数,多线程程序一般需要Downloader支持<br>
|
* Tell the downloader how many threads the spider used.
|
||||||
* 如果不考虑多线程的可以不实现这个方法<br>
|
* @param threadNum number of threads
|
||||||
*
|
|
||||||
* @param thread 线程数量
|
|
||||||
*/
|
*/
|
||||||
public void setThread(int thread);
|
public void setThread(int threadNum);
|
||||||
}
|
}
|
||||||
|
|
|
@ -4,6 +4,7 @@ import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.http.Header;
|
import org.apache.http.Header;
|
||||||
import org.apache.http.HeaderElement;
|
import org.apache.http.HeaderElement;
|
||||||
import org.apache.http.HttpResponse;
|
import org.apache.http.HttpResponse;
|
||||||
|
import org.apache.http.annotation.ThreadSafe;
|
||||||
import org.apache.http.client.HttpClient;
|
import org.apache.http.client.HttpClient;
|
||||||
import org.apache.http.client.entity.GzipDecompressingEntity;
|
import org.apache.http.client.entity.GzipDecompressingEntity;
|
||||||
import org.apache.http.client.methods.HttpGet;
|
import org.apache.http.client.methods.HttpGet;
|
||||||
|
@ -22,12 +23,12 @@ import java.util.Set;
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 封装了HttpClient的下载器。已实现指定次数重试、处理gzip、自定义UA/cookie等功能。<br>
|
* The http downloader based on HttpClient.
|
||||||
*
|
*
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
* Date: 13-4-21
|
* @since 0.1.0
|
||||||
* Time: 下午12:15
|
|
||||||
*/
|
*/
|
||||||
|
@ThreadSafe
|
||||||
public class HttpClientDownloader implements Downloader {
|
public class HttpClientDownloader implements Downloader {
|
||||||
|
|
||||||
private Logger logger = Logger.getLogger(getClass());
|
private Logger logger = Logger.getLogger(getClass());
|
||||||
|
@ -35,14 +36,14 @@ public class HttpClientDownloader implements Downloader {
|
||||||
private int poolSize = 1;
|
private int poolSize = 1;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 直接下载页面的简便方法
|
* A simple method to download a url.
|
||||||
*
|
*
|
||||||
* @param url
|
* @param url
|
||||||
* @return html
|
* @return html
|
||||||
*/
|
*/
|
||||||
public Html download(String url) {
|
public Html download(String url) {
|
||||||
Page page = download(new Request(url), null);
|
Page page = download(new Request(url), null);
|
||||||
return (Html)page.getHtml();
|
return (Html) page.getHtml();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -20,8 +20,7 @@ import java.util.Map;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
* Date: 13-4-21
|
* @since 0.1.0
|
||||||
* Time: 下午12:29
|
|
||||||
*/
|
*/
|
||||||
public class HttpClientPool {
|
public class HttpClientPool {
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
<html>
|
<html>
|
||||||
<body>
|
<body>
|
||||||
包含了页面下载的接口Downloader和实现类HttpClientDownloader,该实现类封装了HttpComponent库。
|
Downloader is the part that downloads web pages and store in Page object.
|
||||||
</body>
|
</body>
|
||||||
</html>
|
</html>
|
||||||
|
|
|
@ -6,11 +6,11 @@ import us.codecraft.webmagic.Task;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 命令行输出抽取结果。可用于测试。<br>
|
* Write results in console.<br>
|
||||||
|
* Usually used in test.
|
||||||
*
|
*
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
* Date: 13-4-21
|
* @since 0.1.0
|
||||||
* Time: 下午1:45
|
|
||||||
*/
|
*/
|
||||||
public class ConsolePipeline implements Pipeline {
|
public class ConsolePipeline implements Pipeline {
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
package us.codecraft.webmagic.pipeline;
|
package us.codecraft.webmagic.pipeline;
|
||||||
|
|
||||||
import org.apache.commons.codec.digest.DigestUtils;
|
import org.apache.commons.codec.digest.DigestUtils;
|
||||||
|
import org.apache.http.annotation.ThreadSafe;
|
||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
import us.codecraft.webmagic.ResultItems;
|
import us.codecraft.webmagic.ResultItems;
|
||||||
import us.codecraft.webmagic.Task;
|
import us.codecraft.webmagic.Task;
|
||||||
|
@ -12,28 +13,23 @@ import java.io.PrintWriter;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 持久化到文件的接口。
|
* Store results in files.<br>
|
||||||
*
|
*
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
* Date: 13-4-21
|
* @since 0.1.0
|
||||||
* Time: 下午6:28
|
|
||||||
*/
|
*/
|
||||||
|
@ThreadSafe
|
||||||
public class FilePipeline extends FilePersistentBase implements Pipeline {
|
public class FilePipeline extends FilePersistentBase implements Pipeline {
|
||||||
|
|
||||||
private Logger logger = Logger.getLogger(getClass());
|
private Logger logger = Logger.getLogger(getClass());
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 新建一个FilePipeline,使用默认保存路径"/data/webmagic/"
|
* create a FilePipeline with default path"/data/webmagic/"
|
||||||
*/
|
*/
|
||||||
public FilePipeline() {
|
public FilePipeline() {
|
||||||
setPath("/data/webmagic/");
|
setPath("/data/webmagic/");
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* 新建一个FilePipeline
|
|
||||||
*
|
|
||||||
* @param path 文件保存路径
|
|
||||||
*/
|
|
||||||
public FilePipeline(String path) {
|
public FilePipeline(String path) {
|
||||||
setPath(path);
|
setPath(path);
|
||||||
}
|
}
|
||||||
|
|
|
@ -4,12 +4,21 @@ import us.codecraft.webmagic.ResultItems;
|
||||||
import us.codecraft.webmagic.Task;
|
import us.codecraft.webmagic.Task;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Pipeline是数据离线处理和持久化的接口。通过实现Pipeline以实现不同的持久化方式(例如保存到数据库)。
|
* Pipeline is the persistent and offline process part of crawler.<br>
|
||||||
|
* The interface Pipeline can be implemented to customize ways of persistent.
|
||||||
|
*
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
* Date: 13-4-21
|
* @since 0.1.0
|
||||||
* Time: 下午1:39
|
* @see ConsolePipeline
|
||||||
|
* @see FilePipeline
|
||||||
*/
|
*/
|
||||||
public interface Pipeline {
|
public interface Pipeline {
|
||||||
|
|
||||||
public void process(ResultItems resultItems,Task task);
|
/**
|
||||||
|
* Process extracted results.
|
||||||
|
*
|
||||||
|
* @param resultItems
|
||||||
|
* @param task
|
||||||
|
*/
|
||||||
|
public void process(ResultItems resultItems, Task task);
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
<html>
|
<html>
|
||||||
<body>
|
<body>
|
||||||
包含了处理页面抽取结果的接口Pipeline和它的几个实现类。
|
Pipeline is the persistent and offline process part of crawler.
|
||||||
</body>
|
</body>
|
||||||
</html>
|
</html>
|
||||||
|
|
|
@ -4,23 +4,33 @@ import us.codecraft.webmagic.Page;
|
||||||
import us.codecraft.webmagic.Site;
|
import us.codecraft.webmagic.Site;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 定制爬虫的核心接口。通过实现PageProcessor可以实现一个定制的爬虫。<br>
|
* Interface to be implemented to customize a crawler.<br>
|
||||||
* extends the class to implements various spiders.<br>
|
* <br>
|
||||||
|
* In PageProcessor, you can customize:
|
||||||
|
* <p/>
|
||||||
|
* start urls and other settings in {@link Site}<br>
|
||||||
|
* how the urls to fetch are detected <br>
|
||||||
|
* how the data are extracted and stored <br>
|
||||||
|
*
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
* Date: 13-4-21
|
* @see Site
|
||||||
* Time: 上午11:42
|
* @see Page
|
||||||
|
* @since 0.1.0
|
||||||
*/
|
*/
|
||||||
public interface PageProcessor {
|
public interface PageProcessor {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 定义如何处理页面,包括链接提取、内容抽取等。
|
* process the page, extract urls to fetch, extract the data and store
|
||||||
|
*
|
||||||
* @param page
|
* @param page
|
||||||
*/
|
*/
|
||||||
public void process(Page page);
|
public void process(Page page);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 定义任务一些配置信息,例如开始链接、抓取间隔、自定义cookie、自定义UA等。
|
* get the site settings
|
||||||
|
*
|
||||||
* @return site
|
* @return site
|
||||||
|
* @see Site
|
||||||
*/
|
*/
|
||||||
public Site getSite();
|
public Site getSite();
|
||||||
}
|
}
|
||||||
|
|
|
@ -7,10 +7,10 @@ import us.codecraft.webmagic.utils.UrlUtils;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 非常简单的抽取器。链接抽取使用定义的通配符,并保存抽取整个内容到content字段。<br>
|
* A simple PageProcessor.
|
||||||
|
*
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
* Date: 13-4-22
|
* @since 0.1.0
|
||||||
* Time: 下午9:15
|
|
||||||
*/
|
*/
|
||||||
public class SimplePageProcessor implements PageProcessor {
|
public class SimplePageProcessor implements PageProcessor {
|
||||||
|
|
||||||
|
@ -22,25 +22,25 @@ public class SimplePageProcessor implements PageProcessor {
|
||||||
this.site = Site.me().addStartUrl(startUrl).
|
this.site = Site.me().addStartUrl(startUrl).
|
||||||
setDomain(UrlUtils.getDomain(startUrl));
|
setDomain(UrlUtils.getDomain(startUrl));
|
||||||
//compile "*" expression to regex
|
//compile "*" expression to regex
|
||||||
this.urlPattern = "("+urlPattern.replace(".","\\.").replace("*","[^\"'#]*")+")";
|
this.urlPattern = "(" + urlPattern.replace(".", "\\.").replace("*", "[^\"'#]*") + ")";
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void process(Page page) {
|
public void process(Page page) {
|
||||||
List<String> requests = page.getHtml().links().regex(urlPattern).all();
|
List<String> requests = page.getHtml().links().regex(urlPattern).all();
|
||||||
//调用page.addTargetRequests()方法添加待抓取链接
|
//add urls to fetch
|
||||||
page.addTargetRequests(requests);
|
page.addTargetRequests(requests);
|
||||||
//xpath方式抽取
|
//extract by XPath
|
||||||
page.putField("title", page.getHtml().xpath("//title"));
|
page.putField("title", page.getHtml().xpath("//title"));
|
||||||
//sc表示使用Readability技术抽取正文
|
|
||||||
page.putField("html", page.getHtml().toString());
|
page.putField("html", page.getHtml().toString());
|
||||||
|
//extract by Readability
|
||||||
page.putField("content", page.getHtml().smartContent());
|
page.putField("content", page.getHtml().smartContent());
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Site getSite() {
|
public Site getSite() {
|
||||||
//定义抽取站点的相关参数
|
//settings
|
||||||
return site;
|
return site;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
<html>
|
<html>
|
||||||
<body>
|
<body>
|
||||||
包含了封装页面处理逻辑的接口PageProcessor和一个实现类SimplePageProcessor。实现PageProcessor即可定制一个自己的爬虫。
|
PageProcessor custom part of a crawler for specific site.
|
||||||
</body>
|
</body>
|
||||||
</html>
|
</html>
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
package us.codecraft.webmagic.scheduler;
|
package us.codecraft.webmagic.scheduler;
|
||||||
|
|
||||||
|
import org.apache.http.annotation.ThreadSafe;
|
||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
import us.codecraft.webmagic.Request;
|
import us.codecraft.webmagic.Request;
|
||||||
import us.codecraft.webmagic.Task;
|
import us.codecraft.webmagic.Task;
|
||||||
|
@ -10,11 +11,13 @@ import java.util.concurrent.BlockingQueue;
|
||||||
import java.util.concurrent.LinkedBlockingQueue;
|
import java.util.concurrent.LinkedBlockingQueue;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 内存队列实现的线程安全Scheduler。<br>
|
* Basic Scheduler implementation.<br>
|
||||||
|
* Store urls to fetch in LinkedBlockingQueue and remove duplicate urls by HashMap.
|
||||||
|
*
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
* Date: 13-4-21
|
* @since 0.1.0
|
||||||
* Time: 下午1:13
|
|
||||||
*/
|
*/
|
||||||
|
@ThreadSafe
|
||||||
public class QueueScheduler implements Scheduler {
|
public class QueueScheduler implements Scheduler {
|
||||||
|
|
||||||
private Logger logger = Logger.getLogger(getClass());
|
private Logger logger = Logger.getLogger(getClass());
|
||||||
|
@ -24,11 +27,11 @@ public class QueueScheduler implements Scheduler {
|
||||||
private Set<String> urls = new HashSet<String>();
|
private Set<String> urls = new HashSet<String>();
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public synchronized void push(Request request,Task task) {
|
public synchronized void push(Request request, Task task) {
|
||||||
if (logger.isDebugEnabled()){
|
if (logger.isDebugEnabled()) {
|
||||||
logger.debug("push to queue "+request.getUrl());
|
logger.debug("push to queue " + request.getUrl());
|
||||||
}
|
}
|
||||||
if (urls.add(request.getUrl())){
|
if (urls.add(request.getUrl())) {
|
||||||
queue.add(request);
|
queue.add(request);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -4,23 +4,27 @@ import us.codecraft.webmagic.Request;
|
||||||
import us.codecraft.webmagic.Task;
|
import us.codecraft.webmagic.Task;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 包含url管理和调度的接口。包括url抓取队列,url去重等功能。<br>
|
* Scheduler is the part of url management.<br>
|
||||||
* Scheduler的接口包含一个Task参数,该参数是为单Scheduler多Task预留的(Spider就是一个Task)。<br>
|
* You can implement interface Scheduler to do:
|
||||||
|
* manage urls to fetch
|
||||||
|
* remove duplicate urls
|
||||||
|
*
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
* Date: 13-4-21
|
* @since 0.1.0
|
||||||
* Time: 下午1:12
|
|
||||||
*/
|
*/
|
||||||
public interface Scheduler {
|
public interface Scheduler {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 加入一个待抓取的链接
|
* add a url to fetch
|
||||||
* @param request 待抓取的链接
|
*
|
||||||
* @param task 定义的任务,以满足单Scheduler多Task的情况
|
* @param request
|
||||||
|
* @param task
|
||||||
*/
|
*/
|
||||||
public void push(Request request,Task task);
|
public void push(Request request, Task task);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 返回下一个要抓取的链接
|
* 返回下一个要抓取的链接
|
||||||
|
*
|
||||||
* @param task 定义的任务,以满足单Scheduler多Task的情况
|
* @param task 定义的任务,以满足单Scheduler多Task的情况
|
||||||
* @return 下一个要抓取的链接
|
* @return 下一个要抓取的链接
|
||||||
*/
|
*/
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
<html>
|
<html>
|
||||||
<body>
|
<body>
|
||||||
包含url管理和调度的接口Scheduler及它的几个实现类。
|
Scheduler is the part of url management.
|
||||||
</body>
|
</body>
|
||||||
</html>
|
</html>
|
||||||
|
|
|
@ -4,6 +4,8 @@ import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
* All selectors will be arranged as a pipeline. <br>
|
||||||
|
* The next selector uses the result of the previous as source.
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
* @since 0.2.0
|
* @since 0.2.0
|
||||||
*/
|
*/
|
||||||
|
|
|
@ -10,10 +10,10 @@ import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* css风格的选择器。包装了Jsoup。<br>
|
* CSS selector. Based on Jsoup.
|
||||||
|
*
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
* Date: 13-4-21
|
* @since 0.1.0
|
||||||
* Time: 上午9:39
|
|
||||||
*/
|
*/
|
||||||
public class CssSelector implements Selector {
|
public class CssSelector implements Selector {
|
||||||
|
|
||||||
|
|
|
@ -4,6 +4,8 @@ import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
* All extractors will do extracting separately, <br>
|
||||||
|
* and the results of extractors will combined as the final result.
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
* @since 0.2.0
|
* @since 0.2.0
|
||||||
*/
|
*/
|
||||||
|
|
|
@ -4,13 +4,16 @@ import java.util.List;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Selector(extractor) for text.<br>
|
* Selector(extractor) for text.<br>
|
||||||
|
*
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
|
* @since 0.1.0
|
||||||
*/
|
*/
|
||||||
public interface Selector {
|
public interface Selector {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Extract single result in text.<br>
|
* Extract single result in text.<br>
|
||||||
* If there are more than one result, only the first will be chosen.
|
* If there are more than one result, only the first will be chosen.
|
||||||
|
*
|
||||||
* @param text
|
* @param text
|
||||||
* @return result
|
* @return result
|
||||||
*/
|
*/
|
||||||
|
@ -18,6 +21,7 @@ public interface Selector {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Extract all results in text.<br>
|
* Extract all results in text.<br>
|
||||||
|
*
|
||||||
* @param text
|
* @param text
|
||||||
* @return results
|
* @return results
|
||||||
*/
|
*/
|
||||||
|
|
|
@ -24,7 +24,7 @@
|
||||||
{@link #getHtml()} get content of current page
|
{@link #getHtml()} get content of current page
|
||||||
{@link #putField(String, Object)} save extracted result
|
{@link #putField(String, Object)} save extracted result
|
||||||
{@link #getResultItems()} get extract results to be used in {@link us.codecraft.webmagic.pipeline.Pipeline}
|
{@link #getResultItems()} get extract results to be used in {@link us.codecraft.webmagic.pipeline.Pipeline}
|
||||||
{@link #addTargetRequests(java.util.List)} {@link #addTargetRequest(String)} add urls to crawl
|
{@link #addTargetRequests(java.util.List)} {@link #addTargetRequest(String)} add urls to fetch
|
||||||
|
|
||||||
</pre>
|
</pre>
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue