update comments

2013-08-17 20:41:29 +08:00 · 2013-08-17 20:41:29 +08:00 · 5f1f4cbc46
parent 6cc1d62a08
commit 5f1f4cbc46
15 changed files with 152 additions and 162 deletions
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java
@ -8,30 +8,19 @@ import java.util.ArrayList;
 import java.util.List;

 /**
- * <pre class="zh">
- * Page保存了上一次抓取的结果，并可定义待抓取的链接内容。
 *
- *     主要方法：
- *     {@link #getUrl()} 获取页面的Url
- *     {@link #getHtml()}  获取页面的html内容
- *     {@link #putField(String, Object)}  保存抽取的结果
- *     {@link #getResultItems()} 获取抽取的结果，在 {@link us.codecraft.webmagic.pipeline.Pipeline} 中调用
- *     {@link #addTargetRequests(java.util.List)} {@link #addTargetRequest(String)} 添加待抓取的链接
- *
- * </pre>
- * <pre class="en">
- * Store extracted result and urls to be crawled.
- *
- *     Main method：
- *     {@link #getUrl()} get url of current page
- *     {@link #getHtml()}  get content of current page
- *     {@link #putField(String, Object)}  save extracted result
- *     {@link #getResultItems()} get extract results to be used in {@link us.codecraft.webmagic.pipeline.Pipeline}
- *     {@link #addTargetRequests(java.util.List)} {@link #addTargetRequest(String)} add urls to crawl
- *
- * </pre>
+ * Object storing extracted result and urls to be crawled.<br>
+ * Main method：                                               <br>
+ * {@link #getUrl()} get url of current page                   <br>
+ * {@link #getHtml()}  get content of current page                 <br>
+ * {@link #putField(String, Object)}  save extracted result            <br>
+ * {@link #getResultItems()} get extract results to be used in {@link us.codecraft.webmagic.pipeline.Pipeline}<br>
+ * {@link #addTargetRequests(java.util.List)} {@link #addTargetRequest(String)} add urls to crawl                 <br>
 *
 * @author code4crafter@gmail.com <br>
+ * @since 0.1.0
+ * @see us.codecraft.webmagic.downloader.Downloader
+ * @see us.codecraft.webmagic.processor.PageProcessor
 */
 public class Page {

@ -55,19 +44,19 @@ public class Page {
    }

    /**
+     * store extract results
     *
-     *
-     * @param key   结果的key
-     * @param field 结果的value
+     * @param key
+     * @param field
     */
    public void putField(String key, Object field) {
        resultItems.put(key, field);
    }

    /**
-     * 获取页面的html内容
+     * get html content of page
     *
-     * @return html 页面的html内容
+     * @return html
     */
    public Selectable getHtml() {
        return html;
@ -82,9 +71,9 @@ public class Page {
    }

    /**
-     * 添加待抓取的链接
+     * add urls to crawl
     *
-     * @param requests 待抓取的链接
+     * @param requests
     */
    public void addTargetRequests(List<String> requests) {
        synchronized (targetRequests) {
@ -99,9 +88,9 @@ public class Page {
    }

    /**
-     * 添加待抓取的链接
+     * add url to crawl
     *
-     * @param requestString 待抓取的链接
+     * @param requestString
     */
    public void addTargetRequest(String requestString) {
        if (StringUtils.isBlank(requestString) || requestString.equals("#")) {
@ -114,9 +103,9 @@ public class Page {
    }

    /**
-     * 添加待抓取的页面，在需要传递附加信息时使用
+     * add requests to crawl
     *
-     * @param request 待抓取的页面
+     * @param request
     */
    public void addTargetRequest(Request request) {
        synchronized (targetRequests) {
@ -125,27 +114,22 @@ public class Page {
    }

    /**
-     * 获取页面的Url
+     * get url of current page
     *
-     * @return url 当前页面的url，可用于抽取
+     * @return url of current page
     */
    public Selectable getUrl() {
        return url;
    }

-    /**
-     * 设置url
-     *
-     * @param url
-     */
    public void setUrl(Selectable url) {
        this.url = url;
    }

    /**
-     * 获取抓取请求
+     * get request of current page
     *
-     * @return request 抓取请求
+     * @return request
     */
    public Request getRequest() {
        return request;
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java
@ -1,33 +1,17 @@
 package us.codecraft.webmagic;

+import us.codecraft.webmagic.utils.Experimental;
+
 import java.io.Serializable;
 import java.util.HashMap;
 import java.util.Map;

 /**
- * <div class="zh">
- * Request对象封装了待抓取的url信息。<br/>
- * 在PageProcessor中，Request对象可以通过{@link us.codecraft.webmagic.Page#getRequest()} 获取。<br/>
- * <br/>
- * Request对象包含一个extra属性，可以写入一些必须的上下文，这个特性在某些场合会有用。<br/>
- * <pre>
- *      Example:
- *          抓取<a href="${link}">${linktext}</a>时，希望提取链接link，并保存linktext的信息。
- *      在上一个页面：
- *      public void process(Page page){
- *          Request request = new Request(link,linktext);
- *          page.addTargetRequest(request)
- *      }
- *      在下一个页面：
- *      public void process(Page page){
- *          String linktext =  (String)page.getRequest().getExtra()[0];
- *      }
- * </pre>
- * </div>
+ * Object contains url to crawl.<br>
+ * It contains some additional information.<br>
 *
 * @author code4crafter@gmail.com <br>
- *         Date: 13-4-21
- *         Time: 上午11:37
+ * @since 0.1.0
 */
 public class Request implements Serializable {

@ -36,20 +20,22 @@ public class Request implements Serializable {
    private String url;

    /**
-     * 额外参数，可以保存一些需要的上下文信息
+     * Store additional information in extras.
     */
    private Map<String, Object> extras;

+    /**
+     * Priority of the request.<br>
+     * The bigger will be processed earlier. <br>
+     * Need a scheduler supporting priority.<br>
+     * But no scheduler in webmagic supporting priority now (:
+     */
+    @Experimental
    private double priority;

    public Request() {
    }

-    /**
-     * 构建一个request对象
-     *
-     * @param url 必须参数，待抓取的url
-     */
    public Request(String url) {
        this.url = url;
    }
@ -59,12 +45,14 @@ public class Request implements Serializable {
    }

    /**
-     * 设置优先级，用于URL队列排序<br>
-     * 需扩展Scheduler<br>
-     * 目前还没有对应支持优先级的Scheduler实现 =。= <br>
-     * @param priority 优先级，越大则越靠前
+     * Set the priority of request for sorting.<br>
+     * Need a scheduler supporting priority.<br>
+     * But no scheduler in webmagic supporting priority now (:
+     *
+     * @param priority
     * @return this
     */
+    @Experimental
    public Request setPriority(double priority) {
        this.priority = priority;
        return this;
@ -85,11 +73,6 @@ public class Request implements Serializable {
        return this;
    }

-    /**
-     * 获取待抓取的url
-     *
-     * @return url 待抓取的url
-     */
    public String getUrl() {
        return url;
    }
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/ResultItems.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/ResultItems.java
@ -4,10 +4,13 @@ import java.util.HashMap;
 import java.util.Map;

 /**
- * 保存抽取结果的类，由PageProcessor处理得到，传递给{@link us.codecraft.webmagic.pipeline.Pipeline}进行持久化。<br>
+ * Object contains extract results.<br>
+ * It is contained in Page and will be processed in pipeline.
+ *
 * @author code4crafter@gmail.com <br>
- * Date: 13-7-25 <br>
- * Time: 下午12:20 <br>
+ * @since 0.1.0
+ * @see Page
+ * @see us.codecraft.webmagic.pipeline.Pipeline
 */
 public class ResultItems {

@ -25,7 +28,7 @@ public class ResultItems {
        return (T) fields.get(key);
    }

-    public  Map<String, Object> getAll() {
+    public Map<String, Object> getAll() {
        return fields;
    }

@ -44,8 +47,10 @@ public class ResultItems {
    }

    /**
-     * 是否忽略这个页面，用于pipeline来判断是否对这个页面进行处理
-     * @return 是否忽略 true 忽略
+     * Whether to skip the result.<br>
+     * Result which is skipped will not be processed by Pipeline.
+     *
+     * @return whether to skip the result
     */
    public boolean isSkip() {
        return skip;
@ -53,8 +58,10 @@ public class ResultItems {


    /**
-     * 设置是否忽略这个页面，用于pipeline来判断是否对这个页面进行处理
-     * @param skip
+     * Set whether to skip the result.<br>
+     * Result which is skipped will not be processed by Pipeline.
+     *
+     * @param skip whether to skip the result
     * @return this
     */
    public ResultItems setSkip(boolean skip) {
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java
@ -5,12 +5,11 @@ import us.codecraft.webmagic.utils.UrlUtils;
 import java.util.*;

 /**
- * Site定义一个待抓取的站点的各种信息。<br>
- * 这个类的所有getter方法，一般都只会被爬虫框架内部进行调用。<br>
+ * Object contains setting for crawler.<br>
 *
 * @author code4crafter@gmail.com <br>
- *         Date: 13-4-21
- *         Time: 下午12:13
+ * @since 0.1.0
+ * @see us.codecraft.webmagic.processor.PageProcessor
 */
 public class Site {

@ -22,6 +21,9 @@ public class Site {

    private String charset;

+    /**
+     * startUrls is the urls the crawler to start with.
+     */
    private List<String> startUrls = new ArrayList<String>();

    private int sleepTime = 3000;
@ -37,19 +39,19 @@ public class Site {
    }

    /**
-     * 创建一个Site对象，等价于new Site()
+     * new a Site
     *
-     * @return 新建的对象
+     * @return new site
     */
    public static Site me() {
        return new Site();
    }

    /**
-     * 为这个站点添加一个cookie，可用于抓取某些需要登录访问的站点。这个cookie的域名与{@link #getDomain()}是一致的
+     * Add a cookie with domain {@link #getDomain()}
     *
-     * @param name  cookie的名称
-     * @param value cookie的值
+     * @param name
+     * @param value
     * @return this
     */
    public Site addCookie(String name, String value) {
@ -58,7 +60,7 @@ public class Site {
    }

    /**
-     * 为这个站点设置user-agent，很多网站都对user-agent进行了限制，不设置此选项可能会得到期望之外的结果。
+     * set user agent
     *
     * @param userAgent userAgent
     * @return this
@ -69,27 +71,27 @@ public class Site {
    }

    /**
-     * 获取已经设置的所有cookie
+     * get cookies
     *
-     * @return 已经设置的所有cookie
+     * @return get cookies
     */
    public Map<String, String> getCookies() {
        return cookies;
    }

    /**
-     * 获取已设置的user-agent
+     * get user agent
     *
-     * @return 已设置的user-agent
+     * @return user agent
     */
    public String getUserAgent() {
        return userAgent;
    }

    /**
-     * 获取已设置的domain
+     * get domain
     *
-     * @return 已设置的domain
+     * @return get domain
     */
    public String getDomain() {
        if (domain == null) {
@ -101,10 +103,9 @@ public class Site {
    }

    /**
-     * 设置这个站点所在域名，必须项。<br>
-     * 目前不支持多个域名的抓取。抓取多个域名请新建一个Spider。
+     * set the domain of site.
     *
-     * @param domain 爬虫会抓取的域名
+     * @param domain
     * @return this
     */
    public Site setDomain(String domain) {
@ -113,10 +114,10 @@ public class Site {
    }

    /**
-     * 设置页面编码，若不设置则自动根据Html meta信息获取。<br>
-     * 一般无需设置encoding，如果发现下载的结果是乱码，则可以设置此项。<br>
+     * Set charset of page manually.<br>
+     * When charset is not set or set to null, it can be auto detected by Http header.
     *
-     * @param charset 编码格式，主要是"utf-8"、"gbk"两种
+     * @param charset
     * @return this
     */
    public Site setCharset(String charset) {
@ -125,20 +126,21 @@ public class Site {
    }

    /**
-     * 获取已设置的编码
+     * get charset set manually
     *
-     * @return 已设置的domain
+     * @return charset
     */
    public String getCharset() {
        return charset;
    }

    /**
-     * 设置可接受的http状态码，仅当状态码在这个集合中时，才会读取页面内容。<br>
-     * 默认为200，正常情况下，无须设置此项。<br>
-     * 某些站点会错误的返回状态码，此时可以对这个选项进行设置。<br>
+     * Set acceptStatCode.<br>
+     * When status code of http response is in acceptStatCodes, it will be processed.<br>
+     * {200} by default.<br>
+     * It is not necessarily to be set.<br>
     *
-     * @param acceptStatCode 可接受的状态码
+     * @param acceptStatCode
     * @return this
     */
    public Site setAcceptStatCode(Set<Integer> acceptStatCode) {
@ -147,27 +149,27 @@ public class Site {
    }

    /**
-     * 获取可接受的状态码
+     * get acceptStatCode
     *
-     * @return 可接受的状态码
+     * @return acceptStatCode
     */
    public Set<Integer> getAcceptStatCode() {
        return acceptStatCode;
    }

    /**
-     * 获取初始页面的地址列表
+     * get start urls
     *
-     * @return 初始页面的地址列表
+     * @return start urls
     */
    public List<String> getStartUrls() {
        return startUrls;
    }

    /**
-     * 增加初始页面的地址，可反复调用此方法增加多个初始地址。
+     * Add a url to start url.<br>
     *
-     * @param startUrl 初始页面的地址
+     * @param startUrl
     * @return this
     */
    public Site addStartUrl(String startUrl) {
@ -176,9 +178,10 @@ public class Site {
    }

    /**
-     * 设置两次抓取之间的间隔，避免对目标站点压力过大(或者避免被防火墙屏蔽...)。
+     * Set the interval between the processing of two pages.<br>
+     * Time unit is micro seconds.<br>
     *
-     * @param sleepTime 单位毫秒
+     * @param sleepTime
     * @return this
     */
    public Site setSleepTime(int sleepTime) {
@ -187,25 +190,26 @@ public class Site {
    }

    /**
-     * 获取两次抓取之间的间隔
+     * Get the interval between the processing of two pages.<br>
+     * Time unit is micro seconds.<br>
     *
-     * @return 两次抓取之间的间隔，单位毫秒
+     * @return the interval between the processing of two pages,
     */
    public int getSleepTime() {
        return sleepTime;
    }

    /**
-     * 获取重新下载的次数，默认为0
+     * Get retry times when download fail, 0 by default.<br>
     *
-     * @return 重新下载的次数
+     * @return retry times when download fail
     */
    public int getRetryTimes() {
        return retryTimes;
    }

    /**
-     * 设置获取重新下载的次数，默认为0
+     * Set retry times when download fail, 0 by default.<br>
     *
     * @return this
     */
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
@ -18,25 +18,30 @@ import java.util.concurrent.ExecutorService;
 import java.util.concurrent.atomic.AtomicInteger;

 /**
- * <pre>
- * webmagic爬虫的入口类。
- *
- * 示例：
- * 定义一个最简单的爬虫：
- *      Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*")).run();
- *
- * 使用FilePipeline保存结果到文件:
- *      Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*"))
- *          .pipeline(new FilePipeline("/data/temp/webmagic/")).run();
- *
- * 使用FileCacheQueueScheduler缓存URL，关闭爬虫后下次自动从停止的页面继续抓取:
- *      Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*"))
- *          .scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")).run();
- * </pre>
+ * Entrance of a crawler.<br>
+ * A spider contains four modules: Downloader, Scheduler, PageProcessor and Pipeline.<br>
+ * Every module is a field of Spider.                                                    <br>
+ * The modules are defined in interface.                                                     <br>
+ * You can customize a spider with various implementations of them.                              <br>
+ * Examples:                                                                                         <br>
+ * <br>
+ * A simple crawler:                                                                                         <br>
+ * Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*")).run();<br>
+ * <br>
+ * Store results to files by FilePipeline:                                                                              <br>
+ * Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*"))                   <br>
+ * .pipeline(new FilePipeline("/data/temp/webmagic/")).run();                                                          <br>
+ * <br>
+ * Use FileCacheQueueScheduler to store urls and cursor in files, so that a Spider can resume the status when shutdown.                 <br>
+ * Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*"))                                   <br>
+ * .scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")).run();                                                        <br>
 *
 * @author code4crafter@gmail.com <br>
- *         Date: 13-4-21
- *         Time: 上午6:53
+ * @see Downloader
+ * @see Scheduler
+ * @see PageProcessor
+ * @see Pipeline
+ * @since 0.1.0
 */
 public class Spider implements Runnable, Task {

@ -222,11 +227,12 @@ public class Spider implements Runnable, Task {

    /**
     * 用某些特定URL进行爬虫测试
+     *
     * @param urls 要抓取的url
     */
-    public void test(String... urls){
+    public void test(String... urls) {
        checkComponent();
-        if (urls.length>0){
+        if (urls.length > 0) {
            for (String url : urls) {
                processRequest(new Request(url));
            }
@ -241,7 +247,7 @@ public class Spider implements Runnable, Task {
        }
        pageProcessor.process(page);
        addRequest(page);
-        if (!page.getResultItems().isSkip()){
+        if (!page.getResultItems().isSkip()) {
            for (Pipeline pipeline : pipelines) {
                pipeline.process(page.getResultItems(), this);
            }
@ -298,8 +304,8 @@ public class Spider implements Runnable, Task {
        return this;
    }

-    public Spider clearPipeline(){
-        pipelines=new ArrayList<Pipeline>();
+    public Spider clearPipeline() {
+        pipelines = new ArrayList<Pipeline>();
        return this;
    }

--- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
@ -38,7 +38,7 @@ public class HttpClientDownloader implements Downloader {
     * 直接下载页面的简便方法
     *
     * @param url
-     * @return
+     * @return html
     */
    public Html download(String url) {
        Page page = download(new Request(url), null);
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/package.html
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/package.html
@ -2,9 +2,6 @@
 	<body>
    <div class="en">
        Main class "Spider" and models.
-    </div>
-    <div class="zh">
-包括webmagic入口类Spider和一些数据传递的实体类。
    </div>
 	</body>
 </html>
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/Experimental.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/Experimental.java
@ -1,4 +1,4 @@
-package us.codecraft.webmagic.model.annotation;
+package us.codecraft.webmagic.utils;

 /**
 * @author code4crafter@gmail.com <br>
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/MultiPageModel.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/MultiPageModel.java
@ -1,6 +1,6 @@
 package us.codecraft.webmagic;

-import us.codecraft.webmagic.model.annotation.Experimental;
+import us.codecraft.webmagic.utils.Experimental;

 import java.util.Collection;

--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/FileCache.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/FileCache.java
@ -4,7 +4,7 @@ import org.apache.commons.codec.digest.DigestUtils;
 import org.apache.commons.lang3.StringUtils;
 import org.apache.log4j.Logger;
 import us.codecraft.webmagic.*;
-import us.codecraft.webmagic.model.annotation.Experimental;
+import us.codecraft.webmagic.utils.Experimental;
 import us.codecraft.webmagic.pipeline.Pipeline;
 import us.codecraft.webmagic.processor.PageProcessor;
 import us.codecraft.webmagic.processor.SimplePageProcessor;
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/HasKey.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/HasKey.java
@ -1,6 +1,6 @@
 package us.codecraft.webmagic.model;

-import us.codecraft.webmagic.model.annotation.Experimental;
+import us.codecraft.webmagic.utils.Experimental;

 /**
 * Interface to be implemented by page mode.<br>
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ComboExtract.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ComboExtract.java
@ -21,7 +21,7 @@ public @interface ComboExtract {
     */
    ExtractBy[] value();

-    enum Op {
+    public static enum Op {
        /**
         * All extractors will be arranged as a pipeline. <br>
         * The next extractor uses the result of the previous as source.
@ -49,7 +49,10 @@ public @interface ComboExtract {
     */
    boolean notNull() default false;

-    public enum Source {
+    /**
+     * types of source for extracting.
+     */
+    public static enum Source {
        /**
         * extract from the content extracted by class extractor
         */
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy.java
@ -21,7 +21,10 @@ public @interface ExtractBy {
     */
    String value();

-    public enum Type {XPath, Regex, Css}
+    /**
+     * types of extractor expressions
+     */
+    public static enum Type {XPath, Regex, Css}

    /**
     * Extractor type, support XPath, CSS Selector and regex.
@ -38,7 +41,10 @@ public @interface ExtractBy {
     */
    boolean notNull() default false;

-    public enum Source {
+    /**
+     * types of source for extracting.
+     */
+    public static enum Source {
        /**
         * extract from the content extracted by class extractor
         */
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/MultiPagePipeline.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/MultiPagePipeline.java
@ -3,7 +3,7 @@ package us.codecraft.webmagic.pipeline;
 import us.codecraft.webmagic.MultiPageModel;
 import us.codecraft.webmagic.ResultItems;
 import us.codecraft.webmagic.Task;
-import us.codecraft.webmagic.model.annotation.Experimental;
+import us.codecraft.webmagic.utils.Experimental;
 import us.codecraft.webmagic.utils.DoubleKeyMap;

 import java.util.*;
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java
@ -16,7 +16,7 @@ import java.util.concurrent.atomic.AtomicBoolean;
 import java.util.concurrent.atomic.AtomicInteger;

 /**
- * Store urls and cursor in files so that a Spider can resume the status when shutdown。<br>
+ * Store urls and cursor in files so that a Spider can resume the status when shutdown.<br>
 *
 * @author code4crafter@gmail.com <br>
 * @since 0.2.0