diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java index fd881b2..a894269 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java @@ -8,7 +8,7 @@ import java.util.ArrayList; import java.util.List; /** - *
+ ** Page保存了上一次抓取的结果,并可定义待抓取的链接内容。 * * 主要方法: @@ -19,6 +19,17 @@ import java.util.List; * {@link #addTargetRequests(java.util.List)} {@link #addTargetRequest(String)} 添加待抓取的链接 * *+ *+ * Store extracted result and urls to be crawled. + * + * Main method: + * {@link #getUrl()} get url of current page + * {@link #getHtml()} get content of current page + * {@link #putField(String, Object)} save extracted result + * {@link #getResultItems()} get extract results to be used in {@link us.codecraft.webmagic.pipeline.Pipeline} + * {@link #addTargetRequests(java.util.List)} {@link #addTargetRequest(String)} add urls to crawl + * + ** * @author code4crafter@gmail.com
*/ @@ -44,7 +55,7 @@ public class Page { } /** - * 保存抽取的结果 + * * * @param key 结果的key * @param field 结果的value diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java index 7a6e557..b9b8ddf 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java @@ -5,6 +5,7 @@ import java.util.HashMap; import java.util.Map; /** + ** Request对象封装了待抓取的url信息。* * @author code4crafter@gmail.com
* 在PageProcessor中,Request对象可以通过{@link us.codecraft.webmagic.Page#getRequest()} 获取。
*
@@ -22,6 +23,7 @@ import java.util.Map; * String linktext = (String)page.getRequest().getExtra()[0]; * } * + *
* Date: 13-4-21 diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/package.html b/webmagic-core/src/main/java/us/codecraft/webmagic/package.html index d5ff540..05328dc 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/package.html +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/package.html @@ -1,5 +1,10 @@ ++ Main class "Spider" and models. ++包括webmagic入口类Spider和一些数据传递的实体类。 +diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ComboExtract.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ComboExtract.java new file mode 100644 index 0000000..1f5f008 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ComboExtract.java @@ -0,0 +1,18 @@ +package us.codecraft.webmagic.model.annotation; + +import java.lang.annotation.ElementType; +import java.lang.annotation.Retention; +import java.lang.annotation.Target; + +/** + * @author code4crafter@gmail.com
+ * Date: 13-8-16
+ * Time: 下午11:09
+ */ +@Retention(java.lang.annotation.RetentionPolicy.RUNTIME) +@Target({ElementType.FIELD, ElementType.TYPE}) +public @interface ComboExtract { + + + +}