diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
index ade2194..efff6ec 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
@@ -74,9 +74,9 @@ public class Spider implements Runnable, Task {
protected final static int STAT_STOPPED = 2;
/**
- * 使用已定义的抽取规则新建一个Spider。
+ * create a spider with pageProcessor.
*
- * @param pageProcessor 已定义的抽取规则
+ * @param pageProcessor
*/
public Spider(PageProcessor pageProcessor) {
this.pageProcessor = pageProcessor;
@@ -85,17 +85,19 @@ public class Spider implements Runnable, Task {
}
/**
- * 使用已定义的抽取规则新建一个Spider。
+ * create a spider with pageProcessor.
*
- * @param pageProcessor 已定义的抽取规则
- * @return 新建的Spider
+ * @param pageProcessor
+ * @return new spider
+ * @see PageProcessor
*/
public static Spider create(PageProcessor pageProcessor) {
return new Spider(pageProcessor);
}
/**
- * 重新设置startUrls,会覆盖Site本身的startUrls。
+ * Set startUrls of Spider.
+ * Prior to startUrls of Site.
*
* @param startUrls
* @return this
@@ -107,9 +109,10 @@ public class Spider implements Runnable, Task {
}
/**
- * 为爬虫设置一个唯一ID,用于标志任务,默认情况下使用domain作为uuid,对于单domain多任务的情况,请为重复任务设置不同的ID。
+ * Set an uuid for spider.
+ * Default uuid is domain of site.
*
- * @param uuid 唯一ID
+ * @param uuid
* @return this
*/
public Spider setUUID(String uuid) {
@@ -118,30 +121,86 @@ public class Spider implements Runnable, Task {
}
/**
- * 设置调度器。调度器用于保存待抓取URL,并可以进行去重、同步、持久化等工作。默认情况下使用内存中的阻塞队列进行调度。
+ * set scheduler for Spider
*
- * @param scheduler 调度器
+ * @param scheduler
* @return this
+ * @Deprecated
+ * @see #setScheduler(us.codecraft.webmagic.scheduler.Scheduler)
*/
public Spider scheduler(Scheduler scheduler) {
+ return setScheduler(scheduler);
+ }
+
+ /**
+ * set scheduler for Spider
+ *
+ * @param scheduler
+ * @return this
+ * @since 0.2.1
+ * @see Scheduler
+ */
+ public Spider setScheduler(Scheduler scheduler) {
checkIfNotRunning();
this.scheduler = scheduler;
return this;
}
/**
- * 设置处理管道。处理管道用于最终抽取结果的后处理,例如:保存到文件、保存到数据库等。默认情况下会输出到控制台。
+ * add a pipeline for Spider
*
- * @param pipeline 处理管道
+ * @param pipeline
* @return this
+ * @deprecated
+ * @see #setPipeline(us.codecraft.webmagic.pipeline.Pipeline)
*/
public Spider pipeline(Pipeline pipeline) {
+ return addPipeline(pipeline);
+ }
+
+ /**
+ * add a pipeline for Spider
+ *
+ * @param pipeline
+ * @return this
+ * @since 0.2.1
+ * @see Pipeline
+ */
+ public Spider addPipeline(Pipeline pipeline) {
checkIfNotRunning();
this.pipelines.add(pipeline);
return this;
}
+ /**
+ * clear the pipelines set
+ *
+ * @return this
+ */
+ public Spider clearPipeline() {
+ pipelines = new ArrayList();
+ return this;
+ }
+
+ /**
+ * set the downloader of spider
+ *
+ * @param downloader
+ * @return this
+ * @deprecated
+ * @see #setDownloader(us.codecraft.webmagic.downloader.Downloader)
+ */
public Spider downloader(Downloader downloader) {
+ return setDownloader(downloader);
+ }
+
+ /**
+ * set the downloader of spider
+ * @see Downloader
+ * @param downloader
+ * @return this
+ */
+ public Spider setDownloader(Downloader downloader) {
checkIfNotRunning();
this.downloader = downloader;
return this;
@@ -226,9 +285,9 @@ public class Spider implements Runnable, Task {
}
/**
- * 用某些特定URL进行爬虫测试
+ * Process specific urls without url discovering.
*
- * @param urls 要抓取的url
+ * @param urls urls to process
*/
public void test(String... urls) {
checkComponent();
@@ -284,9 +343,9 @@ public class Spider implements Runnable, Task {
}
/**
- * 建立多个线程下载
+ * start with more than one threads
*
- * @param threadNum 线程数
+ * @param threadNum
* @return this
*/
public Spider thread(int threadNum) {
@@ -304,11 +363,6 @@ public class Spider implements Runnable, Task {
return this;
}
- public Spider clearPipeline() {
- pipelines = new ArrayList();
- return this;
- }
-
@Override
public String getUUID() {
if (uuid != null) {
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Task.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Task.java
index 14c1d31..6781fcb 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/Task.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Task.java
@@ -1,21 +1,25 @@
package us.codecraft.webmagic;
/**
- * 抓取任务的抽象接口。
+ * Interface for identifying different tasks.
+ *
* @author code4crafter@gmail.com
- * Date: 13-6-18
- * Time: 下午2:57
+ * @since 0.1.0
+ * @see us.codecraft.webmagic.scheduler.Scheduler
+ * @see us.codecraft.webmagic.pipeline.Pipeline
*/
public interface Task {
/**
- * 返回唯一标志该任务的字符串,以区分不同任务。
+ * unique id for a task.
+ *
* @return uuid
*/
public String getUUID();
/**
- * 返回任务抓取的站点信息
+ * site of a task
+ *
* @return site
*/
public Site getSite();