diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/PagedModel.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/MultiPageModel.java
similarity index 76%
rename from webmagic-extension/src/main/java/us/codecraft/webmagic/PagedModel.java
rename to webmagic-extension/src/main/java/us/codecraft/webmagic/MultiPageModel.java
index b94cb25..2e1b713 100644
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/PagedModel.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/MultiPageModel.java
@@ -8,7 +8,7 @@ import java.util.Collection;
* Date: 13-8-4
* Time: 下午5:18
*/
-public interface PagedModel {
+public interface MultiPageModel {
public String getPageKey();
@@ -16,6 +16,6 @@ public interface PagedModel {
public String getPage();
- public PagedModel combine(PagedModel pagedModel);
+ public MultiPageModel combine(MultiPageModel multiPageModel);
}
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/FileCache.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/FileCache.java
index bf32435..a78a343 100644
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/FileCache.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/FileCache.java
@@ -17,7 +17,6 @@ import java.io.*;
/**
* Download file and saved to file for cache.
*
- *
* @author code4crafter@gmail.com
* @since 0.2.1
*/
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/Experimental.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/Experimental.java
new file mode 100644
index 0000000..f619d12
--- /dev/null
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/Experimental.java
@@ -0,0 +1,8 @@
+package us.codecraft.webmagic.model.annotation;
+
+/**
+ * @author code4crafter@gmail.com
+ * Stands for features not stable.
+ */
+public @interface Experimental {
+}
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePageModelPipeline.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePageModelPipeline.java
index c66f52b..3be53d3 100644
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePageModelPipeline.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePageModelPipeline.java
@@ -14,29 +14,24 @@ import java.io.IOException;
import java.io.PrintWriter;
/**
- * JSON格式持久化到文件的接口。
- * 如果持久化的文件名是乱码,请再运行的环境变量里加上LANG=zh_CN.UTF-8。
+ * Store results objects (page models) to files in JSON format。
+ * Use model.getKey() as file name if the model implements HasKey.
+ * Otherwise use SHA1 as file name.
*
* @author code4crafter@gmail.com
- * Date: 13-4-21
- * Time: 下午6:28
+ * @since 0.2.0
*/
public class JsonFilePageModelPipeline extends FilePersistentBase implements PageModelPipeline {
private Logger logger = Logger.getLogger(getClass());
/**
- * 新建一个JsonFilePageModelPipeline,使用默认保存路径"/data/webmagic/"
+ * new JsonFilePageModelPipeline with default path "/data/webmagic/"
*/
public JsonFilePageModelPipeline() {
setPath("/data/webmagic/");
}
- /**
- * 新建一个JsonFilePageModelPipeline
- *
- * @param path 文件保存路径
- */
public JsonFilePageModelPipeline(String path) {
setPath(path);
}
@@ -47,7 +42,7 @@ public class JsonFilePageModelPipeline extends FilePersistentBase implements Pag
try {
String filename;
if (o instanceof HasKey) {
- filename = path + ((HasKey)o).key() + ".json";
+ filename = path + ((HasKey) o).key() + ".json";
} else {
filename = path + DigestUtils.md5Hex(ToStringBuilder.reflectionToString(o)) + ".json";
}
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePipeline.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePipeline.java
index f2478f0..03313a9 100644
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePipeline.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePipeline.java
@@ -13,28 +13,22 @@ import java.io.IOException;
import java.io.PrintWriter;
/**
- * JSON格式持久化到文件的接口。
+ * Store results to files in JSON format。
*
* @author code4crafter@gmail.com
- * Date: 13-4-21
- * Time: 下午6:28
+ * @since 0.2.0
*/
public class JsonFilePipeline extends FilePersistentBase implements Pipeline {
private Logger logger = Logger.getLogger(getClass());
/**
- * 新建一个JsonFilePipeline,使用默认保存路径"/data/webmagic/"
+ * new JsonFilePageModelPipeline with default path "/data/webmagic/"
*/
public JsonFilePipeline() {
setPath("/data/webmagic");
}
- /**
- * 新建一个JsonFilePipeline
- *
- * @param path 文件保存路径
- */
public JsonFilePipeline(String path) {
setPath(path);
}
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/PagedPipeline.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/MultiPagePipeline.java
similarity index 56%
rename from webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/PagedPipeline.java
rename to webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/MultiPagePipeline.java
index beda667..81c684b 100644
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/PagedPipeline.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/MultiPagePipeline.java
@@ -1,26 +1,28 @@
package us.codecraft.webmagic.pipeline;
-import us.codecraft.webmagic.PagedModel;
+import us.codecraft.webmagic.MultiPageModel;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
+import us.codecraft.webmagic.model.annotation.Experimental;
import us.codecraft.webmagic.utils.DoubleKeyMap;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
/**
- * 用于实现分页的Pipeline。
- * 在使用redis做分布式爬虫时,请不要使用此功能。
+ * A pipeline combines the result in more than one page together.
+ * Used for news and articles containing more than one web page.
+ * MultiPagePipeline will store parts of object and output them when all parts are extracted.
*
* @author code4crafter@gmail.com
- * Date: 13-8-4
- * Time: 下午5:15
+ * @since 0.2.0
*/
-public class PagedPipeline implements Pipeline {
+@Experimental
+public class MultiPagePipeline implements Pipeline {
private DoubleKeyMap pageMap = new DoubleKeyMap(ConcurrentHashMap.class);
- private DoubleKeyMap objectMap = new DoubleKeyMap(ConcurrentHashMap.class);
+ private DoubleKeyMap objectMap = new DoubleKeyMap(ConcurrentHashMap.class);
@Override
public void process(ResultItems resultItems, Task task) {
@@ -34,20 +36,20 @@ public class PagedPipeline implements Pipeline {
private void handleObject(Iterator> iterator) {
Map.Entry objectEntry = iterator.next();
Object o = objectEntry.getValue();
- if (o instanceof PagedModel) {
- PagedModel pagedModel = (PagedModel) o;
- pageMap.put(pagedModel.getPageKey(), pagedModel.getPage(), Boolean.TRUE);
- if (pagedModel.getOtherPages() != null) {
- for (String otherPage : pagedModel.getOtherPages()) {
- Boolean aBoolean = pageMap.get(pagedModel.getPageKey(), otherPage);
+ if (o instanceof MultiPageModel) {
+ MultiPageModel multiPageModel = (MultiPageModel) o;
+ pageMap.put(multiPageModel.getPageKey(), multiPageModel.getPage(), Boolean.TRUE);
+ if (multiPageModel.getOtherPages() != null) {
+ for (String otherPage : multiPageModel.getOtherPages()) {
+ Boolean aBoolean = pageMap.get(multiPageModel.getPageKey(), otherPage);
if (aBoolean == null) {
- pageMap.put(pagedModel.getPageKey(), otherPage, Boolean.FALSE);
+ pageMap.put(multiPageModel.getPageKey(), otherPage, Boolean.FALSE);
}
}
}
//check if all pages are processed
- Map booleanMap = pageMap.get(pagedModel.getPageKey());
- objectMap.put(pagedModel.getPageKey(), pagedModel.getPage(), pagedModel);
+ Map booleanMap = pageMap.get(multiPageModel.getPageKey());
+ objectMap.put(multiPageModel.getPageKey(), multiPageModel.getPage(), multiPageModel);
if (booleanMap == null) {
return;
}
@@ -57,12 +59,12 @@ public class PagedPipeline implements Pipeline {
return;
}
}
- List> entryList = new ArrayList>();
- entryList.addAll(objectMap.get(pagedModel.getPageKey()).entrySet());
+ List> entryList = new ArrayList>();
+ entryList.addAll(objectMap.get(multiPageModel.getPageKey()).entrySet());
if (entryList.size() != 0) {
- Collections.sort(entryList, new Comparator>() {
+ Collections.sort(entryList, new Comparator>() {
@Override
- public int compare(Map.Entry o1, Map.Entry o2) {
+ public int compare(Map.Entry o1, Map.Entry o2) {
try {
int i1 = Integer.parseInt(o1.getKey());
int i2 = Integer.parseInt(o2.getKey());
@@ -72,7 +74,7 @@ public class PagedPipeline implements Pipeline {
}
}
});
- PagedModel value = entryList.get(0).getValue();
+ MultiPageModel value = entryList.get(0).getValue();
for (int i = 1; i < entryList.size(); i++) {
value = value.combine(entryList.get(i).getValue());
}
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java
index a8dc23a..3f691cd 100644
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java
@@ -16,10 +16,10 @@ import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;
/**
- * 磁盘文件实现的url管理模块,可以保证在长时间执行的任务中断后,下次启动从中断位置重新开始。
+ * Store urls and cursor in files so that a Spider can resume the status when shutdown。
+ *
* @author code4crafter@gmail.com
- * Date: 13-4-21
- * Time: 下午1:13
+ * @since 0.2.0
*/
public class FileCacheQueueScheduler implements Scheduler {
@@ -46,8 +46,8 @@ public class FileCacheQueueScheduler implements Scheduler {
private Set urls;
public FileCacheQueueScheduler(String filePath) {
- if (!filePath.endsWith("/")&&!filePath.endsWith("\\")){
- filePath+="/";
+ if (!filePath.endsWith("/") && !filePath.endsWith("\\")) {
+ filePath += "/";
}
this.filePath = filePath;
}
@@ -95,7 +95,7 @@ public class FileCacheQueueScheduler implements Scheduler {
readCursorFile();
readUrlFile();
} catch (IOException e) {
- logger.error("init file error",e);
+ logger.error("init file error", e);
}
}
@@ -122,7 +122,7 @@ public class FileCacheQueueScheduler implements Scheduler {
}
private String getFileName(String filename) {
- return filePath + task.getUUID() + filename;
+ return filePath + task.getUUID() + filename;
}
@Override
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java
index c377a12..e0912de 100644
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java
@@ -9,11 +9,10 @@ import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task;
/**
- * 使用redis管理url,构建一个分布式的爬虫。
+ * Use Redis as url scheduler for distributed crawlers。
*
* @author code4crafter@gmail.com
- * Date: 13-7-25
- * Time: 上午7:07
+ * @since 0.2.0
*/
public class RedisScheduler implements Scheduler {
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/selector/JsonPathSelector.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/selector/JsonPathSelector.java
index 8331416..5a41c47 100644
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/selector/JsonPathSelector.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/selector/JsonPathSelector.java
@@ -6,9 +6,8 @@ import java.util.ArrayList;
import java.util.List;
/**
+ * JsonPath
* @author code4crafter@gmail.com
- * Date: 13-8-12
- * Time: 下午12:54
*/
public class JsonPathSelector implements Selector {
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/DoubleKeyMap.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/DoubleKeyMap.java
index b284a15..ba763c0 100755
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/DoubleKeyMap.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/DoubleKeyMap.java
@@ -4,7 +4,6 @@ import java.util.Map;
/**
* @author code4crafter@gmail.com
- * Date Dec 14, 2012
*/
public class DoubleKeyMap extends MultiKeyMapBase {
private Map> map;
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/MultiKeyMapBase.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/MultiKeyMapBase.java
index 89fdc9a..a7d8378 100755
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/MultiKeyMapBase.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/MultiKeyMapBase.java
@@ -9,7 +9,7 @@ import java.util.HashMap;
import java.util.Map;
/**
- * multikey map, some basic objects *
+ * multi-key map, some basic objects *
*
* @author yihua.huang
*/
diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/scheduler/RedisSchedulerTest.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/scheduler/RedisSchedulerTest.java
index 0819e43..1518763 100644
--- a/webmagic-extension/src/test/java/us/codecraft/webmagic/scheduler/RedisSchedulerTest.java
+++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/scheduler/RedisSchedulerTest.java
@@ -9,8 +9,6 @@ import us.codecraft.webmagic.Task;
/**
* @author code4crafter@gmail.com
- * Date: 13-7-25
- * Time: 上午7:51
*/
public class RedisSchedulerTest {
diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/selector/JsonPathSelectorTest.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/selector/JsonPathSelectorTest.java
index 1cd8dc1..c38efe9 100644
--- a/webmagic-extension/src/test/java/us/codecraft/webmagic/selector/JsonPathSelectorTest.java
+++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/selector/JsonPathSelectorTest.java
@@ -7,8 +7,6 @@ import java.util.List;
/**
* @author code4crafter@gmai.com
- * Date: 13-8-12
- * Time: 下午1:12
*/
public class JsonPathSelectorTest {
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/main/QuickStarter.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/main/QuickStarter.java
index 52be272..69adabb 100644
--- a/webmagic-samples/src/main/java/us/codecraft/webmagic/main/QuickStarter.java
+++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/main/QuickStarter.java
@@ -6,7 +6,7 @@ import us.codecraft.webmagic.model.samples.IteyeBlog;
import us.codecraft.webmagic.model.samples.News163;
import us.codecraft.webmagic.model.samples.OschinaBlog;
import us.codecraft.webmagic.pipeline.ConsolePipeline;
-import us.codecraft.webmagic.pipeline.PagedPipeline;
+import us.codecraft.webmagic.pipeline.MultiPagePipeline;
import java.util.LinkedHashMap;
import java.util.Map;
@@ -40,7 +40,7 @@ public class QuickStarter {
key = readKey(key);
System.out.println("The demo started and will last 20 seconds...");
//Start spider
- OOSpider.create(Site.me().addStartUrl(urlMap.get(key)), clazzMap.get(key)).pipeline(new PagedPipeline()).pipeline(new ConsolePipeline()).runAsync();
+ OOSpider.create(Site.me().addStartUrl(urlMap.get(key)), clazzMap.get(key)).pipeline(new MultiPagePipeline()).pipeline(new ConsolePipeline()).runAsync();
try {
Thread.sleep(20000);
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java
index 2bc3f95..946e737 100644
--- a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java
+++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java
@@ -1,6 +1,6 @@
package us.codecraft.webmagic.model.samples;
-import us.codecraft.webmagic.PagedModel;
+import us.codecraft.webmagic.MultiPageModel;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.model.OOSpider;
import us.codecraft.webmagic.model.annotation.ComboExtract;
@@ -8,7 +8,7 @@ import us.codecraft.webmagic.model.annotation.ExtractBy;
import us.codecraft.webmagic.model.annotation.ExtractByUrl;
import us.codecraft.webmagic.model.annotation.TargetUrl;
import us.codecraft.webmagic.pipeline.ConsolePipeline;
-import us.codecraft.webmagic.pipeline.PagedPipeline;
+import us.codecraft.webmagic.pipeline.MultiPagePipeline;
import us.codecraft.webmagic.scheduler.RedisScheduler;
import java.util.Collection;
@@ -20,7 +20,7 @@ import java.util.List;
* Time: 下午8:17
*/
@TargetUrl("http://news.163.com/\\d+/\\d+/\\d+/\\w+*.html")
-public class News163 implements PagedModel {
+public class News163 implements MultiPageModel {
@ExtractByUrl("http://news\\.163\\.com/\\d+/\\d+/\\d+/([^_]*).*\\.html")
private String pageKey;
@@ -58,10 +58,10 @@ public class News163 implements PagedModel {
}
@Override
- public PagedModel combine(PagedModel pagedModel) {
+ public MultiPageModel combine(MultiPageModel multiPageModel) {
News163 news163 = new News163();
news163.title = this.title;
- News163 pagedModel1 = (News163) pagedModel;
+ News163 pagedModel1 = (News163) multiPageModel;
news163.content = this.content + pagedModel1.content;
return news163;
}
@@ -77,7 +77,7 @@ public class News163 implements PagedModel {
public static void main(String[] args) {
OOSpider.create(Site.me().addStartUrl("http://news.163.com/13/0802/05/958I1E330001124J_2.html"), News163.class)
- .scheduler(new RedisScheduler("localhost")).clearPipeline().pipeline(new PagedPipeline()).pipeline(new ConsolePipeline()).run();
+ .scheduler(new RedisScheduler("localhost")).clearPipeline().pipeline(new MultiPagePipeline()).pipeline(new ConsolePipeline()).run();
}
}
diff --git a/zh_docs/us/codecraft/webmagic/PagedModel-cmnt.xml b/zh_docs/us/codecraft/webmagic/PagedModel-cmnt.xml
index a57d07b..17ffd7a 100644
--- a/zh_docs/us/codecraft/webmagic/PagedModel-cmnt.xml
+++ b/zh_docs/us/codecraft/webmagic/PagedModel-cmnt.xml
@@ -4,7 +4,7 @@
Sat Aug 17 14:14:45 CST 2013
-
+
@author code4crafter@gmail.com
Date: 13-8-4
diff --git a/zh_docs/us/codecraft/webmagic/pipeline/PagedPipeline-cmnt.xml b/zh_docs/us/codecraft/webmagic/pipeline/PagedPipeline-cmnt.xml
index eb41808..128521c 100644
--- a/zh_docs/us/codecraft/webmagic/pipeline/PagedPipeline-cmnt.xml
+++ b/zh_docs/us/codecraft/webmagic/pipeline/PagedPipeline-cmnt.xml
@@ -4,7 +4,7 @@
Sat Aug 17 14:14:46 CST 2013
-
+
在使用redis做分布式爬虫时,请不要使用此功能。