From 59aad6a7f4939fcf65c3b9f480fa202ba2fe9dbc Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 17 Aug 2013 18:33:05 +0800 Subject: [PATCH] comments in english --- .../{PagedModel.java => MultiPageModel.java} | 4 +- .../webmagic/downloader/FileCache.java | 1 - .../model/annotation/Experimental.java | 8 ++++ .../pipeline/JsonFilePageModelPipeline.java | 17 +++---- .../webmagic/pipeline/JsonFilePipeline.java | 12 ++--- ...edPipeline.java => MultiPagePipeline.java} | 44 ++++++++++--------- .../scheduler/FileCacheQueueScheduler.java | 14 +++--- .../webmagic/scheduler/RedisScheduler.java | 5 +-- .../webmagic/selector/JsonPathSelector.java | 3 +- .../webmagic/utils/DoubleKeyMap.java | 1 - .../webmagic/utils/MultiKeyMapBase.java | 2 +- .../scheduler/RedisSchedulerTest.java | 2 - .../selector/JsonPathSelectorTest.java | 2 - .../codecraft/webmagic/main/QuickStarter.java | 4 +- .../webmagic/model/samples/News163.java | 12 ++--- .../us/codecraft/webmagic/PagedModel-cmnt.xml | 2 +- .../webmagic/pipeline/PagedPipeline-cmnt.xml | 2 +- 17 files changed, 63 insertions(+), 72 deletions(-) rename webmagic-extension/src/main/java/us/codecraft/webmagic/{PagedModel.java => MultiPageModel.java} (76%) create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/Experimental.java rename webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/{PagedPipeline.java => MultiPagePipeline.java} (56%) diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/PagedModel.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/MultiPageModel.java similarity index 76% rename from webmagic-extension/src/main/java/us/codecraft/webmagic/PagedModel.java rename to webmagic-extension/src/main/java/us/codecraft/webmagic/MultiPageModel.java index b94cb25..2e1b713 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/PagedModel.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/MultiPageModel.java @@ -8,7 +8,7 @@ import java.util.Collection; * Date: 13-8-4
* Time: 下午5:18
*/ -public interface PagedModel { +public interface MultiPageModel { public String getPageKey(); @@ -16,6 +16,6 @@ public interface PagedModel { public String getPage(); - public PagedModel combine(PagedModel pagedModel); + public MultiPageModel combine(MultiPageModel multiPageModel); } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/FileCache.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/FileCache.java index bf32435..a78a343 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/FileCache.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/FileCache.java @@ -17,7 +17,6 @@ import java.io.*; /** * Download file and saved to file for cache.
* - * * @author code4crafter@gmail.com * @since 0.2.1 */ diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/Experimental.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/Experimental.java new file mode 100644 index 0000000..f619d12 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/Experimental.java @@ -0,0 +1,8 @@ +package us.codecraft.webmagic.model.annotation; + +/** + * @author code4crafter@gmail.com
+ * Stands for features not stable. + */ +public @interface Experimental { +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePageModelPipeline.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePageModelPipeline.java index c66f52b..3be53d3 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePageModelPipeline.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePageModelPipeline.java @@ -14,29 +14,24 @@ import java.io.IOException; import java.io.PrintWriter; /** - * JSON格式持久化到文件的接口。
- * 如果持久化的文件名是乱码,请再运行的环境变量里加上LANG=zh_CN.UTF-8。
+ * Store results objects (page models) to files in JSON format。
+ * Use model.getKey() as file name if the model implements HasKey.
+ * Otherwise use SHA1 as file name. * * @author code4crafter@gmail.com
- * Date: 13-4-21 - * Time: 下午6:28 + * @since 0.2.0 */ public class JsonFilePageModelPipeline extends FilePersistentBase implements PageModelPipeline { private Logger logger = Logger.getLogger(getClass()); /** - * 新建一个JsonFilePageModelPipeline,使用默认保存路径"/data/webmagic/" + * new JsonFilePageModelPipeline with default path "/data/webmagic/" */ public JsonFilePageModelPipeline() { setPath("/data/webmagic/"); } - /** - * 新建一个JsonFilePageModelPipeline - * - * @param path 文件保存路径 - */ public JsonFilePageModelPipeline(String path) { setPath(path); } @@ -47,7 +42,7 @@ public class JsonFilePageModelPipeline extends FilePersistentBase implements Pag try { String filename; if (o instanceof HasKey) { - filename = path + ((HasKey)o).key() + ".json"; + filename = path + ((HasKey) o).key() + ".json"; } else { filename = path + DigestUtils.md5Hex(ToStringBuilder.reflectionToString(o)) + ".json"; } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePipeline.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePipeline.java index f2478f0..03313a9 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePipeline.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePipeline.java @@ -13,28 +13,22 @@ import java.io.IOException; import java.io.PrintWriter; /** - * JSON格式持久化到文件的接口。 + * Store results to files in JSON format。
* * @author code4crafter@gmail.com
- * Date: 13-4-21 - * Time: 下午6:28 + * @since 0.2.0 */ public class JsonFilePipeline extends FilePersistentBase implements Pipeline { private Logger logger = Logger.getLogger(getClass()); /** - * 新建一个JsonFilePipeline,使用默认保存路径"/data/webmagic/" + * new JsonFilePageModelPipeline with default path "/data/webmagic/" */ public JsonFilePipeline() { setPath("/data/webmagic"); } - /** - * 新建一个JsonFilePipeline - * - * @param path 文件保存路径 - */ public JsonFilePipeline(String path) { setPath(path); } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/PagedPipeline.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/MultiPagePipeline.java similarity index 56% rename from webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/PagedPipeline.java rename to webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/MultiPagePipeline.java index beda667..81c684b 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/PagedPipeline.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/MultiPagePipeline.java @@ -1,26 +1,28 @@ package us.codecraft.webmagic.pipeline; -import us.codecraft.webmagic.PagedModel; +import us.codecraft.webmagic.MultiPageModel; import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.Task; +import us.codecraft.webmagic.model.annotation.Experimental; import us.codecraft.webmagic.utils.DoubleKeyMap; import java.util.*; import java.util.concurrent.ConcurrentHashMap; /** - * 用于实现分页的Pipeline。
- * 在使用redis做分布式爬虫时,请不要使用此功能。
+ * A pipeline combines the result in more than one page together.
+ * Used for news and articles containing more than one web page.
+ * MultiPagePipeline will store parts of object and output them when all parts are extracted.
* * @author code4crafter@gmail.com
- * Date: 13-8-4
- * Time: 下午5:15
+ * @since 0.2.0 */ -public class PagedPipeline implements Pipeline { +@Experimental +public class MultiPagePipeline implements Pipeline { private DoubleKeyMap pageMap = new DoubleKeyMap(ConcurrentHashMap.class); - private DoubleKeyMap objectMap = new DoubleKeyMap(ConcurrentHashMap.class); + private DoubleKeyMap objectMap = new DoubleKeyMap(ConcurrentHashMap.class); @Override public void process(ResultItems resultItems, Task task) { @@ -34,20 +36,20 @@ public class PagedPipeline implements Pipeline { private void handleObject(Iterator> iterator) { Map.Entry objectEntry = iterator.next(); Object o = objectEntry.getValue(); - if (o instanceof PagedModel) { - PagedModel pagedModel = (PagedModel) o; - pageMap.put(pagedModel.getPageKey(), pagedModel.getPage(), Boolean.TRUE); - if (pagedModel.getOtherPages() != null) { - for (String otherPage : pagedModel.getOtherPages()) { - Boolean aBoolean = pageMap.get(pagedModel.getPageKey(), otherPage); + if (o instanceof MultiPageModel) { + MultiPageModel multiPageModel = (MultiPageModel) o; + pageMap.put(multiPageModel.getPageKey(), multiPageModel.getPage(), Boolean.TRUE); + if (multiPageModel.getOtherPages() != null) { + for (String otherPage : multiPageModel.getOtherPages()) { + Boolean aBoolean = pageMap.get(multiPageModel.getPageKey(), otherPage); if (aBoolean == null) { - pageMap.put(pagedModel.getPageKey(), otherPage, Boolean.FALSE); + pageMap.put(multiPageModel.getPageKey(), otherPage, Boolean.FALSE); } } } //check if all pages are processed - Map booleanMap = pageMap.get(pagedModel.getPageKey()); - objectMap.put(pagedModel.getPageKey(), pagedModel.getPage(), pagedModel); + Map booleanMap = pageMap.get(multiPageModel.getPageKey()); + objectMap.put(multiPageModel.getPageKey(), multiPageModel.getPage(), multiPageModel); if (booleanMap == null) { return; } @@ -57,12 +59,12 @@ public class PagedPipeline implements Pipeline { return; } } - List> entryList = new ArrayList>(); - entryList.addAll(objectMap.get(pagedModel.getPageKey()).entrySet()); + List> entryList = new ArrayList>(); + entryList.addAll(objectMap.get(multiPageModel.getPageKey()).entrySet()); if (entryList.size() != 0) { - Collections.sort(entryList, new Comparator>() { + Collections.sort(entryList, new Comparator>() { @Override - public int compare(Map.Entry o1, Map.Entry o2) { + public int compare(Map.Entry o1, Map.Entry o2) { try { int i1 = Integer.parseInt(o1.getKey()); int i2 = Integer.parseInt(o2.getKey()); @@ -72,7 +74,7 @@ public class PagedPipeline implements Pipeline { } } }); - PagedModel value = entryList.get(0).getValue(); + MultiPageModel value = entryList.get(0).getValue(); for (int i = 1; i < entryList.size(); i++) { value = value.combine(entryList.get(i).getValue()); } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java index a8dc23a..3f691cd 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java @@ -16,10 +16,10 @@ import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicInteger; /** - * 磁盘文件实现的url管理模块,可以保证在长时间执行的任务中断后,下次启动从中断位置重新开始。
+ * Store urls and cursor in files so that a Spider can resume the status when shutdown。
+ * * @author code4crafter@gmail.com
- * Date: 13-4-21 - * Time: 下午1:13 + * @since 0.2.0 */ public class FileCacheQueueScheduler implements Scheduler { @@ -46,8 +46,8 @@ public class FileCacheQueueScheduler implements Scheduler { private Set urls; public FileCacheQueueScheduler(String filePath) { - if (!filePath.endsWith("/")&&!filePath.endsWith("\\")){ - filePath+="/"; + if (!filePath.endsWith("/") && !filePath.endsWith("\\")) { + filePath += "/"; } this.filePath = filePath; } @@ -95,7 +95,7 @@ public class FileCacheQueueScheduler implements Scheduler { readCursorFile(); readUrlFile(); } catch (IOException e) { - logger.error("init file error",e); + logger.error("init file error", e); } } @@ -122,7 +122,7 @@ public class FileCacheQueueScheduler implements Scheduler { } private String getFileName(String filename) { - return filePath + task.getUUID() + filename; + return filePath + task.getUUID() + filename; } @Override diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java index c377a12..e0912de 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java @@ -9,11 +9,10 @@ import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Task; /** - * 使用redis管理url,构建一个分布式的爬虫。
+ * Use Redis as url scheduler for distributed crawlers。
* * @author code4crafter@gmail.com
- * Date: 13-7-25
- * Time: 上午7:07
+ * @since 0.2.0 */ public class RedisScheduler implements Scheduler { diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/selector/JsonPathSelector.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/selector/JsonPathSelector.java index 8331416..5a41c47 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/selector/JsonPathSelector.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/selector/JsonPathSelector.java @@ -6,9 +6,8 @@ import java.util.ArrayList; import java.util.List; /** + * JsonPath * @author code4crafter@gmail.com
- * Date: 13-8-12
- * Time: 下午12:54
*/ public class JsonPathSelector implements Selector { diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/DoubleKeyMap.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/DoubleKeyMap.java index b284a15..ba763c0 100755 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/DoubleKeyMap.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/DoubleKeyMap.java @@ -4,7 +4,6 @@ import java.util.Map; /** * @author code4crafter@gmail.com - * Date Dec 14, 2012 */ public class DoubleKeyMap extends MultiKeyMapBase { private Map> map; diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/MultiKeyMapBase.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/MultiKeyMapBase.java index 89fdc9a..a7d8378 100755 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/MultiKeyMapBase.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/MultiKeyMapBase.java @@ -9,7 +9,7 @@ import java.util.HashMap; import java.util.Map; /** - * multikey map, some basic objects * + * multi-key map, some basic objects * * * @author yihua.huang */ diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/scheduler/RedisSchedulerTest.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/scheduler/RedisSchedulerTest.java index 0819e43..1518763 100644 --- a/webmagic-extension/src/test/java/us/codecraft/webmagic/scheduler/RedisSchedulerTest.java +++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/scheduler/RedisSchedulerTest.java @@ -9,8 +9,6 @@ import us.codecraft.webmagic.Task; /** * @author code4crafter@gmail.com
- * Date: 13-7-25
- * Time: 上午7:51
*/ public class RedisSchedulerTest { diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/selector/JsonPathSelectorTest.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/selector/JsonPathSelectorTest.java index 1cd8dc1..c38efe9 100644 --- a/webmagic-extension/src/test/java/us/codecraft/webmagic/selector/JsonPathSelectorTest.java +++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/selector/JsonPathSelectorTest.java @@ -7,8 +7,6 @@ import java.util.List; /** * @author code4crafter@gmai.com
- * Date: 13-8-12
- * Time: 下午1:12
*/ public class JsonPathSelectorTest { diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/main/QuickStarter.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/main/QuickStarter.java index 52be272..69adabb 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/main/QuickStarter.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/main/QuickStarter.java @@ -6,7 +6,7 @@ import us.codecraft.webmagic.model.samples.IteyeBlog; import us.codecraft.webmagic.model.samples.News163; import us.codecraft.webmagic.model.samples.OschinaBlog; import us.codecraft.webmagic.pipeline.ConsolePipeline; -import us.codecraft.webmagic.pipeline.PagedPipeline; +import us.codecraft.webmagic.pipeline.MultiPagePipeline; import java.util.LinkedHashMap; import java.util.Map; @@ -40,7 +40,7 @@ public class QuickStarter { key = readKey(key); System.out.println("The demo started and will last 20 seconds..."); //Start spider - OOSpider.create(Site.me().addStartUrl(urlMap.get(key)), clazzMap.get(key)).pipeline(new PagedPipeline()).pipeline(new ConsolePipeline()).runAsync(); + OOSpider.create(Site.me().addStartUrl(urlMap.get(key)), clazzMap.get(key)).pipeline(new MultiPagePipeline()).pipeline(new ConsolePipeline()).runAsync(); try { Thread.sleep(20000); diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java index 2bc3f95..946e737 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java @@ -1,6 +1,6 @@ package us.codecraft.webmagic.model.samples; -import us.codecraft.webmagic.PagedModel; +import us.codecraft.webmagic.MultiPageModel; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.model.OOSpider; import us.codecraft.webmagic.model.annotation.ComboExtract; @@ -8,7 +8,7 @@ import us.codecraft.webmagic.model.annotation.ExtractBy; import us.codecraft.webmagic.model.annotation.ExtractByUrl; import us.codecraft.webmagic.model.annotation.TargetUrl; import us.codecraft.webmagic.pipeline.ConsolePipeline; -import us.codecraft.webmagic.pipeline.PagedPipeline; +import us.codecraft.webmagic.pipeline.MultiPagePipeline; import us.codecraft.webmagic.scheduler.RedisScheduler; import java.util.Collection; @@ -20,7 +20,7 @@ import java.util.List; * Time: 下午8:17
*/ @TargetUrl("http://news.163.com/\\d+/\\d+/\\d+/\\w+*.html") -public class News163 implements PagedModel { +public class News163 implements MultiPageModel { @ExtractByUrl("http://news\\.163\\.com/\\d+/\\d+/\\d+/([^_]*).*\\.html") private String pageKey; @@ -58,10 +58,10 @@ public class News163 implements PagedModel { } @Override - public PagedModel combine(PagedModel pagedModel) { + public MultiPageModel combine(MultiPageModel multiPageModel) { News163 news163 = new News163(); news163.title = this.title; - News163 pagedModel1 = (News163) pagedModel; + News163 pagedModel1 = (News163) multiPageModel; news163.content = this.content + pagedModel1.content; return news163; } @@ -77,7 +77,7 @@ public class News163 implements PagedModel { public static void main(String[] args) { OOSpider.create(Site.me().addStartUrl("http://news.163.com/13/0802/05/958I1E330001124J_2.html"), News163.class) - .scheduler(new RedisScheduler("localhost")).clearPipeline().pipeline(new PagedPipeline()).pipeline(new ConsolePipeline()).run(); + .scheduler(new RedisScheduler("localhost")).clearPipeline().pipeline(new MultiPagePipeline()).pipeline(new ConsolePipeline()).run(); } } diff --git a/zh_docs/us/codecraft/webmagic/PagedModel-cmnt.xml b/zh_docs/us/codecraft/webmagic/PagedModel-cmnt.xml index a57d07b..17ffd7a 100644 --- a/zh_docs/us/codecraft/webmagic/PagedModel-cmnt.xml +++ b/zh_docs/us/codecraft/webmagic/PagedModel-cmnt.xml @@ -4,7 +4,7 @@ Sat Aug 17 14:14:45 CST 2013 - + @author code4crafter@gmail.com
Date: 13-8-4
diff --git a/zh_docs/us/codecraft/webmagic/pipeline/PagedPipeline-cmnt.xml b/zh_docs/us/codecraft/webmagic/pipeline/PagedPipeline-cmnt.xml index eb41808..128521c 100644 --- a/zh_docs/us/codecraft/webmagic/pipeline/PagedPipeline-cmnt.xml +++ b/zh_docs/us/codecraft/webmagic/pipeline/PagedPipeline-cmnt.xml @@ -4,7 +4,7 @@ Sat Aug 17 14:14:46 CST 2013 - + 在使用redis做分布式爬虫时,请不要使用此功能。