comments in english
parent
e566a53936
commit
59aad6a7f4
|
@ -8,7 +8,7 @@ import java.util.Collection;
|
|||
* Date: 13-8-4 <br>
|
||||
* Time: 下午5:18 <br>
|
||||
*/
|
||||
public interface PagedModel {
|
||||
public interface MultiPageModel {
|
||||
|
||||
public String getPageKey();
|
||||
|
||||
|
@ -16,6 +16,6 @@ public interface PagedModel {
|
|||
|
||||
public String getPage();
|
||||
|
||||
public PagedModel combine(PagedModel pagedModel);
|
||||
public MultiPageModel combine(MultiPageModel multiPageModel);
|
||||
|
||||
}
|
|
@ -17,7 +17,6 @@ import java.io.*;
|
|||
/**
|
||||
* Download file and saved to file for cache.<br>
|
||||
*
|
||||
*
|
||||
* @author code4crafter@gmail.com
|
||||
* @since 0.2.1
|
||||
*/
|
||||
|
|
|
@ -0,0 +1,8 @@
|
|||
package us.codecraft.webmagic.model.annotation;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Stands for features not stable.
|
||||
*/
|
||||
public @interface Experimental {
|
||||
}
|
|
@ -14,29 +14,24 @@ import java.io.IOException;
|
|||
import java.io.PrintWriter;
|
||||
|
||||
/**
|
||||
* JSON格式持久化到文件的接口。<br>
|
||||
* 如果持久化的文件名是乱码,请再运行的环境变量里加上LANG=zh_CN.UTF-8。<br>
|
||||
* Store results objects (page models) to files in JSON format。<br>
|
||||
* Use model.getKey() as file name if the model implements HasKey.<br>
|
||||
* Otherwise use SHA1 as file name.
|
||||
*
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-4-21
|
||||
* Time: 下午6:28
|
||||
* @since 0.2.0
|
||||
*/
|
||||
public class JsonFilePageModelPipeline extends FilePersistentBase implements PageModelPipeline {
|
||||
|
||||
private Logger logger = Logger.getLogger(getClass());
|
||||
|
||||
/**
|
||||
* 新建一个JsonFilePageModelPipeline,使用默认保存路径"/data/webmagic/"
|
||||
* new JsonFilePageModelPipeline with default path "/data/webmagic/"
|
||||
*/
|
||||
public JsonFilePageModelPipeline() {
|
||||
setPath("/data/webmagic/");
|
||||
}
|
||||
|
||||
/**
|
||||
* 新建一个JsonFilePageModelPipeline
|
||||
*
|
||||
* @param path 文件保存路径
|
||||
*/
|
||||
public JsonFilePageModelPipeline(String path) {
|
||||
setPath(path);
|
||||
}
|
||||
|
@ -47,7 +42,7 @@ public class JsonFilePageModelPipeline extends FilePersistentBase implements Pag
|
|||
try {
|
||||
String filename;
|
||||
if (o instanceof HasKey) {
|
||||
filename = path + ((HasKey)o).key() + ".json";
|
||||
filename = path + ((HasKey) o).key() + ".json";
|
||||
} else {
|
||||
filename = path + DigestUtils.md5Hex(ToStringBuilder.reflectionToString(o)) + ".json";
|
||||
}
|
||||
|
|
|
@ -13,28 +13,22 @@ import java.io.IOException;
|
|||
import java.io.PrintWriter;
|
||||
|
||||
/**
|
||||
* JSON格式持久化到文件的接口。
|
||||
* Store results to files in JSON format。<br>
|
||||
*
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-4-21
|
||||
* Time: 下午6:28
|
||||
* @since 0.2.0
|
||||
*/
|
||||
public class JsonFilePipeline extends FilePersistentBase implements Pipeline {
|
||||
|
||||
private Logger logger = Logger.getLogger(getClass());
|
||||
|
||||
/**
|
||||
* 新建一个JsonFilePipeline,使用默认保存路径"/data/webmagic/"
|
||||
* new JsonFilePageModelPipeline with default path "/data/webmagic/"
|
||||
*/
|
||||
public JsonFilePipeline() {
|
||||
setPath("/data/webmagic");
|
||||
}
|
||||
|
||||
/**
|
||||
* 新建一个JsonFilePipeline
|
||||
*
|
||||
* @param path 文件保存路径
|
||||
*/
|
||||
public JsonFilePipeline(String path) {
|
||||
setPath(path);
|
||||
}
|
||||
|
|
|
@ -1,26 +1,28 @@
|
|||
package us.codecraft.webmagic.pipeline;
|
||||
|
||||
import us.codecraft.webmagic.PagedModel;
|
||||
import us.codecraft.webmagic.MultiPageModel;
|
||||
import us.codecraft.webmagic.ResultItems;
|
||||
import us.codecraft.webmagic.Task;
|
||||
import us.codecraft.webmagic.model.annotation.Experimental;
|
||||
import us.codecraft.webmagic.utils.DoubleKeyMap;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
|
||||
/**
|
||||
* 用于实现分页的Pipeline。<br>
|
||||
* 在使用redis做分布式爬虫时,请不要使用此功能。<br>
|
||||
* A pipeline combines the result in more than one page together.<br>
|
||||
* Used for news and articles containing more than one web page. <br>
|
||||
* MultiPagePipeline will store parts of object and output them when all parts are extracted.<br>
|
||||
*
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-8-4 <br>
|
||||
* Time: 下午5:15 <br>
|
||||
* @since 0.2.0
|
||||
*/
|
||||
public class PagedPipeline implements Pipeline {
|
||||
@Experimental
|
||||
public class MultiPagePipeline implements Pipeline {
|
||||
|
||||
private DoubleKeyMap<String, String, Boolean> pageMap = new DoubleKeyMap<String, String, Boolean>(ConcurrentHashMap.class);
|
||||
|
||||
private DoubleKeyMap<String, String, PagedModel> objectMap = new DoubleKeyMap<String, String, PagedModel>(ConcurrentHashMap.class);
|
||||
private DoubleKeyMap<String, String, MultiPageModel> objectMap = new DoubleKeyMap<String, String, MultiPageModel>(ConcurrentHashMap.class);
|
||||
|
||||
@Override
|
||||
public void process(ResultItems resultItems, Task task) {
|
||||
|
@ -34,20 +36,20 @@ public class PagedPipeline implements Pipeline {
|
|||
private void handleObject(Iterator<Map.Entry<String, Object>> iterator) {
|
||||
Map.Entry<String, Object> objectEntry = iterator.next();
|
||||
Object o = objectEntry.getValue();
|
||||
if (o instanceof PagedModel) {
|
||||
PagedModel pagedModel = (PagedModel) o;
|
||||
pageMap.put(pagedModel.getPageKey(), pagedModel.getPage(), Boolean.TRUE);
|
||||
if (pagedModel.getOtherPages() != null) {
|
||||
for (String otherPage : pagedModel.getOtherPages()) {
|
||||
Boolean aBoolean = pageMap.get(pagedModel.getPageKey(), otherPage);
|
||||
if (o instanceof MultiPageModel) {
|
||||
MultiPageModel multiPageModel = (MultiPageModel) o;
|
||||
pageMap.put(multiPageModel.getPageKey(), multiPageModel.getPage(), Boolean.TRUE);
|
||||
if (multiPageModel.getOtherPages() != null) {
|
||||
for (String otherPage : multiPageModel.getOtherPages()) {
|
||||
Boolean aBoolean = pageMap.get(multiPageModel.getPageKey(), otherPage);
|
||||
if (aBoolean == null) {
|
||||
pageMap.put(pagedModel.getPageKey(), otherPage, Boolean.FALSE);
|
||||
pageMap.put(multiPageModel.getPageKey(), otherPage, Boolean.FALSE);
|
||||
}
|
||||
}
|
||||
}
|
||||
//check if all pages are processed
|
||||
Map<String, Boolean> booleanMap = pageMap.get(pagedModel.getPageKey());
|
||||
objectMap.put(pagedModel.getPageKey(), pagedModel.getPage(), pagedModel);
|
||||
Map<String, Boolean> booleanMap = pageMap.get(multiPageModel.getPageKey());
|
||||
objectMap.put(multiPageModel.getPageKey(), multiPageModel.getPage(), multiPageModel);
|
||||
if (booleanMap == null) {
|
||||
return;
|
||||
}
|
||||
|
@ -57,12 +59,12 @@ public class PagedPipeline implements Pipeline {
|
|||
return;
|
||||
}
|
||||
}
|
||||
List<Map.Entry<String, PagedModel>> entryList = new ArrayList<Map.Entry<String, PagedModel>>();
|
||||
entryList.addAll(objectMap.get(pagedModel.getPageKey()).entrySet());
|
||||
List<Map.Entry<String, MultiPageModel>> entryList = new ArrayList<Map.Entry<String, MultiPageModel>>();
|
||||
entryList.addAll(objectMap.get(multiPageModel.getPageKey()).entrySet());
|
||||
if (entryList.size() != 0) {
|
||||
Collections.sort(entryList, new Comparator<Map.Entry<String, PagedModel>>() {
|
||||
Collections.sort(entryList, new Comparator<Map.Entry<String, MultiPageModel>>() {
|
||||
@Override
|
||||
public int compare(Map.Entry<String, PagedModel> o1, Map.Entry<String, PagedModel> o2) {
|
||||
public int compare(Map.Entry<String, MultiPageModel> o1, Map.Entry<String, MultiPageModel> o2) {
|
||||
try {
|
||||
int i1 = Integer.parseInt(o1.getKey());
|
||||
int i2 = Integer.parseInt(o2.getKey());
|
||||
|
@ -72,7 +74,7 @@ public class PagedPipeline implements Pipeline {
|
|||
}
|
||||
}
|
||||
});
|
||||
PagedModel value = entryList.get(0).getValue();
|
||||
MultiPageModel value = entryList.get(0).getValue();
|
||||
for (int i = 1; i < entryList.size(); i++) {
|
||||
value = value.combine(entryList.get(i).getValue());
|
||||
}
|
|
@ -16,10 +16,10 @@ import java.util.concurrent.atomic.AtomicBoolean;
|
|||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
|
||||
/**
|
||||
* 磁盘文件实现的url管理模块,可以保证在长时间执行的任务中断后,下次启动从中断位置重新开始。<br>
|
||||
* Store urls and cursor in files so that a Spider can resume the status when shutdown。<br>
|
||||
*
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-4-21
|
||||
* Time: 下午1:13
|
||||
* @since 0.2.0
|
||||
*/
|
||||
public class FileCacheQueueScheduler implements Scheduler {
|
||||
|
||||
|
@ -46,8 +46,8 @@ public class FileCacheQueueScheduler implements Scheduler {
|
|||
private Set<String> urls;
|
||||
|
||||
public FileCacheQueueScheduler(String filePath) {
|
||||
if (!filePath.endsWith("/")&&!filePath.endsWith("\\")){
|
||||
filePath+="/";
|
||||
if (!filePath.endsWith("/") && !filePath.endsWith("\\")) {
|
||||
filePath += "/";
|
||||
}
|
||||
this.filePath = filePath;
|
||||
}
|
||||
|
@ -95,7 +95,7 @@ public class FileCacheQueueScheduler implements Scheduler {
|
|||
readCursorFile();
|
||||
readUrlFile();
|
||||
} catch (IOException e) {
|
||||
logger.error("init file error",e);
|
||||
logger.error("init file error", e);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -122,7 +122,7 @@ public class FileCacheQueueScheduler implements Scheduler {
|
|||
}
|
||||
|
||||
private String getFileName(String filename) {
|
||||
return filePath + task.getUUID() + filename;
|
||||
return filePath + task.getUUID() + filename;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -9,11 +9,10 @@ import us.codecraft.webmagic.Request;
|
|||
import us.codecraft.webmagic.Task;
|
||||
|
||||
/**
|
||||
* 使用redis管理url,构建一个分布式的爬虫。<br>
|
||||
* Use Redis as url scheduler for distributed crawlers。<br>
|
||||
*
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-7-25 <br>
|
||||
* Time: 上午7:07 <br>
|
||||
* @since 0.2.0
|
||||
*/
|
||||
public class RedisScheduler implements Scheduler {
|
||||
|
||||
|
|
|
@ -6,9 +6,8 @@ import java.util.ArrayList;
|
|||
import java.util.List;
|
||||
|
||||
/**
|
||||
* JsonPath
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-8-12 <br>
|
||||
* Time: 下午12:54 <br>
|
||||
*/
|
||||
public class JsonPathSelector implements Selector {
|
||||
|
||||
|
|
|
@ -4,7 +4,6 @@ import java.util.Map;
|
|||
|
||||
/**
|
||||
* @author code4crafter@gmail.com
|
||||
* Date Dec 14, 2012
|
||||
*/
|
||||
public class DoubleKeyMap<K1, K2, V> extends MultiKeyMapBase {
|
||||
private Map<K1, Map<K2, V>> map;
|
||||
|
|
|
@ -9,7 +9,7 @@ import java.util.HashMap;
|
|||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* multikey map, some basic objects *
|
||||
* multi-key map, some basic objects *
|
||||
*
|
||||
* @author yihua.huang
|
||||
*/
|
||||
|
|
|
@ -9,8 +9,6 @@ import us.codecraft.webmagic.Task;
|
|||
|
||||
/**
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-7-25 <br>
|
||||
* Time: 上午7:51 <br>
|
||||
*/
|
||||
public class RedisSchedulerTest {
|
||||
|
||||
|
|
|
@ -7,8 +7,6 @@ import java.util.List;
|
|||
|
||||
/**
|
||||
* @author code4crafter@gmai.com <br>
|
||||
* Date: 13-8-12 <br>
|
||||
* Time: 下午1:12 <br>
|
||||
*/
|
||||
public class JsonPathSelectorTest {
|
||||
|
||||
|
|
|
@ -6,7 +6,7 @@ import us.codecraft.webmagic.model.samples.IteyeBlog;
|
|||
import us.codecraft.webmagic.model.samples.News163;
|
||||
import us.codecraft.webmagic.model.samples.OschinaBlog;
|
||||
import us.codecraft.webmagic.pipeline.ConsolePipeline;
|
||||
import us.codecraft.webmagic.pipeline.PagedPipeline;
|
||||
import us.codecraft.webmagic.pipeline.MultiPagePipeline;
|
||||
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.Map;
|
||||
|
@ -40,7 +40,7 @@ public class QuickStarter {
|
|||
key = readKey(key);
|
||||
System.out.println("The demo started and will last 20 seconds...");
|
||||
//Start spider
|
||||
OOSpider.create(Site.me().addStartUrl(urlMap.get(key)), clazzMap.get(key)).pipeline(new PagedPipeline()).pipeline(new ConsolePipeline()).runAsync();
|
||||
OOSpider.create(Site.me().addStartUrl(urlMap.get(key)), clazzMap.get(key)).pipeline(new MultiPagePipeline()).pipeline(new ConsolePipeline()).runAsync();
|
||||
|
||||
try {
|
||||
Thread.sleep(20000);
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
package us.codecraft.webmagic.model.samples;
|
||||
|
||||
import us.codecraft.webmagic.PagedModel;
|
||||
import us.codecraft.webmagic.MultiPageModel;
|
||||
import us.codecraft.webmagic.Site;
|
||||
import us.codecraft.webmagic.model.OOSpider;
|
||||
import us.codecraft.webmagic.model.annotation.ComboExtract;
|
||||
|
@ -8,7 +8,7 @@ import us.codecraft.webmagic.model.annotation.ExtractBy;
|
|||
import us.codecraft.webmagic.model.annotation.ExtractByUrl;
|
||||
import us.codecraft.webmagic.model.annotation.TargetUrl;
|
||||
import us.codecraft.webmagic.pipeline.ConsolePipeline;
|
||||
import us.codecraft.webmagic.pipeline.PagedPipeline;
|
||||
import us.codecraft.webmagic.pipeline.MultiPagePipeline;
|
||||
import us.codecraft.webmagic.scheduler.RedisScheduler;
|
||||
|
||||
import java.util.Collection;
|
||||
|
@ -20,7 +20,7 @@ import java.util.List;
|
|||
* Time: 下午8:17 <br>
|
||||
*/
|
||||
@TargetUrl("http://news.163.com/\\d+/\\d+/\\d+/\\w+*.html")
|
||||
public class News163 implements PagedModel {
|
||||
public class News163 implements MultiPageModel {
|
||||
|
||||
@ExtractByUrl("http://news\\.163\\.com/\\d+/\\d+/\\d+/([^_]*).*\\.html")
|
||||
private String pageKey;
|
||||
|
@ -58,10 +58,10 @@ public class News163 implements PagedModel {
|
|||
}
|
||||
|
||||
@Override
|
||||
public PagedModel combine(PagedModel pagedModel) {
|
||||
public MultiPageModel combine(MultiPageModel multiPageModel) {
|
||||
News163 news163 = new News163();
|
||||
news163.title = this.title;
|
||||
News163 pagedModel1 = (News163) pagedModel;
|
||||
News163 pagedModel1 = (News163) multiPageModel;
|
||||
news163.content = this.content + pagedModel1.content;
|
||||
return news163;
|
||||
}
|
||||
|
@ -77,7 +77,7 @@ public class News163 implements PagedModel {
|
|||
|
||||
public static void main(String[] args) {
|
||||
OOSpider.create(Site.me().addStartUrl("http://news.163.com/13/0802/05/958I1E330001124J_2.html"), News163.class)
|
||||
.scheduler(new RedisScheduler("localhost")).clearPipeline().pipeline(new PagedPipeline()).pipeline(new ConsolePipeline()).run();
|
||||
.scheduler(new RedisScheduler("localhost")).clearPipeline().pipeline(new MultiPagePipeline()).pipeline(new ConsolePipeline()).run();
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
<date-generated>Sat Aug 17 14:14:45 CST 2013</date-generated>
|
||||
</meta>
|
||||
<comment>
|
||||
<key><![CDATA[us.codecraft.webmagic.PagedModel]]></key>
|
||||
<key><![CDATA[us.codecraft.webmagic.MultiPageModel]]></key>
|
||||
<data><![CDATA[ 实现此接口以进行支持爬虫分页抓取。<br>
|
||||
@author code4crafter@gmail.com <br>
|
||||
Date: 13-8-4 <br>
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
<date-generated>Sat Aug 17 14:14:46 CST 2013</date-generated>
|
||||
</meta>
|
||||
<comment>
|
||||
<key><![CDATA[us.codecraft.webmagic.pipeline.PagedPipeline]]></key>
|
||||
<key><![CDATA[us.codecraft.webmagic.pipeline.MultiPagePipeline]]></key>
|
||||
<data><![CDATA[ 用于实现分页的Pipeline。<br>
|
||||
在使用redis做分布式爬虫时,请不要使用此功能。<br>
|
||||
|
||||
|
|
Loading…
Reference in New Issue