add list output support
parent
2bb6f84742
commit
54904851ea
|
@ -20,6 +20,15 @@ public class ConsolePipeline implements Pipeline{
|
|||
}
|
||||
System.out.println("get page: "+resultItems.getRequest().getUrl());
|
||||
for (Map.Entry<String, Object> entry : resultItems.getAll().entrySet()) {
|
||||
if (entry.getValue() instanceof Iterable) {
|
||||
Iterable value = (Iterable) entry.getValue();
|
||||
System.out.println(entry.getKey() + ":");
|
||||
for (Object o : value) {
|
||||
System.out.println(o);
|
||||
}
|
||||
} else {
|
||||
System.out.println(entry.getKey() + ":\t" + entry.getValue());
|
||||
}
|
||||
System.out.println(entry.getKey()+":\t"+entry.getValue());
|
||||
}
|
||||
}
|
||||
|
|
|
@ -13,9 +13,10 @@ import java.util.Map;
|
|||
|
||||
/**
|
||||
* 持久化到文件的接口。
|
||||
*
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-4-21
|
||||
* Time: 下午6:28
|
||||
* Date: 13-4-21
|
||||
* Time: 下午6:28
|
||||
*/
|
||||
public class FilePipeline implements Pipeline {
|
||||
|
||||
|
@ -32,6 +33,7 @@ public class FilePipeline implements Pipeline {
|
|||
|
||||
/**
|
||||
* 新建一个FilePipeline
|
||||
*
|
||||
* @param path 文件保存路径
|
||||
*/
|
||||
public FilePipeline(String path) {
|
||||
|
@ -45,18 +47,26 @@ public class FilePipeline implements Pipeline {
|
|||
if (!file.exists()) {
|
||||
file.mkdirs();
|
||||
}
|
||||
if (resultItems.isSkip()){
|
||||
if (resultItems.isSkip()) {
|
||||
return;
|
||||
}
|
||||
try {
|
||||
PrintWriter printWriter = new PrintWriter(new FileWriter(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl())+".html"));
|
||||
PrintWriter printWriter = new PrintWriter(new FileWriter(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".html"));
|
||||
printWriter.println("url:\t" + resultItems.getRequest().getUrl());
|
||||
for (Map.Entry<String, Object> entry : resultItems.getAll().entrySet()) {
|
||||
printWriter.println(entry.getKey()+":\t"+entry.getValue());
|
||||
if (entry.getValue() instanceof Iterable) {
|
||||
Iterable value = (Iterable) entry.getValue();
|
||||
printWriter.println(entry.getKey() + ":");
|
||||
for (Object o : value) {
|
||||
printWriter.println(o);
|
||||
}
|
||||
} else {
|
||||
printWriter.println(entry.getKey() + ":\t" + entry.getValue());
|
||||
}
|
||||
}
|
||||
printWriter.close();
|
||||
} catch (IOException e) {
|
||||
logger.warn("write file error",e);
|
||||
logger.warn("write file error", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,49 @@
|
|||
package us.codecraft.webmagic.samples;
|
||||
|
||||
import org.apache.commons.collections.CollectionUtils;
|
||||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.Site;
|
||||
import us.codecraft.webmagic.Spider;
|
||||
import us.codecraft.webmagic.pipeline.FilePipeline;
|
||||
import us.codecraft.webmagic.processor.PageProcessor;
|
||||
import us.codecraft.webmagic.scheduler.RedisScheduler;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-4-21
|
||||
* Time: 下午8:08
|
||||
*/
|
||||
public class InfoQMiniBookProcessor implements PageProcessor {
|
||||
|
||||
private Site site;
|
||||
|
||||
@Override
|
||||
public void process(Page page) {
|
||||
page.addTargetRequests(page.getHtml().links().regex("http://www\\.infoq\\.com/cn/minibooks/.*").all());
|
||||
List<String> all = page.getHtml().links().regex(".*\\.pdf").all();
|
||||
if (CollectionUtils.isNotEmpty(all)) {
|
||||
page.putField("pdf", all);
|
||||
} else {
|
||||
page.getResultItems().setSkip(true);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Site getSite() {
|
||||
if (site == null) {
|
||||
site = Site.me().setDomain("www.infoq.com").addStartUrl("http://www.infoq.com/cn/minibooks").addCookie("RegisteredUserCookie", "sDDDc8dIAgZSq67uJSXhtpQaHEi1XDOH").
|
||||
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
|
||||
}
|
||||
return site;
|
||||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
Spider.create(new InfoQMiniBookProcessor())
|
||||
.scheduler(new RedisScheduler("localhost"))
|
||||
.pipeline(new FilePipeline("/data/temp/webmagic/"))
|
||||
.thread(5)
|
||||
.run();
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue