update pipeline api

master
yihua.huang 2013-07-25 13:32:39 +08:00
parent 55d80129bf
commit 65dc372152
26 changed files with 119 additions and 95 deletions

View File

@ -6,8 +6,6 @@ import us.codecraft.webmagic.utils.UrlUtils;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
/**
* <pre>
@ -27,7 +25,7 @@ public class Page {
private Request request;
private Map<String, Selectable> fields = new ConcurrentHashMap<String, Selectable>();
private ResultItems resultItems = new ResultItems();
private Selectable html;
@ -35,44 +33,16 @@ public class Page {
private List<Request> targetRequests = new ArrayList<Request>();
private boolean skip;
private Object extra;
/**
* pipeline
* @return true
*/
public boolean isSkip() {
return skip;
}
/**
* pipeline
* @param skip true
*/
public void setSkip(boolean skip) {
this.skip = skip;
}
public Page() {
}
/**
* {@link us.codecraft.webmagic.pipeline.Pipeline}
* @return fields
*/
public Map<String, Selectable> getFields() {
return fields;
}
/**
*
* @param key key
* @param field value
*/
public void putField(String key, Selectable field) {
fields.put(key, field);
public void putField(String key, Object field) {
resultItems.put(key, field);
}
/**
@ -157,23 +127,10 @@ public class Page {
public void setRequest(Request request) {
this.request = request;
this.resultItems.setRequest(request);
}
/**
*
* @param <T>
* @return
*/
public <T> T getExtra() {
return (T)extra;
}
/**
*
* @param extra
* @param <T>
*/
public <T> void setExtra(T extra) {
this.extra = extra;
public ResultItems getResultItems() {
return resultItems;
}
}

View File

@ -0,0 +1,64 @@
package us.codecraft.webmagic;
import java.util.HashMap;
import java.util.Map;
/**
* PageProcessor{@link us.codecraft.webmagic.pipeline.Pipeline}<br>
* @author yihua.huang@dianping.com <br>
* @date: 13-7-25 <br>
* Time: 12:20 <br>
*/
public class ResultItems {
private Map<String, Object> fields = new HashMap<String, Object>();
private Request request;
private boolean skip;
public <T> T get(String key) {
Object o = fields.get(key);
if (o == null) {
return null;
}
return (T) fields.get(key);
}
public Map<String, Object> getAll() {
return fields;
}
public <T> ResultItems put(String key, T value) {
fields.put(key, value);
return this;
}
public Request getRequest() {
return request;
}
public ResultItems setRequest(Request request) {
this.request = request;
return this;
}
/**
* pipeline
* @return true
*/
public boolean isSkip() {
return skip;
}
/**
* pipeline
* @param skip
* @return this
*/
public ResultItems setSkip(boolean skip) {
this.skip = skip;
return this;
}
}

View File

@ -196,7 +196,7 @@ public class Spider implements Runnable, Task {
pageProcessor.process(page);
addRequest(page);
for (Pipeline pipeline : pipelines) {
pipeline.process(page, this);
pipeline.process(page.getResultItems(), this);
}
sleep(site.getSleepTime());
}

View File

@ -1,8 +1,7 @@
package us.codecraft.webmagic.pipeline;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.selector.Selectable;
import java.util.Map;
@ -15,13 +14,10 @@ import java.util.Map;
public class ConsolePipeline implements Pipeline{
@Override
public void process(Page page,Task task) {
System.out.println("get page: "+page.getUrl());
for (Map.Entry<String, Selectable> entry : page.getFields().entrySet()) {
System.out.println(entry.getKey()+":\t"+entry.getValue().toStrings());
}
if (page.getExtra()!=null){
System.out.println(page.getExtra());
public void process(ResultItems resultItems,Task task) {
System.out.println("get page: "+resultItems.getRequest().getUrl());
for (Map.Entry<String, Object> entry : resultItems.getAll().entrySet()) {
System.out.println(entry.getKey()+":\t"+entry.getValue());
}
}
}

View File

@ -2,13 +2,14 @@ package us.codecraft.webmagic.pipeline;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.log4j.Logger;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.Map;
/**
*
@ -38,16 +39,18 @@ public class FilePipeline implements Pipeline {
}
@Override
public void process(Page page, Task task) {
public void process(ResultItems resultItems, Task task) {
String path = this.path + "/" + task.getUUID() + "/";
File file = new File(path);
if (!file.exists()) {
file.mkdirs();
}
try {
PrintWriter printWriter = new PrintWriter(new FileWriter(path + DigestUtils.md5Hex(page.getUrl().toString())));
printWriter.println("url:\t" + page.getUrl());
printWriter.println("html:\t" + page.getHtml());
PrintWriter printWriter = new PrintWriter(new FileWriter(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl())));
printWriter.println("url:\t" + resultItems.getRequest().getUrl());
for (Map.Entry<String, Object> entry : resultItems.getAll().entrySet()) {
printWriter.println(entry.getKey()+":\t"+entry.getValue());
}
printWriter.close();
} catch (IOException e) {
logger.warn("write file error",e);

View File

@ -1,6 +1,6 @@
package us.codecraft.webmagic.pipeline;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
/**
@ -11,5 +11,5 @@ import us.codecraft.webmagic.Task;
*/
public interface Pipeline {
public void process(Page page,Task task);
public void process(ResultItems resultItems,Task task);
}

View File

@ -30,12 +30,13 @@ public class SimplePageProcessor implements PageProcessor {
@Override
public void process(Page page) {
List<String> requests = page.getHtml().links().regex(urlPattern).toStrings();
List<String> requests = page.getHtml().links().regex(urlPattern).all();
//调用page.addTargetRequests()方法添加待抓取链接
page.addTargetRequests(requests);
//xpath方式抽取
page.putField("title", page.getHtml().xpath("//title"));
//sc表示使用Readability技术抽取正文
page.putField("html", page.getHtml().toString());
page.putField("content", page.getHtml().smartContent());
}

View File

@ -82,14 +82,14 @@ public class PlainText implements Selectable {
}
@Override
public List<String> toStrings() {
public List<String> all() {
return strings;
}
@Override
public String toString() {
if (CollectionUtils.isNotEmpty(toStrings())) {
return toStrings().get(0);
if (CollectionUtils.isNotEmpty(all())) {
return all().get(0);
} else {
return null;
}

View File

@ -69,5 +69,5 @@ public interface Selectable {
*
* @return multi string result
*/
public List<String> toStrings();
public List<String> all();
}

View File

@ -1351,7 +1351,7 @@ public class XpathSelectorTest {
public void testOschina() {
Html html1 = new Html(html);
Assert.assertEquals("再次吐槽easyui", html1.xpath(".//*[@class='QTitle']/h1/a").toString());
Assert.assertNotNull(html1.$("a[href]").xpath("//@href").toStrings());
Assert.assertNotNull(html1.$("a[href]").xpath("//@href").all());
}
}

View File

@ -4,7 +4,7 @@ import freemarker.template.Configuration;
import freemarker.template.Template;
import freemarker.template.TemplateException;
import org.apache.commons.codec.digest.DigestUtils;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import java.io.File;
@ -39,8 +39,8 @@ public class FreemarkerPipeline implements Pipeline {
@Override
public void process(Page page, Task task) {
if (page.isSkip()) {
public void process(ResultItems resultItems, Task task) {
if (resultItems.isSkip()) {
return;
}
String path = this.path + "" + task.getUUID() + "/";
@ -49,8 +49,8 @@ public class FreemarkerPipeline implements Pipeline {
file.mkdirs();
}
try {
PrintWriter printWriter = new PrintWriter(new FileWriter(path + DigestUtils.md5Hex(page.getUrl().toString()) + ".html"));
template.process(page.getFields(), printWriter);
PrintWriter printWriter = new PrintWriter(new FileWriter(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".html"));
template.process(resultItems.getAll(), printWriter);
printWriter.close();
} catch (TemplateException e) {
} catch (IOException e) {

View File

@ -28,7 +28,9 @@ public class RedisScheduler implements Scheduler{
@Override
public synchronized void push(Request request, Task task) {
Jedis jedis = pool.getResource();
//使用SortedSet进行url去重
if (jedis.zrank(SET_PREFIX+task.getUUID(),request.getUrl())==null){
//使用List保存队列
jedis.rpush(QUEUE_PREFIX+task.getUUID(),request.getUrl());
jedis.zadd(SET_PREFIX+task.getUUID(),System.currentTimeMillis(),request.getUrl());
}

View File

@ -1,6 +1,7 @@
package us.codecraft.webmagic.scheduler;
import org.junit.Before;
import org.junit.Ignore;
import org.junit.Test;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
@ -20,6 +21,7 @@ public class RedisSchedulerTest {
redisScheduler = new RedisScheduler("localhost");
}
@Ignore("environment depended")
@Test
public void test() {
Task task = new Task() {
@ -35,7 +37,6 @@ public class RedisSchedulerTest {
};
redisScheduler.push(new Request("http://www.ibm.com/developerworks/cn/java/j-javadev2-22/"), task);
Request poll = redisScheduler.poll(task);
System.out.println(poll.getUrl());
}
}

View File

@ -20,13 +20,13 @@ public class DiandianBlogProcessor implements PageProcessor {
//a()表示提取链接links()表示提取所有链接
//getHtml()返回Html对象支持链式调用
//r()表示用正则表达式提取一条内容regex()表示提取多条内容
//toString()表示取单条结果,toStrings()表示取多条
List<String> requests = page.getHtml().links().regex("(.*/post/.*)").toStrings();
//toString()表示取单条结果,all()表示取多条
List<String> requests = page.getHtml().links().regex("(.*/post/.*)").all();
//使用page.addTargetRequests()方法将待抓取的链接加入队列
page.addTargetRequests(requests);
//page.putField(key,value)将抽取的内容加入结果Map
//x()和xs()使用xpath进行抽取
page.putField("title", page.getHtml().xpath("//title").regex("(.*?)\\|"));
page.putField("title", page.getHtml().xpath("//title").regex("(.*?)\\|").toString());
//smartContent()使用readability技术直接抽取正文对于规整的文本有比较好的抽取正确率
page.putField("content", page.getHtml().smartContent());
page.putField("date", page.getUrl().regex("post/(\\d+-\\d+-\\d+)/"));

View File

@ -18,7 +18,7 @@ public class DianpingProcessor implements PageProcessor {
@Override
public void process(Page page) {
List<String> requests = page.getHtml().links().regex("http://info-search-web121361\\.alpha\\.dp:8080/search/.*").toStrings();
List<String> requests = page.getHtml().links().regex("http://info-search-web121361\\.alpha\\.dp:8080/search/.*").all();
page.addTargetRequests(requests);
}

View File

@ -18,9 +18,9 @@ public class DiaoyuwengProcessor implements PageProcessor {
@Override
public void process(Page page) {
List<String> requests = page.getHtml().links().regex("(http://www\\.diaoyuweng\\.com/home\\.php\\?mod=space&uid=88304&do=thread&view=me&type=thread&order=dateline&from=space&page=\\d+)").toStrings();
List<String> requests = page.getHtml().links().regex("(http://www\\.diaoyuweng\\.com/home\\.php\\?mod=space&uid=88304&do=thread&view=me&type=thread&order=dateline&from=space&page=\\d+)").all();
page.addTargetRequests(requests);
requests = page.getHtml().links().regex("(http://www\\.diaoyuweng\\.com/thread-\\d+-1-1.html)").toStrings();
requests = page.getHtml().links().regex("(http://www\\.diaoyuweng\\.com/thread-\\d+-1-1.html)").all();
page.addTargetRequests(requests);
if (page.getUrl().toString().contains("thread")){
page.putField("title", page.getHtml().xpath("//a[@id='thread_subject']"));

View File

@ -15,7 +15,7 @@ public class F58PageProcesser implements PageProcessor {
@Override
public void process(Page page) {
List<String> strings = page.getHtml().regex("<a[^<>]*href=[\"']{1}(/yewu/.*?)[\"']{1}").toStrings();
List<String> strings = page.getHtml().regex("<a[^<>]*href=[\"']{1}(/yewu/.*?)[\"']{1}").all();
page.addTargetRequests(strings);
page.putField("title",page.getHtml().regex("<title>(.*)</title>"));
page.putField("body",page.getHtml().xpath("//dd[@class='w133']"));

View File

@ -20,7 +20,7 @@ public class GlobalProcessor implements PageProcessor {
@Override
public void process(Page page) {
final List<String> requests = page.getHtml().links().toStrings();
final List<String> requests = page.getHtml().links().all();
page.addTargetRequests(requests);
}

View File

@ -15,7 +15,7 @@ public class HuxiuProcessor implements PageProcessor {
@Override
public void process(Page page) {
//http://progressdaily.diandian.com/post/2013-01-24/40046867275
List<String> requests = page.getHtml().regex("<a[^<>\"']*href=[\"']{1}([/]{0,1}article[^<>#\"']*?)[\"']{1}").toStrings();
List<String> requests = page.getHtml().regex("<a[^<>\"']*href=[\"']{1}([/]{0,1}article[^<>#\"']*?)[\"']{1}").all();
page.addTargetRequests(requests);
page.putField("title",page.getHtml().xpath("//div[@class='neirong']//h1[@class='ph xs5']"));
page.putField("content",page.getHtml().smartContent());

View File

@ -15,12 +15,12 @@ public class MeicanProcessor implements PageProcessor {
@Override
public void process(Page page) {
//http://progressdaily.diandian.com/post/2013-01-24/40046867275
List<String> requests = page.getHtml().xpath("//a[@class=\"area_link flat_btn\"]/@href").toStrings();
List<String> requests = page.getHtml().xpath("//a[@class=\"area_link flat_btn\"]/@href").all();
if (requests.size() > 2) {
requests = requests.subList(0, 2);
}
page.addTargetRequests(requests);
page.addTargetRequests(page.getHtml().links().regex("(.*/restaurant/[^#]+)").toStrings());
page.addTargetRequests(page.getHtml().links().regex("(.*/restaurant/[^#]+)").all());
page.putField("items", page.getHtml().xpath("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"name\"]"));
page.putField("prices", page.getHtml().xpath("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"price_outer\"]/span[@class=\"price\"]"));
}

View File

@ -14,7 +14,7 @@ import java.util.List;
public class NjuBBSProcessor implements PageProcessor {
@Override
public void process(Page page) {
List<String> requests = page.getHtml().regex("<a[^<>]*href=(bbstcon\\?board=Pictures&file=[^>]*)").toStrings();
List<String> requests = page.getHtml().regex("<a[^<>]*href=(bbstcon\\?board=Pictures&file=[^>]*)").all();
page.addTargetRequests(requests);
page.putField("title",page.getHtml().xpath("//div[@id='content']//h2/a"));
page.putField("content",page.getHtml().smartContent());

View File

@ -15,7 +15,7 @@ public class OschinaBlogPageProcesser implements PageProcessor {
@Override
public void process(Page page) {
List<String> strings = page.getHtml().links().regex("(http://my\\.oschina\\.net)").toStrings();
List<String> strings = page.getHtml().links().regex("(http://my\\.oschina\\.net)").all();
page.addTargetRequests(strings);
page.putField("title", page.getHtml().xpath("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1"));
page.putField("content", page.getHtml().smartContent());

View File

@ -15,7 +15,7 @@ public class OschinaPageProcesser implements PageProcessor {
@Override
public void process(Page page) {
List<String> strings = page.getHtml().regex("<a[^<>]*href=[\"']{1}(http://www\\.oschina\\.net/question/[\\w]+)[\"']{1}").toStrings();
List<String> strings = page.getHtml().regex("<a[^<>]*href=[\"']{1}(http://www\\.oschina\\.net/question/[\\w]+)[\"']{1}").all();
page.addTargetRequests(strings);
page.putField("title", page.getHtml().xpath("//div[@class='QTitle']/h1/a"));
page.putField("content", page.getHtml().xpath("//div[@class='Question']//div[@class='Content']/div[@class='detail']"));

View File

@ -18,7 +18,7 @@ public class QzoneBlogProcessor implements PageProcessor {
//http://b1.cnc.qzone.qq.com/cgi-bin/blognew/get_abs?hostUin=233017404&uin=233017404&blogType=0&statYear=2013&source=0&statYear=2013&g_tk=291639571&g_tk=291639571&reqInfo=7&pos=0&num=15&source=0&rand=0.46480297949165106
// &cateName=&cateHex=&statYear=2013&reqInfo=7&pos=0&num=15&sortType=0&source=0&rand=0.46480297949165106&g_tk=291639571&verbose=1&ref=qzone
List<String> requests = page.getHtml().regex("<a[^<>]*href=[\"']{1}(http://17dujingdian\\.com/post/[^#]*?)[\"']{1}").toStrings();
List<String> requests = page.getHtml().regex("<a[^<>]*href=[\"']{1}(http://17dujingdian\\.com/post/[^#]*?)[\"']{1}").all();
page.addTargetRequests(requests);
page.putField("title",page.getHtml().xpath("//div[@id='content']//h2/a"));
page.putField("content",page.getHtml().smartContent());

View File

@ -16,7 +16,7 @@ public class SinaBlogProcesser implements PageProcessor {
@Override
public void process(Page page) {
page.addTargetRequests(page.getHtml().xpath("//div[@class='articalfrontback SG_j_linedot1 clearfix']").links().toStrings());
page.addTargetRequests(page.getHtml().xpath("//div[@class='articalfrontback SG_j_linedot1 clearfix']").links().all());
page.putField("title", page.getHtml().xpath("//div[@class='articalTitle']/h2"));
page.putField("content",page.getHtml().xpath("//div[@id='articlebody']//div[@class='articalContent']"));
page.putField("id",page.getUrl().regex("http://blog\\.sina\\.com\\.cn/s/blog_(\\w+)"));

View File

@ -15,7 +15,7 @@ public class TianyaPageProcesser implements PageProcessor {
@Override
public void process(Page page) {
List<String> strings = page.getHtml().regex("<a[^<>]*href=[\"']{1}(/post-free.*?\\.shtml)[\"']{1}").toStrings();
List<String> strings = page.getHtml().regex("<a[^<>]*href=[\"']{1}(/post-free.*?\\.shtml)[\"']{1}").all();
page.addTargetRequests(strings);
page.putField("title", page.getHtml().xpath("//div[@id='post_head']//span[@class='s_title']//b"));
page.putField("body",page.getHtml().smartContent());