update pipeline api
parent
55d80129bf
commit
65dc372152
|
@ -6,8 +6,6 @@ import us.codecraft.webmagic.utils.UrlUtils;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
|
||||||
import java.util.concurrent.ConcurrentHashMap;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* <pre>
|
* <pre>
|
||||||
|
@ -27,7 +25,7 @@ public class Page {
|
||||||
|
|
||||||
private Request request;
|
private Request request;
|
||||||
|
|
||||||
private Map<String, Selectable> fields = new ConcurrentHashMap<String, Selectable>();
|
private ResultItems resultItems = new ResultItems();
|
||||||
|
|
||||||
private Selectable html;
|
private Selectable html;
|
||||||
|
|
||||||
|
@ -35,44 +33,16 @@ public class Page {
|
||||||
|
|
||||||
private List<Request> targetRequests = new ArrayList<Request>();
|
private List<Request> targetRequests = new ArrayList<Request>();
|
||||||
|
|
||||||
private boolean skip;
|
|
||||||
|
|
||||||
private Object extra;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* 是否忽略这个页面,用于pipeline来判断是否对这个页面进行处理
|
|
||||||
* @return 是否忽略 true 忽略
|
|
||||||
*/
|
|
||||||
public boolean isSkip() {
|
|
||||||
return skip;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* 设置是否忽略这个页面,用于pipeline来判断是否对这个页面进行处理
|
|
||||||
* @param skip 是否忽略 true 忽略
|
|
||||||
*/
|
|
||||||
public void setSkip(boolean skip) {
|
|
||||||
this.skip = skip;
|
|
||||||
}
|
|
||||||
|
|
||||||
public Page() {
|
public Page() {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* 获取抽取的结果,在{@link us.codecraft.webmagic.pipeline.Pipeline} 中调用
|
|
||||||
* @return fields 抽取的结果
|
|
||||||
*/
|
|
||||||
public Map<String, Selectable> getFields() {
|
|
||||||
return fields;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 保存抽取的结果
|
* 保存抽取的结果
|
||||||
* @param key 结果的key
|
* @param key 结果的key
|
||||||
* @param field 结果的value
|
* @param field 结果的value
|
||||||
*/
|
*/
|
||||||
public void putField(String key, Selectable field) {
|
public void putField(String key, Object field) {
|
||||||
fields.put(key, field);
|
resultItems.put(key, field);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -157,23 +127,10 @@ public class Page {
|
||||||
|
|
||||||
public void setRequest(Request request) {
|
public void setRequest(Request request) {
|
||||||
this.request = request;
|
this.request = request;
|
||||||
|
this.resultItems.setRequest(request);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
public ResultItems getResultItems() {
|
||||||
* 获取附加对象
|
return resultItems;
|
||||||
* @param <T> 对象类型
|
|
||||||
* @return 对象内容
|
|
||||||
*/
|
|
||||||
public <T> T getExtra() {
|
|
||||||
return (T)extra;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* 设置附加对象
|
|
||||||
* @param extra 对象内容
|
|
||||||
* @param <T> 对象类型
|
|
||||||
*/
|
|
||||||
public <T> void setExtra(T extra) {
|
|
||||||
this.extra = extra;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,64 @@
|
||||||
|
package us.codecraft.webmagic;
|
||||||
|
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 保存抽取结果的类,由PageProcessor处理得到,传递给{@link us.codecraft.webmagic.pipeline.Pipeline}进行持久化。<br>
|
||||||
|
* @author yihua.huang@dianping.com <br>
|
||||||
|
* @date: 13-7-25 <br>
|
||||||
|
* Time: 下午12:20 <br>
|
||||||
|
*/
|
||||||
|
public class ResultItems {
|
||||||
|
|
||||||
|
private Map<String, Object> fields = new HashMap<String, Object>();
|
||||||
|
|
||||||
|
private Request request;
|
||||||
|
|
||||||
|
private boolean skip;
|
||||||
|
|
||||||
|
public <T> T get(String key) {
|
||||||
|
Object o = fields.get(key);
|
||||||
|
if (o == null) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
return (T) fields.get(key);
|
||||||
|
}
|
||||||
|
|
||||||
|
public Map<String, Object> getAll() {
|
||||||
|
return fields;
|
||||||
|
}
|
||||||
|
|
||||||
|
public <T> ResultItems put(String key, T value) {
|
||||||
|
fields.put(key, value);
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Request getRequest() {
|
||||||
|
return request;
|
||||||
|
}
|
||||||
|
|
||||||
|
public ResultItems setRequest(Request request) {
|
||||||
|
this.request = request;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 是否忽略这个页面,用于pipeline来判断是否对这个页面进行处理
|
||||||
|
* @return 是否忽略 true 忽略
|
||||||
|
*/
|
||||||
|
public boolean isSkip() {
|
||||||
|
return skip;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 设置是否忽略这个页面,用于pipeline来判断是否对这个页面进行处理
|
||||||
|
* @param skip
|
||||||
|
* @return this
|
||||||
|
*/
|
||||||
|
public ResultItems setSkip(boolean skip) {
|
||||||
|
this.skip = skip;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
}
|
|
@ -196,7 +196,7 @@ public class Spider implements Runnable, Task {
|
||||||
pageProcessor.process(page);
|
pageProcessor.process(page);
|
||||||
addRequest(page);
|
addRequest(page);
|
||||||
for (Pipeline pipeline : pipelines) {
|
for (Pipeline pipeline : pipelines) {
|
||||||
pipeline.process(page, this);
|
pipeline.process(page.getResultItems(), this);
|
||||||
}
|
}
|
||||||
sleep(site.getSleepTime());
|
sleep(site.getSleepTime());
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,8 +1,7 @@
|
||||||
package us.codecraft.webmagic.pipeline;
|
package us.codecraft.webmagic.pipeline;
|
||||||
|
|
||||||
import us.codecraft.webmagic.Page;
|
import us.codecraft.webmagic.ResultItems;
|
||||||
import us.codecraft.webmagic.Task;
|
import us.codecraft.webmagic.Task;
|
||||||
import us.codecraft.webmagic.selector.Selectable;
|
|
||||||
|
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
|
@ -15,13 +14,10 @@ import java.util.Map;
|
||||||
public class ConsolePipeline implements Pipeline{
|
public class ConsolePipeline implements Pipeline{
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void process(Page page,Task task) {
|
public void process(ResultItems resultItems,Task task) {
|
||||||
System.out.println("get page: "+page.getUrl());
|
System.out.println("get page: "+resultItems.getRequest().getUrl());
|
||||||
for (Map.Entry<String, Selectable> entry : page.getFields().entrySet()) {
|
for (Map.Entry<String, Object> entry : resultItems.getAll().entrySet()) {
|
||||||
System.out.println(entry.getKey()+":\t"+entry.getValue().toStrings());
|
System.out.println(entry.getKey()+":\t"+entry.getValue());
|
||||||
}
|
|
||||||
if (page.getExtra()!=null){
|
|
||||||
System.out.println(page.getExtra());
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -2,13 +2,14 @@ package us.codecraft.webmagic.pipeline;
|
||||||
|
|
||||||
import org.apache.commons.codec.digest.DigestUtils;
|
import org.apache.commons.codec.digest.DigestUtils;
|
||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
import us.codecraft.webmagic.Page;
|
import us.codecraft.webmagic.ResultItems;
|
||||||
import us.codecraft.webmagic.Task;
|
import us.codecraft.webmagic.Task;
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.FileWriter;
|
import java.io.FileWriter;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.PrintWriter;
|
import java.io.PrintWriter;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 持久化到文件的接口。
|
* 持久化到文件的接口。
|
||||||
|
@ -38,16 +39,18 @@ public class FilePipeline implements Pipeline {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void process(Page page, Task task) {
|
public void process(ResultItems resultItems, Task task) {
|
||||||
String path = this.path + "/" + task.getUUID() + "/";
|
String path = this.path + "/" + task.getUUID() + "/";
|
||||||
File file = new File(path);
|
File file = new File(path);
|
||||||
if (!file.exists()) {
|
if (!file.exists()) {
|
||||||
file.mkdirs();
|
file.mkdirs();
|
||||||
}
|
}
|
||||||
try {
|
try {
|
||||||
PrintWriter printWriter = new PrintWriter(new FileWriter(path + DigestUtils.md5Hex(page.getUrl().toString())));
|
PrintWriter printWriter = new PrintWriter(new FileWriter(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl())));
|
||||||
printWriter.println("url:\t" + page.getUrl());
|
printWriter.println("url:\t" + resultItems.getRequest().getUrl());
|
||||||
printWriter.println("html:\t" + page.getHtml());
|
for (Map.Entry<String, Object> entry : resultItems.getAll().entrySet()) {
|
||||||
|
printWriter.println(entry.getKey()+":\t"+entry.getValue());
|
||||||
|
}
|
||||||
printWriter.close();
|
printWriter.close();
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
logger.warn("write file error",e);
|
logger.warn("write file error",e);
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
package us.codecraft.webmagic.pipeline;
|
package us.codecraft.webmagic.pipeline;
|
||||||
|
|
||||||
import us.codecraft.webmagic.Page;
|
import us.codecraft.webmagic.ResultItems;
|
||||||
import us.codecraft.webmagic.Task;
|
import us.codecraft.webmagic.Task;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -11,5 +11,5 @@ import us.codecraft.webmagic.Task;
|
||||||
*/
|
*/
|
||||||
public interface Pipeline {
|
public interface Pipeline {
|
||||||
|
|
||||||
public void process(Page page,Task task);
|
public void process(ResultItems resultItems,Task task);
|
||||||
}
|
}
|
||||||
|
|
|
@ -30,12 +30,13 @@ public class SimplePageProcessor implements PageProcessor {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void process(Page page) {
|
public void process(Page page) {
|
||||||
List<String> requests = page.getHtml().links().regex(urlPattern).toStrings();
|
List<String> requests = page.getHtml().links().regex(urlPattern).all();
|
||||||
//调用page.addTargetRequests()方法添加待抓取链接
|
//调用page.addTargetRequests()方法添加待抓取链接
|
||||||
page.addTargetRequests(requests);
|
page.addTargetRequests(requests);
|
||||||
//xpath方式抽取
|
//xpath方式抽取
|
||||||
page.putField("title", page.getHtml().xpath("//title"));
|
page.putField("title", page.getHtml().xpath("//title"));
|
||||||
//sc表示使用Readability技术抽取正文
|
//sc表示使用Readability技术抽取正文
|
||||||
|
page.putField("html", page.getHtml().toString());
|
||||||
page.putField("content", page.getHtml().smartContent());
|
page.putField("content", page.getHtml().smartContent());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -82,14 +82,14 @@ public class PlainText implements Selectable {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public List<String> toStrings() {
|
public List<String> all() {
|
||||||
return strings;
|
return strings;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String toString() {
|
public String toString() {
|
||||||
if (CollectionUtils.isNotEmpty(toStrings())) {
|
if (CollectionUtils.isNotEmpty(all())) {
|
||||||
return toStrings().get(0);
|
return all().get(0);
|
||||||
} else {
|
} else {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
|
@ -69,5 +69,5 @@ public interface Selectable {
|
||||||
*
|
*
|
||||||
* @return multi string result
|
* @return multi string result
|
||||||
*/
|
*/
|
||||||
public List<String> toStrings();
|
public List<String> all();
|
||||||
}
|
}
|
||||||
|
|
|
@ -1351,7 +1351,7 @@ public class XpathSelectorTest {
|
||||||
public void testOschina() {
|
public void testOschina() {
|
||||||
Html html1 = new Html(html);
|
Html html1 = new Html(html);
|
||||||
Assert.assertEquals("再次吐槽easyui", html1.xpath(".//*[@class='QTitle']/h1/a").toString());
|
Assert.assertEquals("再次吐槽easyui", html1.xpath(".//*[@class='QTitle']/h1/a").toString());
|
||||||
Assert.assertNotNull(html1.$("a[href]").xpath("//@href").toStrings());
|
Assert.assertNotNull(html1.$("a[href]").xpath("//@href").all());
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -4,7 +4,7 @@ import freemarker.template.Configuration;
|
||||||
import freemarker.template.Template;
|
import freemarker.template.Template;
|
||||||
import freemarker.template.TemplateException;
|
import freemarker.template.TemplateException;
|
||||||
import org.apache.commons.codec.digest.DigestUtils;
|
import org.apache.commons.codec.digest.DigestUtils;
|
||||||
import us.codecraft.webmagic.Page;
|
import us.codecraft.webmagic.ResultItems;
|
||||||
import us.codecraft.webmagic.Task;
|
import us.codecraft.webmagic.Task;
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
|
@ -39,8 +39,8 @@ public class FreemarkerPipeline implements Pipeline {
|
||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void process(Page page, Task task) {
|
public void process(ResultItems resultItems, Task task) {
|
||||||
if (page.isSkip()) {
|
if (resultItems.isSkip()) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
String path = this.path + "" + task.getUUID() + "/";
|
String path = this.path + "" + task.getUUID() + "/";
|
||||||
|
@ -49,8 +49,8 @@ public class FreemarkerPipeline implements Pipeline {
|
||||||
file.mkdirs();
|
file.mkdirs();
|
||||||
}
|
}
|
||||||
try {
|
try {
|
||||||
PrintWriter printWriter = new PrintWriter(new FileWriter(path + DigestUtils.md5Hex(page.getUrl().toString()) + ".html"));
|
PrintWriter printWriter = new PrintWriter(new FileWriter(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".html"));
|
||||||
template.process(page.getFields(), printWriter);
|
template.process(resultItems.getAll(), printWriter);
|
||||||
printWriter.close();
|
printWriter.close();
|
||||||
} catch (TemplateException e) {
|
} catch (TemplateException e) {
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
|
|
|
@ -28,7 +28,9 @@ public class RedisScheduler implements Scheduler{
|
||||||
@Override
|
@Override
|
||||||
public synchronized void push(Request request, Task task) {
|
public synchronized void push(Request request, Task task) {
|
||||||
Jedis jedis = pool.getResource();
|
Jedis jedis = pool.getResource();
|
||||||
|
//使用SortedSet进行url去重
|
||||||
if (jedis.zrank(SET_PREFIX+task.getUUID(),request.getUrl())==null){
|
if (jedis.zrank(SET_PREFIX+task.getUUID(),request.getUrl())==null){
|
||||||
|
//使用List保存队列
|
||||||
jedis.rpush(QUEUE_PREFIX+task.getUUID(),request.getUrl());
|
jedis.rpush(QUEUE_PREFIX+task.getUUID(),request.getUrl());
|
||||||
jedis.zadd(SET_PREFIX+task.getUUID(),System.currentTimeMillis(),request.getUrl());
|
jedis.zadd(SET_PREFIX+task.getUUID(),System.currentTimeMillis(),request.getUrl());
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
package us.codecraft.webmagic.scheduler;
|
package us.codecraft.webmagic.scheduler;
|
||||||
|
|
||||||
import org.junit.Before;
|
import org.junit.Before;
|
||||||
|
import org.junit.Ignore;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
import us.codecraft.webmagic.Request;
|
import us.codecraft.webmagic.Request;
|
||||||
import us.codecraft.webmagic.Site;
|
import us.codecraft.webmagic.Site;
|
||||||
|
@ -20,6 +21,7 @@ public class RedisSchedulerTest {
|
||||||
redisScheduler = new RedisScheduler("localhost");
|
redisScheduler = new RedisScheduler("localhost");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Ignore("environment depended")
|
||||||
@Test
|
@Test
|
||||||
public void test() {
|
public void test() {
|
||||||
Task task = new Task() {
|
Task task = new Task() {
|
||||||
|
@ -35,7 +37,6 @@ public class RedisSchedulerTest {
|
||||||
};
|
};
|
||||||
redisScheduler.push(new Request("http://www.ibm.com/developerworks/cn/java/j-javadev2-22/"), task);
|
redisScheduler.push(new Request("http://www.ibm.com/developerworks/cn/java/j-javadev2-22/"), task);
|
||||||
Request poll = redisScheduler.poll(task);
|
Request poll = redisScheduler.poll(task);
|
||||||
System.out.println(poll.getUrl());
|
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -20,13 +20,13 @@ public class DiandianBlogProcessor implements PageProcessor {
|
||||||
//a()表示提取链接,links()表示提取所有链接
|
//a()表示提取链接,links()表示提取所有链接
|
||||||
//getHtml()返回Html对象,支持链式调用
|
//getHtml()返回Html对象,支持链式调用
|
||||||
//r()表示用正则表达式提取一条内容,regex()表示提取多条内容
|
//r()表示用正则表达式提取一条内容,regex()表示提取多条内容
|
||||||
//toString()表示取单条结果,toStrings()表示取多条
|
//toString()表示取单条结果,all()表示取多条
|
||||||
List<String> requests = page.getHtml().links().regex("(.*/post/.*)").toStrings();
|
List<String> requests = page.getHtml().links().regex("(.*/post/.*)").all();
|
||||||
//使用page.addTargetRequests()方法将待抓取的链接加入队列
|
//使用page.addTargetRequests()方法将待抓取的链接加入队列
|
||||||
page.addTargetRequests(requests);
|
page.addTargetRequests(requests);
|
||||||
//page.putField(key,value)将抽取的内容加入结果Map
|
//page.putField(key,value)将抽取的内容加入结果Map
|
||||||
//x()和xs()使用xpath进行抽取
|
//x()和xs()使用xpath进行抽取
|
||||||
page.putField("title", page.getHtml().xpath("//title").regex("(.*?)\\|"));
|
page.putField("title", page.getHtml().xpath("//title").regex("(.*?)\\|").toString());
|
||||||
//smartContent()使用readability技术直接抽取正文,对于规整的文本有比较好的抽取正确率
|
//smartContent()使用readability技术直接抽取正文,对于规整的文本有比较好的抽取正确率
|
||||||
page.putField("content", page.getHtml().smartContent());
|
page.putField("content", page.getHtml().smartContent());
|
||||||
page.putField("date", page.getUrl().regex("post/(\\d+-\\d+-\\d+)/"));
|
page.putField("date", page.getUrl().regex("post/(\\d+-\\d+-\\d+)/"));
|
||||||
|
|
|
@ -18,7 +18,7 @@ public class DianpingProcessor implements PageProcessor {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void process(Page page) {
|
public void process(Page page) {
|
||||||
List<String> requests = page.getHtml().links().regex("http://info-search-web121361\\.alpha\\.dp:8080/search/.*").toStrings();
|
List<String> requests = page.getHtml().links().regex("http://info-search-web121361\\.alpha\\.dp:8080/search/.*").all();
|
||||||
page.addTargetRequests(requests);
|
page.addTargetRequests(requests);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -18,9 +18,9 @@ public class DiaoyuwengProcessor implements PageProcessor {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void process(Page page) {
|
public void process(Page page) {
|
||||||
List<String> requests = page.getHtml().links().regex("(http://www\\.diaoyuweng\\.com/home\\.php\\?mod=space&uid=88304&do=thread&view=me&type=thread&order=dateline&from=space&page=\\d+)").toStrings();
|
List<String> requests = page.getHtml().links().regex("(http://www\\.diaoyuweng\\.com/home\\.php\\?mod=space&uid=88304&do=thread&view=me&type=thread&order=dateline&from=space&page=\\d+)").all();
|
||||||
page.addTargetRequests(requests);
|
page.addTargetRequests(requests);
|
||||||
requests = page.getHtml().links().regex("(http://www\\.diaoyuweng\\.com/thread-\\d+-1-1.html)").toStrings();
|
requests = page.getHtml().links().regex("(http://www\\.diaoyuweng\\.com/thread-\\d+-1-1.html)").all();
|
||||||
page.addTargetRequests(requests);
|
page.addTargetRequests(requests);
|
||||||
if (page.getUrl().toString().contains("thread")){
|
if (page.getUrl().toString().contains("thread")){
|
||||||
page.putField("title", page.getHtml().xpath("//a[@id='thread_subject']"));
|
page.putField("title", page.getHtml().xpath("//a[@id='thread_subject']"));
|
||||||
|
|
|
@ -15,7 +15,7 @@ public class F58PageProcesser implements PageProcessor {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void process(Page page) {
|
public void process(Page page) {
|
||||||
List<String> strings = page.getHtml().regex("<a[^<>]*href=[\"']{1}(/yewu/.*?)[\"']{1}").toStrings();
|
List<String> strings = page.getHtml().regex("<a[^<>]*href=[\"']{1}(/yewu/.*?)[\"']{1}").all();
|
||||||
page.addTargetRequests(strings);
|
page.addTargetRequests(strings);
|
||||||
page.putField("title",page.getHtml().regex("<title>(.*)</title>"));
|
page.putField("title",page.getHtml().regex("<title>(.*)</title>"));
|
||||||
page.putField("body",page.getHtml().xpath("//dd[@class='w133']"));
|
page.putField("body",page.getHtml().xpath("//dd[@class='w133']"));
|
||||||
|
|
|
@ -20,7 +20,7 @@ public class GlobalProcessor implements PageProcessor {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void process(Page page) {
|
public void process(Page page) {
|
||||||
final List<String> requests = page.getHtml().links().toStrings();
|
final List<String> requests = page.getHtml().links().all();
|
||||||
page.addTargetRequests(requests);
|
page.addTargetRequests(requests);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -15,7 +15,7 @@ public class HuxiuProcessor implements PageProcessor {
|
||||||
@Override
|
@Override
|
||||||
public void process(Page page) {
|
public void process(Page page) {
|
||||||
//http://progressdaily.diandian.com/post/2013-01-24/40046867275
|
//http://progressdaily.diandian.com/post/2013-01-24/40046867275
|
||||||
List<String> requests = page.getHtml().regex("<a[^<>\"']*href=[\"']{1}([/]{0,1}article[^<>#\"']*?)[\"']{1}").toStrings();
|
List<String> requests = page.getHtml().regex("<a[^<>\"']*href=[\"']{1}([/]{0,1}article[^<>#\"']*?)[\"']{1}").all();
|
||||||
page.addTargetRequests(requests);
|
page.addTargetRequests(requests);
|
||||||
page.putField("title",page.getHtml().xpath("//div[@class='neirong']//h1[@class='ph xs5']"));
|
page.putField("title",page.getHtml().xpath("//div[@class='neirong']//h1[@class='ph xs5']"));
|
||||||
page.putField("content",page.getHtml().smartContent());
|
page.putField("content",page.getHtml().smartContent());
|
||||||
|
|
|
@ -15,12 +15,12 @@ public class MeicanProcessor implements PageProcessor {
|
||||||
@Override
|
@Override
|
||||||
public void process(Page page) {
|
public void process(Page page) {
|
||||||
//http://progressdaily.diandian.com/post/2013-01-24/40046867275
|
//http://progressdaily.diandian.com/post/2013-01-24/40046867275
|
||||||
List<String> requests = page.getHtml().xpath("//a[@class=\"area_link flat_btn\"]/@href").toStrings();
|
List<String> requests = page.getHtml().xpath("//a[@class=\"area_link flat_btn\"]/@href").all();
|
||||||
if (requests.size() > 2) {
|
if (requests.size() > 2) {
|
||||||
requests = requests.subList(0, 2);
|
requests = requests.subList(0, 2);
|
||||||
}
|
}
|
||||||
page.addTargetRequests(requests);
|
page.addTargetRequests(requests);
|
||||||
page.addTargetRequests(page.getHtml().links().regex("(.*/restaurant/[^#]+)").toStrings());
|
page.addTargetRequests(page.getHtml().links().regex("(.*/restaurant/[^#]+)").all());
|
||||||
page.putField("items", page.getHtml().xpath("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"name\"]"));
|
page.putField("items", page.getHtml().xpath("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"name\"]"));
|
||||||
page.putField("prices", page.getHtml().xpath("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"price_outer\"]/span[@class=\"price\"]"));
|
page.putField("prices", page.getHtml().xpath("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"price_outer\"]/span[@class=\"price\"]"));
|
||||||
}
|
}
|
||||||
|
|
|
@ -14,7 +14,7 @@ import java.util.List;
|
||||||
public class NjuBBSProcessor implements PageProcessor {
|
public class NjuBBSProcessor implements PageProcessor {
|
||||||
@Override
|
@Override
|
||||||
public void process(Page page) {
|
public void process(Page page) {
|
||||||
List<String> requests = page.getHtml().regex("<a[^<>]*href=(bbstcon\\?board=Pictures&file=[^>]*)").toStrings();
|
List<String> requests = page.getHtml().regex("<a[^<>]*href=(bbstcon\\?board=Pictures&file=[^>]*)").all();
|
||||||
page.addTargetRequests(requests);
|
page.addTargetRequests(requests);
|
||||||
page.putField("title",page.getHtml().xpath("//div[@id='content']//h2/a"));
|
page.putField("title",page.getHtml().xpath("//div[@id='content']//h2/a"));
|
||||||
page.putField("content",page.getHtml().smartContent());
|
page.putField("content",page.getHtml().smartContent());
|
||||||
|
|
|
@ -15,7 +15,7 @@ public class OschinaBlogPageProcesser implements PageProcessor {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void process(Page page) {
|
public void process(Page page) {
|
||||||
List<String> strings = page.getHtml().links().regex("(http://my\\.oschina\\.net)").toStrings();
|
List<String> strings = page.getHtml().links().regex("(http://my\\.oschina\\.net)").all();
|
||||||
page.addTargetRequests(strings);
|
page.addTargetRequests(strings);
|
||||||
page.putField("title", page.getHtml().xpath("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1"));
|
page.putField("title", page.getHtml().xpath("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1"));
|
||||||
page.putField("content", page.getHtml().smartContent());
|
page.putField("content", page.getHtml().smartContent());
|
||||||
|
|
|
@ -15,7 +15,7 @@ public class OschinaPageProcesser implements PageProcessor {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void process(Page page) {
|
public void process(Page page) {
|
||||||
List<String> strings = page.getHtml().regex("<a[^<>]*href=[\"']{1}(http://www\\.oschina\\.net/question/[\\w]+)[\"']{1}").toStrings();
|
List<String> strings = page.getHtml().regex("<a[^<>]*href=[\"']{1}(http://www\\.oschina\\.net/question/[\\w]+)[\"']{1}").all();
|
||||||
page.addTargetRequests(strings);
|
page.addTargetRequests(strings);
|
||||||
page.putField("title", page.getHtml().xpath("//div[@class='QTitle']/h1/a"));
|
page.putField("title", page.getHtml().xpath("//div[@class='QTitle']/h1/a"));
|
||||||
page.putField("content", page.getHtml().xpath("//div[@class='Question']//div[@class='Content']/div[@class='detail']"));
|
page.putField("content", page.getHtml().xpath("//div[@class='Question']//div[@class='Content']/div[@class='detail']"));
|
||||||
|
|
|
@ -18,7 +18,7 @@ public class QzoneBlogProcessor implements PageProcessor {
|
||||||
|
|
||||||
//http://b1.cnc.qzone.qq.com/cgi-bin/blognew/get_abs?hostUin=233017404&uin=233017404&blogType=0&statYear=2013&source=0&statYear=2013&g_tk=291639571&g_tk=291639571&reqInfo=7&pos=0&num=15&source=0&rand=0.46480297949165106
|
//http://b1.cnc.qzone.qq.com/cgi-bin/blognew/get_abs?hostUin=233017404&uin=233017404&blogType=0&statYear=2013&source=0&statYear=2013&g_tk=291639571&g_tk=291639571&reqInfo=7&pos=0&num=15&source=0&rand=0.46480297949165106
|
||||||
// &cateName=&cateHex=&statYear=2013&reqInfo=7&pos=0&num=15&sortType=0&source=0&rand=0.46480297949165106&g_tk=291639571&verbose=1&ref=qzone
|
// &cateName=&cateHex=&statYear=2013&reqInfo=7&pos=0&num=15&sortType=0&source=0&rand=0.46480297949165106&g_tk=291639571&verbose=1&ref=qzone
|
||||||
List<String> requests = page.getHtml().regex("<a[^<>]*href=[\"']{1}(http://17dujingdian\\.com/post/[^#]*?)[\"']{1}").toStrings();
|
List<String> requests = page.getHtml().regex("<a[^<>]*href=[\"']{1}(http://17dujingdian\\.com/post/[^#]*?)[\"']{1}").all();
|
||||||
page.addTargetRequests(requests);
|
page.addTargetRequests(requests);
|
||||||
page.putField("title",page.getHtml().xpath("//div[@id='content']//h2/a"));
|
page.putField("title",page.getHtml().xpath("//div[@id='content']//h2/a"));
|
||||||
page.putField("content",page.getHtml().smartContent());
|
page.putField("content",page.getHtml().smartContent());
|
||||||
|
|
|
@ -16,7 +16,7 @@ public class SinaBlogProcesser implements PageProcessor {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void process(Page page) {
|
public void process(Page page) {
|
||||||
page.addTargetRequests(page.getHtml().xpath("//div[@class='articalfrontback SG_j_linedot1 clearfix']").links().toStrings());
|
page.addTargetRequests(page.getHtml().xpath("//div[@class='articalfrontback SG_j_linedot1 clearfix']").links().all());
|
||||||
page.putField("title", page.getHtml().xpath("//div[@class='articalTitle']/h2"));
|
page.putField("title", page.getHtml().xpath("//div[@class='articalTitle']/h2"));
|
||||||
page.putField("content",page.getHtml().xpath("//div[@id='articlebody']//div[@class='articalContent']"));
|
page.putField("content",page.getHtml().xpath("//div[@id='articlebody']//div[@class='articalContent']"));
|
||||||
page.putField("id",page.getUrl().regex("http://blog\\.sina\\.com\\.cn/s/blog_(\\w+)"));
|
page.putField("id",page.getUrl().regex("http://blog\\.sina\\.com\\.cn/s/blog_(\\w+)"));
|
||||||
|
|
|
@ -15,7 +15,7 @@ public class TianyaPageProcesser implements PageProcessor {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void process(Page page) {
|
public void process(Page page) {
|
||||||
List<String> strings = page.getHtml().regex("<a[^<>]*href=[\"']{1}(/post-free.*?\\.shtml)[\"']{1}").toStrings();
|
List<String> strings = page.getHtml().regex("<a[^<>]*href=[\"']{1}(/post-free.*?\\.shtml)[\"']{1}").all();
|
||||||
page.addTargetRequests(strings);
|
page.addTargetRequests(strings);
|
||||||
page.putField("title", page.getHtml().xpath("//div[@id='post_head']//span[@class='s_title']//b"));
|
page.putField("title", page.getHtml().xpath("//div[@id='post_head']//span[@class='s_title']//b"));
|
||||||
page.putField("body",page.getHtml().smartContent());
|
page.putField("body",page.getHtml().smartContent());
|
||||||
|
|
Loading…
Reference in New Issue