update pipeline api
parent
55d80129bf
commit
65dc372152
|
@ -6,8 +6,6 @@ import us.codecraft.webmagic.utils.UrlUtils;
|
|||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
|
||||
/**
|
||||
* <pre>
|
||||
|
@ -27,7 +25,7 @@ public class Page {
|
|||
|
||||
private Request request;
|
||||
|
||||
private Map<String, Selectable> fields = new ConcurrentHashMap<String, Selectable>();
|
||||
private ResultItems resultItems = new ResultItems();
|
||||
|
||||
private Selectable html;
|
||||
|
||||
|
@ -35,44 +33,16 @@ public class Page {
|
|||
|
||||
private List<Request> targetRequests = new ArrayList<Request>();
|
||||
|
||||
private boolean skip;
|
||||
|
||||
private Object extra;
|
||||
|
||||
/**
|
||||
* 是否忽略这个页面,用于pipeline来判断是否对这个页面进行处理
|
||||
* @return 是否忽略 true 忽略
|
||||
*/
|
||||
public boolean isSkip() {
|
||||
return skip;
|
||||
}
|
||||
|
||||
/**
|
||||
* 设置是否忽略这个页面,用于pipeline来判断是否对这个页面进行处理
|
||||
* @param skip 是否忽略 true 忽略
|
||||
*/
|
||||
public void setSkip(boolean skip) {
|
||||
this.skip = skip;
|
||||
}
|
||||
|
||||
public Page() {
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取抽取的结果,在{@link us.codecraft.webmagic.pipeline.Pipeline} 中调用
|
||||
* @return fields 抽取的结果
|
||||
*/
|
||||
public Map<String, Selectable> getFields() {
|
||||
return fields;
|
||||
}
|
||||
|
||||
/**
|
||||
* 保存抽取的结果
|
||||
* @param key 结果的key
|
||||
* @param field 结果的value
|
||||
*/
|
||||
public void putField(String key, Selectable field) {
|
||||
fields.put(key, field);
|
||||
public void putField(String key, Object field) {
|
||||
resultItems.put(key, field);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -157,23 +127,10 @@ public class Page {
|
|||
|
||||
public void setRequest(Request request) {
|
||||
this.request = request;
|
||||
this.resultItems.setRequest(request);
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取附加对象
|
||||
* @param <T> 对象类型
|
||||
* @return 对象内容
|
||||
*/
|
||||
public <T> T getExtra() {
|
||||
return (T)extra;
|
||||
}
|
||||
|
||||
/**
|
||||
* 设置附加对象
|
||||
* @param extra 对象内容
|
||||
* @param <T> 对象类型
|
||||
*/
|
||||
public <T> void setExtra(T extra) {
|
||||
this.extra = extra;
|
||||
public ResultItems getResultItems() {
|
||||
return resultItems;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,64 @@
|
|||
package us.codecraft.webmagic;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* 保存抽取结果的类,由PageProcessor处理得到,传递给{@link us.codecraft.webmagic.pipeline.Pipeline}进行持久化。<br>
|
||||
* @author yihua.huang@dianping.com <br>
|
||||
* @date: 13-7-25 <br>
|
||||
* Time: 下午12:20 <br>
|
||||
*/
|
||||
public class ResultItems {
|
||||
|
||||
private Map<String, Object> fields = new HashMap<String, Object>();
|
||||
|
||||
private Request request;
|
||||
|
||||
private boolean skip;
|
||||
|
||||
public <T> T get(String key) {
|
||||
Object o = fields.get(key);
|
||||
if (o == null) {
|
||||
return null;
|
||||
}
|
||||
return (T) fields.get(key);
|
||||
}
|
||||
|
||||
public Map<String, Object> getAll() {
|
||||
return fields;
|
||||
}
|
||||
|
||||
public <T> ResultItems put(String key, T value) {
|
||||
fields.put(key, value);
|
||||
return this;
|
||||
}
|
||||
|
||||
public Request getRequest() {
|
||||
return request;
|
||||
}
|
||||
|
||||
public ResultItems setRequest(Request request) {
|
||||
this.request = request;
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* 是否忽略这个页面,用于pipeline来判断是否对这个页面进行处理
|
||||
* @return 是否忽略 true 忽略
|
||||
*/
|
||||
public boolean isSkip() {
|
||||
return skip;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* 设置是否忽略这个页面,用于pipeline来判断是否对这个页面进行处理
|
||||
* @param skip
|
||||
* @return this
|
||||
*/
|
||||
public ResultItems setSkip(boolean skip) {
|
||||
this.skip = skip;
|
||||
return this;
|
||||
}
|
||||
}
|
|
@ -196,7 +196,7 @@ public class Spider implements Runnable, Task {
|
|||
pageProcessor.process(page);
|
||||
addRequest(page);
|
||||
for (Pipeline pipeline : pipelines) {
|
||||
pipeline.process(page, this);
|
||||
pipeline.process(page.getResultItems(), this);
|
||||
}
|
||||
sleep(site.getSleepTime());
|
||||
}
|
||||
|
|
|
@ -1,8 +1,7 @@
|
|||
package us.codecraft.webmagic.pipeline;
|
||||
|
||||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.ResultItems;
|
||||
import us.codecraft.webmagic.Task;
|
||||
import us.codecraft.webmagic.selector.Selectable;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
|
@ -15,13 +14,10 @@ import java.util.Map;
|
|||
public class ConsolePipeline implements Pipeline{
|
||||
|
||||
@Override
|
||||
public void process(Page page,Task task) {
|
||||
System.out.println("get page: "+page.getUrl());
|
||||
for (Map.Entry<String, Selectable> entry : page.getFields().entrySet()) {
|
||||
System.out.println(entry.getKey()+":\t"+entry.getValue().toStrings());
|
||||
}
|
||||
if (page.getExtra()!=null){
|
||||
System.out.println(page.getExtra());
|
||||
public void process(ResultItems resultItems,Task task) {
|
||||
System.out.println("get page: "+resultItems.getRequest().getUrl());
|
||||
for (Map.Entry<String, Object> entry : resultItems.getAll().entrySet()) {
|
||||
System.out.println(entry.getKey()+":\t"+entry.getValue());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -2,13 +2,14 @@ package us.codecraft.webmagic.pipeline;
|
|||
|
||||
import org.apache.commons.codec.digest.DigestUtils;
|
||||
import org.apache.log4j.Logger;
|
||||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.ResultItems;
|
||||
import us.codecraft.webmagic.Task;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileWriter;
|
||||
import java.io.IOException;
|
||||
import java.io.PrintWriter;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* 持久化到文件的接口。
|
||||
|
@ -38,16 +39,18 @@ public class FilePipeline implements Pipeline {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void process(Page page, Task task) {
|
||||
public void process(ResultItems resultItems, Task task) {
|
||||
String path = this.path + "/" + task.getUUID() + "/";
|
||||
File file = new File(path);
|
||||
if (!file.exists()) {
|
||||
file.mkdirs();
|
||||
}
|
||||
try {
|
||||
PrintWriter printWriter = new PrintWriter(new FileWriter(path + DigestUtils.md5Hex(page.getUrl().toString())));
|
||||
printWriter.println("url:\t" + page.getUrl());
|
||||
printWriter.println("html:\t" + page.getHtml());
|
||||
PrintWriter printWriter = new PrintWriter(new FileWriter(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl())));
|
||||
printWriter.println("url:\t" + resultItems.getRequest().getUrl());
|
||||
for (Map.Entry<String, Object> entry : resultItems.getAll().entrySet()) {
|
||||
printWriter.println(entry.getKey()+":\t"+entry.getValue());
|
||||
}
|
||||
printWriter.close();
|
||||
} catch (IOException e) {
|
||||
logger.warn("write file error",e);
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
package us.codecraft.webmagic.pipeline;
|
||||
|
||||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.ResultItems;
|
||||
import us.codecraft.webmagic.Task;
|
||||
|
||||
/**
|
||||
|
@ -11,5 +11,5 @@ import us.codecraft.webmagic.Task;
|
|||
*/
|
||||
public interface Pipeline {
|
||||
|
||||
public void process(Page page,Task task);
|
||||
public void process(ResultItems resultItems,Task task);
|
||||
}
|
||||
|
|
|
@ -30,12 +30,13 @@ public class SimplePageProcessor implements PageProcessor {
|
|||
|
||||
@Override
|
||||
public void process(Page page) {
|
||||
List<String> requests = page.getHtml().links().regex(urlPattern).toStrings();
|
||||
List<String> requests = page.getHtml().links().regex(urlPattern).all();
|
||||
//调用page.addTargetRequests()方法添加待抓取链接
|
||||
page.addTargetRequests(requests);
|
||||
//xpath方式抽取
|
||||
page.putField("title", page.getHtml().xpath("//title"));
|
||||
//sc表示使用Readability技术抽取正文
|
||||
page.putField("html", page.getHtml().toString());
|
||||
page.putField("content", page.getHtml().smartContent());
|
||||
}
|
||||
|
||||
|
|
|
@ -82,14 +82,14 @@ public class PlainText implements Selectable {
|
|||
}
|
||||
|
||||
@Override
|
||||
public List<String> toStrings() {
|
||||
public List<String> all() {
|
||||
return strings;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
if (CollectionUtils.isNotEmpty(toStrings())) {
|
||||
return toStrings().get(0);
|
||||
if (CollectionUtils.isNotEmpty(all())) {
|
||||
return all().get(0);
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
|
|
|
@ -69,5 +69,5 @@ public interface Selectable {
|
|||
*
|
||||
* @return multi string result
|
||||
*/
|
||||
public List<String> toStrings();
|
||||
public List<String> all();
|
||||
}
|
||||
|
|
|
@ -1351,7 +1351,7 @@ public class XpathSelectorTest {
|
|||
public void testOschina() {
|
||||
Html html1 = new Html(html);
|
||||
Assert.assertEquals("再次吐槽easyui", html1.xpath(".//*[@class='QTitle']/h1/a").toString());
|
||||
Assert.assertNotNull(html1.$("a[href]").xpath("//@href").toStrings());
|
||||
Assert.assertNotNull(html1.$("a[href]").xpath("//@href").all());
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -4,7 +4,7 @@ import freemarker.template.Configuration;
|
|||
import freemarker.template.Template;
|
||||
import freemarker.template.TemplateException;
|
||||
import org.apache.commons.codec.digest.DigestUtils;
|
||||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.ResultItems;
|
||||
import us.codecraft.webmagic.Task;
|
||||
|
||||
import java.io.File;
|
||||
|
@ -39,8 +39,8 @@ public class FreemarkerPipeline implements Pipeline {
|
|||
|
||||
|
||||
@Override
|
||||
public void process(Page page, Task task) {
|
||||
if (page.isSkip()) {
|
||||
public void process(ResultItems resultItems, Task task) {
|
||||
if (resultItems.isSkip()) {
|
||||
return;
|
||||
}
|
||||
String path = this.path + "" + task.getUUID() + "/";
|
||||
|
@ -49,8 +49,8 @@ public class FreemarkerPipeline implements Pipeline {
|
|||
file.mkdirs();
|
||||
}
|
||||
try {
|
||||
PrintWriter printWriter = new PrintWriter(new FileWriter(path + DigestUtils.md5Hex(page.getUrl().toString()) + ".html"));
|
||||
template.process(page.getFields(), printWriter);
|
||||
PrintWriter printWriter = new PrintWriter(new FileWriter(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".html"));
|
||||
template.process(resultItems.getAll(), printWriter);
|
||||
printWriter.close();
|
||||
} catch (TemplateException e) {
|
||||
} catch (IOException e) {
|
||||
|
|
|
@ -28,7 +28,9 @@ public class RedisScheduler implements Scheduler{
|
|||
@Override
|
||||
public synchronized void push(Request request, Task task) {
|
||||
Jedis jedis = pool.getResource();
|
||||
//使用SortedSet进行url去重
|
||||
if (jedis.zrank(SET_PREFIX+task.getUUID(),request.getUrl())==null){
|
||||
//使用List保存队列
|
||||
jedis.rpush(QUEUE_PREFIX+task.getUUID(),request.getUrl());
|
||||
jedis.zadd(SET_PREFIX+task.getUUID(),System.currentTimeMillis(),request.getUrl());
|
||||
}
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
package us.codecraft.webmagic.scheduler;
|
||||
|
||||
import org.junit.Before;
|
||||
import org.junit.Ignore;
|
||||
import org.junit.Test;
|
||||
import us.codecraft.webmagic.Request;
|
||||
import us.codecraft.webmagic.Site;
|
||||
|
@ -20,6 +21,7 @@ public class RedisSchedulerTest {
|
|||
redisScheduler = new RedisScheduler("localhost");
|
||||
}
|
||||
|
||||
@Ignore("environment depended")
|
||||
@Test
|
||||
public void test() {
|
||||
Task task = new Task() {
|
||||
|
@ -35,7 +37,6 @@ public class RedisSchedulerTest {
|
|||
};
|
||||
redisScheduler.push(new Request("http://www.ibm.com/developerworks/cn/java/j-javadev2-22/"), task);
|
||||
Request poll = redisScheduler.poll(task);
|
||||
System.out.println(poll.getUrl());
|
||||
|
||||
}
|
||||
}
|
||||
|
|
|
@ -20,13 +20,13 @@ public class DiandianBlogProcessor implements PageProcessor {
|
|||
//a()表示提取链接,links()表示提取所有链接
|
||||
//getHtml()返回Html对象,支持链式调用
|
||||
//r()表示用正则表达式提取一条内容,regex()表示提取多条内容
|
||||
//toString()表示取单条结果,toStrings()表示取多条
|
||||
List<String> requests = page.getHtml().links().regex("(.*/post/.*)").toStrings();
|
||||
//toString()表示取单条结果,all()表示取多条
|
||||
List<String> requests = page.getHtml().links().regex("(.*/post/.*)").all();
|
||||
//使用page.addTargetRequests()方法将待抓取的链接加入队列
|
||||
page.addTargetRequests(requests);
|
||||
//page.putField(key,value)将抽取的内容加入结果Map
|
||||
//x()和xs()使用xpath进行抽取
|
||||
page.putField("title", page.getHtml().xpath("//title").regex("(.*?)\\|"));
|
||||
page.putField("title", page.getHtml().xpath("//title").regex("(.*?)\\|").toString());
|
||||
//smartContent()使用readability技术直接抽取正文,对于规整的文本有比较好的抽取正确率
|
||||
page.putField("content", page.getHtml().smartContent());
|
||||
page.putField("date", page.getUrl().regex("post/(\\d+-\\d+-\\d+)/"));
|
||||
|
|
|
@ -18,7 +18,7 @@ public class DianpingProcessor implements PageProcessor {
|
|||
|
||||
@Override
|
||||
public void process(Page page) {
|
||||
List<String> requests = page.getHtml().links().regex("http://info-search-web121361\\.alpha\\.dp:8080/search/.*").toStrings();
|
||||
List<String> requests = page.getHtml().links().regex("http://info-search-web121361\\.alpha\\.dp:8080/search/.*").all();
|
||||
page.addTargetRequests(requests);
|
||||
}
|
||||
|
||||
|
|
|
@ -18,9 +18,9 @@ public class DiaoyuwengProcessor implements PageProcessor {
|
|||
|
||||
@Override
|
||||
public void process(Page page) {
|
||||
List<String> requests = page.getHtml().links().regex("(http://www\\.diaoyuweng\\.com/home\\.php\\?mod=space&uid=88304&do=thread&view=me&type=thread&order=dateline&from=space&page=\\d+)").toStrings();
|
||||
List<String> requests = page.getHtml().links().regex("(http://www\\.diaoyuweng\\.com/home\\.php\\?mod=space&uid=88304&do=thread&view=me&type=thread&order=dateline&from=space&page=\\d+)").all();
|
||||
page.addTargetRequests(requests);
|
||||
requests = page.getHtml().links().regex("(http://www\\.diaoyuweng\\.com/thread-\\d+-1-1.html)").toStrings();
|
||||
requests = page.getHtml().links().regex("(http://www\\.diaoyuweng\\.com/thread-\\d+-1-1.html)").all();
|
||||
page.addTargetRequests(requests);
|
||||
if (page.getUrl().toString().contains("thread")){
|
||||
page.putField("title", page.getHtml().xpath("//a[@id='thread_subject']"));
|
||||
|
|
|
@ -15,7 +15,7 @@ public class F58PageProcesser implements PageProcessor {
|
|||
|
||||
@Override
|
||||
public void process(Page page) {
|
||||
List<String> strings = page.getHtml().regex("<a[^<>]*href=[\"']{1}(/yewu/.*?)[\"']{1}").toStrings();
|
||||
List<String> strings = page.getHtml().regex("<a[^<>]*href=[\"']{1}(/yewu/.*?)[\"']{1}").all();
|
||||
page.addTargetRequests(strings);
|
||||
page.putField("title",page.getHtml().regex("<title>(.*)</title>"));
|
||||
page.putField("body",page.getHtml().xpath("//dd[@class='w133']"));
|
||||
|
|
|
@ -20,7 +20,7 @@ public class GlobalProcessor implements PageProcessor {
|
|||
|
||||
@Override
|
||||
public void process(Page page) {
|
||||
final List<String> requests = page.getHtml().links().toStrings();
|
||||
final List<String> requests = page.getHtml().links().all();
|
||||
page.addTargetRequests(requests);
|
||||
|
||||
}
|
||||
|
|
|
@ -15,7 +15,7 @@ public class HuxiuProcessor implements PageProcessor {
|
|||
@Override
|
||||
public void process(Page page) {
|
||||
//http://progressdaily.diandian.com/post/2013-01-24/40046867275
|
||||
List<String> requests = page.getHtml().regex("<a[^<>\"']*href=[\"']{1}([/]{0,1}article[^<>#\"']*?)[\"']{1}").toStrings();
|
||||
List<String> requests = page.getHtml().regex("<a[^<>\"']*href=[\"']{1}([/]{0,1}article[^<>#\"']*?)[\"']{1}").all();
|
||||
page.addTargetRequests(requests);
|
||||
page.putField("title",page.getHtml().xpath("//div[@class='neirong']//h1[@class='ph xs5']"));
|
||||
page.putField("content",page.getHtml().smartContent());
|
||||
|
|
|
@ -15,12 +15,12 @@ public class MeicanProcessor implements PageProcessor {
|
|||
@Override
|
||||
public void process(Page page) {
|
||||
//http://progressdaily.diandian.com/post/2013-01-24/40046867275
|
||||
List<String> requests = page.getHtml().xpath("//a[@class=\"area_link flat_btn\"]/@href").toStrings();
|
||||
List<String> requests = page.getHtml().xpath("//a[@class=\"area_link flat_btn\"]/@href").all();
|
||||
if (requests.size() > 2) {
|
||||
requests = requests.subList(0, 2);
|
||||
}
|
||||
page.addTargetRequests(requests);
|
||||
page.addTargetRequests(page.getHtml().links().regex("(.*/restaurant/[^#]+)").toStrings());
|
||||
page.addTargetRequests(page.getHtml().links().regex("(.*/restaurant/[^#]+)").all());
|
||||
page.putField("items", page.getHtml().xpath("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"name\"]"));
|
||||
page.putField("prices", page.getHtml().xpath("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"price_outer\"]/span[@class=\"price\"]"));
|
||||
}
|
||||
|
|
|
@ -14,7 +14,7 @@ import java.util.List;
|
|||
public class NjuBBSProcessor implements PageProcessor {
|
||||
@Override
|
||||
public void process(Page page) {
|
||||
List<String> requests = page.getHtml().regex("<a[^<>]*href=(bbstcon\\?board=Pictures&file=[^>]*)").toStrings();
|
||||
List<String> requests = page.getHtml().regex("<a[^<>]*href=(bbstcon\\?board=Pictures&file=[^>]*)").all();
|
||||
page.addTargetRequests(requests);
|
||||
page.putField("title",page.getHtml().xpath("//div[@id='content']//h2/a"));
|
||||
page.putField("content",page.getHtml().smartContent());
|
||||
|
|
|
@ -15,7 +15,7 @@ public class OschinaBlogPageProcesser implements PageProcessor {
|
|||
|
||||
@Override
|
||||
public void process(Page page) {
|
||||
List<String> strings = page.getHtml().links().regex("(http://my\\.oschina\\.net)").toStrings();
|
||||
List<String> strings = page.getHtml().links().regex("(http://my\\.oschina\\.net)").all();
|
||||
page.addTargetRequests(strings);
|
||||
page.putField("title", page.getHtml().xpath("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1"));
|
||||
page.putField("content", page.getHtml().smartContent());
|
||||
|
|
|
@ -15,7 +15,7 @@ public class OschinaPageProcesser implements PageProcessor {
|
|||
|
||||
@Override
|
||||
public void process(Page page) {
|
||||
List<String> strings = page.getHtml().regex("<a[^<>]*href=[\"']{1}(http://www\\.oschina\\.net/question/[\\w]+)[\"']{1}").toStrings();
|
||||
List<String> strings = page.getHtml().regex("<a[^<>]*href=[\"']{1}(http://www\\.oschina\\.net/question/[\\w]+)[\"']{1}").all();
|
||||
page.addTargetRequests(strings);
|
||||
page.putField("title", page.getHtml().xpath("//div[@class='QTitle']/h1/a"));
|
||||
page.putField("content", page.getHtml().xpath("//div[@class='Question']//div[@class='Content']/div[@class='detail']"));
|
||||
|
|
|
@ -18,7 +18,7 @@ public class QzoneBlogProcessor implements PageProcessor {
|
|||
|
||||
//http://b1.cnc.qzone.qq.com/cgi-bin/blognew/get_abs?hostUin=233017404&uin=233017404&blogType=0&statYear=2013&source=0&statYear=2013&g_tk=291639571&g_tk=291639571&reqInfo=7&pos=0&num=15&source=0&rand=0.46480297949165106
|
||||
// &cateName=&cateHex=&statYear=2013&reqInfo=7&pos=0&num=15&sortType=0&source=0&rand=0.46480297949165106&g_tk=291639571&verbose=1&ref=qzone
|
||||
List<String> requests = page.getHtml().regex("<a[^<>]*href=[\"']{1}(http://17dujingdian\\.com/post/[^#]*?)[\"']{1}").toStrings();
|
||||
List<String> requests = page.getHtml().regex("<a[^<>]*href=[\"']{1}(http://17dujingdian\\.com/post/[^#]*?)[\"']{1}").all();
|
||||
page.addTargetRequests(requests);
|
||||
page.putField("title",page.getHtml().xpath("//div[@id='content']//h2/a"));
|
||||
page.putField("content",page.getHtml().smartContent());
|
||||
|
|
|
@ -16,7 +16,7 @@ public class SinaBlogProcesser implements PageProcessor {
|
|||
|
||||
@Override
|
||||
public void process(Page page) {
|
||||
page.addTargetRequests(page.getHtml().xpath("//div[@class='articalfrontback SG_j_linedot1 clearfix']").links().toStrings());
|
||||
page.addTargetRequests(page.getHtml().xpath("//div[@class='articalfrontback SG_j_linedot1 clearfix']").links().all());
|
||||
page.putField("title", page.getHtml().xpath("//div[@class='articalTitle']/h2"));
|
||||
page.putField("content",page.getHtml().xpath("//div[@id='articlebody']//div[@class='articalContent']"));
|
||||
page.putField("id",page.getUrl().regex("http://blog\\.sina\\.com\\.cn/s/blog_(\\w+)"));
|
||||
|
|
|
@ -15,7 +15,7 @@ public class TianyaPageProcesser implements PageProcessor {
|
|||
|
||||
@Override
|
||||
public void process(Page page) {
|
||||
List<String> strings = page.getHtml().regex("<a[^<>]*href=[\"']{1}(/post-free.*?\\.shtml)[\"']{1}").toStrings();
|
||||
List<String> strings = page.getHtml().regex("<a[^<>]*href=[\"']{1}(/post-free.*?\\.shtml)[\"']{1}").all();
|
||||
page.addTargetRequests(strings);
|
||||
page.putField("title", page.getHtml().xpath("//div[@id='post_head']//span[@class='s_title']//b"));
|
||||
page.putField("body",page.getHtml().smartContent());
|
||||
|
|
Loading…
Reference in New Issue