update package
parent
cfb8990453
commit
21cae2ff2e
|
@ -9,7 +9,7 @@ import java.util.List;
|
|||
|
||||
/**
|
||||
* <pre>
|
||||
*Page保存了上一次抓取的结果,并可定义待抓取的链接内容。
|
||||
* Page保存了上一次抓取的结果,并可定义待抓取的链接内容。
|
||||
*
|
||||
* 主要方法:
|
||||
* {@link #getUrl()} 获取页面的Url
|
||||
|
@ -19,6 +19,7 @@ import java.util.List;
|
|||
* {@link #addTargetRequests(java.util.List)} {@link #addTargetRequest(String)} 添加待抓取的链接
|
||||
*
|
||||
* </pre>
|
||||
*
|
||||
* @author code4crafter@gmail.com <br>
|
||||
*/
|
||||
public class Page {
|
||||
|
@ -36,9 +37,16 @@ public class Page {
|
|||
public Page() {
|
||||
}
|
||||
|
||||
public Page setSkip(boolean skip) {
|
||||
resultItems.setSkip(skip);
|
||||
return this;
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* 保存抽取的结果
|
||||
* @param key 结果的key
|
||||
*
|
||||
* @param key 结果的key
|
||||
* @param field 结果的value
|
||||
*/
|
||||
public void putField(String key, Object field) {
|
||||
|
@ -47,6 +55,7 @@ public class Page {
|
|||
|
||||
/**
|
||||
* 获取页面的html内容
|
||||
*
|
||||
* @return html 页面的html内容
|
||||
*/
|
||||
public Selectable getHtml() {
|
||||
|
@ -63,6 +72,7 @@ public class Page {
|
|||
|
||||
/**
|
||||
* 添加待抓取的链接
|
||||
*
|
||||
* @param requests 待抓取的链接
|
||||
*/
|
||||
public void addTargetRequests(List<String> requests) {
|
||||
|
@ -79,6 +89,7 @@ public class Page {
|
|||
|
||||
/**
|
||||
* 添加待抓取的链接
|
||||
*
|
||||
* @param requestString 待抓取的链接
|
||||
*/
|
||||
public void addTargetRequest(String requestString) {
|
||||
|
@ -93,6 +104,7 @@ public class Page {
|
|||
|
||||
/**
|
||||
* 添加待抓取的页面,在需要传递附加信息时使用
|
||||
*
|
||||
* @param request 待抓取的页面
|
||||
*/
|
||||
public void addTargetRequest(Request request) {
|
||||
|
@ -103,6 +115,7 @@ public class Page {
|
|||
|
||||
/**
|
||||
* 获取页面的Url
|
||||
*
|
||||
* @return url 当前页面的url,可用于抽取
|
||||
*/
|
||||
public Selectable getUrl() {
|
||||
|
@ -111,6 +124,7 @@ public class Page {
|
|||
|
||||
/**
|
||||
* 设置url
|
||||
*
|
||||
* @param url
|
||||
*/
|
||||
public void setUrl(Selectable url) {
|
||||
|
@ -119,6 +133,7 @@ public class Page {
|
|||
|
||||
/**
|
||||
* 获取抓取请求
|
||||
*
|
||||
* @return request 抓取请求
|
||||
*/
|
||||
public Request getRequest() {
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
package us.codecraft.webmagic.oo;
|
||||
package us.codecraft.webmagic.model;
|
||||
|
||||
import us.codecraft.webmagic.Page;
|
||||
|
|
@ -1,4 +1,4 @@
|
|||
package us.codecraft.webmagic.oo;
|
||||
package us.codecraft.webmagic.model;
|
||||
|
||||
import org.apache.commons.lang3.builder.ToStringBuilder;
|
||||
import us.codecraft.webmagic.Task;
|
|
@ -1,4 +1,4 @@
|
|||
package us.codecraft.webmagic.oo;
|
||||
package us.codecraft.webmagic.model;
|
||||
|
||||
import java.lang.annotation.ElementType;
|
||||
import java.lang.annotation.Retention;
|
|
@ -1,4 +1,4 @@
|
|||
package us.codecraft.webmagic.oo;
|
||||
package us.codecraft.webmagic.model;
|
||||
|
||||
import java.lang.annotation.ElementType;
|
||||
import java.lang.annotation.Retention;
|
|
@ -1,4 +1,4 @@
|
|||
package us.codecraft.webmagic.oo;
|
||||
package us.codecraft.webmagic.model;
|
||||
|
||||
import us.codecraft.webmagic.selector.Selector;
|
||||
|
|
@ -1,4 +1,4 @@
|
|||
package us.codecraft.webmagic.oo;
|
||||
package us.codecraft.webmagic.model;
|
||||
|
||||
import us.codecraft.webmagic.selector.Selector;
|
||||
|
|
@ -1,4 +1,4 @@
|
|||
package us.codecraft.webmagic.oo;
|
||||
package us.codecraft.webmagic.model;
|
||||
|
||||
import java.lang.annotation.ElementType;
|
||||
import java.lang.annotation.Retention;
|
|
@ -1,4 +1,4 @@
|
|||
package us.codecraft.webmagic.oo;
|
||||
package us.codecraft.webmagic.model;
|
||||
|
||||
import us.codecraft.webmagic.Site;
|
||||
import us.codecraft.webmagic.Spider;
|
|
@ -1,4 +1,4 @@
|
|||
package us.codecraft.webmagic.oo;
|
||||
package us.codecraft.webmagic.model;
|
||||
|
||||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.Request;
|
|
@ -1,4 +1,4 @@
|
|||
package us.codecraft.webmagic.oo;
|
||||
package us.codecraft.webmagic.model;
|
||||
|
||||
import us.codecraft.webmagic.ResultItems;
|
||||
import us.codecraft.webmagic.Task;
|
|
@ -1,4 +1,4 @@
|
|||
package us.codecraft.webmagic.oo;
|
||||
package us.codecraft.webmagic.model;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import us.codecraft.webmagic.Page;
|
|
@ -1,4 +1,4 @@
|
|||
package us.codecraft.webmagic.oo;
|
||||
package us.codecraft.webmagic.model;
|
||||
|
||||
import us.codecraft.webmagic.Task;
|
||||
|
|
@ -1,4 +1,4 @@
|
|||
package us.codecraft.webmagic.oo;
|
||||
package us.codecraft.webmagic.model;
|
||||
|
||||
import java.lang.annotation.ElementType;
|
||||
import java.lang.annotation.Retention;
|
|
@ -1,4 +1,4 @@
|
|||
package us.codecraft.webmagic.oo;
|
||||
package us.codecraft.webmagic.model;
|
||||
|
||||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.Site;
|
||||
|
@ -10,7 +10,7 @@ import java.util.List;
|
|||
* @date: 13-8-1 <br>
|
||||
* Time: 下午10:18 <br>
|
||||
*/
|
||||
@TargetUrl(value="http://my.oschina.net/flashsword/blog/*",sourceRegion = "//div[@class='BlogLinks']//a/@href")
|
||||
@TargetUrl("http://my.oschina.net/flashsword/blog/*")
|
||||
public class OschinaBlog implements AfterExtractor {
|
||||
|
||||
@ExtractBy("//title")
|
||||
|
@ -27,6 +27,7 @@ public class OschinaBlog implements AfterExtractor {
|
|||
System.out.println("title:\t"+title);
|
||||
System.out.println("content:\t"+content);
|
||||
System.out.println("tags:\t" + tags);
|
||||
page.setSkip(true);
|
||||
}
|
||||
|
||||
public static void main(String[] args) {
|
|
@ -1,4 +1,4 @@
|
|||
package us.codecraft.webmagic.oo;
|
||||
package us.codecraft.webmagic.model;
|
||||
|
||||
import org.junit.Ignore;
|
||||
import org.junit.Test;
|
|
@ -1,4 +1,4 @@
|
|||
package us.codecraft.webmagic.oo.samples;
|
||||
package us.codecraft.webmagic.model.samples;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com <br>
|
|
@ -1,9 +1,9 @@
|
|||
package us.codecraft.webmagic.oo.samples;
|
||||
package us.codecraft.webmagic.model.samples;
|
||||
|
||||
import us.codecraft.webmagic.Site;
|
||||
import us.codecraft.webmagic.oo.ExtractBy;
|
||||
import us.codecraft.webmagic.oo.OOSpider;
|
||||
import us.codecraft.webmagic.oo.TargetUrl;
|
||||
import us.codecraft.webmagic.model.ExtractBy;
|
||||
import us.codecraft.webmagic.model.OOSpider;
|
||||
import us.codecraft.webmagic.model.TargetUrl;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com <br>
|
|
@ -1,8 +1,8 @@
|
|||
package us.codecraft.webmagic.oo.samples;
|
||||
package us.codecraft.webmagic.model.samples;
|
||||
|
||||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.Site;
|
||||
import us.codecraft.webmagic.oo.*;
|
||||
import us.codecraft.webmagic.model.*;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com <br>
|
|
@ -1,9 +1,9 @@
|
|||
package us.codecraft.webmagic.oo.samples;
|
||||
package us.codecraft.webmagic.model.samples;
|
||||
|
||||
import us.codecraft.webmagic.Site;
|
||||
import us.codecraft.webmagic.oo.ExtractBy;
|
||||
import us.codecraft.webmagic.oo.OOSpider;
|
||||
import us.codecraft.webmagic.oo.TargetUrl;
|
||||
import us.codecraft.webmagic.model.ExtractBy;
|
||||
import us.codecraft.webmagic.model.OOSpider;
|
||||
import us.codecraft.webmagic.model.TargetUrl;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com <br>
|
Loading…
Reference in New Issue