update comments
parent
59aad6a7f4
commit
c59c1fe80d
|
@ -1,21 +1,45 @@
|
||||||
package us.codecraft.webmagic;
|
package us.codecraft.webmagic;
|
||||||
|
|
||||||
|
import us.codecraft.webmagic.model.annotation.Experimental;
|
||||||
|
|
||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 实现此接口以进行支持爬虫分页抓取。<br>
|
* Extract an object of more than one pages, such as news and articles。<br>
|
||||||
|
*
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
* Date: 13-8-4 <br>
|
|
||||||
* Time: 下午5:18 <br>
|
|
||||||
*/
|
*/
|
||||||
|
@Experimental
|
||||||
public interface MultiPageModel {
|
public interface MultiPageModel {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Page key is the identifier for the object.
|
||||||
|
*
|
||||||
|
* @return page key
|
||||||
|
*/
|
||||||
public String getPageKey();
|
public String getPageKey();
|
||||||
|
|
||||||
public Collection<String> getOtherPages();
|
/**
|
||||||
|
* page is the identifier of a page in pages for one object.
|
||||||
|
*
|
||||||
|
* @return page
|
||||||
|
*/
|
||||||
public String getPage();
|
public String getPage();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* other pages to be extracted.<br>
|
||||||
|
* It is used to judge whether an object contains more than one page, and whether the pages of the object are all extracted.
|
||||||
|
*
|
||||||
|
* @return other pages
|
||||||
|
*/
|
||||||
|
public Collection<String> getOtherPages();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Combine multiPageModels to a whole object.
|
||||||
|
*
|
||||||
|
* @param multiPageModel
|
||||||
|
* @return multiPageModel combined
|
||||||
|
*/
|
||||||
public MultiPageModel combine(MultiPageModel multiPageModel);
|
public MultiPageModel combine(MultiPageModel multiPageModel);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -6,8 +6,6 @@ import us.codecraft.webmagic.Page;
|
||||||
* 实现这个接口即可在抽取后进行后处理。<br>
|
* 实现这个接口即可在抽取后进行后处理。<br>
|
||||||
*
|
*
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
* Date: 13-8-3 <br>
|
|
||||||
* Time: 上午9:42 <br>
|
|
||||||
*/
|
*/
|
||||||
public interface AfterExtractor {
|
public interface AfterExtractor {
|
||||||
|
|
||||||
|
|
|
@ -5,10 +5,26 @@ import us.codecraft.webmagic.Spider;
|
||||||
import us.codecraft.webmagic.processor.PageProcessor;
|
import us.codecraft.webmagic.processor.PageProcessor;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 基于Model的Spider,封装后的入口类。<br>
|
* The spider for page model extractor。<br>
|
||||||
|
* In webmagic, we call a POJO containing extract result as "page model". <br>
|
||||||
|
* You can customize a crawler by write a page model with annotations. <br>
|
||||||
|
* Such as:
|
||||||
|
* <pre>
|
||||||
|
* {@literal @}TargetUrl("http://my.oschina.net/flashsword/blog/\\d+")
|
||||||
|
* public class OschinaBlog{
|
||||||
|
*
|
||||||
|
* {@literal @}ExtractBy("//title")
|
||||||
|
* private String title;
|
||||||
|
*
|
||||||
|
* {@literal @}ExtractBy(value = "div.BlogContent",type = ExtractBy.Type.Css)
|
||||||
|
* private String content;
|
||||||
|
*
|
||||||
|
* {@literal @}ExtractBy(value = "//div[@class='BlogTags']/a/text()", multi = true)
|
||||||
|
* private List<String> tags;
|
||||||
|
* }
|
||||||
|
</pre>
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
* Date: 13-8-3 <br>
|
* @since 0.2.0
|
||||||
* Time: 上午9:51 <br>
|
|
||||||
*/
|
*/
|
||||||
public class OOSpider extends Spider {
|
public class OOSpider extends Spider {
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
<html>
|
<html>
|
||||||
<body>
|
<body>
|
||||||
webmagic对抓取器编写的面向模型(称为PageModel)的封装。基于POJO及注解即可实现一个PageProcessor。
|
Page model and annotations used to customize a crawler.
|
||||||
</body>
|
</body>
|
||||||
</html>
|
</html>
|
||||||
|
|
|
@ -2,7 +2,6 @@ package us.codecraft.webmagic.utils;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @author code4crafter@gmail.com
|
* @author code4crafter@gmail.com
|
||||||
* Date Dec 14, 2012
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
|
|
Loading…
Reference in New Issue