complete extension comments in en
parent
c59c1fe80d
commit
a994b1c9fd
|
@ -8,6 +8,7 @@ import java.util.Collection;
|
||||||
* Extract an object of more than one pages, such as news and articles。<br>
|
* Extract an object of more than one pages, such as news and articles。<br>
|
||||||
*
|
*
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
|
* @since 0.2.0
|
||||||
*/
|
*/
|
||||||
@Experimental
|
@Experimental
|
||||||
public interface MultiPageModel {
|
public interface MultiPageModel {
|
||||||
|
|
|
@ -3,9 +3,10 @@ package us.codecraft.webmagic.model;
|
||||||
import us.codecraft.webmagic.Page;
|
import us.codecraft.webmagic.Page;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 实现这个接口即可在抽取后进行后处理。<br>
|
* Interface to be implemented by page models that need to do something after fields are extracted。<br>
|
||||||
*
|
*
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
|
* @since 0.2.0
|
||||||
*/
|
*/
|
||||||
public interface AfterExtractor {
|
public interface AfterExtractor {
|
||||||
|
|
||||||
|
|
|
@ -4,9 +4,10 @@ import org.apache.commons.lang3.builder.ToStringBuilder;
|
||||||
import us.codecraft.webmagic.Task;
|
import us.codecraft.webmagic.Task;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
* Print page model in console.<br>
|
||||||
|
* Usually used in test.<br>
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
* Date: 13-8-3 <br>
|
* @since 0.2.0
|
||||||
* Time: 下午3:41 <br>
|
|
||||||
*/
|
*/
|
||||||
public class ConsolePageModelPipeline implements PageModelPipeline {
|
public class ConsolePageModelPipeline implements PageModelPipeline {
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -3,9 +3,9 @@ package us.codecraft.webmagic.model;
|
||||||
import us.codecraft.webmagic.selector.Selector;
|
import us.codecraft.webmagic.selector.Selector;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
* The object contains 'ExtractBy' information.
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
* Date: 13-8-1 <br>
|
* @since 0.2.0
|
||||||
* Time: 下午9:48 <br>
|
|
||||||
*/
|
*/
|
||||||
class Extractor {
|
class Extractor {
|
||||||
|
|
||||||
|
|
|
@ -6,18 +6,18 @@ import java.lang.reflect.Field;
|
||||||
import java.lang.reflect.Method;
|
import java.lang.reflect.Method;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
* Wrapper of field and extractor.
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
* Date: 13-8-1 <br>
|
* @since 0.2.0
|
||||||
* Time: 下午9:48 <br>
|
|
||||||
*/
|
*/
|
||||||
class FieldExtractor extends Extractor{
|
class FieldExtractor extends Extractor {
|
||||||
|
|
||||||
private final Field field;
|
private final Field field;
|
||||||
|
|
||||||
private Method setterMethod;
|
private Method setterMethod;
|
||||||
|
|
||||||
public FieldExtractor(Field field, Selector selector, Source source, boolean notNull,boolean multi) {
|
public FieldExtractor(Field field, Selector selector, Source source, boolean notNull, boolean multi) {
|
||||||
super(selector, source, notNull,multi);
|
super(selector, source, notNull, multi);
|
||||||
this.field = field;
|
this.field = field;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,18 +1,18 @@
|
||||||
package us.codecraft.webmagic.model;
|
package us.codecraft.webmagic.model;
|
||||||
|
|
||||||
|
import us.codecraft.webmagic.model.annotation.Experimental;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 标志一个Model的key。<br>
|
* Interface to be implemented by page mode.<br>
|
||||||
* 实现了这个接口的Model在输出时会使用getKey()作为标志(例如JsonFilePageModelPipeline中持久化的文件名)。<br>
|
* Can be used to identify a page model, or be used as name of file storing the object.<br>
|
||||||
* 如果持久化的文件名是乱码,请再运行的环境变量里加上LANG=zh_CN.UTF-8 。<br>
|
|
||||||
*
|
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
* Date: 13-8-10 <br>
|
* @since 0.2.0
|
||||||
* Time: 上午7:39 <br>
|
|
||||||
*/
|
*/
|
||||||
|
@Experimental
|
||||||
public interface HasKey {
|
public interface HasKey {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 在输出时会使用key作为标志(例如JsonFilePageModelPipeline中持久化的文件名)。
|
*
|
||||||
*
|
*
|
||||||
* @return key
|
* @return key
|
||||||
*/
|
*/
|
||||||
|
|
|
@ -14,10 +14,10 @@ import java.util.regex.Matcher;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 基于PageProcessor的扩展点。<br>
|
* The extension to PageProcessor for page model extractor.
|
||||||
|
*
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
* Date: 13-8-1 <br>
|
* @since 0.2.0
|
||||||
* Time: 下午8:46 <br>
|
|
||||||
*/
|
*/
|
||||||
class ModelPageProcessor implements PageProcessor {
|
class ModelPageProcessor implements PageProcessor {
|
||||||
|
|
||||||
|
|
|
@ -11,11 +11,10 @@ import java.util.Map;
|
||||||
import java.util.concurrent.ConcurrentHashMap;
|
import java.util.concurrent.ConcurrentHashMap;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 基于Pipeline的扩展点,用于实现注解格式的Pipeline。<br>
|
* The extension to Pipeline for page model extractor.
|
||||||
* 与PageModelPipeline是一对多的关系(原谅作者没有更好的名字了)。<br>
|
*
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
* Date: 13-8-2 <br>
|
* @since 0.2.0
|
||||||
* Time: 上午10:47 <br>
|
|
||||||
*/
|
*/
|
||||||
class ModelPipeline implements Pipeline {
|
class ModelPipeline implements Pipeline {
|
||||||
|
|
||||||
|
|
|
@ -42,7 +42,7 @@ public class OOSpider extends Spider {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 创建一个爬虫。<br>
|
* create a spider
|
||||||
* @param site
|
* @param site
|
||||||
* @param pageModelPipeline
|
* @param pageModelPipeline
|
||||||
* @param pageModels
|
* @param pageModels
|
||||||
|
|
|
@ -15,11 +15,10 @@ import java.util.List;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Model主要逻辑类。将一个带注解的POJO转换为一个PageModelExtractor。<br>
|
* The main internal logic of page model extractor.
|
||||||
*
|
*
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
* Date: 13-8-1 <br>
|
* @since 0.2.0
|
||||||
* Time: 下午9:33 <br>
|
|
||||||
*/
|
*/
|
||||||
class PageModelExtractor {
|
class PageModelExtractor {
|
||||||
|
|
||||||
|
|
|
@ -3,9 +3,10 @@ package us.codecraft.webmagic.model;
|
||||||
import us.codecraft.webmagic.Task;
|
import us.codecraft.webmagic.Task;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
* Implements PageModelPipeline to persistent your page model.
|
||||||
|
*
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
* Date: 13-8-3 <br>
|
* @since 0.2.0
|
||||||
* Time: 上午9:34 <br>
|
|
||||||
*/
|
*/
|
||||||
public interface PageModelPipeline<T> {
|
public interface PageModelPipeline<T> {
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
<html>
|
<html>
|
||||||
<body>
|
<body>
|
||||||
Annotations for define a class.
|
Annotations for defining a extractor.
|
||||||
</body>
|
</body>
|
||||||
</html>
|
</html>
|
||||||
|
|
|
@ -6,8 +6,11 @@ import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* JsonPath
|
* JsonPath selector.<br>
|
||||||
|
* Used to extract content from JSON.<br>
|
||||||
|
*
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
|
* @since 0.2.1
|
||||||
*/
|
*/
|
||||||
public class JsonPathSelector implements Selector {
|
public class JsonPathSelector implements Selector {
|
||||||
|
|
||||||
|
@ -43,7 +46,7 @@ public class JsonPathSelector implements Selector {
|
||||||
return list;
|
return list;
|
||||||
}
|
}
|
||||||
if (object instanceof List) {
|
if (object instanceof List) {
|
||||||
return (List<String>)object;
|
return (List<String>) object;
|
||||||
} else {
|
} else {
|
||||||
list.add(object.toString());
|
list.add(object.toString());
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue