add type convert
parent
fba330872b
commit
b18216245b
|
@ -1,5 +1,6 @@
|
|||
package us.codecraft.webmagic.model;
|
||||
|
||||
import us.codecraft.webmagic.model.formatter.ObjectFormatter;
|
||||
import us.codecraft.webmagic.selector.Selector;
|
||||
|
||||
import java.lang.reflect.Field;
|
||||
|
@ -16,6 +17,8 @@ class FieldExtractor extends Extractor {
|
|||
|
||||
private Method setterMethod;
|
||||
|
||||
private ObjectFormatter objectFormatter;
|
||||
|
||||
public FieldExtractor(Field field, Selector selector, Source source, boolean notNull, boolean multi) {
|
||||
super(selector, source, notNull, multi);
|
||||
this.field = field;
|
||||
|
@ -44,4 +47,12 @@ class FieldExtractor extends Extractor {
|
|||
boolean isNotNull() {
|
||||
return notNull;
|
||||
}
|
||||
|
||||
ObjectFormatter getObjectFormatter() {
|
||||
return objectFormatter;
|
||||
}
|
||||
|
||||
void setObjectFormatter(ObjectFormatter objectFormatter) {
|
||||
this.objectFormatter = objectFormatter;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,8 +1,12 @@
|
|||
package us.codecraft.webmagic.model;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.log4j.Logger;
|
||||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.model.annotation.*;
|
||||
import us.codecraft.webmagic.model.formatter.BasicTypeFormatter;
|
||||
import us.codecraft.webmagic.model.formatter.ObjectFormatter;
|
||||
import us.codecraft.webmagic.model.formatter.ObjectFormatters;
|
||||
import us.codecraft.webmagic.selector.*;
|
||||
import us.codecraft.webmagic.utils.ExtractorUtils;
|
||||
|
||||
|
@ -36,6 +40,8 @@ class PageModelExtractor {
|
|||
|
||||
private Extractor objectExtractor;
|
||||
|
||||
private Logger logger = Logger.getLogger(getClass());
|
||||
|
||||
public static PageModelExtractor create(Class clazz) {
|
||||
PageModelExtractor pageModelExtractor = new PageModelExtractor();
|
||||
pageModelExtractor.init(clazz);
|
||||
|
@ -62,16 +68,60 @@ class PageModelExtractor {
|
|||
fieldExtractor = fieldExtractorTmp;
|
||||
}
|
||||
if (fieldExtractor != null) {
|
||||
if (!fieldExtractor.isMulti() && !String.class.isAssignableFrom(field.getType())) {
|
||||
throw new IllegalStateException("Field " + field.getName() + " must be string");
|
||||
} else if (fieldExtractor.isMulti() && !List.class.isAssignableFrom(field.getType())) {
|
||||
throw new IllegalStateException("Field " + field.getName() + " must be list");
|
||||
}
|
||||
checkFormat(field, fieldExtractor);
|
||||
fieldExtractors.add(fieldExtractor);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void checkFormat(Field field, FieldExtractor fieldExtractor) {
|
||||
if (!fieldExtractor.isMulti() && !String.class.isAssignableFrom(field.getType())) {
|
||||
Class<?> fieldClazz = BasicTypeFormatter.detectBasicClass(field.getType());
|
||||
ObjectFormatter objectFormatter = getObjectFormatter(field, fieldClazz);
|
||||
if (objectFormatter == null) {
|
||||
throw new IllegalStateException("Can't find formatter for field " + field.getName() + " of type " + fieldClazz);
|
||||
} else {
|
||||
fieldExtractor.setObjectFormatter(objectFormatter);
|
||||
}
|
||||
} else if (fieldExtractor.isMulti()) {
|
||||
if (!List.class.isAssignableFrom(field.getType())) {
|
||||
throw new IllegalStateException("Field " + field.getName() + " must be list");
|
||||
}
|
||||
Formatter formatter = field.getAnnotation(Formatter.class);
|
||||
if (formatter != null) {
|
||||
if (!formatter.subClazz().equals(Void.class)) {
|
||||
ObjectFormatter objectFormatter = getObjectFormatter(field, formatter.subClazz());
|
||||
if (objectFormatter == null) {
|
||||
throw new IllegalStateException("Can't find formatter for field " + field.getName() + " of type " + formatter.subClazz());
|
||||
} else {
|
||||
fieldExtractor.setObjectFormatter(objectFormatter);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private ObjectFormatter getObjectFormatter(Field field, Class<?> fieldClazz) {
|
||||
Formatter formatter = field.getAnnotation(Formatter.class);
|
||||
if (formatter != null) {
|
||||
if (!formatter.formatter().equals(ObjectFormatter.class)) {
|
||||
return initFormatter(formatter);
|
||||
}
|
||||
}
|
||||
return ObjectFormatters.get(fieldClazz);
|
||||
}
|
||||
|
||||
private ObjectFormatter initFormatter(Formatter formatter) {
|
||||
try {
|
||||
return formatter.formatter().newInstance();
|
||||
} catch (InstantiationException e) {
|
||||
logger.error("init ObjectFormatter fail", e);
|
||||
} catch (IllegalAccessException e) {
|
||||
logger.error("init ObjectFormatter fail", e);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
private FieldExtractor getAnnotationExtractByUrl(Class clazz, Field field) {
|
||||
FieldExtractor fieldExtractor = null;
|
||||
ExtractByUrl extractByUrl = field.getAnnotation(ExtractByUrl.class);
|
||||
|
@ -231,7 +281,12 @@ class PageModelExtractor {
|
|||
if ((value == null || value.size() == 0) && fieldExtractor.isNotNull()) {
|
||||
return null;
|
||||
}
|
||||
if (fieldExtractor.getObjectFormatter() != null) {
|
||||
List<Object> converted = convert(value, fieldExtractor.getObjectFormatter());
|
||||
setField(o, fieldExtractor, converted);
|
||||
} else {
|
||||
setField(o, fieldExtractor, value);
|
||||
}
|
||||
} else {
|
||||
String value;
|
||||
switch (fieldExtractor.getSource()) {
|
||||
|
@ -254,22 +309,47 @@ class PageModelExtractor {
|
|||
if (value == null && fieldExtractor.isNotNull()) {
|
||||
return null;
|
||||
}
|
||||
if (fieldExtractor.getObjectFormatter() != null) {
|
||||
Object converted = convert(value, fieldExtractor.getObjectFormatter());
|
||||
setField(o, fieldExtractor, converted);
|
||||
} else {
|
||||
setField(o, fieldExtractor, value);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (AfterExtractor.class.isAssignableFrom(clazz)) {
|
||||
((AfterExtractor) o).afterProcess(page);
|
||||
}
|
||||
} catch (InstantiationException e) {
|
||||
e.printStackTrace();
|
||||
logger.error("extract fail", e);
|
||||
} catch (IllegalAccessException e) {
|
||||
e.printStackTrace();
|
||||
logger.error("extract fail", e);
|
||||
} catch (InvocationTargetException e) {
|
||||
e.printStackTrace();
|
||||
logger.error("extract fail", e);
|
||||
}
|
||||
return o;
|
||||
}
|
||||
|
||||
private Object convert(String value, ObjectFormatter objectFormatter) {
|
||||
try {
|
||||
return objectFormatter.format(value);
|
||||
} catch (Exception e) {
|
||||
logger.error("convert " + value + " to " + objectFormatter.clazz() + " error!", e);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
private List<Object> convert(List<String> values, ObjectFormatter objectFormatter) {
|
||||
List<Object> objects = new ArrayList<Object>();
|
||||
for (String value : values) {
|
||||
Object converted = convert(value, objectFormatter);
|
||||
if (converted != null) {
|
||||
objects.add(converted);
|
||||
}
|
||||
}
|
||||
return objects;
|
||||
}
|
||||
|
||||
private void setField(Object o, FieldExtractor fieldExtractor, Object value) throws IllegalAccessException, InvocationTargetException {
|
||||
if (fieldExtractor.getSetterMethod() != null) {
|
||||
fieldExtractor.getSetterMethod().invoke(o, value);
|
||||
|
|
|
@ -0,0 +1,41 @@
|
|||
package us.codecraft.webmagic.model.annotation;
|
||||
|
||||
import us.codecraft.webmagic.model.formatter.ObjectFormatter;
|
||||
|
||||
import java.lang.annotation.ElementType;
|
||||
import java.lang.annotation.Retention;
|
||||
import java.lang.annotation.Target;
|
||||
|
||||
/**
|
||||
* Define how the result string is convert to an object for field.
|
||||
*
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* @since 0.3.2
|
||||
*/
|
||||
@Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
|
||||
@Target({ElementType.FIELD})
|
||||
public @interface Formatter {
|
||||
|
||||
/**
|
||||
* Set formatter params.
|
||||
*
|
||||
* @return formatter params
|
||||
*/
|
||||
String[] value();
|
||||
|
||||
/**
|
||||
* Specific the class of field of class of elements in collection for field. <br/>
|
||||
* It is not necessary to be set because we can detect the class by class of field,
|
||||
* unless you use a collection as a field. <br/>
|
||||
*
|
||||
* @return the class of field
|
||||
*/
|
||||
Class subClazz() default Void.class;
|
||||
|
||||
/**
|
||||
* If there are more than one formatter for a class, just specify the implement.
|
||||
* @return implement
|
||||
*/
|
||||
Class<? extends ObjectFormatter> formatter() default ObjectFormatter.class;
|
||||
|
||||
}
|
|
@ -0,0 +1,150 @@
|
|||
package us.codecraft.webmagic.model.formatter;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com
|
||||
* @since 0.3.2
|
||||
*/
|
||||
public abstract class BasicTypeFormatter<T> implements ObjectFormatter<T> {
|
||||
|
||||
@Override
|
||||
public void initParam(String[] extra) {
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public T format(String raw) throws Exception {
|
||||
if (raw == null) {
|
||||
return null;
|
||||
}
|
||||
raw = raw.trim();
|
||||
return formatTrimmed(raw);
|
||||
}
|
||||
|
||||
protected abstract T formatTrimmed(String raw) throws Exception;
|
||||
|
||||
public static final List<ObjectFormatter> basicTypeFormatters = Arrays.<ObjectFormatter>asList(new IntegerFormatter(),
|
||||
new LongFormatter(), new DoubleFormatter(), new FloatFormatter(), new ShortFormatter(),
|
||||
new CharactorFormatter(), new ByteFormatter(), new BooleanFormatter());
|
||||
|
||||
public static Class<?> detectBasicClass(Class<?> type) {
|
||||
if (type.equals(Integer.TYPE) || type.equals(Integer.class)) {
|
||||
return Integer.class;
|
||||
} else if (type.equals(Long.TYPE) || type.equals(Long.class)) {
|
||||
return Long.class;
|
||||
} else if (type.equals(Double.TYPE) || type.equals(Double.class)) {
|
||||
return Double.class;
|
||||
} else if (type.equals(Float.TYPE) || type.equals(Float.class)) {
|
||||
return Float.class;
|
||||
} else if (type.equals(Short.TYPE) || type.equals(Short.class)) {
|
||||
return Short.class;
|
||||
} else if (type.equals(Character.TYPE) || type.equals(Character.class)) {
|
||||
return Character.class;
|
||||
} else if (type.equals(Byte.TYPE) || type.equals(Byte.class)) {
|
||||
return Byte.class;
|
||||
} else if (type.equals(Boolean.TYPE) || type.equals(Boolean.class)) {
|
||||
return Boolean.class;
|
||||
}
|
||||
return type;
|
||||
}
|
||||
|
||||
public static class IntegerFormatter extends BasicTypeFormatter<Integer> {
|
||||
@Override
|
||||
public Integer formatTrimmed(String raw) throws Exception {
|
||||
return Integer.parseInt(raw);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Class<Integer> clazz() {
|
||||
return Integer.class;
|
||||
}
|
||||
}
|
||||
|
||||
public static class LongFormatter extends BasicTypeFormatter<Long> {
|
||||
@Override
|
||||
public Long formatTrimmed(String raw) throws Exception {
|
||||
return Long.parseLong(raw);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Class<Long> clazz() {
|
||||
return Long.class;
|
||||
}
|
||||
}
|
||||
|
||||
public static class DoubleFormatter extends BasicTypeFormatter<Double> {
|
||||
@Override
|
||||
public Double formatTrimmed(String raw) throws Exception {
|
||||
return Double.parseDouble(raw);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Class<Double> clazz() {
|
||||
return Double.class;
|
||||
}
|
||||
}
|
||||
|
||||
public static class FloatFormatter extends BasicTypeFormatter<Float> {
|
||||
@Override
|
||||
public Float formatTrimmed(String raw) throws Exception {
|
||||
return Float.parseFloat(raw);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Class<Float> clazz() {
|
||||
return Float.class;
|
||||
}
|
||||
}
|
||||
|
||||
public static class ShortFormatter extends BasicTypeFormatter<Short> {
|
||||
@Override
|
||||
public Short formatTrimmed(String raw) throws Exception {
|
||||
return Short.parseShort(raw);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Class<Short> clazz() {
|
||||
return Short.class;
|
||||
}
|
||||
}
|
||||
|
||||
public static class CharactorFormatter extends BasicTypeFormatter<Character> {
|
||||
@Override
|
||||
public Character formatTrimmed(String raw) throws Exception {
|
||||
return raw.charAt(0);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Class<Character> clazz() {
|
||||
return Character.class;
|
||||
}
|
||||
}
|
||||
|
||||
public static class ByteFormatter extends BasicTypeFormatter<Byte> {
|
||||
@Override
|
||||
public Byte formatTrimmed(String raw) throws Exception {
|
||||
return Byte.parseByte(raw, 10);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Class<Byte> clazz() {
|
||||
return Byte.class;
|
||||
}
|
||||
}
|
||||
|
||||
public static class BooleanFormatter extends BasicTypeFormatter<Boolean> {
|
||||
@Override
|
||||
public Boolean formatTrimmed(String raw) throws Exception {
|
||||
return Boolean.parseBoolean(raw);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Class<Boolean> clazz() {
|
||||
return Boolean.class;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
|
@ -0,0 +1,29 @@
|
|||
package us.codecraft.webmagic.model.formatter;
|
||||
|
||||
import org.apache.commons.lang3.time.DateUtils;
|
||||
|
||||
import java.util.Date;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com
|
||||
* @since 0.3.2
|
||||
*/
|
||||
public class DateFormatter implements ObjectFormatter<Date> {
|
||||
|
||||
private String[] datePatterns = new String[]{"YYYY-MM-dd HH:mm"};
|
||||
|
||||
@Override
|
||||
public Date format(String raw) throws Exception {
|
||||
return DateUtils.parseDate(raw, datePatterns);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Class<Date> clazz() {
|
||||
return Date.class;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void initParam(String[] extra) {
|
||||
datePatterns = extra;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,14 @@
|
|||
package us.codecraft.webmagic.model.formatter;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com
|
||||
*/
|
||||
public interface ObjectFormatter<T> {
|
||||
|
||||
T format(String raw) throws Exception;
|
||||
|
||||
Class<T> clazz();
|
||||
|
||||
void initParam(String[] extra);
|
||||
|
||||
}
|
|
@ -0,0 +1,27 @@
|
|||
package us.codecraft.webmagic.model.formatter;
|
||||
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com
|
||||
* @since 0.3.2
|
||||
*/
|
||||
public class ObjectFormatters {
|
||||
|
||||
private static Map<Class, ObjectFormatter> formatterMap = new ConcurrentHashMap<Class, ObjectFormatter>();
|
||||
|
||||
static {
|
||||
for (ObjectFormatter basicTypeFormatter : BasicTypeFormatter.basicTypeFormatters) {
|
||||
put(basicTypeFormatter);
|
||||
}
|
||||
}
|
||||
|
||||
public static void put(ObjectFormatter objectFormatter) {
|
||||
formatterMap.put(objectFormatter.clazz(), objectFormatter);
|
||||
}
|
||||
|
||||
public static <T> ObjectFormatter<T> get(Class<T> clazz){
|
||||
return formatterMap.get(clazz);
|
||||
}
|
||||
}
|
|
@ -32,10 +32,10 @@ public class GithubRepo implements HasKey {
|
|||
private List<String> language;
|
||||
|
||||
@ExtractBy("//ul[@class='pagehead-actions']/li[2]//a[@class='social-count js-social-count']/text()")
|
||||
private String star;
|
||||
private int star;
|
||||
|
||||
@ExtractBy("//ul[@class='pagehead-actions']/li[3]//a[@class='social-count']/text()")
|
||||
private String fork;
|
||||
private int fork;
|
||||
|
||||
@ExtractByUrl
|
||||
private String url;
|
||||
|
@ -46,8 +46,8 @@ public class GithubRepo implements HasKey {
|
|||
, new PageModelPipeline<GithubRepo>() {
|
||||
@Override
|
||||
public void process(GithubRepo o, Task task) {
|
||||
Assert.assertEquals("78",o.getStar().trim());
|
||||
Assert.assertEquals("65",o.getFork().trim());
|
||||
Assert.assertEquals(78, o.getStar());
|
||||
Assert.assertEquals(65, o.getFork());
|
||||
}
|
||||
}, GithubRepo.class).setDownloader(new MockDownloader()).test("https://github.com/code4craft/webmagic");
|
||||
}
|
||||
|
@ -77,11 +77,11 @@ public class GithubRepo implements HasKey {
|
|||
return url;
|
||||
}
|
||||
|
||||
public String getStar() {
|
||||
public int getStar() {
|
||||
return star;
|
||||
}
|
||||
|
||||
public String getFork() {
|
||||
public int getFork() {
|
||||
return fork;
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue