Refactor of processSingle in PageModelExtractor (#1155)

master
François Gibier 2024-04-05 15:51:08 +02:00 committed by GitHub
parent f10fabcb58
commit 05e5eefc7d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
12 changed files with 217 additions and 186 deletions

View File

@ -10,6 +10,12 @@
<artifactId>webmagic-extension</artifactId> <artifactId>webmagic-extension</artifactId>
<dependencies> <dependencies>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>1.18.32</version>
<scope>provided</scope>
</dependency>
<dependency> <dependency>
<groupId>redis.clients</groupId> <groupId>redis.clients</groupId>
<artifactId>jedis</artifactId> <artifactId>jedis</artifactId>

View File

@ -1,5 +1,7 @@
package us.codecraft.webmagic.model; package us.codecraft.webmagic.model;
import lombok.Getter;
import lombok.Setter;
import us.codecraft.webmagic.selector.Selector; import us.codecraft.webmagic.selector.Selector;
/** /**
@ -7,17 +9,19 @@ import us.codecraft.webmagic.selector.Selector;
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* @since 0.2.0 * @since 0.2.0
*/ */
class Extractor { public class Extractor {
@Getter @Setter
protected Selector selector; protected Selector selector;
@Getter
protected final Source source; protected final Source source;
protected final boolean notNull; protected final boolean notNull;
protected final boolean multi; protected final boolean multi;
static enum Source {Html, Url, RawHtml, RawText} public static enum Source {Html, Url, RawHtml, RawText}
public Extractor(Selector selector, Source source, boolean notNull, boolean multi) { public Extractor(Selector selector, Source source, boolean notNull, boolean multi) {
this.selector = selector; this.selector = selector;
@ -26,23 +30,11 @@ class Extractor {
this.multi = multi; this.multi = multi;
} }
Selector getSelector() { public boolean isNotNull() {
return selector;
}
Source getSource() {
return source;
}
boolean isNotNull() {
return notNull; return notNull;
} }
boolean isMulti() { public boolean isMulti() {
return multi; return multi;
} }
void setSelector(Selector selector) {
this.selector = selector;
}
} }

View File

@ -6,53 +6,27 @@ import us.codecraft.webmagic.selector.Selector;
import java.lang.reflect.Field; import java.lang.reflect.Field;
import java.lang.reflect.Method; import java.lang.reflect.Method;
import lombok.Getter;
import lombok.Setter;
/** /**
* Wrapper of field and extractor. * Wrapper of field and extractor.
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* @since 0.2.0 * @since 0.2.0
*/ */
class FieldExtractor extends Extractor { public class FieldExtractor extends Extractor {
@Getter
private final Field field; private final Field field;
@Getter @Setter
private Method setterMethod; private Method setterMethod;
@Getter @Setter
private ObjectFormatter objectFormatter; private ObjectFormatter objectFormatter;
public FieldExtractor(Field field, Selector selector, Source source, boolean notNull, boolean multi) { public FieldExtractor(Field field, Selector selector, Source source, boolean notNull, boolean multi) {
super(selector, source, notNull, multi); super(selector, source, notNull, multi);
this.field = field; this.field = field;
} }
Field getField() {
return field;
}
Selector getSelector() {
return selector;
}
Source getSource() {
return source;
}
void setSetterMethod(Method setterMethod) {
this.setterMethod = setterMethod;
}
Method getSetterMethod() {
return setterMethod;
}
boolean isNotNull() {
return notNull;
}
ObjectFormatter getObjectFormatter() {
return objectFormatter;
}
void setObjectFormatter(ObjectFormatter objectFormatter) {
this.objectFormatter = objectFormatter;
}
} }

View File

@ -3,17 +3,21 @@ package us.codecraft.webmagic.model;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import lombok.Getter;
import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.model.annotation.*; import us.codecraft.webmagic.model.annotation.*;
import us.codecraft.webmagic.model.formatter.ObjectFormatter; import us.codecraft.webmagic.model.fields.PageField;
import us.codecraft.webmagic.model.formatter.ObjectFormatterBuilder; import us.codecraft.webmagic.model.formatter.ObjectFormatterBuilder;
import us.codecraft.webmagic.model.selections.MultipleSelection;
import us.codecraft.webmagic.model.selections.Selection;
import us.codecraft.webmagic.model.selections.SingleSelection;
import us.codecraft.webmagic.selector.*; import us.codecraft.webmagic.selector.*;
import us.codecraft.webmagic.utils.ClassUtils; import us.codecraft.webmagic.utils.ClassUtils;
import us.codecraft.webmagic.utils.ExtractorUtils; import us.codecraft.webmagic.utils.ExtractorUtils;
import java.lang.annotation.Annotation; import java.lang.annotation.Annotation;
import java.lang.reflect.Field; import java.lang.reflect.Field;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method; import java.lang.reflect.Method;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
@ -29,14 +33,19 @@ import static us.codecraft.webmagic.model.annotation.ExtractBy.Source.RawText;
*/ */
class PageModelExtractor { class PageModelExtractor {
@Getter
private List<Pattern> targetUrlPatterns = new ArrayList<Pattern>(); private List<Pattern> targetUrlPatterns = new ArrayList<Pattern>();
@Getter
private Selector targetUrlRegionSelector; private Selector targetUrlRegionSelector;
@Getter
private List<Pattern> helpUrlPatterns = new ArrayList<Pattern>(); private List<Pattern> helpUrlPatterns = new ArrayList<Pattern>();
@Getter
private Selector helpUrlRegionSelector; private Selector helpUrlRegionSelector;
@Getter
private Class clazz; private Class clazz;
private List<FieldExtractor> fieldExtractors; private List<FieldExtractor> fieldExtractors;
@ -233,145 +242,16 @@ class PageModelExtractor {
try { try {
o = clazz.newInstance(); o = clazz.newInstance();
for (FieldExtractor fieldExtractor : fieldExtractors) { for (FieldExtractor fieldExtractor : fieldExtractors) {
if (fieldExtractor.isMulti()) { Selection selection = fieldExtractor.isMulti() ? new MultipleSelection() : new SingleSelection();
List<String> value=getMultiValueFromSource(page, fieldExtractor, html, isRaw); PageField field = selection.extractField(page, html, isRaw, fieldExtractor);
if ((value == null || value.size() == 0) && fieldExtractor.isNotNull()) { if (!field.operation(o, fieldExtractor, logger))
return null; return null;
}
if (fieldExtractor.getObjectFormatter() != null) {
List<Object> converted = convertMultiValue(value, fieldExtractor.getObjectFormatter());
setField(o, fieldExtractor, converted);
} else {
setField(o, fieldExtractor, value);
}
} else {
String value=getSingleValueFromSource(page, fieldExtractor, html, isRaw);
if (value == null && fieldExtractor.isNotNull()) {
return null;
}
if (fieldExtractor.getObjectFormatter() != null) {
Object converted = convertSingleValue(value, fieldExtractor.getObjectFormatter());
if (converted == null && fieldExtractor.isNotNull()) {
return null;
}
setField(o, fieldExtractor, converted);
} else {
setField(o, fieldExtractor, value);
}
}
} }
if (AfterExtractor.class.isAssignableFrom(clazz)) { if (AfterExtractor.class.isAssignableFrom(clazz))
((AfterExtractor) o).afterProcess(page); ((AfterExtractor) o).afterProcess(page);
} } catch (Exception e) {
} catch (InstantiationException e) {
logger.error("extract fail", e);
} catch (IllegalAccessException e) {
logger.error("extract fail", e);
} catch (InvocationTargetException e) {
logger.error("extract fail", e); logger.error("extract fail", e);
} }
return o; return o;
} }
private List<String> getMultiValueFromSource(Page page, FieldExtractor fieldExtractor, String html, boolean isRaw) {
List<String> value;
switch (fieldExtractor.getSource()) {
case RawHtml:
value = page.getHtml().selectDocumentForList(fieldExtractor.getSelector());
break;
case Html:
if (isRaw) {
value = page.getHtml().selectDocumentForList(fieldExtractor.getSelector());
} else {
value = fieldExtractor.getSelector().selectList(html);
}
break;
case Url:
value = fieldExtractor.getSelector().selectList(page.getUrl().toString());
break;
case RawText:
value = fieldExtractor.getSelector().selectList(page.getRawText());
break;
default:
value = fieldExtractor.getSelector().selectList(html);
}
return value;
}
private String getSingleValueFromSource(Page page, FieldExtractor fieldExtractor, String html, boolean isRaw) {
String value;
switch (fieldExtractor.getSource()) {
case RawHtml:
value = page.getHtml().selectDocument(fieldExtractor.getSelector());
break;
case Html:
if (isRaw) {
value = page.getHtml().selectDocument(fieldExtractor.getSelector());
} else {
value = fieldExtractor.getSelector().select(html);
}
break;
case Url:
value = fieldExtractor.getSelector().select(page.getUrl().toString());
break;
case RawText:
value = fieldExtractor.getSelector().select(page.getRawText());
break;
default:
value = fieldExtractor.getSelector().select(html);
}
return value;
}
private Object convertSingleValue(String value, ObjectFormatter objectFormatter) {
try {
Object format = objectFormatter.format(value);
logger.debug("String {} is converted to {}", value, format);
return format;
} catch (Exception e) {
logger.error("convert " + value + " to " + objectFormatter.clazz() + " error!", e);
}
return null;
}
private List<Object> convertMultiValue(List<String> values, ObjectFormatter objectFormatter) {
List<Object> objects = new ArrayList<Object>();
for (String value : values) {
Object converted = convertSingleValue(value, objectFormatter);
if (converted != null) {
objects.add(converted);
}
}
return objects;
}
private void setField(Object o, FieldExtractor fieldExtractor, Object value) throws IllegalAccessException, InvocationTargetException {
if (value == null) {
return;
}
if (fieldExtractor.getSetterMethod() != null) {
fieldExtractor.getSetterMethod().invoke(o, value);
}
fieldExtractor.getField().set(o, value);
}
Class getClazz() {
return clazz;
}
List<Pattern> getTargetUrlPatterns() {
return targetUrlPatterns;
}
List<Pattern> getHelpUrlPatterns() {
return helpUrlPatterns;
}
Selector getTargetUrlRegionSelector() {
return targetUrlRegionSelector;
}
Selector getHelpUrlRegionSelector() {
return helpUrlRegionSelector;
}
} }

View File

@ -0,0 +1,42 @@
package us.codecraft.webmagic.model.fields;
import java.lang.reflect.InvocationTargetException;
import java.util.ArrayList;
import java.util.List;
import org.slf4j.Logger;
import lombok.Getter;
import us.codecraft.webmagic.model.FieldExtractor;
import us.codecraft.webmagic.model.formatter.ObjectFormatter;
public class MultipleField extends PageField {
@Getter
private List<String> fieldNames;
public MultipleField(List<String> fieldNames) {
this.fieldNames = fieldNames;
}
public boolean operation(Object o, FieldExtractor fieldExtractor, Logger logger) throws IllegalAccessException, InvocationTargetException {
if ((this.fieldNames == null || this.fieldNames.size() == 0) && fieldExtractor.isNotNull())
return false;
if (fieldExtractor.getObjectFormatter() != null) {
List<Object> converted = this.convert(this.fieldNames, fieldExtractor.getObjectFormatter(), logger);
setField(o, fieldExtractor, converted);
}
else
setField(o, fieldExtractor, this.fieldNames);
return true;
}
private List<Object> convert(List<String> values, ObjectFormatter objectFormatter, Logger logger) {
List<Object> objects = new ArrayList<>();
for (String value : values) {
Object converted = this.convert(value, objectFormatter, logger);
if (converted != null)
objects.add(converted);
}
return objects;
}
}

View File

@ -0,0 +1,31 @@
package us.codecraft.webmagic.model.fields;
import java.lang.reflect.InvocationTargetException;
import org.slf4j.Logger;
import us.codecraft.webmagic.model.FieldExtractor;
import us.codecraft.webmagic.model.formatter.ObjectFormatter;
public abstract class PageField {
public abstract boolean operation(Object o, FieldExtractor fieldExtractor, Logger logger) throws IllegalAccessException, InvocationTargetException;
protected Object convert(String value, ObjectFormatter objectFormatter, Logger logger) {
try {
Object format = objectFormatter.format(value);
logger.debug("String {} is converted to {}", value, format);
return format;
} catch (Exception e) {
logger.error("convert " + value + " to " + objectFormatter.clazz() + " error!", e);
}
return null;
}
protected void setField(Object o, FieldExtractor fieldExtractor, Object value) throws IllegalAccessException, InvocationTargetException {
if (value != null) {
if (fieldExtractor.getSetterMethod() != null)
fieldExtractor.getSetterMethod().invoke(o, value);
fieldExtractor.getField().set(o, value);
}
}
}

View File

@ -0,0 +1,28 @@
package us.codecraft.webmagic.model.fields;
import java.lang.reflect.InvocationTargetException;
import org.slf4j.Logger;
import lombok.Getter;
import us.codecraft.webmagic.model.FieldExtractor;
public class SingleField extends PageField {
@Getter
private String fieldName;
public SingleField(String fieldName) {
this.fieldName = fieldName;
}
public boolean operation(Object o, FieldExtractor fieldExtractor, Logger logger) throws IllegalAccessException, InvocationTargetException {
if (fieldExtractor.getObjectFormatter() != null) {
Object converted = this.convert(this.fieldName, fieldExtractor.getObjectFormatter(), logger);
if (converted == null && fieldExtractor.isNotNull())
return false;
setField(o, fieldExtractor, converted);
} else
setField(o, fieldExtractor, this.fieldName);
return true;
}
}

View File

@ -0,0 +1,36 @@
package us.codecraft.webmagic.model.selections;
import java.util.List;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.model.FieldExtractor;
import us.codecraft.webmagic.model.fields.MultipleField;
public class MultipleSelection implements Selection {
public MultipleField extractField(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
List<String> fieldsName;
switch (fieldExtractor.getSource()) {
case RawHtml:
fieldsName = page.getHtml().selectDocumentForList(fieldExtractor.getSelector());
break;
case Html:
if (isRaw)
fieldsName = page.getHtml().selectDocumentForList(fieldExtractor.getSelector());
else
fieldsName = fieldExtractor.getSelector().selectList(html);
break;
case Url:
fieldsName = fieldExtractor.getSelector().selectList(page.getUrl().toString());
break;
case RawText:
fieldsName = fieldExtractor.getSelector().selectList(page.getRawText());
break;
default:
fieldsName = fieldExtractor.getSelector().selectList(html);
}
if ((fieldsName == null || fieldsName.size() == 0) && fieldExtractor.isNotNull()) {
return null;
}
return new MultipleField(fieldsName);
}
}

View File

@ -0,0 +1,9 @@
package us.codecraft.webmagic.model.selections;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.model.FieldExtractor;
import us.codecraft.webmagic.model.fields.PageField;
public interface Selection {
public PageField extractField(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor);
}

View File

@ -0,0 +1,33 @@
package us.codecraft.webmagic.model.selections;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.model.FieldExtractor;
import us.codecraft.webmagic.model.fields.SingleField;
public class SingleSelection implements Selection {
public SingleField extractField(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
String field;
switch (fieldExtractor.getSource()) {
case RawHtml:
field = page.getHtml().selectDocument(fieldExtractor.getSelector());
break;
case Html:
if (isRaw)
field = page.getHtml().selectDocument(fieldExtractor.getSelector());
else
field = fieldExtractor.getSelector().select(html);
break;
case Url:
field = fieldExtractor.getSelector().select(page.getUrl().toString());
break;
case RawText:
field = fieldExtractor.getSelector().select(page.getRawText());
break;
default:
field = fieldExtractor.getSelector().select(html);
}
if (field == null && fieldExtractor.isNotNull())
return null;
return new SingleField(field);
}
}