Refactor of processSingle in PageModelExtractor (#1155)
parent
f10fabcb58
commit
05e5eefc7d
|
@ -10,6 +10,12 @@
|
||||||
<artifactId>webmagic-extension</artifactId>
|
<artifactId>webmagic-extension</artifactId>
|
||||||
|
|
||||||
<dependencies>
|
<dependencies>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.projectlombok</groupId>
|
||||||
|
<artifactId>lombok</artifactId>
|
||||||
|
<version>1.18.32</version>
|
||||||
|
<scope>provided</scope>
|
||||||
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>redis.clients</groupId>
|
<groupId>redis.clients</groupId>
|
||||||
<artifactId>jedis</artifactId>
|
<artifactId>jedis</artifactId>
|
||||||
|
|
|
@ -1,5 +1,7 @@
|
||||||
package us.codecraft.webmagic.model;
|
package us.codecraft.webmagic.model;
|
||||||
|
|
||||||
|
import lombok.Getter;
|
||||||
|
import lombok.Setter;
|
||||||
import us.codecraft.webmagic.selector.Selector;
|
import us.codecraft.webmagic.selector.Selector;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -7,17 +9,19 @@ import us.codecraft.webmagic.selector.Selector;
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
* @since 0.2.0
|
* @since 0.2.0
|
||||||
*/
|
*/
|
||||||
class Extractor {
|
public class Extractor {
|
||||||
|
|
||||||
|
@Getter @Setter
|
||||||
protected Selector selector;
|
protected Selector selector;
|
||||||
|
|
||||||
|
@Getter
|
||||||
protected final Source source;
|
protected final Source source;
|
||||||
|
|
||||||
protected final boolean notNull;
|
protected final boolean notNull;
|
||||||
|
|
||||||
protected final boolean multi;
|
protected final boolean multi;
|
||||||
|
|
||||||
static enum Source {Html, Url, RawHtml, RawText}
|
public static enum Source {Html, Url, RawHtml, RawText}
|
||||||
|
|
||||||
public Extractor(Selector selector, Source source, boolean notNull, boolean multi) {
|
public Extractor(Selector selector, Source source, boolean notNull, boolean multi) {
|
||||||
this.selector = selector;
|
this.selector = selector;
|
||||||
|
@ -26,23 +30,11 @@ class Extractor {
|
||||||
this.multi = multi;
|
this.multi = multi;
|
||||||
}
|
}
|
||||||
|
|
||||||
Selector getSelector() {
|
public boolean isNotNull() {
|
||||||
return selector;
|
|
||||||
}
|
|
||||||
|
|
||||||
Source getSource() {
|
|
||||||
return source;
|
|
||||||
}
|
|
||||||
|
|
||||||
boolean isNotNull() {
|
|
||||||
return notNull;
|
return notNull;
|
||||||
}
|
}
|
||||||
|
|
||||||
boolean isMulti() {
|
public boolean isMulti() {
|
||||||
return multi;
|
return multi;
|
||||||
}
|
}
|
||||||
|
|
||||||
void setSelector(Selector selector) {
|
|
||||||
this.selector = selector;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -6,53 +6,27 @@ import us.codecraft.webmagic.selector.Selector;
|
||||||
import java.lang.reflect.Field;
|
import java.lang.reflect.Field;
|
||||||
import java.lang.reflect.Method;
|
import java.lang.reflect.Method;
|
||||||
|
|
||||||
|
import lombok.Getter;
|
||||||
|
import lombok.Setter;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Wrapper of field and extractor.
|
* Wrapper of field and extractor.
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
* @since 0.2.0
|
* @since 0.2.0
|
||||||
*/
|
*/
|
||||||
class FieldExtractor extends Extractor {
|
public class FieldExtractor extends Extractor {
|
||||||
|
|
||||||
|
@Getter
|
||||||
private final Field field;
|
private final Field field;
|
||||||
|
|
||||||
|
@Getter @Setter
|
||||||
private Method setterMethod;
|
private Method setterMethod;
|
||||||
|
|
||||||
|
@Getter @Setter
|
||||||
private ObjectFormatter objectFormatter;
|
private ObjectFormatter objectFormatter;
|
||||||
|
|
||||||
public FieldExtractor(Field field, Selector selector, Source source, boolean notNull, boolean multi) {
|
public FieldExtractor(Field field, Selector selector, Source source, boolean notNull, boolean multi) {
|
||||||
super(selector, source, notNull, multi);
|
super(selector, source, notNull, multi);
|
||||||
this.field = field;
|
this.field = field;
|
||||||
}
|
}
|
||||||
|
|
||||||
Field getField() {
|
|
||||||
return field;
|
|
||||||
}
|
|
||||||
|
|
||||||
Selector getSelector() {
|
|
||||||
return selector;
|
|
||||||
}
|
|
||||||
|
|
||||||
Source getSource() {
|
|
||||||
return source;
|
|
||||||
}
|
|
||||||
|
|
||||||
void setSetterMethod(Method setterMethod) {
|
|
||||||
this.setterMethod = setterMethod;
|
|
||||||
}
|
|
||||||
|
|
||||||
Method getSetterMethod() {
|
|
||||||
return setterMethod;
|
|
||||||
}
|
|
||||||
|
|
||||||
boolean isNotNull() {
|
|
||||||
return notNull;
|
|
||||||
}
|
|
||||||
|
|
||||||
ObjectFormatter getObjectFormatter() {
|
|
||||||
return objectFormatter;
|
|
||||||
}
|
|
||||||
|
|
||||||
void setObjectFormatter(ObjectFormatter objectFormatter) {
|
|
||||||
this.objectFormatter = objectFormatter;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -3,17 +3,21 @@ package us.codecraft.webmagic.model;
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import lombok.Getter;
|
||||||
import us.codecraft.webmagic.Page;
|
import us.codecraft.webmagic.Page;
|
||||||
import us.codecraft.webmagic.model.annotation.*;
|
import us.codecraft.webmagic.model.annotation.*;
|
||||||
import us.codecraft.webmagic.model.formatter.ObjectFormatter;
|
import us.codecraft.webmagic.model.fields.PageField;
|
||||||
import us.codecraft.webmagic.model.formatter.ObjectFormatterBuilder;
|
import us.codecraft.webmagic.model.formatter.ObjectFormatterBuilder;
|
||||||
|
import us.codecraft.webmagic.model.selections.MultipleSelection;
|
||||||
|
import us.codecraft.webmagic.model.selections.Selection;
|
||||||
|
import us.codecraft.webmagic.model.selections.SingleSelection;
|
||||||
import us.codecraft.webmagic.selector.*;
|
import us.codecraft.webmagic.selector.*;
|
||||||
import us.codecraft.webmagic.utils.ClassUtils;
|
import us.codecraft.webmagic.utils.ClassUtils;
|
||||||
import us.codecraft.webmagic.utils.ExtractorUtils;
|
import us.codecraft.webmagic.utils.ExtractorUtils;
|
||||||
|
|
||||||
import java.lang.annotation.Annotation;
|
import java.lang.annotation.Annotation;
|
||||||
import java.lang.reflect.Field;
|
import java.lang.reflect.Field;
|
||||||
import java.lang.reflect.InvocationTargetException;
|
|
||||||
import java.lang.reflect.Method;
|
import java.lang.reflect.Method;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
@ -29,14 +33,19 @@ import static us.codecraft.webmagic.model.annotation.ExtractBy.Source.RawText;
|
||||||
*/
|
*/
|
||||||
class PageModelExtractor {
|
class PageModelExtractor {
|
||||||
|
|
||||||
|
@Getter
|
||||||
private List<Pattern> targetUrlPatterns = new ArrayList<Pattern>();
|
private List<Pattern> targetUrlPatterns = new ArrayList<Pattern>();
|
||||||
|
|
||||||
|
@Getter
|
||||||
private Selector targetUrlRegionSelector;
|
private Selector targetUrlRegionSelector;
|
||||||
|
|
||||||
|
@Getter
|
||||||
private List<Pattern> helpUrlPatterns = new ArrayList<Pattern>();
|
private List<Pattern> helpUrlPatterns = new ArrayList<Pattern>();
|
||||||
|
|
||||||
|
@Getter
|
||||||
private Selector helpUrlRegionSelector;
|
private Selector helpUrlRegionSelector;
|
||||||
|
|
||||||
|
@Getter
|
||||||
private Class clazz;
|
private Class clazz;
|
||||||
|
|
||||||
private List<FieldExtractor> fieldExtractors;
|
private List<FieldExtractor> fieldExtractors;
|
||||||
|
@ -233,145 +242,16 @@ class PageModelExtractor {
|
||||||
try {
|
try {
|
||||||
o = clazz.newInstance();
|
o = clazz.newInstance();
|
||||||
for (FieldExtractor fieldExtractor : fieldExtractors) {
|
for (FieldExtractor fieldExtractor : fieldExtractors) {
|
||||||
if (fieldExtractor.isMulti()) {
|
Selection selection = fieldExtractor.isMulti() ? new MultipleSelection() : new SingleSelection();
|
||||||
List<String> value=getMultiValueFromSource(page, fieldExtractor, html, isRaw);
|
PageField field = selection.extractField(page, html, isRaw, fieldExtractor);
|
||||||
if ((value == null || value.size() == 0) && fieldExtractor.isNotNull()) {
|
if (!field.operation(o, fieldExtractor, logger))
|
||||||
return null;
|
return null;
|
||||||
}
|
|
||||||
if (fieldExtractor.getObjectFormatter() != null) {
|
|
||||||
List<Object> converted = convertMultiValue(value, fieldExtractor.getObjectFormatter());
|
|
||||||
setField(o, fieldExtractor, converted);
|
|
||||||
} else {
|
|
||||||
setField(o, fieldExtractor, value);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
String value=getSingleValueFromSource(page, fieldExtractor, html, isRaw);
|
|
||||||
if (value == null && fieldExtractor.isNotNull()) {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
if (fieldExtractor.getObjectFormatter() != null) {
|
|
||||||
Object converted = convertSingleValue(value, fieldExtractor.getObjectFormatter());
|
|
||||||
if (converted == null && fieldExtractor.isNotNull()) {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
setField(o, fieldExtractor, converted);
|
|
||||||
} else {
|
|
||||||
setField(o, fieldExtractor, value);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
if (AfterExtractor.class.isAssignableFrom(clazz)) {
|
if (AfterExtractor.class.isAssignableFrom(clazz))
|
||||||
((AfterExtractor) o).afterProcess(page);
|
((AfterExtractor) o).afterProcess(page);
|
||||||
}
|
} catch (Exception e) {
|
||||||
} catch (InstantiationException e) {
|
|
||||||
logger.error("extract fail", e);
|
|
||||||
} catch (IllegalAccessException e) {
|
|
||||||
logger.error("extract fail", e);
|
|
||||||
} catch (InvocationTargetException e) {
|
|
||||||
logger.error("extract fail", e);
|
logger.error("extract fail", e);
|
||||||
}
|
}
|
||||||
return o;
|
return o;
|
||||||
}
|
}
|
||||||
|
|
||||||
private List<String> getMultiValueFromSource(Page page, FieldExtractor fieldExtractor, String html, boolean isRaw) {
|
|
||||||
List<String> value;
|
|
||||||
switch (fieldExtractor.getSource()) {
|
|
||||||
case RawHtml:
|
|
||||||
value = page.getHtml().selectDocumentForList(fieldExtractor.getSelector());
|
|
||||||
break;
|
|
||||||
case Html:
|
|
||||||
if (isRaw) {
|
|
||||||
value = page.getHtml().selectDocumentForList(fieldExtractor.getSelector());
|
|
||||||
} else {
|
|
||||||
value = fieldExtractor.getSelector().selectList(html);
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
case Url:
|
|
||||||
value = fieldExtractor.getSelector().selectList(page.getUrl().toString());
|
|
||||||
break;
|
|
||||||
case RawText:
|
|
||||||
value = fieldExtractor.getSelector().selectList(page.getRawText());
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
value = fieldExtractor.getSelector().selectList(html);
|
|
||||||
}
|
|
||||||
return value;
|
|
||||||
}
|
|
||||||
|
|
||||||
private String getSingleValueFromSource(Page page, FieldExtractor fieldExtractor, String html, boolean isRaw) {
|
|
||||||
String value;
|
|
||||||
switch (fieldExtractor.getSource()) {
|
|
||||||
case RawHtml:
|
|
||||||
value = page.getHtml().selectDocument(fieldExtractor.getSelector());
|
|
||||||
break;
|
|
||||||
case Html:
|
|
||||||
if (isRaw) {
|
|
||||||
value = page.getHtml().selectDocument(fieldExtractor.getSelector());
|
|
||||||
} else {
|
|
||||||
value = fieldExtractor.getSelector().select(html);
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
case Url:
|
|
||||||
value = fieldExtractor.getSelector().select(page.getUrl().toString());
|
|
||||||
break;
|
|
||||||
case RawText:
|
|
||||||
value = fieldExtractor.getSelector().select(page.getRawText());
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
value = fieldExtractor.getSelector().select(html);
|
|
||||||
}
|
|
||||||
return value;
|
|
||||||
}
|
|
||||||
|
|
||||||
private Object convertSingleValue(String value, ObjectFormatter objectFormatter) {
|
|
||||||
try {
|
|
||||||
Object format = objectFormatter.format(value);
|
|
||||||
logger.debug("String {} is converted to {}", value, format);
|
|
||||||
return format;
|
|
||||||
} catch (Exception e) {
|
|
||||||
logger.error("convert " + value + " to " + objectFormatter.clazz() + " error!", e);
|
|
||||||
}
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
private List<Object> convertMultiValue(List<String> values, ObjectFormatter objectFormatter) {
|
|
||||||
List<Object> objects = new ArrayList<Object>();
|
|
||||||
for (String value : values) {
|
|
||||||
Object converted = convertSingleValue(value, objectFormatter);
|
|
||||||
if (converted != null) {
|
|
||||||
objects.add(converted);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return objects;
|
|
||||||
}
|
|
||||||
|
|
||||||
private void setField(Object o, FieldExtractor fieldExtractor, Object value) throws IllegalAccessException, InvocationTargetException {
|
|
||||||
if (value == null) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
if (fieldExtractor.getSetterMethod() != null) {
|
|
||||||
fieldExtractor.getSetterMethod().invoke(o, value);
|
|
||||||
}
|
|
||||||
fieldExtractor.getField().set(o, value);
|
|
||||||
}
|
|
||||||
|
|
||||||
Class getClazz() {
|
|
||||||
return clazz;
|
|
||||||
}
|
|
||||||
|
|
||||||
List<Pattern> getTargetUrlPatterns() {
|
|
||||||
return targetUrlPatterns;
|
|
||||||
}
|
|
||||||
|
|
||||||
List<Pattern> getHelpUrlPatterns() {
|
|
||||||
return helpUrlPatterns;
|
|
||||||
}
|
|
||||||
|
|
||||||
Selector getTargetUrlRegionSelector() {
|
|
||||||
return targetUrlRegionSelector;
|
|
||||||
}
|
|
||||||
|
|
||||||
Selector getHelpUrlRegionSelector() {
|
|
||||||
return helpUrlRegionSelector;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,42 @@
|
||||||
|
package us.codecraft.webmagic.model.fields;
|
||||||
|
|
||||||
|
import java.lang.reflect.InvocationTargetException;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
|
||||||
|
import lombok.Getter;
|
||||||
|
import us.codecraft.webmagic.model.FieldExtractor;
|
||||||
|
import us.codecraft.webmagic.model.formatter.ObjectFormatter;
|
||||||
|
|
||||||
|
public class MultipleField extends PageField {
|
||||||
|
@Getter
|
||||||
|
private List<String> fieldNames;
|
||||||
|
|
||||||
|
public MultipleField(List<String> fieldNames) {
|
||||||
|
this.fieldNames = fieldNames;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean operation(Object o, FieldExtractor fieldExtractor, Logger logger) throws IllegalAccessException, InvocationTargetException {
|
||||||
|
if ((this.fieldNames == null || this.fieldNames.size() == 0) && fieldExtractor.isNotNull())
|
||||||
|
return false;
|
||||||
|
if (fieldExtractor.getObjectFormatter() != null) {
|
||||||
|
List<Object> converted = this.convert(this.fieldNames, fieldExtractor.getObjectFormatter(), logger);
|
||||||
|
setField(o, fieldExtractor, converted);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
setField(o, fieldExtractor, this.fieldNames);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
private List<Object> convert(List<String> values, ObjectFormatter objectFormatter, Logger logger) {
|
||||||
|
List<Object> objects = new ArrayList<>();
|
||||||
|
for (String value : values) {
|
||||||
|
Object converted = this.convert(value, objectFormatter, logger);
|
||||||
|
if (converted != null)
|
||||||
|
objects.add(converted);
|
||||||
|
}
|
||||||
|
return objects;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,31 @@
|
||||||
|
package us.codecraft.webmagic.model.fields;
|
||||||
|
|
||||||
|
import java.lang.reflect.InvocationTargetException;
|
||||||
|
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
|
||||||
|
import us.codecraft.webmagic.model.FieldExtractor;
|
||||||
|
import us.codecraft.webmagic.model.formatter.ObjectFormatter;
|
||||||
|
|
||||||
|
public abstract class PageField {
|
||||||
|
public abstract boolean operation(Object o, FieldExtractor fieldExtractor, Logger logger) throws IllegalAccessException, InvocationTargetException;
|
||||||
|
|
||||||
|
protected Object convert(String value, ObjectFormatter objectFormatter, Logger logger) {
|
||||||
|
try {
|
||||||
|
Object format = objectFormatter.format(value);
|
||||||
|
logger.debug("String {} is converted to {}", value, format);
|
||||||
|
return format;
|
||||||
|
} catch (Exception e) {
|
||||||
|
logger.error("convert " + value + " to " + objectFormatter.clazz() + " error!", e);
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected void setField(Object o, FieldExtractor fieldExtractor, Object value) throws IllegalAccessException, InvocationTargetException {
|
||||||
|
if (value != null) {
|
||||||
|
if (fieldExtractor.getSetterMethod() != null)
|
||||||
|
fieldExtractor.getSetterMethod().invoke(o, value);
|
||||||
|
fieldExtractor.getField().set(o, value);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,28 @@
|
||||||
|
package us.codecraft.webmagic.model.fields;
|
||||||
|
|
||||||
|
import java.lang.reflect.InvocationTargetException;
|
||||||
|
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
|
||||||
|
import lombok.Getter;
|
||||||
|
import us.codecraft.webmagic.model.FieldExtractor;
|
||||||
|
|
||||||
|
public class SingleField extends PageField {
|
||||||
|
@Getter
|
||||||
|
private String fieldName;
|
||||||
|
|
||||||
|
public SingleField(String fieldName) {
|
||||||
|
this.fieldName = fieldName;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean operation(Object o, FieldExtractor fieldExtractor, Logger logger) throws IllegalAccessException, InvocationTargetException {
|
||||||
|
if (fieldExtractor.getObjectFormatter() != null) {
|
||||||
|
Object converted = this.convert(this.fieldName, fieldExtractor.getObjectFormatter(), logger);
|
||||||
|
if (converted == null && fieldExtractor.isNotNull())
|
||||||
|
return false;
|
||||||
|
setField(o, fieldExtractor, converted);
|
||||||
|
} else
|
||||||
|
setField(o, fieldExtractor, this.fieldName);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,36 @@
|
||||||
|
package us.codecraft.webmagic.model.selections;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import us.codecraft.webmagic.Page;
|
||||||
|
import us.codecraft.webmagic.model.FieldExtractor;
|
||||||
|
import us.codecraft.webmagic.model.fields.MultipleField;
|
||||||
|
|
||||||
|
public class MultipleSelection implements Selection {
|
||||||
|
public MultipleField extractField(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
|
||||||
|
List<String> fieldsName;
|
||||||
|
switch (fieldExtractor.getSource()) {
|
||||||
|
case RawHtml:
|
||||||
|
fieldsName = page.getHtml().selectDocumentForList(fieldExtractor.getSelector());
|
||||||
|
break;
|
||||||
|
case Html:
|
||||||
|
if (isRaw)
|
||||||
|
fieldsName = page.getHtml().selectDocumentForList(fieldExtractor.getSelector());
|
||||||
|
else
|
||||||
|
fieldsName = fieldExtractor.getSelector().selectList(html);
|
||||||
|
break;
|
||||||
|
case Url:
|
||||||
|
fieldsName = fieldExtractor.getSelector().selectList(page.getUrl().toString());
|
||||||
|
break;
|
||||||
|
case RawText:
|
||||||
|
fieldsName = fieldExtractor.getSelector().selectList(page.getRawText());
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
fieldsName = fieldExtractor.getSelector().selectList(html);
|
||||||
|
}
|
||||||
|
if ((fieldsName == null || fieldsName.size() == 0) && fieldExtractor.isNotNull()) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
return new MultipleField(fieldsName);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,9 @@
|
||||||
|
package us.codecraft.webmagic.model.selections;
|
||||||
|
|
||||||
|
import us.codecraft.webmagic.Page;
|
||||||
|
import us.codecraft.webmagic.model.FieldExtractor;
|
||||||
|
import us.codecraft.webmagic.model.fields.PageField;
|
||||||
|
|
||||||
|
public interface Selection {
|
||||||
|
public PageField extractField(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor);
|
||||||
|
}
|
|
@ -0,0 +1,33 @@
|
||||||
|
package us.codecraft.webmagic.model.selections;
|
||||||
|
|
||||||
|
import us.codecraft.webmagic.Page;
|
||||||
|
import us.codecraft.webmagic.model.FieldExtractor;
|
||||||
|
import us.codecraft.webmagic.model.fields.SingleField;
|
||||||
|
|
||||||
|
public class SingleSelection implements Selection {
|
||||||
|
public SingleField extractField(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
|
||||||
|
String field;
|
||||||
|
switch (fieldExtractor.getSource()) {
|
||||||
|
case RawHtml:
|
||||||
|
field = page.getHtml().selectDocument(fieldExtractor.getSelector());
|
||||||
|
break;
|
||||||
|
case Html:
|
||||||
|
if (isRaw)
|
||||||
|
field = page.getHtml().selectDocument(fieldExtractor.getSelector());
|
||||||
|
else
|
||||||
|
field = fieldExtractor.getSelector().select(html);
|
||||||
|
break;
|
||||||
|
case Url:
|
||||||
|
field = fieldExtractor.getSelector().select(page.getUrl().toString());
|
||||||
|
break;
|
||||||
|
case RawText:
|
||||||
|
field = fieldExtractor.getSelector().select(page.getRawText());
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
field = fieldExtractor.getSelector().select(html);
|
||||||
|
}
|
||||||
|
if (field == null && fieldExtractor.isNotNull())
|
||||||
|
return null;
|
||||||
|
return new SingleField(field);
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue