From 05e5eefc7d9e7dd8fd8b85cb297b2f5e30f56e6d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Gibier?= Date: Fri, 5 Apr 2024 15:51:08 +0200 Subject: [PATCH] Refactor of processSingle in PageModelExtractor (#1155) --- webmagic-extension/pom.xml | 6 + .../codecraft/webmagic/model/Extractor.java | 24 +-- .../webmagic/model/FieldExtractor.java | 40 +---- .../webmagic/model/PageModelExtractor.java | 154 ++---------------- .../webmagic/model/fields/MultipleField.java | 42 +++++ .../webmagic/model/fields/PageField.java | 31 ++++ .../webmagic/model/fields/SingleField.java | 28 ++++ .../model/selections/MultipleSelection.java | 36 ++++ .../webmagic/model/selections/Selection.java | 9 + .../model/selections/SingleSelection.java | 33 ++++ .../webmagic/utils/DoubleKeyMap.java | 0 .../webmagic/utils/MultiKeyMapBase.java | 0 12 files changed, 217 insertions(+), 186 deletions(-) create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/model/fields/MultipleField.java create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/model/fields/PageField.java create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/model/fields/SingleField.java create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/model/selections/MultipleSelection.java create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/model/selections/Selection.java create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/model/selections/SingleSelection.java mode change 100755 => 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/utils/DoubleKeyMap.java mode change 100755 => 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/utils/MultiKeyMapBase.java diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index a234a4f..8d2c070 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -10,6 +10,12 @@ webmagic-extension + + org.projectlombok + lombok + 1.18.32 + provided + redis.clients jedis diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/Extractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/Extractor.java index f1d2f84..d64adff 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/Extractor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/Extractor.java @@ -1,5 +1,7 @@ package us.codecraft.webmagic.model; +import lombok.Getter; +import lombok.Setter; import us.codecraft.webmagic.selector.Selector; /** @@ -7,17 +9,19 @@ import us.codecraft.webmagic.selector.Selector; * @author code4crafter@gmail.com
* @since 0.2.0 */ -class Extractor { +public class Extractor { + @Getter @Setter protected Selector selector; + @Getter protected final Source source; protected final boolean notNull; protected final boolean multi; - static enum Source {Html, Url, RawHtml, RawText} + public static enum Source {Html, Url, RawHtml, RawText} public Extractor(Selector selector, Source source, boolean notNull, boolean multi) { this.selector = selector; @@ -26,23 +30,11 @@ class Extractor { this.multi = multi; } - Selector getSelector() { - return selector; - } - - Source getSource() { - return source; - } - - boolean isNotNull() { + public boolean isNotNull() { return notNull; } - boolean isMulti() { + public boolean isMulti() { return multi; } - - void setSelector(Selector selector) { - this.selector = selector; - } } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/FieldExtractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/FieldExtractor.java index a2cba13..a49ea77 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/FieldExtractor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/FieldExtractor.java @@ -6,53 +6,27 @@ import us.codecraft.webmagic.selector.Selector; import java.lang.reflect.Field; import java.lang.reflect.Method; +import lombok.Getter; +import lombok.Setter; + /** * Wrapper of field and extractor. * @author code4crafter@gmail.com
* @since 0.2.0 */ -class FieldExtractor extends Extractor { +public class FieldExtractor extends Extractor { + @Getter private final Field field; + @Getter @Setter private Method setterMethod; + @Getter @Setter private ObjectFormatter objectFormatter; public FieldExtractor(Field field, Selector selector, Source source, boolean notNull, boolean multi) { super(selector, source, notNull, multi); this.field = field; } - - Field getField() { - return field; - } - - Selector getSelector() { - return selector; - } - - Source getSource() { - return source; - } - - void setSetterMethod(Method setterMethod) { - this.setterMethod = setterMethod; - } - - Method getSetterMethod() { - return setterMethod; - } - - boolean isNotNull() { - return notNull; - } - - ObjectFormatter getObjectFormatter() { - return objectFormatter; - } - - void setObjectFormatter(ObjectFormatter objectFormatter) { - this.objectFormatter = objectFormatter; - } } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java index d8947de..de71717 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java @@ -3,17 +3,21 @@ package us.codecraft.webmagic.model; import org.apache.commons.lang3.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; + +import lombok.Getter; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.model.annotation.*; -import us.codecraft.webmagic.model.formatter.ObjectFormatter; +import us.codecraft.webmagic.model.fields.PageField; import us.codecraft.webmagic.model.formatter.ObjectFormatterBuilder; +import us.codecraft.webmagic.model.selections.MultipleSelection; +import us.codecraft.webmagic.model.selections.Selection; +import us.codecraft.webmagic.model.selections.SingleSelection; import us.codecraft.webmagic.selector.*; import us.codecraft.webmagic.utils.ClassUtils; import us.codecraft.webmagic.utils.ExtractorUtils; import java.lang.annotation.Annotation; import java.lang.reflect.Field; -import java.lang.reflect.InvocationTargetException; import java.lang.reflect.Method; import java.util.ArrayList; import java.util.List; @@ -29,14 +33,19 @@ import static us.codecraft.webmagic.model.annotation.ExtractBy.Source.RawText; */ class PageModelExtractor { + @Getter private List targetUrlPatterns = new ArrayList(); + @Getter private Selector targetUrlRegionSelector; + @Getter private List helpUrlPatterns = new ArrayList(); + @Getter private Selector helpUrlRegionSelector; + @Getter private Class clazz; private List fieldExtractors; @@ -233,145 +242,16 @@ class PageModelExtractor { try { o = clazz.newInstance(); for (FieldExtractor fieldExtractor : fieldExtractors) { - if (fieldExtractor.isMulti()) { - List value=getMultiValueFromSource(page, fieldExtractor, html, isRaw); - if ((value == null || value.size() == 0) && fieldExtractor.isNotNull()) { - return null; - } - if (fieldExtractor.getObjectFormatter() != null) { - List converted = convertMultiValue(value, fieldExtractor.getObjectFormatter()); - setField(o, fieldExtractor, converted); - } else { - setField(o, fieldExtractor, value); - } - } else { - String value=getSingleValueFromSource(page, fieldExtractor, html, isRaw); - if (value == null && fieldExtractor.isNotNull()) { - return null; - } - if (fieldExtractor.getObjectFormatter() != null) { - Object converted = convertSingleValue(value, fieldExtractor.getObjectFormatter()); - if (converted == null && fieldExtractor.isNotNull()) { - return null; - } - setField(o, fieldExtractor, converted); - } else { - setField(o, fieldExtractor, value); - } - } + Selection selection = fieldExtractor.isMulti() ? new MultipleSelection() : new SingleSelection(); + PageField field = selection.extractField(page, html, isRaw, fieldExtractor); + if (!field.operation(o, fieldExtractor, logger)) + return null; } - if (AfterExtractor.class.isAssignableFrom(clazz)) { + if (AfterExtractor.class.isAssignableFrom(clazz)) ((AfterExtractor) o).afterProcess(page); - } - } catch (InstantiationException e) { - logger.error("extract fail", e); - } catch (IllegalAccessException e) { - logger.error("extract fail", e); - } catch (InvocationTargetException e) { + } catch (Exception e) { logger.error("extract fail", e); } return o; } - - private List getMultiValueFromSource(Page page, FieldExtractor fieldExtractor, String html, boolean isRaw) { - List value; - switch (fieldExtractor.getSource()) { - case RawHtml: - value = page.getHtml().selectDocumentForList(fieldExtractor.getSelector()); - break; - case Html: - if (isRaw) { - value = page.getHtml().selectDocumentForList(fieldExtractor.getSelector()); - } else { - value = fieldExtractor.getSelector().selectList(html); - } - break; - case Url: - value = fieldExtractor.getSelector().selectList(page.getUrl().toString()); - break; - case RawText: - value = fieldExtractor.getSelector().selectList(page.getRawText()); - break; - default: - value = fieldExtractor.getSelector().selectList(html); - } - return value; - } - - private String getSingleValueFromSource(Page page, FieldExtractor fieldExtractor, String html, boolean isRaw) { - String value; - switch (fieldExtractor.getSource()) { - case RawHtml: - value = page.getHtml().selectDocument(fieldExtractor.getSelector()); - break; - case Html: - if (isRaw) { - value = page.getHtml().selectDocument(fieldExtractor.getSelector()); - } else { - value = fieldExtractor.getSelector().select(html); - } - break; - case Url: - value = fieldExtractor.getSelector().select(page.getUrl().toString()); - break; - case RawText: - value = fieldExtractor.getSelector().select(page.getRawText()); - break; - default: - value = fieldExtractor.getSelector().select(html); - } - return value; - } - - private Object convertSingleValue(String value, ObjectFormatter objectFormatter) { - try { - Object format = objectFormatter.format(value); - logger.debug("String {} is converted to {}", value, format); - return format; - } catch (Exception e) { - logger.error("convert " + value + " to " + objectFormatter.clazz() + " error!", e); - } - return null; - } - - private List convertMultiValue(List values, ObjectFormatter objectFormatter) { - List objects = new ArrayList(); - for (String value : values) { - Object converted = convertSingleValue(value, objectFormatter); - if (converted != null) { - objects.add(converted); - } - } - return objects; - } - - private void setField(Object o, FieldExtractor fieldExtractor, Object value) throws IllegalAccessException, InvocationTargetException { - if (value == null) { - return; - } - if (fieldExtractor.getSetterMethod() != null) { - fieldExtractor.getSetterMethod().invoke(o, value); - } - fieldExtractor.getField().set(o, value); - } - - Class getClazz() { - return clazz; - } - - List getTargetUrlPatterns() { - return targetUrlPatterns; - } - - List getHelpUrlPatterns() { - return helpUrlPatterns; - } - - Selector getTargetUrlRegionSelector() { - return targetUrlRegionSelector; - } - - Selector getHelpUrlRegionSelector() { - return helpUrlRegionSelector; - } } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/fields/MultipleField.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/fields/MultipleField.java new file mode 100644 index 0000000..4a4bf38 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/fields/MultipleField.java @@ -0,0 +1,42 @@ +package us.codecraft.webmagic.model.fields; + +import java.lang.reflect.InvocationTargetException; +import java.util.ArrayList; +import java.util.List; + +import org.slf4j.Logger; + +import lombok.Getter; +import us.codecraft.webmagic.model.FieldExtractor; +import us.codecraft.webmagic.model.formatter.ObjectFormatter; + +public class MultipleField extends PageField { + @Getter + private List fieldNames; + + public MultipleField(List fieldNames) { + this.fieldNames = fieldNames; + } + + public boolean operation(Object o, FieldExtractor fieldExtractor, Logger logger) throws IllegalAccessException, InvocationTargetException { + if ((this.fieldNames == null || this.fieldNames.size() == 0) && fieldExtractor.isNotNull()) + return false; + if (fieldExtractor.getObjectFormatter() != null) { + List converted = this.convert(this.fieldNames, fieldExtractor.getObjectFormatter(), logger); + setField(o, fieldExtractor, converted); + } + else + setField(o, fieldExtractor, this.fieldNames); + return true; + } + + private List convert(List values, ObjectFormatter objectFormatter, Logger logger) { + List objects = new ArrayList<>(); + for (String value : values) { + Object converted = this.convert(value, objectFormatter, logger); + if (converted != null) + objects.add(converted); + } + return objects; + } +} \ No newline at end of file diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/fields/PageField.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/fields/PageField.java new file mode 100644 index 0000000..ad44283 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/fields/PageField.java @@ -0,0 +1,31 @@ +package us.codecraft.webmagic.model.fields; + +import java.lang.reflect.InvocationTargetException; + +import org.slf4j.Logger; + +import us.codecraft.webmagic.model.FieldExtractor; +import us.codecraft.webmagic.model.formatter.ObjectFormatter; + +public abstract class PageField { + public abstract boolean operation(Object o, FieldExtractor fieldExtractor, Logger logger) throws IllegalAccessException, InvocationTargetException; + + protected Object convert(String value, ObjectFormatter objectFormatter, Logger logger) { + try { + Object format = objectFormatter.format(value); + logger.debug("String {} is converted to {}", value, format); + return format; + } catch (Exception e) { + logger.error("convert " + value + " to " + objectFormatter.clazz() + " error!", e); + } + return null; + } + + protected void setField(Object o, FieldExtractor fieldExtractor, Object value) throws IllegalAccessException, InvocationTargetException { + if (value != null) { + if (fieldExtractor.getSetterMethod() != null) + fieldExtractor.getSetterMethod().invoke(o, value); + fieldExtractor.getField().set(o, value); + } + } +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/fields/SingleField.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/fields/SingleField.java new file mode 100644 index 0000000..136a1c5 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/fields/SingleField.java @@ -0,0 +1,28 @@ +package us.codecraft.webmagic.model.fields; + +import java.lang.reflect.InvocationTargetException; + +import org.slf4j.Logger; + +import lombok.Getter; +import us.codecraft.webmagic.model.FieldExtractor; + +public class SingleField extends PageField { + @Getter + private String fieldName; + + public SingleField(String fieldName) { + this.fieldName = fieldName; + } + + public boolean operation(Object o, FieldExtractor fieldExtractor, Logger logger) throws IllegalAccessException, InvocationTargetException { + if (fieldExtractor.getObjectFormatter() != null) { + Object converted = this.convert(this.fieldName, fieldExtractor.getObjectFormatter(), logger); + if (converted == null && fieldExtractor.isNotNull()) + return false; + setField(o, fieldExtractor, converted); + } else + setField(o, fieldExtractor, this.fieldName); + return true; + } +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/selections/MultipleSelection.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/selections/MultipleSelection.java new file mode 100644 index 0000000..d49f9c5 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/selections/MultipleSelection.java @@ -0,0 +1,36 @@ +package us.codecraft.webmagic.model.selections; + +import java.util.List; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.model.FieldExtractor; +import us.codecraft.webmagic.model.fields.MultipleField; + +public class MultipleSelection implements Selection { + public MultipleField extractField(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) { + List fieldsName; + switch (fieldExtractor.getSource()) { + case RawHtml: + fieldsName = page.getHtml().selectDocumentForList(fieldExtractor.getSelector()); + break; + case Html: + if (isRaw) + fieldsName = page.getHtml().selectDocumentForList(fieldExtractor.getSelector()); + else + fieldsName = fieldExtractor.getSelector().selectList(html); + break; + case Url: + fieldsName = fieldExtractor.getSelector().selectList(page.getUrl().toString()); + break; + case RawText: + fieldsName = fieldExtractor.getSelector().selectList(page.getRawText()); + break; + default: + fieldsName = fieldExtractor.getSelector().selectList(html); + } + if ((fieldsName == null || fieldsName.size() == 0) && fieldExtractor.isNotNull()) { + return null; + } + return new MultipleField(fieldsName); + } +} \ No newline at end of file diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/selections/Selection.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/selections/Selection.java new file mode 100644 index 0000000..e70ab9d --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/selections/Selection.java @@ -0,0 +1,9 @@ +package us.codecraft.webmagic.model.selections; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.model.FieldExtractor; +import us.codecraft.webmagic.model.fields.PageField; + +public interface Selection { + public PageField extractField(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor); +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/selections/SingleSelection.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/selections/SingleSelection.java new file mode 100644 index 0000000..a4c1fe4 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/selections/SingleSelection.java @@ -0,0 +1,33 @@ +package us.codecraft.webmagic.model.selections; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.model.FieldExtractor; +import us.codecraft.webmagic.model.fields.SingleField; + +public class SingleSelection implements Selection { + public SingleField extractField(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) { + String field; + switch (fieldExtractor.getSource()) { + case RawHtml: + field = page.getHtml().selectDocument(fieldExtractor.getSelector()); + break; + case Html: + if (isRaw) + field = page.getHtml().selectDocument(fieldExtractor.getSelector()); + else + field = fieldExtractor.getSelector().select(html); + break; + case Url: + field = fieldExtractor.getSelector().select(page.getUrl().toString()); + break; + case RawText: + field = fieldExtractor.getSelector().select(page.getRawText()); + break; + default: + field = fieldExtractor.getSelector().select(html); + } + if (field == null && fieldExtractor.isNotNull()) + return null; + return new SingleField(field); + } +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/DoubleKeyMap.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/DoubleKeyMap.java old mode 100755 new mode 100644 diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/MultiKeyMapBase.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/MultiKeyMapBase.java old mode 100755 new mode 100644