Changed refactor of processSingle again, this one is a better version (#1157)
* Refactor of processSingle in PageModelExtractor * Changed my refactor of processSingle, this one is a lot better * Changed my refactor of processSingle, this one is a lot bettermaster
parent
05e5eefc7d
commit
2df7dca871
|
@ -2,6 +2,8 @@ package us.codecraft.webmagic.model;
|
||||||
|
|
||||||
import lombok.Getter;
|
import lombok.Getter;
|
||||||
import lombok.Setter;
|
import lombok.Setter;
|
||||||
|
|
||||||
|
import us.codecraft.webmagic.model.sources.Source;
|
||||||
import us.codecraft.webmagic.selector.Selector;
|
import us.codecraft.webmagic.selector.Selector;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -20,9 +22,7 @@ public class Extractor {
|
||||||
protected final boolean notNull;
|
protected final boolean notNull;
|
||||||
|
|
||||||
protected final boolean multi;
|
protected final boolean multi;
|
||||||
|
|
||||||
public static enum Source {Html, Url, RawHtml, RawText}
|
|
||||||
|
|
||||||
public Extractor(Selector selector, Source source, boolean notNull, boolean multi) {
|
public Extractor(Selector selector, Source source, boolean notNull, boolean multi) {
|
||||||
this.selector = selector;
|
this.selector = selector;
|
||||||
this.source = source;
|
this.source = source;
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
package us.codecraft.webmagic.model;
|
package us.codecraft.webmagic.model;
|
||||||
|
|
||||||
import us.codecraft.webmagic.model.formatter.ObjectFormatter;
|
import us.codecraft.webmagic.model.formatter.ObjectFormatter;
|
||||||
|
import us.codecraft.webmagic.model.sources.Source;
|
||||||
import us.codecraft.webmagic.selector.Selector;
|
import us.codecraft.webmagic.selector.Selector;
|
||||||
|
|
||||||
import java.lang.reflect.Field;
|
import java.lang.reflect.Field;
|
||||||
|
|
|
@ -9,9 +9,9 @@ import us.codecraft.webmagic.Page;
|
||||||
import us.codecraft.webmagic.model.annotation.*;
|
import us.codecraft.webmagic.model.annotation.*;
|
||||||
import us.codecraft.webmagic.model.fields.PageField;
|
import us.codecraft.webmagic.model.fields.PageField;
|
||||||
import us.codecraft.webmagic.model.formatter.ObjectFormatterBuilder;
|
import us.codecraft.webmagic.model.formatter.ObjectFormatterBuilder;
|
||||||
import us.codecraft.webmagic.model.selections.MultipleSelection;
|
import us.codecraft.webmagic.model.sources.Source;
|
||||||
import us.codecraft.webmagic.model.selections.Selection;
|
import us.codecraft.webmagic.model.sources.SourceTextExtractor;
|
||||||
import us.codecraft.webmagic.model.selections.SingleSelection;
|
import us.codecraft.webmagic.model.sources.Source.*;
|
||||||
import us.codecraft.webmagic.selector.*;
|
import us.codecraft.webmagic.selector.*;
|
||||||
import us.codecraft.webmagic.utils.ClassUtils;
|
import us.codecraft.webmagic.utils.ClassUtils;
|
||||||
import us.codecraft.webmagic.utils.ExtractorUtils;
|
import us.codecraft.webmagic.utils.ExtractorUtils;
|
||||||
|
@ -95,7 +95,7 @@ class PageModelExtractor {
|
||||||
regexPattern = ".*";
|
regexPattern = ".*";
|
||||||
}
|
}
|
||||||
fieldExtractor = new FieldExtractor(field,
|
fieldExtractor = new FieldExtractor(field,
|
||||||
new RegexSelector(regexPattern), FieldExtractor.Source.Url, extractByUrl.notNull(),
|
new RegexSelector(regexPattern), new Url(), extractByUrl.notNull(),
|
||||||
extractByUrl.multi() || List.class.isAssignableFrom(field.getType()));
|
extractByUrl.multi() || List.class.isAssignableFrom(field.getType()));
|
||||||
Method setterMethod = getSetterMethod(clazz, field);
|
Method setterMethod = getSetterMethod(clazz, field);
|
||||||
if (setterMethod != null) {
|
if (setterMethod != null) {
|
||||||
|
@ -121,7 +121,7 @@ class PageModelExtractor {
|
||||||
default:
|
default:
|
||||||
selector = new AndSelector(ExtractorUtils.getSelectors(extractBies));
|
selector = new AndSelector(ExtractorUtils.getSelectors(extractBies));
|
||||||
}
|
}
|
||||||
fieldExtractor = new FieldExtractor(field, selector, comboExtract.source() == ComboExtract.Source.RawHtml ? FieldExtractor.Source.RawHtml : FieldExtractor.Source.Html,
|
fieldExtractor = new FieldExtractor(field, selector, comboExtract.source() == ComboExtract.Source.RawHtml ? new RawHtml() : new SelectedHtml(),
|
||||||
comboExtract.notNull(), comboExtract.multi() || List.class.isAssignableFrom(field.getType()));
|
comboExtract.notNull(), comboExtract.multi() || List.class.isAssignableFrom(field.getType()));
|
||||||
Method setterMethod = getSetterMethod(clazz, field);
|
Method setterMethod = getSetterMethod(clazz, field);
|
||||||
if (setterMethod != null) {
|
if (setterMethod != null) {
|
||||||
|
@ -136,26 +136,23 @@ class PageModelExtractor {
|
||||||
ExtractBy extractBy = field.getAnnotation(ExtractBy.class);
|
ExtractBy extractBy = field.getAnnotation(ExtractBy.class);
|
||||||
if (extractBy != null) {
|
if (extractBy != null) {
|
||||||
Selector selector = ExtractorUtils.getSelector(extractBy);
|
Selector selector = ExtractorUtils.getSelector(extractBy);
|
||||||
ExtractBy.Source source0 = extractBy.source();
|
ExtractBy.Source extractSource = extractBy.source();
|
||||||
if (extractBy.type()== ExtractBy.Type.JsonPath){
|
if (extractBy.type()== ExtractBy.Type.JsonPath)
|
||||||
source0 = RawText;
|
extractSource = RawText;
|
||||||
}
|
Source source = null;
|
||||||
FieldExtractor.Source source = null;
|
switch (extractSource) {
|
||||||
switch (source0){
|
|
||||||
case RawText:
|
case RawText:
|
||||||
source = FieldExtractor.Source.RawText;
|
source = new RawText();
|
||||||
break;
|
break;
|
||||||
case RawHtml:
|
case RawHtml:
|
||||||
source = FieldExtractor.Source.RawHtml;
|
source = new RawHtml();
|
||||||
break;
|
break;
|
||||||
case SelectedHtml:
|
case SelectedHtml:
|
||||||
source =FieldExtractor.Source.Html;
|
source = new SelectedHtml();
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
source =FieldExtractor.Source.Html;
|
source = new SelectedHtml();
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fieldExtractor = new FieldExtractor(field, selector, source,
|
fieldExtractor = new FieldExtractor(field, selector, source,
|
||||||
extractBy.notNull(), List.class.isAssignableFrom(field.getType()));
|
extractBy.notNull(), List.class.isAssignableFrom(field.getType()));
|
||||||
fieldExtractor.setSetterMethod(getSetterMethod(clazz, field));
|
fieldExtractor.setSetterMethod(getSetterMethod(clazz, field));
|
||||||
|
@ -202,7 +199,7 @@ class PageModelExtractor {
|
||||||
annotation = clazz.getAnnotation(ExtractBy.class);
|
annotation = clazz.getAnnotation(ExtractBy.class);
|
||||||
if (annotation != null) {
|
if (annotation != null) {
|
||||||
ExtractBy extractBy = (ExtractBy) annotation;
|
ExtractBy extractBy = (ExtractBy) annotation;
|
||||||
objectExtractor = new Extractor(new XpathSelector(extractBy.value()), Extractor.Source.Html, extractBy.notNull(), extractBy.multi());
|
objectExtractor = new Extractor(new XpathSelector(extractBy.value()), new SelectedHtml(), extractBy.notNull(), extractBy.multi());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -242,8 +239,7 @@ class PageModelExtractor {
|
||||||
try {
|
try {
|
||||||
o = clazz.newInstance();
|
o = clazz.newInstance();
|
||||||
for (FieldExtractor fieldExtractor : fieldExtractors) {
|
for (FieldExtractor fieldExtractor : fieldExtractors) {
|
||||||
Selection selection = fieldExtractor.isMulti() ? new MultipleSelection() : new SingleSelection();
|
PageField field = SourceTextExtractor.getText(page, html, isRaw, fieldExtractor);
|
||||||
PageField field = selection.extractField(page, html, isRaw, fieldExtractor);
|
|
||||||
if (!field.operation(o, fieldExtractor, logger))
|
if (!field.operation(o, fieldExtractor, logger))
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,36 +0,0 @@
|
||||||
package us.codecraft.webmagic.model.selections;
|
|
||||||
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
import us.codecraft.webmagic.Page;
|
|
||||||
import us.codecraft.webmagic.model.FieldExtractor;
|
|
||||||
import us.codecraft.webmagic.model.fields.MultipleField;
|
|
||||||
|
|
||||||
public class MultipleSelection implements Selection {
|
|
||||||
public MultipleField extractField(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
|
|
||||||
List<String> fieldsName;
|
|
||||||
switch (fieldExtractor.getSource()) {
|
|
||||||
case RawHtml:
|
|
||||||
fieldsName = page.getHtml().selectDocumentForList(fieldExtractor.getSelector());
|
|
||||||
break;
|
|
||||||
case Html:
|
|
||||||
if (isRaw)
|
|
||||||
fieldsName = page.getHtml().selectDocumentForList(fieldExtractor.getSelector());
|
|
||||||
else
|
|
||||||
fieldsName = fieldExtractor.getSelector().selectList(html);
|
|
||||||
break;
|
|
||||||
case Url:
|
|
||||||
fieldsName = fieldExtractor.getSelector().selectList(page.getUrl().toString());
|
|
||||||
break;
|
|
||||||
case RawText:
|
|
||||||
fieldsName = fieldExtractor.getSelector().selectList(page.getRawText());
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
fieldsName = fieldExtractor.getSelector().selectList(html);
|
|
||||||
}
|
|
||||||
if ((fieldsName == null || fieldsName.size() == 0) && fieldExtractor.isNotNull()) {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
return new MultipleField(fieldsName);
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,9 +0,0 @@
|
||||||
package us.codecraft.webmagic.model.selections;
|
|
||||||
|
|
||||||
import us.codecraft.webmagic.Page;
|
|
||||||
import us.codecraft.webmagic.model.FieldExtractor;
|
|
||||||
import us.codecraft.webmagic.model.fields.PageField;
|
|
||||||
|
|
||||||
public interface Selection {
|
|
||||||
public PageField extractField(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor);
|
|
||||||
}
|
|
|
@ -1,33 +0,0 @@
|
||||||
package us.codecraft.webmagic.model.selections;
|
|
||||||
|
|
||||||
import us.codecraft.webmagic.Page;
|
|
||||||
import us.codecraft.webmagic.model.FieldExtractor;
|
|
||||||
import us.codecraft.webmagic.model.fields.SingleField;
|
|
||||||
|
|
||||||
public class SingleSelection implements Selection {
|
|
||||||
public SingleField extractField(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
|
|
||||||
String field;
|
|
||||||
switch (fieldExtractor.getSource()) {
|
|
||||||
case RawHtml:
|
|
||||||
field = page.getHtml().selectDocument(fieldExtractor.getSelector());
|
|
||||||
break;
|
|
||||||
case Html:
|
|
||||||
if (isRaw)
|
|
||||||
field = page.getHtml().selectDocument(fieldExtractor.getSelector());
|
|
||||||
else
|
|
||||||
field = fieldExtractor.getSelector().select(html);
|
|
||||||
break;
|
|
||||||
case Url:
|
|
||||||
field = fieldExtractor.getSelector().select(page.getUrl().toString());
|
|
||||||
break;
|
|
||||||
case RawText:
|
|
||||||
field = fieldExtractor.getSelector().select(page.getRawText());
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
field = fieldExtractor.getSelector().select(html);
|
|
||||||
}
|
|
||||||
if (field == null && fieldExtractor.isNotNull())
|
|
||||||
return null;
|
|
||||||
return new SingleField(field);
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -0,0 +1,68 @@
|
||||||
|
package us.codecraft.webmagic.model.sources;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import us.codecraft.webmagic.Page;
|
||||||
|
import us.codecraft.webmagic.model.FieldExtractor;
|
||||||
|
|
||||||
|
public interface Source {
|
||||||
|
public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor);
|
||||||
|
public List<String> getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor);
|
||||||
|
|
||||||
|
public class RawHtml implements Source {
|
||||||
|
public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
|
||||||
|
return page.getHtml().selectDocument(fieldExtractor.getSelector());
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<String> getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
|
||||||
|
return page.getHtml().selectDocumentForList(fieldExtractor.getSelector());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public class SelectedHtml implements Source {
|
||||||
|
public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
|
||||||
|
if (isRaw)
|
||||||
|
return page.getHtml().selectDocument(fieldExtractor.getSelector());
|
||||||
|
else
|
||||||
|
return fieldExtractor.getSelector().select(html);
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<String> getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
|
||||||
|
if (isRaw)
|
||||||
|
return page.getHtml().selectDocumentForList(fieldExtractor.getSelector());
|
||||||
|
else
|
||||||
|
return fieldExtractor.getSelector().selectList(html);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public class Url implements Source {
|
||||||
|
public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
|
||||||
|
return fieldExtractor.getSelector().select(page.getUrl().toString());
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<String> getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
|
||||||
|
return fieldExtractor.getSelector().selectList(page.getUrl().toString());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public class RawText implements Source {
|
||||||
|
public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
|
||||||
|
return fieldExtractor.getSelector().select(page.getRawText());
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<String> getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
|
||||||
|
return fieldExtractor.getSelector().selectList(page.getRawText());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public class DefaultSource implements Source {
|
||||||
|
public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
|
||||||
|
return fieldExtractor.getSelector().select(html);
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<String> getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
|
||||||
|
return fieldExtractor.getSelector().selectList(html);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
@ -0,0 +1,17 @@
|
||||||
|
package us.codecraft.webmagic.model.sources;
|
||||||
|
|
||||||
|
import us.codecraft.webmagic.Page;
|
||||||
|
import us.codecraft.webmagic.model.FieldExtractor;
|
||||||
|
import us.codecraft.webmagic.model.fields.MultipleField;
|
||||||
|
import us.codecraft.webmagic.model.fields.PageField;
|
||||||
|
import us.codecraft.webmagic.model.fields.SingleField;
|
||||||
|
|
||||||
|
public class SourceTextExtractor {
|
||||||
|
public static PageField getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
|
||||||
|
Source source = fieldExtractor.getSource();
|
||||||
|
if (fieldExtractor.isMulti())
|
||||||
|
return new MultipleField(source.getTextList(page, html, isRaw, fieldExtractor));
|
||||||
|
else
|
||||||
|
return new SingleField(source.getText(page, html, isRaw, fieldExtractor));
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue