Changed refactor of processSingle again, this one is a better version (#1157)
* Refactor of processSingle in PageModelExtractor * Changed my refactor of processSingle, this one is a lot better * Changed my refactor of processSingle, this one is a lot bettermaster
parent
05e5eefc7d
commit
2df7dca871
|
@ -2,6 +2,8 @@ package us.codecraft.webmagic.model;
|
|||
|
||||
import lombok.Getter;
|
||||
import lombok.Setter;
|
||||
|
||||
import us.codecraft.webmagic.model.sources.Source;
|
||||
import us.codecraft.webmagic.selector.Selector;
|
||||
|
||||
/**
|
||||
|
@ -20,9 +22,7 @@ public class Extractor {
|
|||
protected final boolean notNull;
|
||||
|
||||
protected final boolean multi;
|
||||
|
||||
public static enum Source {Html, Url, RawHtml, RawText}
|
||||
|
||||
|
||||
public Extractor(Selector selector, Source source, boolean notNull, boolean multi) {
|
||||
this.selector = selector;
|
||||
this.source = source;
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
package us.codecraft.webmagic.model;
|
||||
|
||||
import us.codecraft.webmagic.model.formatter.ObjectFormatter;
|
||||
import us.codecraft.webmagic.model.sources.Source;
|
||||
import us.codecraft.webmagic.selector.Selector;
|
||||
|
||||
import java.lang.reflect.Field;
|
||||
|
|
|
@ -9,9 +9,9 @@ import us.codecraft.webmagic.Page;
|
|||
import us.codecraft.webmagic.model.annotation.*;
|
||||
import us.codecraft.webmagic.model.fields.PageField;
|
||||
import us.codecraft.webmagic.model.formatter.ObjectFormatterBuilder;
|
||||
import us.codecraft.webmagic.model.selections.MultipleSelection;
|
||||
import us.codecraft.webmagic.model.selections.Selection;
|
||||
import us.codecraft.webmagic.model.selections.SingleSelection;
|
||||
import us.codecraft.webmagic.model.sources.Source;
|
||||
import us.codecraft.webmagic.model.sources.SourceTextExtractor;
|
||||
import us.codecraft.webmagic.model.sources.Source.*;
|
||||
import us.codecraft.webmagic.selector.*;
|
||||
import us.codecraft.webmagic.utils.ClassUtils;
|
||||
import us.codecraft.webmagic.utils.ExtractorUtils;
|
||||
|
@ -95,7 +95,7 @@ class PageModelExtractor {
|
|||
regexPattern = ".*";
|
||||
}
|
||||
fieldExtractor = new FieldExtractor(field,
|
||||
new RegexSelector(regexPattern), FieldExtractor.Source.Url, extractByUrl.notNull(),
|
||||
new RegexSelector(regexPattern), new Url(), extractByUrl.notNull(),
|
||||
extractByUrl.multi() || List.class.isAssignableFrom(field.getType()));
|
||||
Method setterMethod = getSetterMethod(clazz, field);
|
||||
if (setterMethod != null) {
|
||||
|
@ -121,7 +121,7 @@ class PageModelExtractor {
|
|||
default:
|
||||
selector = new AndSelector(ExtractorUtils.getSelectors(extractBies));
|
||||
}
|
||||
fieldExtractor = new FieldExtractor(field, selector, comboExtract.source() == ComboExtract.Source.RawHtml ? FieldExtractor.Source.RawHtml : FieldExtractor.Source.Html,
|
||||
fieldExtractor = new FieldExtractor(field, selector, comboExtract.source() == ComboExtract.Source.RawHtml ? new RawHtml() : new SelectedHtml(),
|
||||
comboExtract.notNull(), comboExtract.multi() || List.class.isAssignableFrom(field.getType()));
|
||||
Method setterMethod = getSetterMethod(clazz, field);
|
||||
if (setterMethod != null) {
|
||||
|
@ -136,26 +136,23 @@ class PageModelExtractor {
|
|||
ExtractBy extractBy = field.getAnnotation(ExtractBy.class);
|
||||
if (extractBy != null) {
|
||||
Selector selector = ExtractorUtils.getSelector(extractBy);
|
||||
ExtractBy.Source source0 = extractBy.source();
|
||||
if (extractBy.type()== ExtractBy.Type.JsonPath){
|
||||
source0 = RawText;
|
||||
}
|
||||
FieldExtractor.Source source = null;
|
||||
switch (source0){
|
||||
ExtractBy.Source extractSource = extractBy.source();
|
||||
if (extractBy.type()== ExtractBy.Type.JsonPath)
|
||||
extractSource = RawText;
|
||||
Source source = null;
|
||||
switch (extractSource) {
|
||||
case RawText:
|
||||
source = FieldExtractor.Source.RawText;
|
||||
source = new RawText();
|
||||
break;
|
||||
case RawHtml:
|
||||
source = FieldExtractor.Source.RawHtml;
|
||||
source = new RawHtml();
|
||||
break;
|
||||
case SelectedHtml:
|
||||
source =FieldExtractor.Source.Html;
|
||||
source = new SelectedHtml();
|
||||
break;
|
||||
default:
|
||||
source =FieldExtractor.Source.Html;
|
||||
|
||||
source = new SelectedHtml();
|
||||
}
|
||||
|
||||
fieldExtractor = new FieldExtractor(field, selector, source,
|
||||
extractBy.notNull(), List.class.isAssignableFrom(field.getType()));
|
||||
fieldExtractor.setSetterMethod(getSetterMethod(clazz, field));
|
||||
|
@ -202,7 +199,7 @@ class PageModelExtractor {
|
|||
annotation = clazz.getAnnotation(ExtractBy.class);
|
||||
if (annotation != null) {
|
||||
ExtractBy extractBy = (ExtractBy) annotation;
|
||||
objectExtractor = new Extractor(new XpathSelector(extractBy.value()), Extractor.Source.Html, extractBy.notNull(), extractBy.multi());
|
||||
objectExtractor = new Extractor(new XpathSelector(extractBy.value()), new SelectedHtml(), extractBy.notNull(), extractBy.multi());
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -242,8 +239,7 @@ class PageModelExtractor {
|
|||
try {
|
||||
o = clazz.newInstance();
|
||||
for (FieldExtractor fieldExtractor : fieldExtractors) {
|
||||
Selection selection = fieldExtractor.isMulti() ? new MultipleSelection() : new SingleSelection();
|
||||
PageField field = selection.extractField(page, html, isRaw, fieldExtractor);
|
||||
PageField field = SourceTextExtractor.getText(page, html, isRaw, fieldExtractor);
|
||||
if (!field.operation(o, fieldExtractor, logger))
|
||||
return null;
|
||||
}
|
||||
|
|
|
@ -1,36 +0,0 @@
|
|||
package us.codecraft.webmagic.model.selections;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.model.FieldExtractor;
|
||||
import us.codecraft.webmagic.model.fields.MultipleField;
|
||||
|
||||
public class MultipleSelection implements Selection {
|
||||
public MultipleField extractField(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
|
||||
List<String> fieldsName;
|
||||
switch (fieldExtractor.getSource()) {
|
||||
case RawHtml:
|
||||
fieldsName = page.getHtml().selectDocumentForList(fieldExtractor.getSelector());
|
||||
break;
|
||||
case Html:
|
||||
if (isRaw)
|
||||
fieldsName = page.getHtml().selectDocumentForList(fieldExtractor.getSelector());
|
||||
else
|
||||
fieldsName = fieldExtractor.getSelector().selectList(html);
|
||||
break;
|
||||
case Url:
|
||||
fieldsName = fieldExtractor.getSelector().selectList(page.getUrl().toString());
|
||||
break;
|
||||
case RawText:
|
||||
fieldsName = fieldExtractor.getSelector().selectList(page.getRawText());
|
||||
break;
|
||||
default:
|
||||
fieldsName = fieldExtractor.getSelector().selectList(html);
|
||||
}
|
||||
if ((fieldsName == null || fieldsName.size() == 0) && fieldExtractor.isNotNull()) {
|
||||
return null;
|
||||
}
|
||||
return new MultipleField(fieldsName);
|
||||
}
|
||||
}
|
|
@ -1,9 +0,0 @@
|
|||
package us.codecraft.webmagic.model.selections;
|
||||
|
||||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.model.FieldExtractor;
|
||||
import us.codecraft.webmagic.model.fields.PageField;
|
||||
|
||||
public interface Selection {
|
||||
public PageField extractField(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor);
|
||||
}
|
|
@ -1,33 +0,0 @@
|
|||
package us.codecraft.webmagic.model.selections;
|
||||
|
||||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.model.FieldExtractor;
|
||||
import us.codecraft.webmagic.model.fields.SingleField;
|
||||
|
||||
public class SingleSelection implements Selection {
|
||||
public SingleField extractField(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
|
||||
String field;
|
||||
switch (fieldExtractor.getSource()) {
|
||||
case RawHtml:
|
||||
field = page.getHtml().selectDocument(fieldExtractor.getSelector());
|
||||
break;
|
||||
case Html:
|
||||
if (isRaw)
|
||||
field = page.getHtml().selectDocument(fieldExtractor.getSelector());
|
||||
else
|
||||
field = fieldExtractor.getSelector().select(html);
|
||||
break;
|
||||
case Url:
|
||||
field = fieldExtractor.getSelector().select(page.getUrl().toString());
|
||||
break;
|
||||
case RawText:
|
||||
field = fieldExtractor.getSelector().select(page.getRawText());
|
||||
break;
|
||||
default:
|
||||
field = fieldExtractor.getSelector().select(html);
|
||||
}
|
||||
if (field == null && fieldExtractor.isNotNull())
|
||||
return null;
|
||||
return new SingleField(field);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,68 @@
|
|||
package us.codecraft.webmagic.model.sources;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.model.FieldExtractor;
|
||||
|
||||
public interface Source {
|
||||
public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor);
|
||||
public List<String> getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor);
|
||||
|
||||
public class RawHtml implements Source {
|
||||
public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
|
||||
return page.getHtml().selectDocument(fieldExtractor.getSelector());
|
||||
}
|
||||
|
||||
public List<String> getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
|
||||
return page.getHtml().selectDocumentForList(fieldExtractor.getSelector());
|
||||
}
|
||||
}
|
||||
|
||||
public class SelectedHtml implements Source {
|
||||
public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
|
||||
if (isRaw)
|
||||
return page.getHtml().selectDocument(fieldExtractor.getSelector());
|
||||
else
|
||||
return fieldExtractor.getSelector().select(html);
|
||||
}
|
||||
|
||||
public List<String> getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
|
||||
if (isRaw)
|
||||
return page.getHtml().selectDocumentForList(fieldExtractor.getSelector());
|
||||
else
|
||||
return fieldExtractor.getSelector().selectList(html);
|
||||
}
|
||||
}
|
||||
|
||||
public class Url implements Source {
|
||||
public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
|
||||
return fieldExtractor.getSelector().select(page.getUrl().toString());
|
||||
}
|
||||
|
||||
public List<String> getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
|
||||
return fieldExtractor.getSelector().selectList(page.getUrl().toString());
|
||||
}
|
||||
}
|
||||
|
||||
public class RawText implements Source {
|
||||
public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
|
||||
return fieldExtractor.getSelector().select(page.getRawText());
|
||||
}
|
||||
|
||||
public List<String> getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
|
||||
return fieldExtractor.getSelector().selectList(page.getRawText());
|
||||
}
|
||||
}
|
||||
|
||||
public class DefaultSource implements Source {
|
||||
public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
|
||||
return fieldExtractor.getSelector().select(html);
|
||||
}
|
||||
|
||||
public List<String> getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
|
||||
return fieldExtractor.getSelector().selectList(html);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,17 @@
|
|||
package us.codecraft.webmagic.model.sources;
|
||||
|
||||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.model.FieldExtractor;
|
||||
import us.codecraft.webmagic.model.fields.MultipleField;
|
||||
import us.codecraft.webmagic.model.fields.PageField;
|
||||
import us.codecraft.webmagic.model.fields.SingleField;
|
||||
|
||||
public class SourceTextExtractor {
|
||||
public static PageField getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
|
||||
Source source = fieldExtractor.getSource();
|
||||
if (fieldExtractor.isMulti())
|
||||
return new MultipleField(source.getTextList(page, html, isRaw, fieldExtractor));
|
||||
else
|
||||
return new SingleField(source.getText(page, html, isRaw, fieldExtractor));
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue