add multi entity extract
parent
bfadac756a
commit
b393e38320
|
@ -51,7 +51,7 @@ public class ObjectPageProcessor implements PageProcessor {
|
||||||
public void process(Page page) {
|
public void process(Page page) {
|
||||||
for (PageModelExtractor pageModelExtractor : pageModelExtractorList) {
|
for (PageModelExtractor pageModelExtractor : pageModelExtractorList) {
|
||||||
Object process = pageModelExtractor.process(page);
|
Object process = pageModelExtractor.process(page);
|
||||||
if (process == null) {
|
if (process == null || (process instanceof List && ((List) process).size() == 0)) {
|
||||||
page.getResultItems().setSkip(true);
|
page.getResultItems().setSkip(true);
|
||||||
}
|
}
|
||||||
postProcessPageModel(pageModelExtractor.getClazz(), process);
|
postProcessPageModel(pageModelExtractor.getClazz(), process);
|
||||||
|
|
|
@ -4,6 +4,8 @@ import us.codecraft.webmagic.ResultItems;
|
||||||
import us.codecraft.webmagic.Task;
|
import us.codecraft.webmagic.Task;
|
||||||
import us.codecraft.webmagic.pipeline.Pipeline;
|
import us.codecraft.webmagic.pipeline.Pipeline;
|
||||||
|
|
||||||
|
import java.lang.annotation.Annotation;
|
||||||
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.concurrent.ConcurrentHashMap;
|
import java.util.concurrent.ConcurrentHashMap;
|
||||||
|
|
||||||
|
@ -32,8 +34,17 @@ public class ObjectPipeline implements Pipeline {
|
||||||
for (Map.Entry<Class, PageModelPipeline> classPageModelPipelineEntry : pageModelPipelines.entrySet()) {
|
for (Map.Entry<Class, PageModelPipeline> classPageModelPipelineEntry : pageModelPipelines.entrySet()) {
|
||||||
Object o = resultItems.get(classPageModelPipelineEntry.getKey().getCanonicalName());
|
Object o = resultItems.get(classPageModelPipelineEntry.getKey().getCanonicalName());
|
||||||
if (o != null) {
|
if (o != null) {
|
||||||
|
Annotation annotation = classPageModelPipelineEntry.getKey().getAnnotation(ExtractBy.class);
|
||||||
|
ExtractBy extractBy = (ExtractBy) annotation;
|
||||||
|
if (extractBy.multi()) {
|
||||||
|
List<Object> list = (List<Object>) o;
|
||||||
|
for (Object o1 : list) {
|
||||||
|
classPageModelPipelineEntry.getValue().process(o1, task);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
classPageModelPipelineEntry.getValue().process(o, task);
|
classPageModelPipelineEntry.getValue().process(o, task);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -31,6 +31,8 @@ class PageModelExtractor {
|
||||||
|
|
||||||
private List<FieldExtractor> fieldExtractors;
|
private List<FieldExtractor> fieldExtractors;
|
||||||
|
|
||||||
|
private Extractor extractor;
|
||||||
|
|
||||||
public static PageModelExtractor create(Class clazz) {
|
public static PageModelExtractor create(Class clazz) {
|
||||||
PageModelExtractor pageModelExtractor = new PageModelExtractor();
|
PageModelExtractor pageModelExtractor = new PageModelExtractor();
|
||||||
pageModelExtractor.init(clazz);
|
pageModelExtractor.init(clazz);
|
||||||
|
@ -39,7 +41,7 @@ class PageModelExtractor {
|
||||||
|
|
||||||
private void init(Class clazz) {
|
private void init(Class clazz) {
|
||||||
this.clazz = clazz;
|
this.clazz = clazz;
|
||||||
initTargetUrlPatterns();
|
initClassExtractors();
|
||||||
fieldExtractors = new ArrayList<FieldExtractor>();
|
fieldExtractors = new ArrayList<FieldExtractor>();
|
||||||
for (Field field : clazz.getDeclaredFields()) {
|
for (Field field : clazz.getDeclaredFields()) {
|
||||||
field.setAccessible(true);
|
field.setAccessible(true);
|
||||||
|
@ -107,7 +109,7 @@ class PageModelExtractor {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private void initTargetUrlPatterns() {
|
private void initClassExtractors() {
|
||||||
Annotation annotation = clazz.getAnnotation(TargetUrl.class);
|
Annotation annotation = clazz.getAnnotation(TargetUrl.class);
|
||||||
if (annotation == null) {
|
if (annotation == null) {
|
||||||
targetUrlPatterns.add(Pattern.compile(".*"));
|
targetUrlPatterns.add(Pattern.compile(".*"));
|
||||||
|
@ -115,9 +117,9 @@ class PageModelExtractor {
|
||||||
TargetUrl targetUrl = (TargetUrl) annotation;
|
TargetUrl targetUrl = (TargetUrl) annotation;
|
||||||
String[] value = targetUrl.value();
|
String[] value = targetUrl.value();
|
||||||
for (String s : value) {
|
for (String s : value) {
|
||||||
targetUrlPatterns.add(Pattern.compile("("+s.replace(".", "\\.").replace("*", "[^\"'#]*")+")"));
|
targetUrlPatterns.add(Pattern.compile("(" + s.replace(".", "\\.").replace("*", "[^\"'#]*") + ")"));
|
||||||
}
|
}
|
||||||
if (!targetUrl.sourceRegion().equals("")){
|
if (!targetUrl.sourceRegion().equals("")) {
|
||||||
targetUrlRegionSelector = new Xpath2Selector(targetUrl.sourceRegion());
|
targetUrlRegionSelector = new Xpath2Selector(targetUrl.sourceRegion());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -126,12 +128,17 @@ class PageModelExtractor {
|
||||||
HelpUrl helpUrl = (HelpUrl) annotation;
|
HelpUrl helpUrl = (HelpUrl) annotation;
|
||||||
String[] value = helpUrl.value();
|
String[] value = helpUrl.value();
|
||||||
for (String s : value) {
|
for (String s : value) {
|
||||||
helpUrlPatterns.add(Pattern.compile("("+s.replace(".", "\\.").replace("*", "[^\"'#]*")+")"));
|
helpUrlPatterns.add(Pattern.compile("(" + s.replace(".", "\\.").replace("*", "[^\"'#]*") + ")"));
|
||||||
}
|
}
|
||||||
if (!helpUrl.sourceRegion().equals("")){
|
if (!helpUrl.sourceRegion().equals("")) {
|
||||||
helpUrlRegionSelector = new Xpath2Selector(helpUrl.sourceRegion());
|
helpUrlRegionSelector = new Xpath2Selector(helpUrl.sourceRegion());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
annotation = clazz.getAnnotation(ExtractBy.class);
|
||||||
|
if (annotation != null) {
|
||||||
|
ExtractBy extractBy = (ExtractBy) annotation;
|
||||||
|
extractor = new Extractor(new Xpath2Selector(extractBy.value()), Extractor.Source.Html, extractBy.notNull(), extractBy.multi());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public Object process(Page page) {
|
public Object process(Page page) {
|
||||||
|
@ -144,6 +151,28 @@ class PageModelExtractor {
|
||||||
if (!matched) {
|
if (!matched) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
if (extractor == null) {
|
||||||
|
return processSingle(page,page.getHtml().toString());
|
||||||
|
} else {
|
||||||
|
if (extractor.multi){
|
||||||
|
List<Object> os = new ArrayList<Object>();
|
||||||
|
List<String> list = extractor.getSelector().selectList(page.getHtml().toString());
|
||||||
|
for (String s : list) {
|
||||||
|
Object o = processSingle(page, s);
|
||||||
|
if (o!=null){
|
||||||
|
os.add(o);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return os;
|
||||||
|
}else {
|
||||||
|
String select = extractor.getSelector().select(page.getHtml().toString());
|
||||||
|
Object o = processSingle(page, select);
|
||||||
|
return o;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private Object processSingle(Page page,String html) {
|
||||||
Object o = null;
|
Object o = null;
|
||||||
try {
|
try {
|
||||||
o = clazz.newInstance();
|
o = clazz.newInstance();
|
||||||
|
@ -152,38 +181,38 @@ class PageModelExtractor {
|
||||||
List<String> value;
|
List<String> value;
|
||||||
switch (fieldExtractor.getSource()) {
|
switch (fieldExtractor.getSource()) {
|
||||||
case Html:
|
case Html:
|
||||||
value = fieldExtractor.getSelector().selectList(page.getHtml().toString());
|
value = fieldExtractor.getSelector().selectList(html);
|
||||||
break;
|
break;
|
||||||
case Url:
|
case Url:
|
||||||
value = fieldExtractor.getSelector().selectList(page.getUrl().toString());
|
value = fieldExtractor.getSelector().selectList(page.getUrl().toString());
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
value = fieldExtractor.getSelector().selectList(page.getHtml().toString());
|
value = fieldExtractor.getSelector().selectList(html);
|
||||||
}
|
}
|
||||||
if ((value == null || value.size() == 0) && fieldExtractor.isNotNull()) {
|
if ((value == null || value.size() == 0) && fieldExtractor.isNotNull()) {
|
||||||
page.getResultItems().setSkip(true);
|
return null;
|
||||||
}
|
}
|
||||||
setField(o, fieldExtractor, value);
|
setField(o, fieldExtractor, value);
|
||||||
} else {
|
} else {
|
||||||
String value;
|
String value;
|
||||||
switch (fieldExtractor.getSource()) {
|
switch (fieldExtractor.getSource()) {
|
||||||
case Html:
|
case Html:
|
||||||
value = fieldExtractor.getSelector().select(page.getHtml().toString());
|
value = fieldExtractor.getSelector().select(html);
|
||||||
break;
|
break;
|
||||||
case Url:
|
case Url:
|
||||||
value = fieldExtractor.getSelector().select(page.getUrl().toString());
|
value = fieldExtractor.getSelector().select(page.getUrl().toString());
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
value = fieldExtractor.getSelector().select(page.getHtml().toString());
|
value = fieldExtractor.getSelector().select(html);
|
||||||
}
|
}
|
||||||
if (value == null && fieldExtractor.isNotNull()) {
|
if (value == null && fieldExtractor.isNotNull()) {
|
||||||
page.getResultItems().setSkip(true);
|
return null;
|
||||||
}
|
}
|
||||||
setField(o, fieldExtractor, value);
|
setField(o, fieldExtractor, value);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (AfterExtractor.class.isAssignableFrom(clazz)) {
|
if (AfterExtractor.class.isAssignableFrom(clazz)) {
|
||||||
((AfterExtractor)o).afterProcess(page);
|
((AfterExtractor) o).afterProcess(page);
|
||||||
}
|
}
|
||||||
} catch (InstantiationException e) {
|
} catch (InstantiationException e) {
|
||||||
e.printStackTrace();
|
e.printStackTrace();
|
||||||
|
|
|
@ -0,0 +1,31 @@
|
||||||
|
package us.codecraft.webmagic.oo.samples;
|
||||||
|
|
||||||
|
import us.codecraft.webmagic.Page;
|
||||||
|
import us.codecraft.webmagic.Site;
|
||||||
|
import us.codecraft.webmagic.oo.*;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author yihua.huang@dianping.com <br>
|
||||||
|
* @date: 13-8-3 <br>
|
||||||
|
* Time: 下午8:25 <br>
|
||||||
|
*/
|
||||||
|
@TargetUrl("http://www.oschina.net/question/\\d+_\\d+*")
|
||||||
|
@HelpUrl("http://www.oschina.net/question/*")
|
||||||
|
@ExtractBy(value = "//ul[@class='list']/li[@class='Answer']", multi = true)
|
||||||
|
public class OschinaAnswer implements AfterExtractor{
|
||||||
|
|
||||||
|
@ExtractBy("//img/@title")
|
||||||
|
private String user;
|
||||||
|
|
||||||
|
@ExtractBy(value="//div[@class='detail']",notNull = false)
|
||||||
|
private String content;
|
||||||
|
|
||||||
|
public static void main(String[] args) {
|
||||||
|
OOSpider.create(Site.me().addStartUrl("http://www.oschina.net/question/567527_120597"), OschinaAnswer.class).run();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void afterProcess(Page page) {
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue