add multi entity extract

master
yihua.huang 2013-08-03 20:42:29 +08:00
parent bfadac756a
commit b393e38320
4 changed files with 86 additions and 15 deletions

View File

@ -51,7 +51,7 @@ public class ObjectPageProcessor implements PageProcessor {
public void process(Page page) {
for (PageModelExtractor pageModelExtractor : pageModelExtractorList) {
Object process = pageModelExtractor.process(page);
if (process == null) {
if (process == null || (process instanceof List && ((List) process).size() == 0)) {
page.getResultItems().setSkip(true);
}
postProcessPageModel(pageModelExtractor.getClazz(), process);

View File

@ -4,6 +4,8 @@ import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;
import java.lang.annotation.Annotation;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
@ -32,8 +34,17 @@ public class ObjectPipeline implements Pipeline {
for (Map.Entry<Class, PageModelPipeline> classPageModelPipelineEntry : pageModelPipelines.entrySet()) {
Object o = resultItems.get(classPageModelPipelineEntry.getKey().getCanonicalName());
if (o != null) {
Annotation annotation = classPageModelPipelineEntry.getKey().getAnnotation(ExtractBy.class);
ExtractBy extractBy = (ExtractBy) annotation;
if (extractBy.multi()) {
List<Object> list = (List<Object>) o;
for (Object o1 : list) {
classPageModelPipelineEntry.getValue().process(o1, task);
}
} else {
classPageModelPipelineEntry.getValue().process(o, task);
}
}
}
}
}

View File

@ -31,6 +31,8 @@ class PageModelExtractor {
private List<FieldExtractor> fieldExtractors;
private Extractor extractor;
public static PageModelExtractor create(Class clazz) {
PageModelExtractor pageModelExtractor = new PageModelExtractor();
pageModelExtractor.init(clazz);
@ -39,7 +41,7 @@ class PageModelExtractor {
private void init(Class clazz) {
this.clazz = clazz;
initTargetUrlPatterns();
initClassExtractors();
fieldExtractors = new ArrayList<FieldExtractor>();
for (Field field : clazz.getDeclaredFields()) {
field.setAccessible(true);
@ -107,7 +109,7 @@ class PageModelExtractor {
}
}
private void initTargetUrlPatterns() {
private void initClassExtractors() {
Annotation annotation = clazz.getAnnotation(TargetUrl.class);
if (annotation == null) {
targetUrlPatterns.add(Pattern.compile(".*"));
@ -132,6 +134,11 @@ class PageModelExtractor {
helpUrlRegionSelector = new Xpath2Selector(helpUrl.sourceRegion());
}
}
annotation = clazz.getAnnotation(ExtractBy.class);
if (annotation != null) {
ExtractBy extractBy = (ExtractBy) annotation;
extractor = new Extractor(new Xpath2Selector(extractBy.value()), Extractor.Source.Html, extractBy.notNull(), extractBy.multi());
}
}
public Object process(Page page) {
@ -144,6 +151,28 @@ class PageModelExtractor {
if (!matched) {
return null;
}
if (extractor == null) {
return processSingle(page,page.getHtml().toString());
} else {
if (extractor.multi){
List<Object> os = new ArrayList<Object>();
List<String> list = extractor.getSelector().selectList(page.getHtml().toString());
for (String s : list) {
Object o = processSingle(page, s);
if (o!=null){
os.add(o);
}
}
return os;
}else {
String select = extractor.getSelector().select(page.getHtml().toString());
Object o = processSingle(page, select);
return o;
}
}
}
private Object processSingle(Page page,String html) {
Object o = null;
try {
o = clazz.newInstance();
@ -152,32 +181,32 @@ class PageModelExtractor {
List<String> value;
switch (fieldExtractor.getSource()) {
case Html:
value = fieldExtractor.getSelector().selectList(page.getHtml().toString());
value = fieldExtractor.getSelector().selectList(html);
break;
case Url:
value = fieldExtractor.getSelector().selectList(page.getUrl().toString());
break;
default:
value = fieldExtractor.getSelector().selectList(page.getHtml().toString());
value = fieldExtractor.getSelector().selectList(html);
}
if ((value == null || value.size() == 0) && fieldExtractor.isNotNull()) {
page.getResultItems().setSkip(true);
return null;
}
setField(o, fieldExtractor, value);
} else {
String value;
switch (fieldExtractor.getSource()) {
case Html:
value = fieldExtractor.getSelector().select(page.getHtml().toString());
value = fieldExtractor.getSelector().select(html);
break;
case Url:
value = fieldExtractor.getSelector().select(page.getUrl().toString());
break;
default:
value = fieldExtractor.getSelector().select(page.getHtml().toString());
value = fieldExtractor.getSelector().select(html);
}
if (value == null && fieldExtractor.isNotNull()) {
page.getResultItems().setSkip(true);
return null;
}
setField(o, fieldExtractor, value);
}

View File

@ -0,0 +1,31 @@
package us.codecraft.webmagic.oo.samples;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.oo.*;
/**
* @author yihua.huang@dianping.com <br>
* @date: 13-8-3 <br>
* Time: 8:25 <br>
*/
@TargetUrl("http://www.oschina.net/question/\\d+_\\d+*")
@HelpUrl("http://www.oschina.net/question/*")
@ExtractBy(value = "//ul[@class='list']/li[@class='Answer']", multi = true)
public class OschinaAnswer implements AfterExtractor{
@ExtractBy("//img/@title")
private String user;
@ExtractBy(value="//div[@class='detail']",notNull = false)
private String content;
public static void main(String[] args) {
OOSpider.create(Site.me().addStartUrl("http://www.oschina.net/question/567527_120597"), OschinaAnswer.class).run();
}
@Override
public void afterProcess(Page page) {
}
}