diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/ObjectPageProcessor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/ObjectPageProcessor.java index c280acd..a02e446 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/ObjectPageProcessor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/ObjectPageProcessor.java @@ -51,7 +51,7 @@ public class ObjectPageProcessor implements PageProcessor { public void process(Page page) { for (PageModelExtractor pageModelExtractor : pageModelExtractorList) { Object process = pageModelExtractor.process(page); - if (process == null) { + if (process == null || (process instanceof List && ((List) process).size() == 0)) { page.getResultItems().setSkip(true); } postProcessPageModel(pageModelExtractor.getClazz(), process); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/ObjectPipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/ObjectPipeline.java index a5f02ed..54ae2ef 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/ObjectPipeline.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/ObjectPipeline.java @@ -4,6 +4,8 @@ import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.pipeline.Pipeline; +import java.lang.annotation.Annotation; +import java.util.List; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; @@ -32,7 +34,16 @@ public class ObjectPipeline implements Pipeline { for (Map.Entry classPageModelPipelineEntry : pageModelPipelines.entrySet()) { Object o = resultItems.get(classPageModelPipelineEntry.getKey().getCanonicalName()); if (o != null) { - classPageModelPipelineEntry.getValue().process(o, task); + Annotation annotation = classPageModelPipelineEntry.getKey().getAnnotation(ExtractBy.class); + ExtractBy extractBy = (ExtractBy) annotation; + if (extractBy.multi()) { + List list = (List) o; + for (Object o1 : list) { + classPageModelPipelineEntry.getValue().process(o1, task); + } + } else { + classPageModelPipelineEntry.getValue().process(o, task); + } } } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/PageModelExtractor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/PageModelExtractor.java index 8a0d81b..d3d5335 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/PageModelExtractor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/PageModelExtractor.java @@ -31,6 +31,8 @@ class PageModelExtractor { private List fieldExtractors; + private Extractor extractor; + public static PageModelExtractor create(Class clazz) { PageModelExtractor pageModelExtractor = new PageModelExtractor(); pageModelExtractor.init(clazz); @@ -39,7 +41,7 @@ class PageModelExtractor { private void init(Class clazz) { this.clazz = clazz; - initTargetUrlPatterns(); + initClassExtractors(); fieldExtractors = new ArrayList(); for (Field field : clazz.getDeclaredFields()) { field.setAccessible(true); @@ -107,7 +109,7 @@ class PageModelExtractor { } } - private void initTargetUrlPatterns() { + private void initClassExtractors() { Annotation annotation = clazz.getAnnotation(TargetUrl.class); if (annotation == null) { targetUrlPatterns.add(Pattern.compile(".*")); @@ -115,9 +117,9 @@ class PageModelExtractor { TargetUrl targetUrl = (TargetUrl) annotation; String[] value = targetUrl.value(); for (String s : value) { - targetUrlPatterns.add(Pattern.compile("("+s.replace(".", "\\.").replace("*", "[^\"'#]*")+")")); + targetUrlPatterns.add(Pattern.compile("(" + s.replace(".", "\\.").replace("*", "[^\"'#]*") + ")")); } - if (!targetUrl.sourceRegion().equals("")){ + if (!targetUrl.sourceRegion().equals("")) { targetUrlRegionSelector = new Xpath2Selector(targetUrl.sourceRegion()); } } @@ -126,12 +128,17 @@ class PageModelExtractor { HelpUrl helpUrl = (HelpUrl) annotation; String[] value = helpUrl.value(); for (String s : value) { - helpUrlPatterns.add(Pattern.compile("("+s.replace(".", "\\.").replace("*", "[^\"'#]*")+")")); + helpUrlPatterns.add(Pattern.compile("(" + s.replace(".", "\\.").replace("*", "[^\"'#]*") + ")")); } - if (!helpUrl.sourceRegion().equals("")){ + if (!helpUrl.sourceRegion().equals("")) { helpUrlRegionSelector = new Xpath2Selector(helpUrl.sourceRegion()); } } + annotation = clazz.getAnnotation(ExtractBy.class); + if (annotation != null) { + ExtractBy extractBy = (ExtractBy) annotation; + extractor = new Extractor(new Xpath2Selector(extractBy.value()), Extractor.Source.Html, extractBy.notNull(), extractBy.multi()); + } } public Object process(Page page) { @@ -144,6 +151,28 @@ class PageModelExtractor { if (!matched) { return null; } + if (extractor == null) { + return processSingle(page,page.getHtml().toString()); + } else { + if (extractor.multi){ + List os = new ArrayList(); + List list = extractor.getSelector().selectList(page.getHtml().toString()); + for (String s : list) { + Object o = processSingle(page, s); + if (o!=null){ + os.add(o); + } + } + return os; + }else { + String select = extractor.getSelector().select(page.getHtml().toString()); + Object o = processSingle(page, select); + return o; + } + } + } + + private Object processSingle(Page page,String html) { Object o = null; try { o = clazz.newInstance(); @@ -152,38 +181,38 @@ class PageModelExtractor { List value; switch (fieldExtractor.getSource()) { case Html: - value = fieldExtractor.getSelector().selectList(page.getHtml().toString()); + value = fieldExtractor.getSelector().selectList(html); break; case Url: value = fieldExtractor.getSelector().selectList(page.getUrl().toString()); break; default: - value = fieldExtractor.getSelector().selectList(page.getHtml().toString()); + value = fieldExtractor.getSelector().selectList(html); } if ((value == null || value.size() == 0) && fieldExtractor.isNotNull()) { - page.getResultItems().setSkip(true); + return null; } setField(o, fieldExtractor, value); } else { String value; switch (fieldExtractor.getSource()) { case Html: - value = fieldExtractor.getSelector().select(page.getHtml().toString()); + value = fieldExtractor.getSelector().select(html); break; case Url: value = fieldExtractor.getSelector().select(page.getUrl().toString()); break; default: - value = fieldExtractor.getSelector().select(page.getHtml().toString()); + value = fieldExtractor.getSelector().select(html); } if (value == null && fieldExtractor.isNotNull()) { - page.getResultItems().setSkip(true); + return null; } setField(o, fieldExtractor, value); } } if (AfterExtractor.class.isAssignableFrom(clazz)) { - ((AfterExtractor)o).afterProcess(page); + ((AfterExtractor) o).afterProcess(page); } } catch (InstantiationException e) { e.printStackTrace(); diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/oo/samples/OschinaAnswer.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/oo/samples/OschinaAnswer.java new file mode 100644 index 0000000..fd04b1d --- /dev/null +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/oo/samples/OschinaAnswer.java @@ -0,0 +1,31 @@ +package us.codecraft.webmagic.oo.samples; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.oo.*; + +/** + * @author yihua.huang@dianping.com
+ * @date: 13-8-3
+ * Time: 下午8:25
+ */ +@TargetUrl("http://www.oschina.net/question/\\d+_\\d+*") +@HelpUrl("http://www.oschina.net/question/*") +@ExtractBy(value = "//ul[@class='list']/li[@class='Answer']", multi = true) +public class OschinaAnswer implements AfterExtractor{ + + @ExtractBy("//img/@title") + private String user; + + @ExtractBy(value="//div[@class='detail']",notNull = false) + private String content; + + public static void main(String[] args) { + OOSpider.create(Site.me().addStartUrl("http://www.oschina.net/question/567527_120597"), OschinaAnswer.class).run(); + } + + @Override + public void afterProcess(Page page) { + + } +}