From f3a29d931520f893a43836364768f3fb3abd1926 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Mon, 5 Aug 2013 21:03:47 +0800 Subject: [PATCH] fix pagedmodel bug --- .../webmagic/model/PageModelExtractor.java | 12 +++++++----- .../codecraft/webmagic/pipeline/PagedPipeline.java | 11 +++++++---- .../codecraft/webmagic/model/samples/News163.java | 14 +++++--------- 3 files changed, 19 insertions(+), 18 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java index b2c2bb0..0207b7a 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java @@ -55,8 +55,10 @@ class PageModelExtractor { fieldExtractor = fieldExtractorTmp; } // ExtractBy2 & ExtractBy3 - addAnnotationExtractBy2(clazz, fieldExtractor); - addAnnotationExtractBy3(clazz, fieldExtractor); + if (fieldExtractor!=null){ + addAnnotationExtractBy2(fieldExtractor); + addAnnotationExtractBy3(fieldExtractor); + } fieldExtractorTmp = getAnnotationExtractByUrl(clazz, field); if (fieldExtractor != null && fieldExtractorTmp != null) { throw new IllegalStateException("Only one of 'ExtractBy ExtractByRaw ExtractByUrl' can be added to a field!"); @@ -69,8 +71,8 @@ class PageModelExtractor { } else if (fieldExtractor.isMulti() && !List.class.isAssignableFrom(field.getType())) { throw new IllegalStateException("Field " + field.getName() + " must be list"); } + fieldExtractors.add(fieldExtractor); } - } } @@ -122,7 +124,7 @@ class PageModelExtractor { return fieldExtractor; } - private void addAnnotationExtractBy2(Class clazz, FieldExtractor fieldExtractor) { + private void addAnnotationExtractBy2(FieldExtractor fieldExtractor) { ExtractBy2 extractBy = fieldExtractor.getField().getAnnotation(ExtractBy2.class); if (extractBy != null) { String value = extractBy.value(); @@ -147,7 +149,7 @@ class PageModelExtractor { } } - private void addAnnotationExtractBy3(Class clazz, FieldExtractor fieldExtractor) { + private void addAnnotationExtractBy3(FieldExtractor fieldExtractor) { ExtractBy3 extractBy = fieldExtractor.getField().getAnnotation(ExtractBy3.class); if (extractBy != null) { String value = extractBy.value(); diff --git a/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/pipeline/PagedPipeline.java b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/pipeline/PagedPipeline.java index cc71e5c..282545f 100644 --- a/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/pipeline/PagedPipeline.java +++ b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/pipeline/PagedPipeline.java @@ -33,10 +33,13 @@ public class PagedPipeline implements Pipeline { Object o = objectEntry.getValue(); if (o instanceof PagedModel) { PagedModel pagedModel = (PagedModel) o; - for (String otherPage : pagedModel.getOtherPages()) { - Boolean aBoolean = pageMap.get(pagedModel.getPageKey(), otherPage); - if (aBoolean == null) { - pageMap.put(pagedModel.getPageKey(), otherPage, Boolean.FALSE); + pageMap.put(pagedModel.getPageKey(), pagedModel.getPage(), Boolean.TRUE); + if (pagedModel.getOtherPages()!=null){ + for (String otherPage : pagedModel.getOtherPages()) { + Boolean aBoolean = pageMap.get(pagedModel.getPageKey(), otherPage); + if (aBoolean == null) { + pageMap.put(pagedModel.getPageKey(), otherPage, Boolean.FALSE); + } } } //check if all pages are processed diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java index 07b1e8e..52abe88 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java @@ -1,12 +1,10 @@ package us.codecraft.webmagic.model.samples; -import us.codecraft.webmagic.Page; import us.codecraft.webmagic.PagedModel; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.model.*; import us.codecraft.webmagic.pipeline.ConsolePipeline; import us.codecraft.webmagic.pipeline.PagedPipeline; -import us.codecraft.webmagic.selector.Selectable; import java.util.Collection; import java.util.List; @@ -17,14 +15,16 @@ import java.util.List; * Time: 下午8:17
*/ @TargetUrl("http://news.163.com/\\d+/\\d+/\\d+/\\w+*.html") -public class News163 implements PagedModel, AfterExtractor { +public class News163 implements PagedModel { - @ExtractByUrl("http://news\\.163\\.com/\\d+/\\d+/\\d+/(\\w+)*\\.html") + @ExtractByUrl("http://news\\.163\\.com/\\d+/\\d+/\\d+/([^_]*).*\\.html") private String pageKey; @ExtractByUrl(value = "http://news\\.163\\.com/\\d+/\\d+/\\d+/\\w+_(\\d+)\\.html", notNull = false) private String page; + @ExtractBy(value = "//div[@class=\"ep-pages\"]//a/@href", multi = true) + @ExtractBy2(value = "http://news\\.163\\.com/\\d+/\\d+/\\d+/\\w+_(\\d+)\\.html", type = ExtractBy2.Type.Regex) private List otherPage; @ExtractBy("//h1[@id=\"h1title\"]/text()") @@ -54,6 +54,7 @@ public class News163 implements PagedModel, AfterExtractor { @Override public PagedModel combine(PagedModel pagedModel) { News163 news163 = new News163(); + news163.title = this.title; News163 pagedModel1 = (News163) pagedModel; news163.content = this.content + pagedModel1.content; return news163; @@ -73,9 +74,4 @@ public class News163 implements PagedModel, AfterExtractor { .clearPipeline().pipeline(new PagedPipeline()).pipeline(new ConsolePipeline()).run(); } - @Override - public void afterProcess(Page page) { - Selectable xpath = page.getHtml().xpath("//div[@class=\"ep-pages\"]//a/@href"); - otherPage = xpath.regex("http://news\\.163\\.com/\\d+/\\d+/\\d+/\\w+_(\\d+)\\.html").all(); - } }