fix pagedmodel bug

2013-08-05 21:03:47 +08:00 · 2013-08-05 21:03:47 +08:00 · f3a29d9315
parent 629f8ac2d1
commit f3a29d9315
3 changed files with 19 additions and 18 deletions
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java
@ -55,8 +55,10 @@ class PageModelExtractor {
                fieldExtractor = fieldExtractorTmp;
            }
            // ExtractBy2 & ExtractBy3
-            addAnnotationExtractBy2(clazz, fieldExtractor);
-            addAnnotationExtractBy3(clazz, fieldExtractor);
+            if (fieldExtractor!=null){
+                addAnnotationExtractBy2(fieldExtractor);
+                addAnnotationExtractBy3(fieldExtractor);
+            }
            fieldExtractorTmp = getAnnotationExtractByUrl(clazz, field);
            if (fieldExtractor != null && fieldExtractorTmp != null) {
                throw new IllegalStateException("Only one of 'ExtractBy ExtractByRaw ExtractByUrl' can be added to a field!");
@ -69,8 +71,8 @@ class PageModelExtractor {
                } else if (fieldExtractor.isMulti() && !List.class.isAssignableFrom(field.getType())) {
                    throw new IllegalStateException("Field " + field.getName() + " must be list");
                }
+                fieldExtractors.add(fieldExtractor);
            }
-
        }
    }

@ -122,7 +124,7 @@ class PageModelExtractor {
        return fieldExtractor;
    }

-    private void addAnnotationExtractBy2(Class clazz, FieldExtractor fieldExtractor) {
+    private void addAnnotationExtractBy2(FieldExtractor fieldExtractor) {
        ExtractBy2 extractBy = fieldExtractor.getField().getAnnotation(ExtractBy2.class);
        if (extractBy != null) {
            String value = extractBy.value();
@ -147,7 +149,7 @@ class PageModelExtractor {
        }
    }

-    private void addAnnotationExtractBy3(Class clazz, FieldExtractor fieldExtractor) {
+    private void addAnnotationExtractBy3(FieldExtractor fieldExtractor) {
        ExtractBy3 extractBy = fieldExtractor.getField().getAnnotation(ExtractBy3.class);
        if (extractBy != null) {
            String value = extractBy.value();
--- a/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/pipeline/PagedPipeline.java
+++ b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/pipeline/PagedPipeline.java
@ -33,10 +33,13 @@ public class PagedPipeline implements Pipeline {
        Object o = objectEntry.getValue();
        if (o instanceof PagedModel) {
            PagedModel pagedModel = (PagedModel) o;
-            for (String otherPage : pagedModel.getOtherPages()) {
-                Boolean aBoolean = pageMap.get(pagedModel.getPageKey(), otherPage);
-                if (aBoolean == null) {
-                    pageMap.put(pagedModel.getPageKey(), otherPage, Boolean.FALSE);
+            pageMap.put(pagedModel.getPageKey(), pagedModel.getPage(), Boolean.TRUE);
+            if (pagedModel.getOtherPages()!=null){
+                for (String otherPage : pagedModel.getOtherPages()) {
+                    Boolean aBoolean = pageMap.get(pagedModel.getPageKey(), otherPage);
+                    if (aBoolean == null) {
+                        pageMap.put(pagedModel.getPageKey(), otherPage, Boolean.FALSE);
+                    }
                }
            }
            //check if all pages are processed
--- a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java
+++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java
@ -1,12 +1,10 @@
 package us.codecraft.webmagic.model.samples;

-import us.codecraft.webmagic.Page;
 import us.codecraft.webmagic.PagedModel;
 import us.codecraft.webmagic.Site;
 import us.codecraft.webmagic.model.*;
 import us.codecraft.webmagic.pipeline.ConsolePipeline;
 import us.codecraft.webmagic.pipeline.PagedPipeline;
-import us.codecraft.webmagic.selector.Selectable;

 import java.util.Collection;
 import java.util.List;
@ -17,14 +15,16 @@ import java.util.List;
 * Time: 下午8:17 <br>
 */
@TargetUrl("http://news.163.com/\\d+/\\d+/\\d+/\\w+*.html")
-public class News163 implements PagedModel, AfterExtractor {
+public class News163 implements PagedModel {

-    @ExtractByUrl("http://news\\.163\\.com/\\d+/\\d+/\\d+/(\\w+)*\\.html")
+    @ExtractByUrl("http://news\\.163\\.com/\\d+/\\d+/\\d+/([^_]*).*\\.html")
    private String pageKey;

    @ExtractByUrl(value = "http://news\\.163\\.com/\\d+/\\d+/\\d+/\\w+_(\\d+)\\.html", notNull = false)
    private String page;

+    @ExtractBy(value = "//div[@class=\"ep-pages\"]//a/@href", multi = true)
+    @ExtractBy2(value = "http://news\\.163\\.com/\\d+/\\d+/\\d+/\\w+_(\\d+)\\.html", type = ExtractBy2.Type.Regex)
    private List<String> otherPage;

    @ExtractBy("//h1[@id=\"h1title\"]/text()")
@ -54,6 +54,7 @@ public class News163 implements PagedModel, AfterExtractor {
    @Override
    public PagedModel combine(PagedModel pagedModel) {
        News163 news163 = new News163();
+        news163.title = this.title;
        News163 pagedModel1 = (News163) pagedModel;
        news163.content = this.content + pagedModel1.content;
        return news163;
@ -73,9 +74,4 @@ public class News163 implements PagedModel, AfterExtractor {
                .clearPipeline().pipeline(new PagedPipeline()).pipeline(new ConsolePipeline()).run();
    }

-    @Override
-    public void afterProcess(Page page) {
-        Selectable xpath = page.getHtml().xpath("//div[@class=\"ep-pages\"]//a/@href");
-        otherPage = xpath.regex("http://news\\.163\\.com/\\d+/\\d+/\\d+/\\w+_(\\d+)\\.html").all();
-    }
 }