diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java
index b2c2bb0..0207b7a 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java
@@ -55,8 +55,10 @@ class PageModelExtractor {
fieldExtractor = fieldExtractorTmp;
}
// ExtractBy2 & ExtractBy3
- addAnnotationExtractBy2(clazz, fieldExtractor);
- addAnnotationExtractBy3(clazz, fieldExtractor);
+ if (fieldExtractor!=null){
+ addAnnotationExtractBy2(fieldExtractor);
+ addAnnotationExtractBy3(fieldExtractor);
+ }
fieldExtractorTmp = getAnnotationExtractByUrl(clazz, field);
if (fieldExtractor != null && fieldExtractorTmp != null) {
throw new IllegalStateException("Only one of 'ExtractBy ExtractByRaw ExtractByUrl' can be added to a field!");
@@ -69,8 +71,8 @@ class PageModelExtractor {
} else if (fieldExtractor.isMulti() && !List.class.isAssignableFrom(field.getType())) {
throw new IllegalStateException("Field " + field.getName() + " must be list");
}
+ fieldExtractors.add(fieldExtractor);
}
-
}
}
@@ -122,7 +124,7 @@ class PageModelExtractor {
return fieldExtractor;
}
- private void addAnnotationExtractBy2(Class clazz, FieldExtractor fieldExtractor) {
+ private void addAnnotationExtractBy2(FieldExtractor fieldExtractor) {
ExtractBy2 extractBy = fieldExtractor.getField().getAnnotation(ExtractBy2.class);
if (extractBy != null) {
String value = extractBy.value();
@@ -147,7 +149,7 @@ class PageModelExtractor {
}
}
- private void addAnnotationExtractBy3(Class clazz, FieldExtractor fieldExtractor) {
+ private void addAnnotationExtractBy3(FieldExtractor fieldExtractor) {
ExtractBy3 extractBy = fieldExtractor.getField().getAnnotation(ExtractBy3.class);
if (extractBy != null) {
String value = extractBy.value();
diff --git a/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/pipeline/PagedPipeline.java b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/pipeline/PagedPipeline.java
index cc71e5c..282545f 100644
--- a/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/pipeline/PagedPipeline.java
+++ b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/pipeline/PagedPipeline.java
@@ -33,10 +33,13 @@ public class PagedPipeline implements Pipeline {
Object o = objectEntry.getValue();
if (o instanceof PagedModel) {
PagedModel pagedModel = (PagedModel) o;
- for (String otherPage : pagedModel.getOtherPages()) {
- Boolean aBoolean = pageMap.get(pagedModel.getPageKey(), otherPage);
- if (aBoolean == null) {
- pageMap.put(pagedModel.getPageKey(), otherPage, Boolean.FALSE);
+ pageMap.put(pagedModel.getPageKey(), pagedModel.getPage(), Boolean.TRUE);
+ if (pagedModel.getOtherPages()!=null){
+ for (String otherPage : pagedModel.getOtherPages()) {
+ Boolean aBoolean = pageMap.get(pagedModel.getPageKey(), otherPage);
+ if (aBoolean == null) {
+ pageMap.put(pagedModel.getPageKey(), otherPage, Boolean.FALSE);
+ }
}
}
//check if all pages are processed
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java
index 07b1e8e..52abe88 100644
--- a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java
+++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java
@@ -1,12 +1,10 @@
package us.codecraft.webmagic.model.samples;
-import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.PagedModel;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.model.*;
import us.codecraft.webmagic.pipeline.ConsolePipeline;
import us.codecraft.webmagic.pipeline.PagedPipeline;
-import us.codecraft.webmagic.selector.Selectable;
import java.util.Collection;
import java.util.List;
@@ -17,14 +15,16 @@ import java.util.List;
* Time: 下午8:17
*/
@TargetUrl("http://news.163.com/\\d+/\\d+/\\d+/\\w+*.html")
-public class News163 implements PagedModel, AfterExtractor {
+public class News163 implements PagedModel {
- @ExtractByUrl("http://news\\.163\\.com/\\d+/\\d+/\\d+/(\\w+)*\\.html")
+ @ExtractByUrl("http://news\\.163\\.com/\\d+/\\d+/\\d+/([^_]*).*\\.html")
private String pageKey;
@ExtractByUrl(value = "http://news\\.163\\.com/\\d+/\\d+/\\d+/\\w+_(\\d+)\\.html", notNull = false)
private String page;
+ @ExtractBy(value = "//div[@class=\"ep-pages\"]//a/@href", multi = true)
+ @ExtractBy2(value = "http://news\\.163\\.com/\\d+/\\d+/\\d+/\\w+_(\\d+)\\.html", type = ExtractBy2.Type.Regex)
private List otherPage;
@ExtractBy("//h1[@id=\"h1title\"]/text()")
@@ -54,6 +54,7 @@ public class News163 implements PagedModel, AfterExtractor {
@Override
public PagedModel combine(PagedModel pagedModel) {
News163 news163 = new News163();
+ news163.title = this.title;
News163 pagedModel1 = (News163) pagedModel;
news163.content = this.content + pagedModel1.content;
return news163;
@@ -73,9 +74,4 @@ public class News163 implements PagedModel, AfterExtractor {
.clearPipeline().pipeline(new PagedPipeline()).pipeline(new ConsolePipeline()).run();
}
- @Override
- public void afterProcess(Page page) {
- Selectable xpath = page.getHtml().xpath("//div[@class=\"ep-pages\"]//a/@href");
- otherPage = xpath.regex("http://news\\.163\\.com/\\d+/\\d+/\\d+/\\w+_(\\d+)\\.html").all();
- }
}