fix pagedmodel bug
parent
629f8ac2d1
commit
f3a29d9315
|
@ -55,8 +55,10 @@ class PageModelExtractor {
|
|||
fieldExtractor = fieldExtractorTmp;
|
||||
}
|
||||
// ExtractBy2 & ExtractBy3
|
||||
addAnnotationExtractBy2(clazz, fieldExtractor);
|
||||
addAnnotationExtractBy3(clazz, fieldExtractor);
|
||||
if (fieldExtractor!=null){
|
||||
addAnnotationExtractBy2(fieldExtractor);
|
||||
addAnnotationExtractBy3(fieldExtractor);
|
||||
}
|
||||
fieldExtractorTmp = getAnnotationExtractByUrl(clazz, field);
|
||||
if (fieldExtractor != null && fieldExtractorTmp != null) {
|
||||
throw new IllegalStateException("Only one of 'ExtractBy ExtractByRaw ExtractByUrl' can be added to a field!");
|
||||
|
@ -69,8 +71,8 @@ class PageModelExtractor {
|
|||
} else if (fieldExtractor.isMulti() && !List.class.isAssignableFrom(field.getType())) {
|
||||
throw new IllegalStateException("Field " + field.getName() + " must be list");
|
||||
}
|
||||
fieldExtractors.add(fieldExtractor);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -122,7 +124,7 @@ class PageModelExtractor {
|
|||
return fieldExtractor;
|
||||
}
|
||||
|
||||
private void addAnnotationExtractBy2(Class clazz, FieldExtractor fieldExtractor) {
|
||||
private void addAnnotationExtractBy2(FieldExtractor fieldExtractor) {
|
||||
ExtractBy2 extractBy = fieldExtractor.getField().getAnnotation(ExtractBy2.class);
|
||||
if (extractBy != null) {
|
||||
String value = extractBy.value();
|
||||
|
@ -147,7 +149,7 @@ class PageModelExtractor {
|
|||
}
|
||||
}
|
||||
|
||||
private void addAnnotationExtractBy3(Class clazz, FieldExtractor fieldExtractor) {
|
||||
private void addAnnotationExtractBy3(FieldExtractor fieldExtractor) {
|
||||
ExtractBy3 extractBy = fieldExtractor.getField().getAnnotation(ExtractBy3.class);
|
||||
if (extractBy != null) {
|
||||
String value = extractBy.value();
|
||||
|
|
|
@ -33,10 +33,13 @@ public class PagedPipeline implements Pipeline {
|
|||
Object o = objectEntry.getValue();
|
||||
if (o instanceof PagedModel) {
|
||||
PagedModel pagedModel = (PagedModel) o;
|
||||
for (String otherPage : pagedModel.getOtherPages()) {
|
||||
Boolean aBoolean = pageMap.get(pagedModel.getPageKey(), otherPage);
|
||||
if (aBoolean == null) {
|
||||
pageMap.put(pagedModel.getPageKey(), otherPage, Boolean.FALSE);
|
||||
pageMap.put(pagedModel.getPageKey(), pagedModel.getPage(), Boolean.TRUE);
|
||||
if (pagedModel.getOtherPages()!=null){
|
||||
for (String otherPage : pagedModel.getOtherPages()) {
|
||||
Boolean aBoolean = pageMap.get(pagedModel.getPageKey(), otherPage);
|
||||
if (aBoolean == null) {
|
||||
pageMap.put(pagedModel.getPageKey(), otherPage, Boolean.FALSE);
|
||||
}
|
||||
}
|
||||
}
|
||||
//check if all pages are processed
|
||||
|
|
|
@ -1,12 +1,10 @@
|
|||
package us.codecraft.webmagic.model.samples;
|
||||
|
||||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.PagedModel;
|
||||
import us.codecraft.webmagic.Site;
|
||||
import us.codecraft.webmagic.model.*;
|
||||
import us.codecraft.webmagic.pipeline.ConsolePipeline;
|
||||
import us.codecraft.webmagic.pipeline.PagedPipeline;
|
||||
import us.codecraft.webmagic.selector.Selectable;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
|
@ -17,14 +15,16 @@ import java.util.List;
|
|||
* Time: 下午8:17 <br>
|
||||
*/
|
||||
@TargetUrl("http://news.163.com/\\d+/\\d+/\\d+/\\w+*.html")
|
||||
public class News163 implements PagedModel, AfterExtractor {
|
||||
public class News163 implements PagedModel {
|
||||
|
||||
@ExtractByUrl("http://news\\.163\\.com/\\d+/\\d+/\\d+/(\\w+)*\\.html")
|
||||
@ExtractByUrl("http://news\\.163\\.com/\\d+/\\d+/\\d+/([^_]*).*\\.html")
|
||||
private String pageKey;
|
||||
|
||||
@ExtractByUrl(value = "http://news\\.163\\.com/\\d+/\\d+/\\d+/\\w+_(\\d+)\\.html", notNull = false)
|
||||
private String page;
|
||||
|
||||
@ExtractBy(value = "//div[@class=\"ep-pages\"]//a/@href", multi = true)
|
||||
@ExtractBy2(value = "http://news\\.163\\.com/\\d+/\\d+/\\d+/\\w+_(\\d+)\\.html", type = ExtractBy2.Type.Regex)
|
||||
private List<String> otherPage;
|
||||
|
||||
@ExtractBy("//h1[@id=\"h1title\"]/text()")
|
||||
|
@ -54,6 +54,7 @@ public class News163 implements PagedModel, AfterExtractor {
|
|||
@Override
|
||||
public PagedModel combine(PagedModel pagedModel) {
|
||||
News163 news163 = new News163();
|
||||
news163.title = this.title;
|
||||
News163 pagedModel1 = (News163) pagedModel;
|
||||
news163.content = this.content + pagedModel1.content;
|
||||
return news163;
|
||||
|
@ -73,9 +74,4 @@ public class News163 implements PagedModel, AfterExtractor {
|
|||
.clearPipeline().pipeline(new PagedPipeline()).pipeline(new ConsolePipeline()).run();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void afterProcess(Page page) {
|
||||
Selectable xpath = page.getHtml().xpath("//div[@class=\"ep-pages\"]//a/@href");
|
||||
otherPage = xpath.regex("http://news\\.163\\.com/\\d+/\\d+/\\d+/\\w+_(\\d+)\\.html").all();
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue