fix pagedmodel bug
parent
629f8ac2d1
commit
f3a29d9315
|
@ -55,8 +55,10 @@ class PageModelExtractor {
|
||||||
fieldExtractor = fieldExtractorTmp;
|
fieldExtractor = fieldExtractorTmp;
|
||||||
}
|
}
|
||||||
// ExtractBy2 & ExtractBy3
|
// ExtractBy2 & ExtractBy3
|
||||||
addAnnotationExtractBy2(clazz, fieldExtractor);
|
if (fieldExtractor!=null){
|
||||||
addAnnotationExtractBy3(clazz, fieldExtractor);
|
addAnnotationExtractBy2(fieldExtractor);
|
||||||
|
addAnnotationExtractBy3(fieldExtractor);
|
||||||
|
}
|
||||||
fieldExtractorTmp = getAnnotationExtractByUrl(clazz, field);
|
fieldExtractorTmp = getAnnotationExtractByUrl(clazz, field);
|
||||||
if (fieldExtractor != null && fieldExtractorTmp != null) {
|
if (fieldExtractor != null && fieldExtractorTmp != null) {
|
||||||
throw new IllegalStateException("Only one of 'ExtractBy ExtractByRaw ExtractByUrl' can be added to a field!");
|
throw new IllegalStateException("Only one of 'ExtractBy ExtractByRaw ExtractByUrl' can be added to a field!");
|
||||||
|
@ -69,8 +71,8 @@ class PageModelExtractor {
|
||||||
} else if (fieldExtractor.isMulti() && !List.class.isAssignableFrom(field.getType())) {
|
} else if (fieldExtractor.isMulti() && !List.class.isAssignableFrom(field.getType())) {
|
||||||
throw new IllegalStateException("Field " + field.getName() + " must be list");
|
throw new IllegalStateException("Field " + field.getName() + " must be list");
|
||||||
}
|
}
|
||||||
|
fieldExtractors.add(fieldExtractor);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -122,7 +124,7 @@ class PageModelExtractor {
|
||||||
return fieldExtractor;
|
return fieldExtractor;
|
||||||
}
|
}
|
||||||
|
|
||||||
private void addAnnotationExtractBy2(Class clazz, FieldExtractor fieldExtractor) {
|
private void addAnnotationExtractBy2(FieldExtractor fieldExtractor) {
|
||||||
ExtractBy2 extractBy = fieldExtractor.getField().getAnnotation(ExtractBy2.class);
|
ExtractBy2 extractBy = fieldExtractor.getField().getAnnotation(ExtractBy2.class);
|
||||||
if (extractBy != null) {
|
if (extractBy != null) {
|
||||||
String value = extractBy.value();
|
String value = extractBy.value();
|
||||||
|
@ -147,7 +149,7 @@ class PageModelExtractor {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private void addAnnotationExtractBy3(Class clazz, FieldExtractor fieldExtractor) {
|
private void addAnnotationExtractBy3(FieldExtractor fieldExtractor) {
|
||||||
ExtractBy3 extractBy = fieldExtractor.getField().getAnnotation(ExtractBy3.class);
|
ExtractBy3 extractBy = fieldExtractor.getField().getAnnotation(ExtractBy3.class);
|
||||||
if (extractBy != null) {
|
if (extractBy != null) {
|
||||||
String value = extractBy.value();
|
String value = extractBy.value();
|
||||||
|
|
|
@ -33,10 +33,13 @@ public class PagedPipeline implements Pipeline {
|
||||||
Object o = objectEntry.getValue();
|
Object o = objectEntry.getValue();
|
||||||
if (o instanceof PagedModel) {
|
if (o instanceof PagedModel) {
|
||||||
PagedModel pagedModel = (PagedModel) o;
|
PagedModel pagedModel = (PagedModel) o;
|
||||||
for (String otherPage : pagedModel.getOtherPages()) {
|
pageMap.put(pagedModel.getPageKey(), pagedModel.getPage(), Boolean.TRUE);
|
||||||
Boolean aBoolean = pageMap.get(pagedModel.getPageKey(), otherPage);
|
if (pagedModel.getOtherPages()!=null){
|
||||||
if (aBoolean == null) {
|
for (String otherPage : pagedModel.getOtherPages()) {
|
||||||
pageMap.put(pagedModel.getPageKey(), otherPage, Boolean.FALSE);
|
Boolean aBoolean = pageMap.get(pagedModel.getPageKey(), otherPage);
|
||||||
|
if (aBoolean == null) {
|
||||||
|
pageMap.put(pagedModel.getPageKey(), otherPage, Boolean.FALSE);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
//check if all pages are processed
|
//check if all pages are processed
|
||||||
|
|
|
@ -1,12 +1,10 @@
|
||||||
package us.codecraft.webmagic.model.samples;
|
package us.codecraft.webmagic.model.samples;
|
||||||
|
|
||||||
import us.codecraft.webmagic.Page;
|
|
||||||
import us.codecraft.webmagic.PagedModel;
|
import us.codecraft.webmagic.PagedModel;
|
||||||
import us.codecraft.webmagic.Site;
|
import us.codecraft.webmagic.Site;
|
||||||
import us.codecraft.webmagic.model.*;
|
import us.codecraft.webmagic.model.*;
|
||||||
import us.codecraft.webmagic.pipeline.ConsolePipeline;
|
import us.codecraft.webmagic.pipeline.ConsolePipeline;
|
||||||
import us.codecraft.webmagic.pipeline.PagedPipeline;
|
import us.codecraft.webmagic.pipeline.PagedPipeline;
|
||||||
import us.codecraft.webmagic.selector.Selectable;
|
|
||||||
|
|
||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
@ -17,14 +15,16 @@ import java.util.List;
|
||||||
* Time: 下午8:17 <br>
|
* Time: 下午8:17 <br>
|
||||||
*/
|
*/
|
||||||
@TargetUrl("http://news.163.com/\\d+/\\d+/\\d+/\\w+*.html")
|
@TargetUrl("http://news.163.com/\\d+/\\d+/\\d+/\\w+*.html")
|
||||||
public class News163 implements PagedModel, AfterExtractor {
|
public class News163 implements PagedModel {
|
||||||
|
|
||||||
@ExtractByUrl("http://news\\.163\\.com/\\d+/\\d+/\\d+/(\\w+)*\\.html")
|
@ExtractByUrl("http://news\\.163\\.com/\\d+/\\d+/\\d+/([^_]*).*\\.html")
|
||||||
private String pageKey;
|
private String pageKey;
|
||||||
|
|
||||||
@ExtractByUrl(value = "http://news\\.163\\.com/\\d+/\\d+/\\d+/\\w+_(\\d+)\\.html", notNull = false)
|
@ExtractByUrl(value = "http://news\\.163\\.com/\\d+/\\d+/\\d+/\\w+_(\\d+)\\.html", notNull = false)
|
||||||
private String page;
|
private String page;
|
||||||
|
|
||||||
|
@ExtractBy(value = "//div[@class=\"ep-pages\"]//a/@href", multi = true)
|
||||||
|
@ExtractBy2(value = "http://news\\.163\\.com/\\d+/\\d+/\\d+/\\w+_(\\d+)\\.html", type = ExtractBy2.Type.Regex)
|
||||||
private List<String> otherPage;
|
private List<String> otherPage;
|
||||||
|
|
||||||
@ExtractBy("//h1[@id=\"h1title\"]/text()")
|
@ExtractBy("//h1[@id=\"h1title\"]/text()")
|
||||||
|
@ -54,6 +54,7 @@ public class News163 implements PagedModel, AfterExtractor {
|
||||||
@Override
|
@Override
|
||||||
public PagedModel combine(PagedModel pagedModel) {
|
public PagedModel combine(PagedModel pagedModel) {
|
||||||
News163 news163 = new News163();
|
News163 news163 = new News163();
|
||||||
|
news163.title = this.title;
|
||||||
News163 pagedModel1 = (News163) pagedModel;
|
News163 pagedModel1 = (News163) pagedModel;
|
||||||
news163.content = this.content + pagedModel1.content;
|
news163.content = this.content + pagedModel1.content;
|
||||||
return news163;
|
return news163;
|
||||||
|
@ -73,9 +74,4 @@ public class News163 implements PagedModel, AfterExtractor {
|
||||||
.clearPipeline().pipeline(new PagedPipeline()).pipeline(new ConsolePipeline()).run();
|
.clearPipeline().pipeline(new PagedPipeline()).pipeline(new ConsolePipeline()).run();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
public void afterProcess(Page page) {
|
|
||||||
Selectable xpath = page.getHtml().xpath("//div[@class=\"ep-pages\"]//a/@href");
|
|
||||||
otherPage = xpath.regex("http://news\\.163\\.com/\\d+/\\d+/\\d+/\\w+_(\\d+)\\.html").all();
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue