update afterextract api
parent
aca165b132
commit
145628557d
|
@ -9,7 +9,7 @@ import us.codecraft.webmagic.Page;
|
|||
* @date: 13-8-3 <br>
|
||||
* Time: 上午9:42 <br>
|
||||
*/
|
||||
public interface AfterExtractor<T> {
|
||||
public interface AfterExtractor {
|
||||
|
||||
public void afterProcess(Page page, T t);
|
||||
public void afterProcess(Page page);
|
||||
}
|
||||
|
|
|
@ -4,11 +4,13 @@ import us.codecraft.webmagic.Page;
|
|||
import us.codecraft.webmagic.Request;
|
||||
import us.codecraft.webmagic.Site;
|
||||
import us.codecraft.webmagic.processor.PageProcessor;
|
||||
import us.codecraft.webmagic.selector.Selector;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
/**
|
||||
|
@ -33,7 +35,7 @@ public class ObjectPageProcessor implements PageProcessor {
|
|||
}
|
||||
|
||||
|
||||
public ObjectPageProcessor addPageModel(Class clazz){
|
||||
public ObjectPageProcessor addPageModel(Class clazz) {
|
||||
PageModelExtractor pageModelExtractor = PageModelExtractor.create(clazz);
|
||||
targetUrlPatterns.addAll(pageModelExtractor.getTargetUrlPatterns());
|
||||
targetUrlPatterns.addAll(pageModelExtractor.getHelpUrlPatterns());
|
||||
|
@ -49,22 +51,34 @@ public class ObjectPageProcessor implements PageProcessor {
|
|||
public void process(Page page) {
|
||||
for (PageModelExtractor pageModelExtractor : pageModelExtractorList) {
|
||||
Object process = pageModelExtractor.process(page);
|
||||
if (process==null){
|
||||
if (process == null) {
|
||||
page.getResultItems().setSkip(true);
|
||||
}
|
||||
postProcessPageModel(pageModelExtractor.getClazz(), process);
|
||||
page.putField(pageModelExtractor.getClazz().getCanonicalName(), process);
|
||||
extractLinks(page, pageModelExtractor.getHelpUrlRegionSelector(), pageModelExtractor.getHelpUrlPatterns());
|
||||
extractLinks(page, pageModelExtractor.getTargetUrlRegionSelector(), pageModelExtractor.getTargetUrlPatterns());
|
||||
}
|
||||
for (String link : page.getHtml().links().all()) {
|
||||
for (Pattern targetUrlPattern : targetUrlPatterns) {
|
||||
if (targetUrlPattern.matcher(link).matches()){
|
||||
page.addTargetRequest(new Request(link));
|
||||
}
|
||||
|
||||
private void extractLinks(Page page, Selector urlRegionSelector, List<Pattern> urlPatterns) {
|
||||
List<String> links;
|
||||
if (urlRegionSelector == null) {
|
||||
links = page.getHtml().links().all();
|
||||
} else {
|
||||
links = urlRegionSelector.selectList(page.getHtml().toString());
|
||||
}
|
||||
for (String link : links) {
|
||||
for (Pattern targetUrlPattern : urlPatterns) {
|
||||
Matcher matcher = targetUrlPattern.matcher(link);
|
||||
if (matcher.find()) {
|
||||
page.addTargetRequest(new Request(matcher.group(1)));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
protected void postProcessPageModel(Class clazz, Object object){
|
||||
protected void postProcessPageModel(Class clazz, Object object) {
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -21,14 +21,16 @@ class PageModelExtractor {
|
|||
|
||||
private List<Pattern> targetUrlPatterns = new ArrayList<Pattern>();
|
||||
|
||||
private Selector targetUrlRegionSelector;
|
||||
|
||||
private List<Pattern> helpUrlPatterns = new ArrayList<Pattern>();
|
||||
|
||||
private Selector helpUrlRegionSelector;
|
||||
|
||||
private Class clazz;
|
||||
|
||||
private List<FieldExtractor> fieldExtractors;
|
||||
|
||||
private AfterExtractor afterExtractor;
|
||||
|
||||
public static PageModelExtractor create(Class clazz) {
|
||||
PageModelExtractor pageModelExtractor = new PageModelExtractor();
|
||||
pageModelExtractor.init(clazz);
|
||||
|
@ -39,13 +41,6 @@ class PageModelExtractor {
|
|||
this.clazz = clazz;
|
||||
initTargetUrlPatterns();
|
||||
fieldExtractors = new ArrayList<FieldExtractor>();
|
||||
if (AfterExtractor.class.isAssignableFrom(clazz)) {
|
||||
try {
|
||||
afterExtractor = (AfterExtractor) clazz.newInstance();
|
||||
} catch (Exception e) {
|
||||
throw new IllegalArgumentException(e);
|
||||
}
|
||||
}
|
||||
for (Field field : clazz.getDeclaredFields()) {
|
||||
field.setAccessible(true);
|
||||
ExtractBy extractBy = field.getAnnotation(ExtractBy.class);
|
||||
|
@ -117,16 +112,24 @@ class PageModelExtractor {
|
|||
if (annotation == null) {
|
||||
targetUrlPatterns.add(Pattern.compile(".*"));
|
||||
} else {
|
||||
String[] value = ((TargetUrl) annotation).value();
|
||||
TargetUrl targetUrl = (TargetUrl) annotation;
|
||||
String[] value = targetUrl.value();
|
||||
for (String s : value) {
|
||||
targetUrlPatterns.add(Pattern.compile(s.replace(".", "\\.").replace("*", "[^\"'#]*")));
|
||||
targetUrlPatterns.add(Pattern.compile("("+s.replace(".", "\\.").replace("*", "[^\"'#]*")+")"));
|
||||
}
|
||||
if (!targetUrl.sourceRegion().equals("")){
|
||||
targetUrlRegionSelector = new Xpath2Selector(targetUrl.sourceRegion());
|
||||
}
|
||||
}
|
||||
annotation = clazz.getAnnotation(HelpUrl.class);
|
||||
if (annotation != null) {
|
||||
String[] value = ((HelpUrl) annotation).value();
|
||||
HelpUrl helpUrl = (HelpUrl) annotation;
|
||||
String[] value = helpUrl.value();
|
||||
for (String s : value) {
|
||||
helpUrlPatterns.add(Pattern.compile(s.replace(".", "\\.").replace("*", "[^\"'#]*")));
|
||||
helpUrlPatterns.add(Pattern.compile("("+s.replace(".", "\\.").replace("*", "[^\"'#]*")+")"));
|
||||
}
|
||||
if (!helpUrl.sourceRegion().equals("")){
|
||||
helpUrlRegionSelector = new Xpath2Selector(helpUrl.sourceRegion());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -179,8 +182,8 @@ class PageModelExtractor {
|
|||
setField(o, fieldExtractor, value);
|
||||
}
|
||||
}
|
||||
if (afterExtractor != null) {
|
||||
afterExtractor.afterProcess(page, o);
|
||||
if (AfterExtractor.class.isAssignableFrom(clazz)) {
|
||||
((AfterExtractor)o).afterProcess(page);
|
||||
}
|
||||
} catch (InstantiationException e) {
|
||||
e.printStackTrace();
|
||||
|
@ -210,4 +213,12 @@ class PageModelExtractor {
|
|||
List<Pattern> getHelpUrlPatterns() {
|
||||
return helpUrlPatterns;
|
||||
}
|
||||
|
||||
Selector getTargetUrlRegionSelector() {
|
||||
return targetUrlRegionSelector;
|
||||
}
|
||||
|
||||
Selector getHelpUrlRegionSelector() {
|
||||
return helpUrlRegionSelector;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -9,8 +9,8 @@ import java.util.List;
|
|||
* @date: 13-8-1 <br>
|
||||
* Time: 下午10:18 <br>
|
||||
*/
|
||||
@TargetUrl("http://my.oschina.net/flashsword/blog/*")
|
||||
public class OschinaBlog implements AfterExtractor<OschinaBlog> {
|
||||
@TargetUrl(value="http://my.oschina.net/flashsword/blog/*",sourceRegion = "//div[@class='BlogLinks']")
|
||||
public class OschinaBlog implements AfterExtractor {
|
||||
|
||||
@ExtractBy("//title")
|
||||
private String title;
|
||||
|
@ -22,7 +22,7 @@ public class OschinaBlog implements AfterExtractor<OschinaBlog> {
|
|||
private List<String> tags;
|
||||
|
||||
@Override
|
||||
public void afterProcess(Page page, OschinaBlog oschinaBlog) {
|
||||
public void afterProcess(Page page) {
|
||||
content = null;
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue