update afterextract api
parent
aca165b132
commit
145628557d
|
@ -9,7 +9,7 @@ import us.codecraft.webmagic.Page;
|
||||||
* @date: 13-8-3 <br>
|
* @date: 13-8-3 <br>
|
||||||
* Time: 上午9:42 <br>
|
* Time: 上午9:42 <br>
|
||||||
*/
|
*/
|
||||||
public interface AfterExtractor<T> {
|
public interface AfterExtractor {
|
||||||
|
|
||||||
public void afterProcess(Page page, T t);
|
public void afterProcess(Page page);
|
||||||
}
|
}
|
||||||
|
|
|
@ -4,11 +4,13 @@ import us.codecraft.webmagic.Page;
|
||||||
import us.codecraft.webmagic.Request;
|
import us.codecraft.webmagic.Request;
|
||||||
import us.codecraft.webmagic.Site;
|
import us.codecraft.webmagic.Site;
|
||||||
import us.codecraft.webmagic.processor.PageProcessor;
|
import us.codecraft.webmagic.processor.PageProcessor;
|
||||||
|
import us.codecraft.webmagic.selector.Selector;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
import java.util.regex.Matcher;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -33,7 +35,7 @@ public class ObjectPageProcessor implements PageProcessor {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public ObjectPageProcessor addPageModel(Class clazz){
|
public ObjectPageProcessor addPageModel(Class clazz) {
|
||||||
PageModelExtractor pageModelExtractor = PageModelExtractor.create(clazz);
|
PageModelExtractor pageModelExtractor = PageModelExtractor.create(clazz);
|
||||||
targetUrlPatterns.addAll(pageModelExtractor.getTargetUrlPatterns());
|
targetUrlPatterns.addAll(pageModelExtractor.getTargetUrlPatterns());
|
||||||
targetUrlPatterns.addAll(pageModelExtractor.getHelpUrlPatterns());
|
targetUrlPatterns.addAll(pageModelExtractor.getHelpUrlPatterns());
|
||||||
|
@ -49,22 +51,34 @@ public class ObjectPageProcessor implements PageProcessor {
|
||||||
public void process(Page page) {
|
public void process(Page page) {
|
||||||
for (PageModelExtractor pageModelExtractor : pageModelExtractorList) {
|
for (PageModelExtractor pageModelExtractor : pageModelExtractorList) {
|
||||||
Object process = pageModelExtractor.process(page);
|
Object process = pageModelExtractor.process(page);
|
||||||
if (process==null){
|
if (process == null) {
|
||||||
page.getResultItems().setSkip(true);
|
page.getResultItems().setSkip(true);
|
||||||
}
|
}
|
||||||
postProcessPageModel(pageModelExtractor.getClazz(), process);
|
postProcessPageModel(pageModelExtractor.getClazz(), process);
|
||||||
page.putField(pageModelExtractor.getClazz().getCanonicalName(), process);
|
page.putField(pageModelExtractor.getClazz().getCanonicalName(), process);
|
||||||
|
extractLinks(page, pageModelExtractor.getHelpUrlRegionSelector(), pageModelExtractor.getHelpUrlPatterns());
|
||||||
|
extractLinks(page, pageModelExtractor.getTargetUrlRegionSelector(), pageModelExtractor.getTargetUrlPatterns());
|
||||||
}
|
}
|
||||||
for (String link : page.getHtml().links().all()) {
|
}
|
||||||
for (Pattern targetUrlPattern : targetUrlPatterns) {
|
|
||||||
if (targetUrlPattern.matcher(link).matches()){
|
private void extractLinks(Page page, Selector urlRegionSelector, List<Pattern> urlPatterns) {
|
||||||
page.addTargetRequest(new Request(link));
|
List<String> links;
|
||||||
|
if (urlRegionSelector == null) {
|
||||||
|
links = page.getHtml().links().all();
|
||||||
|
} else {
|
||||||
|
links = urlRegionSelector.selectList(page.getHtml().toString());
|
||||||
|
}
|
||||||
|
for (String link : links) {
|
||||||
|
for (Pattern targetUrlPattern : urlPatterns) {
|
||||||
|
Matcher matcher = targetUrlPattern.matcher(link);
|
||||||
|
if (matcher.find()) {
|
||||||
|
page.addTargetRequest(new Request(matcher.group(1)));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
protected void postProcessPageModel(Class clazz, Object object){
|
protected void postProcessPageModel(Class clazz, Object object) {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -21,14 +21,16 @@ class PageModelExtractor {
|
||||||
|
|
||||||
private List<Pattern> targetUrlPatterns = new ArrayList<Pattern>();
|
private List<Pattern> targetUrlPatterns = new ArrayList<Pattern>();
|
||||||
|
|
||||||
|
private Selector targetUrlRegionSelector;
|
||||||
|
|
||||||
private List<Pattern> helpUrlPatterns = new ArrayList<Pattern>();
|
private List<Pattern> helpUrlPatterns = new ArrayList<Pattern>();
|
||||||
|
|
||||||
|
private Selector helpUrlRegionSelector;
|
||||||
|
|
||||||
private Class clazz;
|
private Class clazz;
|
||||||
|
|
||||||
private List<FieldExtractor> fieldExtractors;
|
private List<FieldExtractor> fieldExtractors;
|
||||||
|
|
||||||
private AfterExtractor afterExtractor;
|
|
||||||
|
|
||||||
public static PageModelExtractor create(Class clazz) {
|
public static PageModelExtractor create(Class clazz) {
|
||||||
PageModelExtractor pageModelExtractor = new PageModelExtractor();
|
PageModelExtractor pageModelExtractor = new PageModelExtractor();
|
||||||
pageModelExtractor.init(clazz);
|
pageModelExtractor.init(clazz);
|
||||||
|
@ -39,13 +41,6 @@ class PageModelExtractor {
|
||||||
this.clazz = clazz;
|
this.clazz = clazz;
|
||||||
initTargetUrlPatterns();
|
initTargetUrlPatterns();
|
||||||
fieldExtractors = new ArrayList<FieldExtractor>();
|
fieldExtractors = new ArrayList<FieldExtractor>();
|
||||||
if (AfterExtractor.class.isAssignableFrom(clazz)) {
|
|
||||||
try {
|
|
||||||
afterExtractor = (AfterExtractor) clazz.newInstance();
|
|
||||||
} catch (Exception e) {
|
|
||||||
throw new IllegalArgumentException(e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for (Field field : clazz.getDeclaredFields()) {
|
for (Field field : clazz.getDeclaredFields()) {
|
||||||
field.setAccessible(true);
|
field.setAccessible(true);
|
||||||
ExtractBy extractBy = field.getAnnotation(ExtractBy.class);
|
ExtractBy extractBy = field.getAnnotation(ExtractBy.class);
|
||||||
|
@ -117,16 +112,24 @@ class PageModelExtractor {
|
||||||
if (annotation == null) {
|
if (annotation == null) {
|
||||||
targetUrlPatterns.add(Pattern.compile(".*"));
|
targetUrlPatterns.add(Pattern.compile(".*"));
|
||||||
} else {
|
} else {
|
||||||
String[] value = ((TargetUrl) annotation).value();
|
TargetUrl targetUrl = (TargetUrl) annotation;
|
||||||
|
String[] value = targetUrl.value();
|
||||||
for (String s : value) {
|
for (String s : value) {
|
||||||
targetUrlPatterns.add(Pattern.compile(s.replace(".", "\\.").replace("*", "[^\"'#]*")));
|
targetUrlPatterns.add(Pattern.compile("("+s.replace(".", "\\.").replace("*", "[^\"'#]*")+")"));
|
||||||
|
}
|
||||||
|
if (!targetUrl.sourceRegion().equals("")){
|
||||||
|
targetUrlRegionSelector = new Xpath2Selector(targetUrl.sourceRegion());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
annotation = clazz.getAnnotation(HelpUrl.class);
|
annotation = clazz.getAnnotation(HelpUrl.class);
|
||||||
if (annotation != null) {
|
if (annotation != null) {
|
||||||
String[] value = ((HelpUrl) annotation).value();
|
HelpUrl helpUrl = (HelpUrl) annotation;
|
||||||
|
String[] value = helpUrl.value();
|
||||||
for (String s : value) {
|
for (String s : value) {
|
||||||
helpUrlPatterns.add(Pattern.compile(s.replace(".", "\\.").replace("*", "[^\"'#]*")));
|
helpUrlPatterns.add(Pattern.compile("("+s.replace(".", "\\.").replace("*", "[^\"'#]*")+")"));
|
||||||
|
}
|
||||||
|
if (!helpUrl.sourceRegion().equals("")){
|
||||||
|
helpUrlRegionSelector = new Xpath2Selector(helpUrl.sourceRegion());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -179,8 +182,8 @@ class PageModelExtractor {
|
||||||
setField(o, fieldExtractor, value);
|
setField(o, fieldExtractor, value);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (afterExtractor != null) {
|
if (AfterExtractor.class.isAssignableFrom(clazz)) {
|
||||||
afterExtractor.afterProcess(page, o);
|
((AfterExtractor)o).afterProcess(page);
|
||||||
}
|
}
|
||||||
} catch (InstantiationException e) {
|
} catch (InstantiationException e) {
|
||||||
e.printStackTrace();
|
e.printStackTrace();
|
||||||
|
@ -210,4 +213,12 @@ class PageModelExtractor {
|
||||||
List<Pattern> getHelpUrlPatterns() {
|
List<Pattern> getHelpUrlPatterns() {
|
||||||
return helpUrlPatterns;
|
return helpUrlPatterns;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Selector getTargetUrlRegionSelector() {
|
||||||
|
return targetUrlRegionSelector;
|
||||||
|
}
|
||||||
|
|
||||||
|
Selector getHelpUrlRegionSelector() {
|
||||||
|
return helpUrlRegionSelector;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -9,8 +9,8 @@ import java.util.List;
|
||||||
* @date: 13-8-1 <br>
|
* @date: 13-8-1 <br>
|
||||||
* Time: 下午10:18 <br>
|
* Time: 下午10:18 <br>
|
||||||
*/
|
*/
|
||||||
@TargetUrl("http://my.oschina.net/flashsword/blog/*")
|
@TargetUrl(value="http://my.oschina.net/flashsword/blog/*",sourceRegion = "//div[@class='BlogLinks']")
|
||||||
public class OschinaBlog implements AfterExtractor<OschinaBlog> {
|
public class OschinaBlog implements AfterExtractor {
|
||||||
|
|
||||||
@ExtractBy("//title")
|
@ExtractBy("//title")
|
||||||
private String title;
|
private String title;
|
||||||
|
@ -22,7 +22,7 @@ public class OschinaBlog implements AfterExtractor<OschinaBlog> {
|
||||||
private List<String> tags;
|
private List<String> tags;
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void afterProcess(Page page, OschinaBlog oschinaBlog) {
|
public void afterProcess(Page page) {
|
||||||
content = null;
|
content = null;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue