diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/AfterExtractor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/AfterExtractor.java
index cb9788b..79feaaf 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/AfterExtractor.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/AfterExtractor.java
@@ -9,7 +9,7 @@ import us.codecraft.webmagic.Page;
* @date: 13-8-3
* Time: 上午9:42
*/
-public interface AfterExtractor {
+public interface AfterExtractor {
- public void afterProcess(Page page, T t);
+ public void afterProcess(Page page);
}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/ObjectPageProcessor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/ObjectPageProcessor.java
index dda96b5..c280acd 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/ObjectPageProcessor.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/ObjectPageProcessor.java
@@ -4,11 +4,13 @@ import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
+import us.codecraft.webmagic.selector.Selector;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
+import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
@@ -33,7 +35,7 @@ public class ObjectPageProcessor implements PageProcessor {
}
- public ObjectPageProcessor addPageModel(Class clazz){
+ public ObjectPageProcessor addPageModel(Class clazz) {
PageModelExtractor pageModelExtractor = PageModelExtractor.create(clazz);
targetUrlPatterns.addAll(pageModelExtractor.getTargetUrlPatterns());
targetUrlPatterns.addAll(pageModelExtractor.getHelpUrlPatterns());
@@ -49,22 +51,34 @@ public class ObjectPageProcessor implements PageProcessor {
public void process(Page page) {
for (PageModelExtractor pageModelExtractor : pageModelExtractorList) {
Object process = pageModelExtractor.process(page);
- if (process==null){
+ if (process == null) {
page.getResultItems().setSkip(true);
}
postProcessPageModel(pageModelExtractor.getClazz(), process);
page.putField(pageModelExtractor.getClazz().getCanonicalName(), process);
+ extractLinks(page, pageModelExtractor.getHelpUrlRegionSelector(), pageModelExtractor.getHelpUrlPatterns());
+ extractLinks(page, pageModelExtractor.getTargetUrlRegionSelector(), pageModelExtractor.getTargetUrlPatterns());
}
- for (String link : page.getHtml().links().all()) {
- for (Pattern targetUrlPattern : targetUrlPatterns) {
- if (targetUrlPattern.matcher(link).matches()){
- page.addTargetRequest(new Request(link));
+ }
+
+ private void extractLinks(Page page, Selector urlRegionSelector, List urlPatterns) {
+ List links;
+ if (urlRegionSelector == null) {
+ links = page.getHtml().links().all();
+ } else {
+ links = urlRegionSelector.selectList(page.getHtml().toString());
+ }
+ for (String link : links) {
+ for (Pattern targetUrlPattern : urlPatterns) {
+ Matcher matcher = targetUrlPattern.matcher(link);
+ if (matcher.find()) {
+ page.addTargetRequest(new Request(matcher.group(1)));
}
}
}
}
- protected void postProcessPageModel(Class clazz, Object object){
+ protected void postProcessPageModel(Class clazz, Object object) {
}
@Override
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/PageModelExtractor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/PageModelExtractor.java
index 83a4d31..8a0d81b 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/PageModelExtractor.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/PageModelExtractor.java
@@ -21,14 +21,16 @@ class PageModelExtractor {
private List targetUrlPatterns = new ArrayList();
+ private Selector targetUrlRegionSelector;
+
private List helpUrlPatterns = new ArrayList();
+ private Selector helpUrlRegionSelector;
+
private Class clazz;
private List fieldExtractors;
- private AfterExtractor afterExtractor;
-
public static PageModelExtractor create(Class clazz) {
PageModelExtractor pageModelExtractor = new PageModelExtractor();
pageModelExtractor.init(clazz);
@@ -39,13 +41,6 @@ class PageModelExtractor {
this.clazz = clazz;
initTargetUrlPatterns();
fieldExtractors = new ArrayList();
- if (AfterExtractor.class.isAssignableFrom(clazz)) {
- try {
- afterExtractor = (AfterExtractor) clazz.newInstance();
- } catch (Exception e) {
- throw new IllegalArgumentException(e);
- }
- }
for (Field field : clazz.getDeclaredFields()) {
field.setAccessible(true);
ExtractBy extractBy = field.getAnnotation(ExtractBy.class);
@@ -117,16 +112,24 @@ class PageModelExtractor {
if (annotation == null) {
targetUrlPatterns.add(Pattern.compile(".*"));
} else {
- String[] value = ((TargetUrl) annotation).value();
+ TargetUrl targetUrl = (TargetUrl) annotation;
+ String[] value = targetUrl.value();
for (String s : value) {
- targetUrlPatterns.add(Pattern.compile(s.replace(".", "\\.").replace("*", "[^\"'#]*")));
+ targetUrlPatterns.add(Pattern.compile("("+s.replace(".", "\\.").replace("*", "[^\"'#]*")+")"));
+ }
+ if (!targetUrl.sourceRegion().equals("")){
+ targetUrlRegionSelector = new Xpath2Selector(targetUrl.sourceRegion());
}
}
annotation = clazz.getAnnotation(HelpUrl.class);
if (annotation != null) {
- String[] value = ((HelpUrl) annotation).value();
+ HelpUrl helpUrl = (HelpUrl) annotation;
+ String[] value = helpUrl.value();
for (String s : value) {
- helpUrlPatterns.add(Pattern.compile(s.replace(".", "\\.").replace("*", "[^\"'#]*")));
+ helpUrlPatterns.add(Pattern.compile("("+s.replace(".", "\\.").replace("*", "[^\"'#]*")+")"));
+ }
+ if (!helpUrl.sourceRegion().equals("")){
+ helpUrlRegionSelector = new Xpath2Selector(helpUrl.sourceRegion());
}
}
}
@@ -179,8 +182,8 @@ class PageModelExtractor {
setField(o, fieldExtractor, value);
}
}
- if (afterExtractor != null) {
- afterExtractor.afterProcess(page, o);
+ if (AfterExtractor.class.isAssignableFrom(clazz)) {
+ ((AfterExtractor)o).afterProcess(page);
}
} catch (InstantiationException e) {
e.printStackTrace();
@@ -210,4 +213,12 @@ class PageModelExtractor {
List getHelpUrlPatterns() {
return helpUrlPatterns;
}
+
+ Selector getTargetUrlRegionSelector() {
+ return targetUrlRegionSelector;
+ }
+
+ Selector getHelpUrlRegionSelector() {
+ return helpUrlRegionSelector;
+ }
}
diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/oo/OschinaBlog.java b/webmagic-core/src/test/java/us/codecraft/webmagic/oo/OschinaBlog.java
index 85d4817..0f64aef 100644
--- a/webmagic-core/src/test/java/us/codecraft/webmagic/oo/OschinaBlog.java
+++ b/webmagic-core/src/test/java/us/codecraft/webmagic/oo/OschinaBlog.java
@@ -9,8 +9,8 @@ import java.util.List;
* @date: 13-8-1
* Time: 下午10:18
*/
-@TargetUrl("http://my.oschina.net/flashsword/blog/*")
-public class OschinaBlog implements AfterExtractor {
+@TargetUrl(value="http://my.oschina.net/flashsword/blog/*",sourceRegion = "//div[@class='BlogLinks']")
+public class OschinaBlog implements AfterExtractor {
@ExtractBy("//title")
private String title;
@@ -22,7 +22,7 @@ public class OschinaBlog implements AfterExtractor {
private List tags;
@Override
- public void afterProcess(Page page, OschinaBlog oschinaBlog) {
+ public void afterProcess(Page page) {
content = null;
}
}