add extract by url
parent
f08ffc34fd
commit
abba3b7bff
|
@ -0,0 +1,18 @@
|
|||
package us.codecraft.webmagic.annotation;
|
||||
|
||||
import java.lang.annotation.ElementType;
|
||||
import java.lang.annotation.Retention;
|
||||
import java.lang.annotation.Target;
|
||||
|
||||
/**
|
||||
* @author yihua.huang@dianping.com <br>
|
||||
* @date: 13-8-1 <br>
|
||||
* Time: 下午8:40 <br>
|
||||
*/
|
||||
@Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
|
||||
@Target({ElementType.FIELD})
|
||||
public @interface ExtractByUrl {
|
||||
|
||||
String value() default "";
|
||||
|
||||
}
|
|
@ -15,9 +15,20 @@ class FieldExtractor {
|
|||
|
||||
private final Selector selector;
|
||||
|
||||
FieldExtractor(Field field, Selector selector) {
|
||||
private final Source source;
|
||||
|
||||
static enum Source {Html, Url}
|
||||
|
||||
public FieldExtractor(Field field, Selector selector) {
|
||||
this.field = field;
|
||||
this.selector = selector;
|
||||
this.source = Source.Html;
|
||||
}
|
||||
|
||||
public FieldExtractor(Field field, Selector selector, Source source) {
|
||||
this.field = field;
|
||||
this.selector = selector;
|
||||
this.source = source;
|
||||
}
|
||||
|
||||
Field getField() {
|
||||
|
@ -27,4 +38,8 @@ class FieldExtractor {
|
|||
Selector getSelector() {
|
||||
return selector;
|
||||
}
|
||||
|
||||
Source getSource() {
|
||||
return source;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -47,6 +47,7 @@ public class ObjectPageProcessor implements PageProcessor {
|
|||
public void process(Page page) {
|
||||
for (PageModelExtractor pageModelExtractor : pageModelExtractorList) {
|
||||
Object process = pageModelExtractor.process(page);
|
||||
postProcessPageModel(pageModelExtractor.getClazz(), process);
|
||||
page.putField(pageModelExtractor.getClazz().getCanonicalName(), process);
|
||||
}
|
||||
for (String link : page.getHtml().links().all()) {
|
||||
|
@ -58,6 +59,9 @@ public class ObjectPageProcessor implements PageProcessor {
|
|||
}
|
||||
}
|
||||
|
||||
protected void postProcessPageModel(Class clazz, Object object){
|
||||
}
|
||||
|
||||
@Override
|
||||
public Site getSite() {
|
||||
return site;
|
||||
|
|
|
@ -38,22 +38,32 @@ class PageModelExtractor {
|
|||
for (Field field : clazz.getDeclaredFields()) {
|
||||
field.setAccessible(true);
|
||||
ExtractBy extractBy = field.getAnnotation(ExtractBy.class);
|
||||
String value = extractBy.value();
|
||||
Selector selector;
|
||||
switch (extractBy.type()) {
|
||||
case Css:
|
||||
selector = new CssSelector(value);
|
||||
break;
|
||||
case Regex:
|
||||
selector = new RegexSelector(value);
|
||||
break;
|
||||
case XPath:
|
||||
selector = new XpathSelector(value);
|
||||
break;
|
||||
default:
|
||||
selector = new XpathSelector(value);
|
||||
if (extractBy != null) {
|
||||
String value = extractBy.value();
|
||||
Selector selector;
|
||||
switch (extractBy.type()) {
|
||||
case Css:
|
||||
selector = new CssSelector(value);
|
||||
break;
|
||||
case Regex:
|
||||
selector = new RegexSelector(value);
|
||||
break;
|
||||
case XPath:
|
||||
selector = new XpathSelector(value);
|
||||
break;
|
||||
default:
|
||||
selector = new XpathSelector(value);
|
||||
}
|
||||
fieldExtractors.add(new FieldExtractor(field, selector));
|
||||
}
|
||||
ExtractByUrl extractByUrl = field.getAnnotation(ExtractByUrl.class);
|
||||
if (extractByUrl != null) {
|
||||
String regexPattern = extractByUrl.value();
|
||||
if (regexPattern.trim().equals("")) {
|
||||
regexPattern = ".*";
|
||||
}
|
||||
fieldExtractors.add(new FieldExtractor(field, new RegexSelector(regexPattern), FieldExtractor.Source.Url));
|
||||
}
|
||||
fieldExtractors.add(new FieldExtractor(field, selector));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -65,7 +75,7 @@ class PageModelExtractor {
|
|||
} else {
|
||||
String[] value = ((TargetUrl) annotation).value();
|
||||
for (String s : value) {
|
||||
targetUrlPatterns.add(Pattern.compile(s.replace(".","\\.").replace("*","[^\"'#]*")));
|
||||
targetUrlPatterns.add(Pattern.compile(s.replace(".", "\\.").replace("*", "[^\"'#]*")));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -84,7 +94,15 @@ class PageModelExtractor {
|
|||
try {
|
||||
o = clazz.newInstance();
|
||||
for (FieldExtractor fieldExtractor : fieldExtractors) {
|
||||
fieldExtractor.getField().set(o, fieldExtractor.getSelector().select(page.getHtml().toString()));
|
||||
switch (fieldExtractor.getSource()) {
|
||||
case Html:
|
||||
fieldExtractor.getField().set(o, fieldExtractor.getSelector().select(page.getHtml().toString()));
|
||||
break;
|
||||
case Url:
|
||||
fieldExtractor.getField().set(o, fieldExtractor.getSelector().select(page.getUrl().toString()));
|
||||
break;
|
||||
}
|
||||
|
||||
}
|
||||
} catch (InstantiationException e) {
|
||||
e.printStackTrace();
|
||||
|
|
|
@ -6,7 +6,7 @@ package us.codecraft.webmagic.annotation;
|
|||
* Time: 下午10:18 <br>
|
||||
*/
|
||||
@TargetUrl("http://my.oschina.net/flashsword/blog/*")
|
||||
public class Blog {
|
||||
public class OschinaBlog {
|
||||
|
||||
@ExtractBy("//title")
|
||||
private String title;
|
||||
|
@ -16,7 +16,7 @@ public class Blog {
|
|||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "Blog{" +
|
||||
return "OschinaBlog{" +
|
||||
"title='" + title + '\'' +
|
||||
", content='" + content + '\'' +
|
||||
'}';
|
|
@ -15,7 +15,7 @@ public class TestFetcher {
|
|||
@Ignore("takes long")
|
||||
@Test
|
||||
public void test() {
|
||||
Spider.create(ObjectPageProcessor.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog/145796"), Blog.class)).run();
|
||||
Spider.create(ObjectPageProcessor.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog/145796"), OschinaBlog.class)).run();
|
||||
|
||||
}
|
||||
|
||||
|
|
|
@ -1168,7 +1168,7 @@ public class XpathSelectorTest {
|
|||
+ " var location = window.location;\n"
|
||||
+ " source_url = location.protocol + \"//\" + location.host + location.pathname + location.search;\n"
|
||||
+ " pre.writeAttribute('codeable_id', post_id);\n"
|
||||
+ " pre.writeAttribute('codeable_type', \"Blog\");\n"
|
||||
+ " pre.writeAttribute('codeable_type', \"OschinaBlog\");\n"
|
||||
+ " pre.writeAttribute('source_url', source_url);\n"
|
||||
+ " pre.writeAttribute('pre_index', index);\n"
|
||||
+ " pre.writeAttribute('title', 'jsoup 解析页面商品信息');\n"
|
||||
|
|
Loading…
Reference in New Issue