diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/Fetcher.java b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/Fetcher.java new file mode 100644 index 0000000..86f78db --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/Fetcher.java @@ -0,0 +1,21 @@ +package us.codecraft.webmagic.annotation; + +import java.lang.annotation.ElementType; +import java.lang.annotation.Retention; +import java.lang.annotation.Target; + +/** + * @author yihua.huang@dianping.com
+ * @date: 13-8-1
+ * Time: 下午8:40
+ */ +@Retention(java.lang.annotation.RetentionPolicy.RUNTIME) +@Target({ElementType.FIELD}) +public @interface Fetcher { + + String value(); + + public enum Type {XPath, Regex, Css}; + + Type type() default Type.XPath; +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/FieldFetcher.java b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/FieldFetcher.java new file mode 100644 index 0000000..ee9962b --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/FieldFetcher.java @@ -0,0 +1,30 @@ +package us.codecraft.webmagic.annotation; + +import us.codecraft.webmagic.selector.Selector; + +import java.lang.reflect.Field; + +/** + * @author yihua.huang@dianping.com
+ * @date: 13-8-1
+ * Time: 下午9:48
+ */ +class FieldFetcher { + + private final Field field; + + private final Selector selector; + + FieldFetcher(Field field, Selector selector) { + this.field = field; + this.selector = selector; + } + + Field getField() { + return field; + } + + Selector getSelector() { + return selector; + } +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ObjectPageProcessor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ObjectPageProcessor.java new file mode 100644 index 0000000..98c969e --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ObjectPageProcessor.java @@ -0,0 +1,65 @@ +package us.codecraft.webmagic.annotation; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.processor.PageProcessor; + +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.regex.Pattern; + +/** + * @author yihua.huang@dianping.com
+ * @date: 13-8-1
+ * Time: 下午8:46
+ */ +public class ObjectPageProcessor implements PageProcessor { + + private List pageModelFetcherList; + + private Site site; + + private Set targetUrlPatterns; + + public static ObjectPageProcessor create(Site site, Class... clazzs) { + List pageModelFetcherList = new ArrayList(); + for (Class clazz : clazzs) { + PageModelFetcher pageModelFetcher = PageModelFetcher.create(clazz); + pageModelFetcherList.add(pageModelFetcher); + } + ObjectPageProcessor objectPageProcessor = new ObjectPageProcessor(site, pageModelFetcherList); + return objectPageProcessor; + } + + private ObjectPageProcessor(Site site, List pageModelFetcherList) { + this.site = site; + this.pageModelFetcherList = pageModelFetcherList; + targetUrlPatterns = new HashSet(); + for (PageModelFetcher pageModelFetcher : pageModelFetcherList) { + targetUrlPatterns.addAll(pageModelFetcher.getTargetUrlPatterns()); + } + } + + @Override + public void process(Page page) { + for (PageModelFetcher pageModelFetcher : pageModelFetcherList) { + Object process = pageModelFetcher.process(page); + page.putField(pageModelFetcher.getClazz().getCanonicalName(), process); + } + for (String link : page.getHtml().links().all()) { + for (Pattern targetUrlPattern : targetUrlPatterns) { + if (targetUrlPattern.matcher(link).matches()){ + page.addTargetRequest(new Request(link)); + } + } + } + } + + @Override + public Site getSite() { + return site; + } +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/PageModelFetcher.java b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/PageModelFetcher.java new file mode 100644 index 0000000..097f1af --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/PageModelFetcher.java @@ -0,0 +1,104 @@ +package us.codecraft.webmagic.annotation; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.selector.CssSelector; +import us.codecraft.webmagic.selector.RegexSelector; +import us.codecraft.webmagic.selector.Selector; +import us.codecraft.webmagic.selector.XpathSelector; + +import java.lang.annotation.Annotation; +import java.lang.reflect.Field; +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Pattern; + +/** + * @author yihua.huang@dianping.com
+ * @date: 13-8-1
+ * Time: 下午9:33
+ */ +class PageModelFetcher { + + private List targetUrlPatterns; + + private Class clazz; + + private List fieldFetchers; + + public static PageModelFetcher create(Class clazz) { + PageModelFetcher pageModelFetcher = new PageModelFetcher(); + pageModelFetcher.init(clazz); + return pageModelFetcher; + } + + private void init(Class clazz) { + this.clazz = clazz; + initTargetUrlPatterns(); + fieldFetchers = new ArrayList(); + for (Field field : clazz.getDeclaredFields()) { + field.setAccessible(true); + Fetcher fetcher = field.getAnnotation(Fetcher.class); + String value = fetcher.value(); + Selector selector; + switch (fetcher.type()) { + case Css: + selector = new CssSelector(value); + break; + case Regex: + selector = new RegexSelector(value); + break; + case XPath: + selector = new XpathSelector(value); + break; + default: + selector = new XpathSelector(value); + } + fieldFetchers.add(new FieldFetcher(field, selector)); + } + } + + private void initTargetUrlPatterns() { + targetUrlPatterns = new ArrayList(); + Annotation annotation = clazz.getAnnotation(TargetUrl.class); + if (annotation == null) { + targetUrlPatterns.add(Pattern.compile(".*")); + } else { + String[] value = ((TargetUrl) annotation).value(); + for (String s : value) { + targetUrlPatterns.add(Pattern.compile(s.replace(".","\\.").replace("*","[^\"'#]*"))); + } + } + } + + public Object process(Page page) { + boolean matched = false; + for (Pattern targetPattern : targetUrlPatterns) { + if (targetPattern.matcher(page.getUrl().toString()).matches()) { + matched = true; + } + } + if (!matched) { + return null; + } + Object o = null; + try { + o = clazz.newInstance(); + for (FieldFetcher fieldFetcher : fieldFetchers) { + fieldFetcher.getField().set(o, fieldFetcher.getSelector().select(page.getHtml().toString())); + } + } catch (InstantiationException e) { + e.printStackTrace(); + } catch (IllegalAccessException e) { + e.printStackTrace(); + } + return o; + } + + Class getClazz() { + return clazz; + } + + List getTargetUrlPatterns() { + return targetUrlPatterns; + } +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/TargetUrl.java b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/TargetUrl.java new file mode 100644 index 0000000..f4f58ed --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/TargetUrl.java @@ -0,0 +1,17 @@ +package us.codecraft.webmagic.annotation; + +import java.lang.annotation.ElementType; +import java.lang.annotation.Retention; +import java.lang.annotation.Target; + +/** + * @author yihua.huang@dianping.com
+ * @date: 13-8-1
+ * Time: 下午8:40
+ */ +@Retention(java.lang.annotation.RetentionPolicy.RUNTIME) +@Target({ElementType.TYPE}) +public @interface TargetUrl { + + String[] value(); +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selector.java index 845c0b6..4af2b44 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selector.java @@ -8,7 +8,7 @@ import java.util.List; * Date: 13-4-20 * Time: 下午8:02 */ -interface Selector { +public interface Selector { public String select(String text); diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/annotation/Blog.java b/webmagic-core/src/test/java/us/codecraft/webmagic/annotation/Blog.java new file mode 100644 index 0000000..6c6e88c --- /dev/null +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/annotation/Blog.java @@ -0,0 +1,24 @@ +package us.codecraft.webmagic.annotation; + +/** + * @author yihua.huang@dianping.com
+ * @date: 13-8-1
+ * Time: 下午10:18
+ */ +@TargetUrl("http://djjchobits.iteye.com/blog/\\d+") +public class Blog { + + @Fetcher("//title") + private String title; + + @Fetcher(value = "div#main",type = Fetcher.Type.Css) + private String content; + + @Override + public String toString() { + return "Blog{" + + "title='" + title + '\'' + + ", content='" + content + '\'' + + '}'; + } +} diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/annotation/TestFetcher.java b/webmagic-core/src/test/java/us/codecraft/webmagic/annotation/TestFetcher.java new file mode 100644 index 0000000..5318703 --- /dev/null +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/annotation/TestFetcher.java @@ -0,0 +1,20 @@ +package us.codecraft.webmagic.annotation; + +import org.junit.Test; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Spider; + +/** + * @author yihua.huang@dianping.com
+ * @date: 13-8-1
+ * Time: 下午8:42
+ */ +public class TestFetcher { + + @Test + public void test() { + Spider.create(ObjectPageProcessor.create(Site.me().addStartUrl("http://djjchobits.iteye.com/blog/569000"), Blog.class)).run(); + + } + +}