complete objectpipeline

master
yihua.huang 2013-08-03 15:55:54 +08:00
parent 866ab0a056
commit f84b53514f
5 changed files with 72 additions and 31 deletions

View File

@ -0,0 +1,16 @@
package us.codecraft.webmagic.annotation;
import org.apache.commons.lang3.builder.ToStringBuilder;
import us.codecraft.webmagic.Task;
/**
* @author yihua.huang@dianping.com <br>
* @date: 13-8-3 <br>
* Time: 3:41 <br>
*/
public class ConsolePageModelPipeline implements PageModelPipeline {
@Override
public void process(Object o, Task task) {
System.out.println(ToStringBuilder.reflectionToString(o));
}
}

View File

@ -2,28 +2,57 @@ package us.codecraft.webmagic.annotation;
import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.pipeline.Pipeline;
/** /**
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* @date: 13-8-3 <br> * @date: 13-8-3 <br>
* Time: 9:51 <br> * Time: 9:51 <br>
*/ */
public class OOSpider extends Spider{ public class OOSpider extends Spider {
/** /**
* 使Spider * OOSpiderObjectPageProcessor
* *
* @param pageProcessor * @param pageProcessor
*/ */
public OOSpider(PageProcessor pageProcessor) {
super(pageProcessor); private ObjectPageProcessor objectPageProcessor;
private ObjectPipeline objectPipeline;
protected OOSpider(ObjectPageProcessor objectPageProcessor) {
super(objectPageProcessor);
this.objectPageProcessor = objectPageProcessor;
} }
public static OOSpider create(Site site,Class... pageModels) { public OOSpider(Site site, PageModelPipeline pageModelPipeline, Class... pageModels) {
OOSpider ooSpider = new OOSpider(ObjectPageProcessor.create(site, pageModels)); this(ObjectPageProcessor.create(site, pageModels));
ooSpider.pipeline(new ObjectPipeline()); this.objectPipeline = new ObjectPipeline();
return ooSpider; super.pipeline(objectPipeline);
for (Class pageModel : pageModels) {
this.objectPipeline.put(pageModel, pageModelPipeline);
}
}
public static OOSpider create(Site site, Class... pageModels) {
return new OOSpider(site, new ConsolePageModelPipeline(), pageModels);
}
public static OOSpider create(Site site, PageModelPipeline pageModelPipeline, Class... pageModels) {
return new OOSpider(site, pageModelPipeline, pageModels);
}
public OOSpider addPageModel(PageModelPipeline pageModelPipeline, Class... pageModels) {
for (Class pageModel : pageModels) {
objectPageProcessor.addPageModel(pageModel);
objectPipeline.put(pageModel, pageModelPipeline);
}
return this;
}
public Spider pipeline(Pipeline pipeline) {
throw new UnsupportedOperationException("Sorry, OOSpider can only use ObjectPipeline");
} }
} }

View File

@ -18,30 +18,31 @@ import java.util.regex.Pattern;
*/ */
public class ObjectPageProcessor implements PageProcessor { public class ObjectPageProcessor implements PageProcessor {
private List<PageModelExtractor> pageModelExtractorList; private List<PageModelExtractor> pageModelExtractorList = new ArrayList<PageModelExtractor>();
private Site site; private Site site;
private Set<Pattern> targetUrlPatterns; private Set<Pattern> targetUrlPatterns = new HashSet<Pattern>();
public static ObjectPageProcessor create(Site site, Class... clazzs) { public static ObjectPageProcessor create(Site site, Class... clazzs) {
List<PageModelExtractor> pageModelExtractorList = new ArrayList<PageModelExtractor>(); ObjectPageProcessor objectPageProcessor = new ObjectPageProcessor(site);
for (Class clazz : clazzs) { for (Class clazz : clazzs) {
PageModelExtractor pageModelExtractor = PageModelExtractor.create(clazz); objectPageProcessor.addPageModel(clazz);
pageModelExtractorList.add(pageModelExtractor);
} }
ObjectPageProcessor objectPageProcessor = new ObjectPageProcessor(site, pageModelExtractorList);
return objectPageProcessor; return objectPageProcessor;
} }
private ObjectPageProcessor(Site site, List<PageModelExtractor> pageModelExtractorList) {
this.site = site; public ObjectPageProcessor addPageModel(Class clazz){
this.pageModelExtractorList = pageModelExtractorList; PageModelExtractor pageModelExtractor = PageModelExtractor.create(clazz);
targetUrlPatterns = new HashSet<Pattern>();
for (PageModelExtractor pageModelExtractor : pageModelExtractorList) {
targetUrlPatterns.addAll(pageModelExtractor.getTargetUrlPatterns()); targetUrlPatterns.addAll(pageModelExtractor.getTargetUrlPatterns());
targetUrlPatterns.addAll(pageModelExtractor.getHelpUrlPatterns()); targetUrlPatterns.addAll(pageModelExtractor.getHelpUrlPatterns());
pageModelExtractorList.add(pageModelExtractor);
return this;
} }
private ObjectPageProcessor(Site site) {
this.site = site;
} }
@Override @Override

View File

@ -22,9 +22,9 @@ import java.util.regex.Pattern;
*/ */
class PageModelExtractor { class PageModelExtractor {
private List<Pattern> targetUrlPatterns; private List<Pattern> targetUrlPatterns = new ArrayList<Pattern>();
private List<Pattern> helpUrlPatterns; private List<Pattern> helpUrlPatterns = new ArrayList<Pattern>();
private Class clazz; private Class clazz;
@ -106,7 +106,6 @@ class PageModelExtractor {
} }
private void initTargetUrlPatterns() { private void initTargetUrlPatterns() {
targetUrlPatterns = new ArrayList<Pattern>();
Annotation annotation = clazz.getAnnotation(TargetUrl.class); Annotation annotation = clazz.getAnnotation(TargetUrl.class);
if (annotation == null) { if (annotation == null) {
targetUrlPatterns.add(Pattern.compile(".*")); targetUrlPatterns.add(Pattern.compile(".*"));
@ -116,7 +115,6 @@ class PageModelExtractor {
targetUrlPatterns.add(Pattern.compile(s.replace(".", "\\.").replace("*", "[^\"'#]*"))); targetUrlPatterns.add(Pattern.compile(s.replace(".", "\\.").replace("*", "[^\"'#]*")));
} }
} }
helpUrlPatterns = new ArrayList<Pattern>();
annotation = clazz.getAnnotation(HelpUrl.class); annotation = clazz.getAnnotation(HelpUrl.class);
if (annotation != null) { if (annotation != null) {
String[] value = ((HelpUrl) annotation).value(); String[] value = ((HelpUrl) annotation).value();

View File

@ -1,6 +1,5 @@
package us.codecraft.webmagic.annotation; package us.codecraft.webmagic.annotation;
import org.junit.Ignore;
import org.junit.Test; import org.junit.Test;
import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Site;
@ -11,13 +10,11 @@ import us.codecraft.webmagic.Site;
*/ */
public class TestFetcher { public class TestFetcher {
@Ignore("takes long") // @Ignore("takes long")
@Test @Test
public void test() { public void test() {
ObjectPipeline objectPipeline = new ObjectPipeline();
OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog/145796"), OschinaBlog.class) OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog/145796"), OschinaBlog.class)
.pipeline(objectPipeline); .run();
OschinaBlog oschinaBlog = null;
} }