update email

master
yihua.huang 2013-08-03 14:01:18 +08:00
parent 7c9e9ce869
commit 866ab0a056
17 changed files with 110 additions and 29 deletions

View File

@ -5,7 +5,7 @@ import java.util.Map;
/** /**
* PageProcessor{@link us.codecraft.webmagic.pipeline.Pipeline}<br> * PageProcessor{@link us.codecraft.webmagic.pipeline.Pipeline}<br>
* @author yihua.huang@dianping.com <br> * @author code4crafter@gmail.com <br>
* @date: 13-7-25 <br> * @date: 13-7-25 <br>
* Time: 12:20 <br> * Time: 12:20 <br>
*/ */

View File

@ -90,10 +90,6 @@ public class Spider implements Runnable, Task {
return new Spider(pageProcessor); return new Spider(pageProcessor);
} }
public static Spider create(Site site,Class... pageModels) {
return new Spider(ObjectPageProcessor.create(site,pageModels));
}
/** /**
* startUrlsSitestartUrls * startUrlsSitestartUrls
* *

View File

@ -0,0 +1,15 @@
package us.codecraft.webmagic.annotation;
import us.codecraft.webmagic.Page;
/**
* <br>
*
* @author code4crafter@gmail.com <br>
* @date: 13-8-3 <br>
* Time: 9:42 <br>
*/
public interface AfterExtractor<T> {
public void afterProcess(Page page, T t);
}

View File

@ -5,7 +5,7 @@ import java.lang.annotation.Retention;
import java.lang.annotation.Target; import java.lang.annotation.Target;
/** /**
* @author yihua.huang@dianping.com <br> * @author code4crafter@gmail.com <br>
* @date: 13-8-1 <br> * @date: 13-8-1 <br>
* Time: 8:40 <br> * Time: 8:40 <br>
*/ */

View File

@ -5,7 +5,7 @@ import java.lang.annotation.Retention;
import java.lang.annotation.Target; import java.lang.annotation.Target;
/** /**
* @author yihua.huang@dianping.com <br> * @author code4crafter@gmail.com <br>
* @date: 13-8-1 <br> * @date: 13-8-1 <br>
* Time: 8:40 <br> * Time: 8:40 <br>
*/ */

View File

@ -6,7 +6,7 @@ import java.lang.reflect.Field;
import java.lang.reflect.Method; import java.lang.reflect.Method;
/** /**
* @author yihua.huang@dianping.com <br> * @author code4crafter@gmail.com <br>
* @date: 13-8-1 <br> * @date: 13-8-1 <br>
* Time: 9:48 <br> * Time: 9:48 <br>
*/ */

View File

@ -5,7 +5,7 @@ import java.lang.annotation.Retention;
import java.lang.annotation.Target; import java.lang.annotation.Target;
/** /**
* @author yihua.huang@dianping.com <br> * @author code4crafter@gmail.com <br>
* @date: 13-8-1 <br> * @date: 13-8-1 <br>
* Time: 8:40 <br> * Time: 8:40 <br>
*/ */

View File

@ -0,0 +1,29 @@
package us.codecraft.webmagic.annotation;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
/**
* @author code4crafter@gmail.com <br>
* @date: 13-8-3 <br>
* Time: 9:51 <br>
*/
public class OOSpider extends Spider{
/**
* 使Spider
*
* @param pageProcessor
*/
public OOSpider(PageProcessor pageProcessor) {
super(pageProcessor);
}
public static OOSpider create(Site site,Class... pageModels) {
OOSpider ooSpider = new OOSpider(ObjectPageProcessor.create(site, pageModels));
ooSpider.pipeline(new ObjectPipeline());
return ooSpider;
}
}

View File

@ -12,7 +12,7 @@ import java.util.Set;
import java.util.regex.Pattern; import java.util.regex.Pattern;
/** /**
* @author yihua.huang@dianping.com <br> * @author code4crafter@gmail.com <br>
* @date: 13-8-1 <br> * @date: 13-8-1 <br>
* Time: 8:46 <br> * Time: 8:46 <br>
*/ */

View File

@ -4,18 +4,36 @@ import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task; import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline; import us.codecraft.webmagic.pipeline.Pipeline;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
/** /**
* @author yihua.huang@dianping.com <br> * @author code4crafter@gmail.com <br>
* @date: 13-8-2 <br> * @date: 13-8-2 <br>
* Time: 10:47 <br> * Time: 10:47 <br>
*/ */
public class ObjectPipeline implements Pipeline { public class ObjectPipeline implements Pipeline {
private Map<Class, PageModelPipeline> pageModelPipelines = new ConcurrentHashMap<Class, PageModelPipeline>();
public ObjectPipeline() {
}
public ObjectPipeline put(Class clazz, PageModelPipeline pageModelPipeline) {
pageModelPipelines.put(clazz, pageModelPipeline);
return this;
}
@Override @Override
public void process(ResultItems resultItems, Task task) { public void process(ResultItems resultItems, Task task) {
if (resultItems.isSkip()) {
} return;
}
public <T> T read() { for (Map.Entry<Class, PageModelPipeline> classPageModelPipelineEntry : pageModelPipelines.entrySet()) {
return null; Object o = resultItems.get(classPageModelPipelineEntry.getKey().getCanonicalName());
if (o != null) {
classPageModelPipelineEntry.getValue().process(o, task);
}
}
} }
} }

View File

@ -16,7 +16,7 @@ import java.util.List;
import java.util.regex.Pattern; import java.util.regex.Pattern;
/** /**
* @author yihua.huang@dianping.com <br> * @author code4crafter@gmail.com <br>
* @date: 13-8-1 <br> * @date: 13-8-1 <br>
* Time: 9:33 <br> * Time: 9:33 <br>
*/ */
@ -30,6 +30,8 @@ class PageModelExtractor {
private List<FieldExtractor> fieldExtractors; private List<FieldExtractor> fieldExtractors;
private AfterExtractor afterExtractor;
public static PageModelExtractor create(Class clazz) { public static PageModelExtractor create(Class clazz) {
PageModelExtractor pageModelExtractor = new PageModelExtractor(); PageModelExtractor pageModelExtractor = new PageModelExtractor();
pageModelExtractor.init(clazz); pageModelExtractor.init(clazz);
@ -40,6 +42,13 @@ class PageModelExtractor {
this.clazz = clazz; this.clazz = clazz;
initTargetUrlPatterns(); initTargetUrlPatterns();
fieldExtractors = new ArrayList<FieldExtractor>(); fieldExtractors = new ArrayList<FieldExtractor>();
if (clazz.isAssignableFrom(AfterExtractor.class)){
try {
afterExtractor=(AfterExtractor)clazz.newInstance();
} catch (Exception e) {
throw new IllegalArgumentException(e);
}
}
for (Field field : clazz.getDeclaredFields()) { for (Field field : clazz.getDeclaredFields()) {
field.setAccessible(true); field.setAccessible(true);
if (!field.getType().isAssignableFrom(String.class)){ if (!field.getType().isAssignableFrom(String.class)){
@ -147,6 +156,9 @@ class PageModelExtractor {
} }
setField(o, fieldExtractor, value); setField(o, fieldExtractor, value);
} }
if (afterExtractor!=null){
afterExtractor.afterProcess(page,o);
}
} catch (InstantiationException e) { } catch (InstantiationException e) {
e.printStackTrace(); e.printStackTrace();
} catch (IllegalAccessException e) { } catch (IllegalAccessException e) {

View File

@ -0,0 +1,14 @@
package us.codecraft.webmagic.annotation;
import us.codecraft.webmagic.Task;
/**
* @author code4crafter@gmail.com <br>
* @date: 13-8-3 <br>
* Time: 9:34 <br>
*/
public interface PageModelPipeline<T> {
public void process(T t, Task task);
}

View File

@ -5,7 +5,7 @@ import java.lang.annotation.Retention;
import java.lang.annotation.Target; import java.lang.annotation.Target;
/** /**
* @author yihua.huang@dianping.com <br> * @author code4crafter@gmail.com <br>
* @date: 13-8-1 <br> * @date: 13-8-1 <br>
* Time: 8:40 <br> * Time: 8:40 <br>
*/ */
@ -14,4 +14,5 @@ import java.lang.annotation.Target;
public @interface TargetUrl { public @interface TargetUrl {
String[] value(); String[] value();
} }

View File

@ -2,7 +2,7 @@ package us.codecraft.webmagic.downloader;
/** /**
* Spiderdestroy()<br> * Spiderdestroy()<br>
* @author yihua.huang@dianping.com <br> * @author code4crafter@gmail.com <br>
* @date: 13-7-26 <br> * @date: 13-7-26 <br>
* Time: 3:10 <br> * Time: 3:10 <br>
*/ */

View File

@ -3,7 +3,6 @@ package us.codecraft.webmagic.annotation;
import org.junit.Ignore; import org.junit.Ignore;
import org.junit.Test; import org.junit.Test;
import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
/** /**
* @author yihua.huang@dianping.com <br> * @author yihua.huang@dianping.com <br>
@ -16,12 +15,9 @@ public class TestFetcher {
@Test @Test
public void test() { public void test() {
ObjectPipeline objectPipeline = new ObjectPipeline(); ObjectPipeline objectPipeline = new ObjectPipeline();
Spider.create(ObjectPageProcessor.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog/145796"), OschinaBlog.class)) OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog/145796"), OschinaBlog.class)
.pipeline(objectPipeline).runAsync(); .pipeline(objectPipeline);
OschinaBlog oschinaBlog = null; OschinaBlog oschinaBlog = null;
while ((oschinaBlog = objectPipeline.read()) != null) {
System.out.println(oschinaBlog);
}
} }

View File

@ -1,8 +1,8 @@
package us.codecraft.webmagic.annotation.samples; package us.codecraft.webmagic.annotation.samples;
import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.annotation.ExtractBy; import us.codecraft.webmagic.annotation.ExtractBy;
import us.codecraft.webmagic.annotation.OOSpider;
import us.codecraft.webmagic.annotation.TargetUrl; import us.codecraft.webmagic.annotation.TargetUrl;
/** /**
@ -28,7 +28,7 @@ public class IteyeBlog implements Blog{
} }
public static void main(String[] args) { public static void main(String[] args) {
Spider.create(Site.me().addStartUrl("http://dengminhui.iteye.com/blog"),IteyeBlog.class).run(); OOSpider.create(Site.me().addStartUrl("http://dengminhui.iteye.com/blog"), IteyeBlog.class).run();
} }
public String getTitle() { public String getTitle() {

View File

@ -1,8 +1,8 @@
package us.codecraft.webmagic.annotation.samples; package us.codecraft.webmagic.annotation.samples;
import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.annotation.ExtractBy; import us.codecraft.webmagic.annotation.ExtractBy;
import us.codecraft.webmagic.annotation.OOSpider;
import us.codecraft.webmagic.annotation.TargetUrl; import us.codecraft.webmagic.annotation.TargetUrl;
/** /**
@ -28,7 +28,7 @@ public class OschinaBlog implements Blog{
} }
public static void main(String[] args) { public static void main(String[] args) {
Spider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog"),OschinaBlog.class).run(); OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog"), OschinaBlog.class).run();
} }
public String getTitle() { public String getTitle() {