diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/model/GithubRepo.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/GithubRepo.java similarity index 66% rename from webmagic-extension/src/test/java/us/codecraft/webmagic/model/GithubRepo.java rename to webmagic-extension/src/main/java/us/codecraft/webmagic/example/GithubRepo.java index a9e049b..d950115 100644 --- a/webmagic-extension/src/test/java/us/codecraft/webmagic/model/GithubRepo.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/GithubRepo.java @@ -1,10 +1,6 @@ -package us.codecraft.webmagic.model; +package us.codecraft.webmagic.example; -import junit.framework.Assert; -import org.junit.Test; -import us.codecraft.webmagic.MockDownloader; -import us.codecraft.webmagic.Site; -import us.codecraft.webmagic.Task; +import us.codecraft.webmagic.model.HasKey; import us.codecraft.webmagic.model.annotation.ExtractBy; import us.codecraft.webmagic.model.annotation.ExtractByUrl; import us.codecraft.webmagic.model.annotation.HelpUrl; @@ -25,10 +21,10 @@ public class GithubRepo implements HasKey { @ExtractByUrl("https://github\\.com/(\\w+)/.*") private String author; - @ExtractBy("//div[@id='readme']") + @ExtractBy("//div[@id='readme']/tidyText()") private String readme; - @ExtractBy(value = "//div[@class='repository-lang-stats']//li//span[@class='lang']", multi = true) + @ExtractBy(value = "//div[@class='repository-lang-stats']//li//span[@class='lang']/text()", multi = true) private List language; @ExtractBy("//ul[@class='pagehead-actions']/li[2]//a[@class='social-count js-social-count']/text()") @@ -40,18 +36,6 @@ public class GithubRepo implements HasKey { @ExtractByUrl private String url; - @Test - public void test() { - OOSpider.create(Site.me().addStartUrl("https://github.com/code4craft/webmagic").setSleepTime(0) - , new PageModelPipeline() { - @Override - public void process(GithubRepo o, Task task) { - Assert.assertEquals(78, o.getStar()); - Assert.assertEquals(65, o.getFork()); - } - }, GithubRepo.class).setDownloader(new MockDownloader()).test("https://github.com/code4craft/webmagic"); - } - @Override public String key() { return author + ":" + name; diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java index 370b0fb..cd3e72b 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java @@ -105,15 +105,15 @@ class PageModelExtractor { Formatter formatter = field.getAnnotation(Formatter.class); if (formatter != null) { if (!formatter.formatter().equals(ObjectFormatter.class)) { - return initFormatter(formatter); + return initFormatter(formatter.formatter()); } } - return ObjectFormatters.get(fieldClazz); + return initFormatter(ObjectFormatters.get(fieldClazz)); } - private ObjectFormatter initFormatter(Formatter formatter) { + private ObjectFormatter initFormatter(Class formatterClazz) { try { - return formatter.formatter().newInstance(); + return formatterClazz.newInstance(); } catch (InstantiationException e) { logger.error("init ObjectFormatter fail", e); } catch (IllegalAccessException e) { diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/BasicTypeFormatter.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/BasicTypeFormatter.java index 2669582..f9d76a8 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/BasicTypeFormatter.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/BasicTypeFormatter.java @@ -25,9 +25,9 @@ public abstract class BasicTypeFormatter implements ObjectFormatter { protected abstract T formatTrimmed(String raw) throws Exception; - public static final List basicTypeFormatters = Arrays.asList(new IntegerFormatter(), - new LongFormatter(), new DoubleFormatter(), new FloatFormatter(), new ShortFormatter(), - new CharactorFormatter(), new ByteFormatter(), new BooleanFormatter()); + public static final List> basicTypeFormatters = Arrays.>asList(IntegerFormatter.class, + LongFormatter.class, DoubleFormatter.class, FloatFormatter.class, ShortFormatter.class, + CharactorFormatter.class, ByteFormatter.class, BooleanFormatter.class); public static Class detectBasicClass(Class type) { if (type.equals(Integer.TYPE) || type.equals(Integer.class)) { diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/ObjectFormatters.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/ObjectFormatters.java index 6dedc3c..7534e5e 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/ObjectFormatters.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/ObjectFormatters.java @@ -9,19 +9,26 @@ import java.util.concurrent.ConcurrentHashMap; */ public class ObjectFormatters { - private static Map formatterMap = new ConcurrentHashMap(); + private static Map> formatterMap = new ConcurrentHashMap>(); static { - for (ObjectFormatter basicTypeFormatter : BasicTypeFormatter.basicTypeFormatters) { + for (Class basicTypeFormatter : BasicTypeFormatter.basicTypeFormatters) { put(basicTypeFormatter); } + put(DateFormatter.class); + } + + public static void put(Class objectFormatter) { + try { + formatterMap.put(objectFormatter.newInstance().clazz(), objectFormatter); + } catch (InstantiationException e) { + e.printStackTrace(); + } catch (IllegalAccessException e) { + e.printStackTrace(); + } } - public static void put(ObjectFormatter objectFormatter) { - formatterMap.put(objectFormatter.clazz(), objectFormatter); - } - - public static ObjectFormatter get(Class clazz){ + public static Class get(Class clazz){ return formatterMap.get(clazz); } } diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/model/GithubRepoTest.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/GithubRepoTest.java new file mode 100644 index 0000000..9755550 --- /dev/null +++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/GithubRepoTest.java @@ -0,0 +1,26 @@ +package us.codecraft.webmagic.model; + +import junit.framework.Assert; +import org.junit.Test; +import us.codecraft.webmagic.MockDownloader; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Task; +import us.codecraft.webmagic.example.GithubRepo; + +/** + * @author code4crafter@gmail.com
+ */ +public class GithubRepoTest { + + @Test + public void test() { + OOSpider.create(Site.me().addStartUrl("https://github.com/code4craft/webmagic").setSleepTime(0) + , new PageModelPipeline() { + @Override + public void process(GithubRepo o, Task task) { + Assert.assertEquals(78, o.getStar()); + Assert.assertEquals(65, o.getFork()); + } + }, GithubRepo.class).setDownloader(new MockDownloader()).test("https://github.com/code4craft/webmagic"); + } +} diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java index 7819b44..8e6602c 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java @@ -1,19 +1,19 @@ package us.codecraft.webmagic.model.samples; import us.codecraft.webmagic.Site; -import us.codecraft.webmagic.model.HasKey; import us.codecraft.webmagic.model.OOSpider; import us.codecraft.webmagic.model.annotation.ExtractBy; import us.codecraft.webmagic.model.annotation.TargetUrl; import us.codecraft.webmagic.pipeline.JsonFilePageModelPipeline; +import java.util.Date; import java.util.List; /** * @author code4crafter@gmail.com
*/ @TargetUrl("http://my.oschina.net/flashsword/blog/\\d+") -public class OschinaBlog implements HasKey{ +public class OschinaBlog{ @ExtractBy("//title") private String title; @@ -24,16 +24,14 @@ public class OschinaBlog implements HasKey{ @ExtractBy(value = "//div[@class='BlogTags']/a/text()", multi = true) private List tags; + @ExtractBy("//div[class='BlogStat']/regex('\\d{4}-\\d{1,2}-\\d{1,2} \\d{1,2}:\\d{1,2}')") + private Date date; + public static void main(String[] args) { OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog") ,new JsonFilePageModelPipeline(), OschinaBlog.class).run(); } - @Override - public String key() { - return title; - } - public String getTitle() { return title; }