diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java index 876c48a..db5326b 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java @@ -83,14 +83,13 @@ class PageModelExtractor { return; } if (!formatter.formatter().equals(Formatter.DEFAULT_FORMATTER)) { - ObjectFormatter objectFormatter = initFormatter(formatter.formatter()); - objectFormatter.initParam(formatter.value()); + ObjectFormatter objectFormatter = initFormatter(formatter.formatter(), formatter.value()); fieldExtractor.setObjectFormatter(objectFormatter); return; } if (!fieldExtractor.isMulti() && !String.class.isAssignableFrom(field.getType())) { Class fieldClazz = BasicTypeFormatter.detectBasicClass(field.getType()); - ObjectFormatter objectFormatter = getObjectFormatter(field, fieldClazz, formatter); + ObjectFormatter objectFormatter = initFormatter(ObjectFormatters.get(fieldClazz), formatter.value()); if (objectFormatter == null) { throw new IllegalStateException("Can't find formatter for field " + field.getName() + " of type " + fieldClazz); } else { @@ -100,30 +99,22 @@ class PageModelExtractor { if (!List.class.isAssignableFrom(field.getType())) { throw new IllegalStateException("Field " + field.getName() + " must be list"); } - if (formatter != null) { - if (!formatter.subClazz().equals(Void.class)) { - ObjectFormatter objectFormatter = getObjectFormatter(field, formatter.subClazz(), formatter); - if (objectFormatter == null) { - throw new IllegalStateException("Can't find formatter for field " + field.getName() + " of type " + formatter.subClazz()); - } else { - fieldExtractor.setObjectFormatter(objectFormatter); - } + if (!formatter.subClazz().equals(Void.class)) { + ObjectFormatter objectFormatter = initFormatter(ObjectFormatters.get(formatter.subClazz()), formatter.value()); + if (objectFormatter == null) { + throw new IllegalStateException("Can't find formatter for field " + field.getName() + " of type " + formatter.subClazz()); + } else { + fieldExtractor.setObjectFormatter(objectFormatter); } } } } - private ObjectFormatter getObjectFormatter(Field field, Class fieldClazz, Formatter formatter) { - ObjectFormatter objectFormatter = initFormatter(ObjectFormatters.get(fieldClazz)); - if(formatter != null && formatter.value() != null){ - objectFormatter.initParam(formatter.value()); - } - return objectFormatter; - } - - private ObjectFormatter initFormatter(Class formatterClazz) { + private ObjectFormatter initFormatter(Class formatterClazz, String[] params) { try { - return formatterClazz.newInstance(); + ObjectFormatter objectFormatter = formatterClazz.newInstance(); + objectFormatter.initParam(params); + return objectFormatter; } catch (InstantiationException e) { throw new RuntimeException(e); } catch (IllegalAccessException e) { diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/model/GithubRepoApi.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/GithubRepoApi.java new file mode 100644 index 0000000..3750645 --- /dev/null +++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/GithubRepoApi.java @@ -0,0 +1,18 @@ +package us.codecraft.webmagic.model; + +import us.codecraft.webmagic.model.annotation.ExtractBy; + +/** + * @author code4crafter@gmail.com + * Date: 2017/6/3 + * Time: 下午9:07 + */ +public class GithubRepoApi { + + @ExtractBy(type = ExtractBy.Type.JsonPath, value = "$.name",source = ExtractBy.Source.RawText) + private String name; + + public String getName() { + return name; + } +} diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/model/GithubRepoTest.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/GithubRepoTest.java index 1f3b2df..632dd86 100644 --- a/webmagic-extension/src/test/java/us/codecraft/webmagic/model/GithubRepoTest.java +++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/GithubRepoTest.java @@ -1,12 +1,11 @@ package us.codecraft.webmagic.model; import org.junit.Test; -import us.codecraft.webmagic.SimpleHttpClient; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.downloader.MockGithubDownloader; -import us.codecraft.webmagic.pipeline.PageModelPipeline; import us.codecraft.webmagic.example.GithubRepo; +import us.codecraft.webmagic.pipeline.PageModelPipeline; import static org.assertj.core.api.Assertions.assertThat; @@ -27,11 +26,4 @@ public class GithubRepoTest { }, GithubRepo.class).addUrl("https://github.com/code4craft/webmagic").setDownloader(new MockGithubDownloader()).test("https://github.com/code4craft/webmagic"); } - @Test - public void test1() throws Exception { - SimpleHttpClient simpleHttpClient = new SimpleHttpClient(); - GithubRepo model = simpleHttpClient.get("https://github.com/code4craft/webmagic",GithubRepo.class); - System.out.println(model); - - } } diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/model/MockModel.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/MockModel.java deleted file mode 100644 index 6531053..0000000 --- a/webmagic-extension/src/test/java/us/codecraft/webmagic/model/MockModel.java +++ /dev/null @@ -1,13 +0,0 @@ -package us.codecraft.webmagic.model; - -import us.codecraft.webmagic.model.annotation.HelpUrl; -import us.codecraft.webmagic.model.annotation.TargetUrl; - -/** - * @author code4crafer@gmail.com - */ -@TargetUrl(value = "http://webmagic.io/post/\\d+",sourceRegion = "//li[@class='post']") -@HelpUrl(value = "http://webmagic.io/list/\\d+",sourceRegion = "//li[@class='list']") -public class MockModel { - -} diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/model/ModelPageProcessorTest.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/ModelPageProcessorTest.java index 48f6073..0fb71c8 100644 --- a/webmagic-extension/src/test/java/us/codecraft/webmagic/model/ModelPageProcessorTest.java +++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/ModelPageProcessorTest.java @@ -1,19 +1,13 @@ package us.codecraft.webmagic.model; -import org.apache.commons.io.IOUtils; -import org.apache.commons.lang3.time.DateFormatUtils; import org.junit.Test; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.model.annotation.ExtractBy; -import us.codecraft.webmagic.model.annotation.Formatter; +import us.codecraft.webmagic.model.annotation.HelpUrl; import us.codecraft.webmagic.model.annotation.TargetUrl; -import us.codecraft.webmagic.model.formatter.DateFormatter; import us.codecraft.webmagic.selector.PlainText; -import java.io.IOException; -import java.util.Date; - import static org.assertj.core.api.Assertions.assertThat; /** @@ -22,6 +16,8 @@ import static org.assertj.core.api.Assertions.assertThat; */ public class ModelPageProcessorTest { + private PageMocker pageMocker = new PageMocker(); + @TargetUrl("http://codecraft.us/foo") public static class ModelFoo { @@ -38,15 +34,10 @@ public class ModelPageProcessorTest { } - public static class ModelDate { + @TargetUrl(value = "http://webmagic.io/post/\\d+",sourceRegion = "//li[@class='post']") + @HelpUrl(value = "http://webmagic.io/list/\\d+",sourceRegion = "//li[@class='list']") + public static class MockModel { - @Formatter(value = "yyyyMMdd", formatter = DateFormatter.class) - @ExtractBy(value = "//div[@class='date']/text()", notNull = true) - private Date date; - - public Date getDate() { - return date; - } } @Test @@ -63,26 +54,11 @@ public class ModelPageProcessorTest { @Test public void testExtractLinks() throws Exception { ModelPageProcessor modelPageProcessor = ModelPageProcessor.create(null, MockModel.class); - Page page = getMockPage(); + Page page = pageMocker.getMockPage(); modelPageProcessor.process(page); assertThat(page.getTargetRequests()).containsExactly(new Request("http://webmagic.io/list/1"), new Request("http://webmagic.io/list/2"), new Request("http://webmagic.io/post/1"), new Request("http://webmagic.io/post/2")); } - @Test - public void testExtractDate() throws Exception { - ModelPageProcessor modelPageProcessor = ModelPageProcessor.create(null, ModelDate.class); - Page page = getMockPage(); - modelPageProcessor.process(page); - ModelDate modelDate = (ModelDate) page.getResultItems().get(ModelDate.class.getCanonicalName()); - assertThat(DateFormatUtils.format(modelDate.getDate(),"yyyyMMdd")).isEqualTo("20170603"); - } - private Page getMockPage() throws IOException { - Page page = new Page(); - page.setRawText(IOUtils.toString(getClass().getClassLoader().getResourceAsStream("html/mock-webmagic.html"))); - page.setRequest(new Request("http://webmagic.io/list/0")); - page.setUrl(new PlainText("http://webmagic.io/list/0")); - return page; - } } diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/model/PageMapperTest.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/PageMapperTest.java index 8588365..45938d6 100644 --- a/webmagic-extension/src/test/java/us/codecraft/webmagic/model/PageMapperTest.java +++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/PageMapperTest.java @@ -1,13 +1,6 @@ package us.codecraft.webmagic.model; -import org.apache.commons.io.IOUtils; import org.junit.Test; -import us.codecraft.webmagic.Page; -import us.codecraft.webmagic.Request; -import us.codecraft.webmagic.model.annotation.ExtractBy; -import us.codecraft.webmagic.selector.PlainText; - -import java.io.IOException; import static org.assertj.core.api.Assertions.assertThat; @@ -18,29 +11,13 @@ import static org.assertj.core.api.Assertions.assertThat; */ public class PageMapperTest { - public static class GithubRepo { - - @ExtractBy(type = ExtractBy.Type.JsonPath, value = "$.name",source = ExtractBy.Source.RawText) - private String name; - - public String getName() { - return name; - } - } + private PageMocker pageMocker = new PageMocker(); @Test public void test_get() throws Exception { - PageMapper pageMapper = new PageMapper(GithubRepo.class); - GithubRepo githubRepo = pageMapper.get(getMockJsonPage()); + PageMapper pageMapper = new PageMapper(GithubRepoApi.class); + GithubRepoApi githubRepo = pageMapper.get(pageMocker.getMockJsonPage()); assertThat(githubRepo.getName()).isEqualTo("webmagic"); } - private Page getMockJsonPage() throws IOException { - Page page = new Page(); - page.setRawText(IOUtils.toString(getClass().getClassLoader().getResourceAsStream("json/mock-githubrepo.json"))); - page.setRequest(new Request("https://api.github.com/repos/code4craft/webmagic")); - page.setUrl(new PlainText("https://api.github.com/repos/code4craft/webmagic")); - return page; - } - } diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/model/PageMocker.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/PageMocker.java new file mode 100644 index 0000000..4b0c133 --- /dev/null +++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/PageMocker.java @@ -0,0 +1,32 @@ +package us.codecraft.webmagic.model; + +import org.apache.commons.io.IOUtils; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.selector.PlainText; + +import java.io.IOException; + +/** + * @author code4crafter@gmail.com + * Date: 2017/6/3 + * Time: 下午9:08 + */ +public class PageMocker { + + public Page getMockJsonPage() throws IOException { + Page page = new Page(); + page.setRawText(IOUtils.toString(PageMocker.class.getClassLoader().getResourceAsStream("json/mock-githubrepo.json"))); + page.setRequest(new Request("https://api.github.com/repos/code4craft/webmagic")); + page.setUrl(new PlainText("https://api.github.com/repos/code4craft/webmagic")); + return page; + } + + public Page getMockPage() throws IOException { + Page page = new Page(); + page.setRawText(IOUtils.toString(PageMocker.class.getClassLoader().getResourceAsStream("html/mock-webmagic.html"))); + page.setRequest(new Request("http://webmagic.io/list/0")); + page.setUrl(new PlainText("http://webmagic.io/list/0")); + return page; + } +} diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/model/PageModelExtractorTest.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/PageModelExtractorTest.java new file mode 100644 index 0000000..77739e5 --- /dev/null +++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/PageModelExtractorTest.java @@ -0,0 +1,103 @@ +package us.codecraft.webmagic.model; + +import org.apache.commons.lang3.time.DateFormatUtils; +import org.junit.Test; +import us.codecraft.webmagic.model.annotation.ExtractBy; +import us.codecraft.webmagic.model.annotation.Formatter; +import us.codecraft.webmagic.model.formatter.DateFormatter; + +import java.util.Date; +import java.util.List; + +import static org.assertj.core.api.Assertions.assertThat; + +/** + * @author code4crafter@gmail.com + * Date: 2017/6/3 + * Time: 下午9:06 + */ +public class PageModelExtractorTest { + + private PageMocker pageMocker = new PageMocker(); + + public static class ModelDateStr { + + @ExtractBy(value = "//div[@class='date']/text()", notNull = true) + private String dateStr; + + } + + public static class ModelDate { + + @Formatter(value = "yyyyMMdd", formatter = DateFormatter.class) + @ExtractBy(value = "//div[@class='date']/text()", notNull = true) + private Date date; + + } + + public static class ModelInt { + + @ExtractBy(value = "//div[@class='number']/text()", notNull = true) + private int number; + + } + + public static class ModelStringList { + + @ExtractBy("//a/@href") + private List links; + + } + + public static class ModelIntList { + + @Formatter(subClazz = Integer.class) + @ExtractBy("//li[@class='numbers']/text()") + private List numbers; + + } + + public static class ModelDateList { + + @Formatter(subClazz = Date.class, value = "yyyyMMdd") + @ExtractBy("//li[@class='dates']/text()") + private List dates; + + } + + @Test + public void testXpath() throws Exception { + ModelDateStr modelDate = (ModelDateStr) PageModelExtractor.create(ModelDateStr.class).process(pageMocker.getMockPage()); + assertThat(modelDate.dateStr).isEqualTo("20170603"); + } + + @Test + public void testExtractDate() throws Exception { + ModelDate modelDate = (ModelDate) PageModelExtractor.create(ModelDate.class).process(pageMocker.getMockPage()); + assertThat(DateFormatUtils.format(modelDate.date,"yyyyMMdd")).isEqualTo("20170603"); + } + + @Test + public void testExtractInt() throws Exception { + ModelInt modelDate = (ModelInt) PageModelExtractor.create(ModelInt.class).process(pageMocker.getMockPage()); + assertThat(modelDate.number).isEqualTo(12); + } + + @Test + public void testExtractList() throws Exception { + ModelStringList modelDate = (ModelStringList) PageModelExtractor.create(ModelStringList.class).process(pageMocker.getMockPage()); + assertThat(modelDate.links).hasSize(8); + } + + @Test + public void testExtractIntList() throws Exception { + ModelIntList modelDate = (ModelIntList) PageModelExtractor.create(ModelIntList.class).process(pageMocker.getMockPage()); + assertThat(modelDate.numbers).hasSize(4); + } + + @Test + public void testExtractDateList() throws Exception { + ModelDateList modelDate = (ModelDateList) PageModelExtractor.create(ModelDateList.class).process(pageMocker.getMockPage()); + assertThat(modelDate.dates).hasSize(4); + } +} diff --git a/webmagic-extension/src/test/resources/html/mock-webmagic.html b/webmagic-extension/src/test/resources/html/mock-webmagic.html index d6039f9..ed09a13 100644 --- a/webmagic-extension/src/test/resources/html/mock-webmagic.html +++ b/webmagic-extension/src/test/resources/html/mock-webmagic.html @@ -6,6 +6,7 @@
20170603
+
12
  • @@ -18,6 +19,17 @@
- +
    +
  • 1
  • +
  • 2
  • +
  • 3
  • +
  • 4
  • +
+
    +
  • 20170601
  • +
  • 20170602
  • +
  • 20170603
  • +
  • 20170604
  • +
\ No newline at end of file