change formatter to class
parent
b18216245b
commit
250cc5e662
|
@ -1,10 +1,6 @@
|
||||||
package us.codecraft.webmagic.model;
|
package us.codecraft.webmagic.example;
|
||||||
|
|
||||||
import junit.framework.Assert;
|
import us.codecraft.webmagic.model.HasKey;
|
||||||
import org.junit.Test;
|
|
||||||
import us.codecraft.webmagic.MockDownloader;
|
|
||||||
import us.codecraft.webmagic.Site;
|
|
||||||
import us.codecraft.webmagic.Task;
|
|
||||||
import us.codecraft.webmagic.model.annotation.ExtractBy;
|
import us.codecraft.webmagic.model.annotation.ExtractBy;
|
||||||
import us.codecraft.webmagic.model.annotation.ExtractByUrl;
|
import us.codecraft.webmagic.model.annotation.ExtractByUrl;
|
||||||
import us.codecraft.webmagic.model.annotation.HelpUrl;
|
import us.codecraft.webmagic.model.annotation.HelpUrl;
|
||||||
|
@ -25,10 +21,10 @@ public class GithubRepo implements HasKey {
|
||||||
@ExtractByUrl("https://github\\.com/(\\w+)/.*")
|
@ExtractByUrl("https://github\\.com/(\\w+)/.*")
|
||||||
private String author;
|
private String author;
|
||||||
|
|
||||||
@ExtractBy("//div[@id='readme']")
|
@ExtractBy("//div[@id='readme']/tidyText()")
|
||||||
private String readme;
|
private String readme;
|
||||||
|
|
||||||
@ExtractBy(value = "//div[@class='repository-lang-stats']//li//span[@class='lang']", multi = true)
|
@ExtractBy(value = "//div[@class='repository-lang-stats']//li//span[@class='lang']/text()", multi = true)
|
||||||
private List<String> language;
|
private List<String> language;
|
||||||
|
|
||||||
@ExtractBy("//ul[@class='pagehead-actions']/li[2]//a[@class='social-count js-social-count']/text()")
|
@ExtractBy("//ul[@class='pagehead-actions']/li[2]//a[@class='social-count js-social-count']/text()")
|
||||||
|
@ -40,18 +36,6 @@ public class GithubRepo implements HasKey {
|
||||||
@ExtractByUrl
|
@ExtractByUrl
|
||||||
private String url;
|
private String url;
|
||||||
|
|
||||||
@Test
|
|
||||||
public void test() {
|
|
||||||
OOSpider.create(Site.me().addStartUrl("https://github.com/code4craft/webmagic").setSleepTime(0)
|
|
||||||
, new PageModelPipeline<GithubRepo>() {
|
|
||||||
@Override
|
|
||||||
public void process(GithubRepo o, Task task) {
|
|
||||||
Assert.assertEquals(78, o.getStar());
|
|
||||||
Assert.assertEquals(65, o.getFork());
|
|
||||||
}
|
|
||||||
}, GithubRepo.class).setDownloader(new MockDownloader()).test("https://github.com/code4craft/webmagic");
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String key() {
|
public String key() {
|
||||||
return author + ":" + name;
|
return author + ":" + name;
|
|
@ -105,15 +105,15 @@ class PageModelExtractor {
|
||||||
Formatter formatter = field.getAnnotation(Formatter.class);
|
Formatter formatter = field.getAnnotation(Formatter.class);
|
||||||
if (formatter != null) {
|
if (formatter != null) {
|
||||||
if (!formatter.formatter().equals(ObjectFormatter.class)) {
|
if (!formatter.formatter().equals(ObjectFormatter.class)) {
|
||||||
return initFormatter(formatter);
|
return initFormatter(formatter.formatter());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return ObjectFormatters.get(fieldClazz);
|
return initFormatter(ObjectFormatters.get(fieldClazz));
|
||||||
}
|
}
|
||||||
|
|
||||||
private ObjectFormatter initFormatter(Formatter formatter) {
|
private ObjectFormatter initFormatter(Class<? extends ObjectFormatter> formatterClazz) {
|
||||||
try {
|
try {
|
||||||
return formatter.formatter().newInstance();
|
return formatterClazz.newInstance();
|
||||||
} catch (InstantiationException e) {
|
} catch (InstantiationException e) {
|
||||||
logger.error("init ObjectFormatter fail", e);
|
logger.error("init ObjectFormatter fail", e);
|
||||||
} catch (IllegalAccessException e) {
|
} catch (IllegalAccessException e) {
|
||||||
|
|
|
@ -25,9 +25,9 @@ public abstract class BasicTypeFormatter<T> implements ObjectFormatter<T> {
|
||||||
|
|
||||||
protected abstract T formatTrimmed(String raw) throws Exception;
|
protected abstract T formatTrimmed(String raw) throws Exception;
|
||||||
|
|
||||||
public static final List<ObjectFormatter> basicTypeFormatters = Arrays.<ObjectFormatter>asList(new IntegerFormatter(),
|
public static final List<Class<? extends ObjectFormatter>> basicTypeFormatters = Arrays.<Class<? extends ObjectFormatter>>asList(IntegerFormatter.class,
|
||||||
new LongFormatter(), new DoubleFormatter(), new FloatFormatter(), new ShortFormatter(),
|
LongFormatter.class, DoubleFormatter.class, FloatFormatter.class, ShortFormatter.class,
|
||||||
new CharactorFormatter(), new ByteFormatter(), new BooleanFormatter());
|
CharactorFormatter.class, ByteFormatter.class, BooleanFormatter.class);
|
||||||
|
|
||||||
public static Class<?> detectBasicClass(Class<?> type) {
|
public static Class<?> detectBasicClass(Class<?> type) {
|
||||||
if (type.equals(Integer.TYPE) || type.equals(Integer.class)) {
|
if (type.equals(Integer.TYPE) || type.equals(Integer.class)) {
|
||||||
|
|
|
@ -9,19 +9,26 @@ import java.util.concurrent.ConcurrentHashMap;
|
||||||
*/
|
*/
|
||||||
public class ObjectFormatters {
|
public class ObjectFormatters {
|
||||||
|
|
||||||
private static Map<Class, ObjectFormatter> formatterMap = new ConcurrentHashMap<Class, ObjectFormatter>();
|
private static Map<Class, Class<? extends ObjectFormatter>> formatterMap = new ConcurrentHashMap<Class, Class<? extends ObjectFormatter>>();
|
||||||
|
|
||||||
static {
|
static {
|
||||||
for (ObjectFormatter basicTypeFormatter : BasicTypeFormatter.basicTypeFormatters) {
|
for (Class<? extends ObjectFormatter> basicTypeFormatter : BasicTypeFormatter.basicTypeFormatters) {
|
||||||
put(basicTypeFormatter);
|
put(basicTypeFormatter);
|
||||||
}
|
}
|
||||||
|
put(DateFormatter.class);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void put(Class<? extends ObjectFormatter> objectFormatter) {
|
||||||
|
try {
|
||||||
|
formatterMap.put(objectFormatter.newInstance().clazz(), objectFormatter);
|
||||||
|
} catch (InstantiationException e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
} catch (IllegalAccessException e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public static void put(ObjectFormatter objectFormatter) {
|
public static Class<? extends ObjectFormatter> get(Class<?> clazz){
|
||||||
formatterMap.put(objectFormatter.clazz(), objectFormatter);
|
|
||||||
}
|
|
||||||
|
|
||||||
public static <T> ObjectFormatter<T> get(Class<T> clazz){
|
|
||||||
return formatterMap.get(clazz);
|
return formatterMap.get(clazz);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,26 @@
|
||||||
|
package us.codecraft.webmagic.model;
|
||||||
|
|
||||||
|
import junit.framework.Assert;
|
||||||
|
import org.junit.Test;
|
||||||
|
import us.codecraft.webmagic.MockDownloader;
|
||||||
|
import us.codecraft.webmagic.Site;
|
||||||
|
import us.codecraft.webmagic.Task;
|
||||||
|
import us.codecraft.webmagic.example.GithubRepo;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author code4crafter@gmail.com <br>
|
||||||
|
*/
|
||||||
|
public class GithubRepoTest {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void test() {
|
||||||
|
OOSpider.create(Site.me().addStartUrl("https://github.com/code4craft/webmagic").setSleepTime(0)
|
||||||
|
, new PageModelPipeline<GithubRepo>() {
|
||||||
|
@Override
|
||||||
|
public void process(GithubRepo o, Task task) {
|
||||||
|
Assert.assertEquals(78, o.getStar());
|
||||||
|
Assert.assertEquals(65, o.getFork());
|
||||||
|
}
|
||||||
|
}, GithubRepo.class).setDownloader(new MockDownloader()).test("https://github.com/code4craft/webmagic");
|
||||||
|
}
|
||||||
|
}
|
|
@ -1,19 +1,19 @@
|
||||||
package us.codecraft.webmagic.model.samples;
|
package us.codecraft.webmagic.model.samples;
|
||||||
|
|
||||||
import us.codecraft.webmagic.Site;
|
import us.codecraft.webmagic.Site;
|
||||||
import us.codecraft.webmagic.model.HasKey;
|
|
||||||
import us.codecraft.webmagic.model.OOSpider;
|
import us.codecraft.webmagic.model.OOSpider;
|
||||||
import us.codecraft.webmagic.model.annotation.ExtractBy;
|
import us.codecraft.webmagic.model.annotation.ExtractBy;
|
||||||
import us.codecraft.webmagic.model.annotation.TargetUrl;
|
import us.codecraft.webmagic.model.annotation.TargetUrl;
|
||||||
import us.codecraft.webmagic.pipeline.JsonFilePageModelPipeline;
|
import us.codecraft.webmagic.pipeline.JsonFilePageModelPipeline;
|
||||||
|
|
||||||
|
import java.util.Date;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
*/
|
*/
|
||||||
@TargetUrl("http://my.oschina.net/flashsword/blog/\\d+")
|
@TargetUrl("http://my.oschina.net/flashsword/blog/\\d+")
|
||||||
public class OschinaBlog implements HasKey{
|
public class OschinaBlog{
|
||||||
|
|
||||||
@ExtractBy("//title")
|
@ExtractBy("//title")
|
||||||
private String title;
|
private String title;
|
||||||
|
@ -24,16 +24,14 @@ public class OschinaBlog implements HasKey{
|
||||||
@ExtractBy(value = "//div[@class='BlogTags']/a/text()", multi = true)
|
@ExtractBy(value = "//div[@class='BlogTags']/a/text()", multi = true)
|
||||||
private List<String> tags;
|
private List<String> tags;
|
||||||
|
|
||||||
|
@ExtractBy("//div[class='BlogStat']/regex('\\d{4}-\\d{1,2}-\\d{1,2} \\d{1,2}:\\d{1,2}')")
|
||||||
|
private Date date;
|
||||||
|
|
||||||
public static void main(String[] args) {
|
public static void main(String[] args) {
|
||||||
OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog")
|
OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog")
|
||||||
,new JsonFilePageModelPipeline(), OschinaBlog.class).run();
|
,new JsonFilePageModelPipeline(), OschinaBlog.class).run();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
public String key() {
|
|
||||||
return title;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getTitle() {
|
public String getTitle() {
|
||||||
return title;
|
return title;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue