diff --git a/pom.xml b/pom.xml index a7eb02b..8ba03ce 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ 7 us.codecraft - 0.4.3-SNAPSHOT + 0.5.0-SNAPSHOT 4.0.0 pom @@ -76,6 +76,16 @@ guava 15.0 + + org.slf4j + slf4j-api + 1.7.6 + + + org.slf4j + slf4j-log4j12 + 1.7.6 + us.codecraft xsoup diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index 914bfda..e64b865 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.4.3-SNAPSHOT + 0.5.0-SNAPSHOT 4.0.0 @@ -23,7 +23,6 @@ com.google.guava guava - 15.0 @@ -37,8 +36,13 @@ - log4j - log4j + org.slf4j + slf4j-api + + + + org.slf4j + slf4j-log4j12 diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index a6b8dac..6a6b956 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -2,7 +2,8 @@ package us.codecraft.webmagic; import com.google.common.collect.Lists; import org.apache.commons.collections.CollectionUtils; -import org.apache.log4j.Logger; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import us.codecraft.webmagic.downloader.Downloader; import us.codecraft.webmagic.downloader.HttpClientDownloader; import us.codecraft.webmagic.pipeline.CollectorPipeline; @@ -18,7 +19,10 @@ import us.codecraft.webmagic.utils.UrlUtils; import java.io.Closeable; import java.io.IOException; -import java.util.*; +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; +import java.util.UUID; import java.util.concurrent.ExecutorService; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicLong; @@ -72,7 +76,7 @@ public class Spider implements Runnable, Task { protected Scheduler scheduler = new QueueScheduler(); - protected Logger logger = Logger.getLogger(getClass()); + protected Logger logger = LoggerFactory.getLogger(getClass()); protected ExecutorService executorService; diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index c11f0f1..cd8c12f 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.4.3-SNAPSHOT + 0.5.0-SNAPSHOT 4.0.0 diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/Inject.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/Inject.java new file mode 100644 index 0000000..c6608ae --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/Inject.java @@ -0,0 +1,15 @@ +package us.codecraft.webmagic.configurable; + +import java.lang.annotation.ElementType; +import java.lang.annotation.Retention; +import java.lang.annotation.Target; + +/** + * @author yihua.huang@dianping.com + */ +@Retention(java.lang.annotation.RetentionPolicy.RUNTIME) +@Target({ElementType.FIELD}) +public @interface Inject { + + String value() default ""; +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/ConfigurableBlogPageProcesser.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/ConfigurableBlogPageProcesser.java new file mode 100644 index 0000000..f5992a4 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/ConfigurableBlogPageProcesser.java @@ -0,0 +1,51 @@ +package us.codecraft.webmagic.example; + +import java.util.List; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.configurable.Inject; +import us.codecraft.webmagic.processor.PageProcessor; + +/** + * @author code4crafter@gmail.com
+ */ +public class ConfigurableBlogPageProcesser implements PageProcessor { + + private Site site = Site.me().setDomain("my.oschina.net"); + + @Inject("linkRegex") + private String linkRegex; + + @Inject("titleXpath") + private String titleXpath; + + @Inject("contentXpath") + private String contentXpath; + + @Inject("tagsXpath") + private String tagsXpath; + + @Override + public void process(Page page) { + List links = page.getHtml().links().regex(linkRegex).all(); + page.addTargetRequests(links); + page.putField("title", page.getHtml().xpath(titleXpath).toString()); + if (page.getResultItems().get("title") == null) { + //skip this page + page.setSkip(true); + } + page.putField("content", page.getHtml().smartContent().toString()); + page.putField("tags", page.getHtml().xpath(tagsXpath).all()); + } + + @Override + public Site getSite() { + return site; + + } + + public static void main(String[] args) { + Spider.create(new ConfigurableBlogPageProcesser()).addUrl("http://my.oschina.net/flashsword/blog").thread(2).run(); + } +} diff --git a/webmagic-lucene/pom.xml b/webmagic-lucene/pom.xml index 5502ca3..d8b8bc9 100644 --- a/webmagic-lucene/pom.xml +++ b/webmagic-lucene/pom.xml @@ -5,7 +5,7 @@ webmagic-parent us.codecraft - 0.4.3-SNAPSHOT + 0.5.0-SNAPSHOT 4.0.0 diff --git a/webmagic-panel/pom.xml b/webmagic-panel/pom.xml index 98f9fb3..3b0b682 100644 --- a/webmagic-panel/pom.xml +++ b/webmagic-panel/pom.xml @@ -5,19 +5,18 @@ webmagic-parent us.codecraft - 0.4.3-SNAPSHOT + 0.5.0-SNAPSHOT 4.0.0 us.codecraft webmagic-panel - 0.4.3-SNAPSHOT us.codecraft webmagic-scripts - 0.4.3-SNAPSHOT + ${project.version} diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index 58f79de..c2b4b93 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -5,7 +5,7 @@ webmagic-parent us.codecraft - 0.4.3-SNAPSHOT + 0.5.0-SNAPSHOT 4.0.0 diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml index d17335e..94bb1d0 100644 --- a/webmagic-saxon/pom.xml +++ b/webmagic-saxon/pom.xml @@ -5,7 +5,7 @@ webmagic-parent us.codecraft - 0.4.3-SNAPSHOT + 0.5.0-SNAPSHOT 4.0.0 diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index 1c65513..5c21160 100644 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.4.3-SNAPSHOT + 0.5.0-SNAPSHOT 4.0.0 diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml index fb33895..d73417b 100644 --- a/webmagic-selenium/pom.xml +++ b/webmagic-selenium/pom.xml @@ -5,7 +5,7 @@ webmagic-parent us.codecraft - 0.4.3-SNAPSHOT + 0.5.0-SNAPSHOT 4.0.0 diff --git a/webmagic-worker/pom.xml b/webmagic-worker/pom.xml index f3bddf8..f1059db 100644 --- a/webmagic-worker/pom.xml +++ b/webmagic-worker/pom.xml @@ -5,20 +5,19 @@ webmagic-parent us.codecraft - 0.4.3-SNAPSHOT + 0.5.0-SNAPSHOT 4.0.0 us.codecraft webmagic-worker - 0.4.3-SNAPSHOT war us.codecraft webmagic-scripts - 0.4.3-SNAPSHOT + ${project.version}