diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml new file mode 100644 index 0000000..607eb13 --- /dev/null +++ b/webmagic-core/pom.xml @@ -0,0 +1,105 @@ + + + us.codecraft + 0.0.1-SNAPSHOT + 4.0.0 + + webmagic-core + + + + org.apache.httpcomponents + httpclient + 4.2.1 + + + + junit + junit + 4.7 + test + + + + com.google.guava + guava + 13.0.1 + + + + org.apache.commons + commons-lang3 + 3.1 + + + + log4j + log4j + 1.2.17 + + + + commons-collections + commons-collections + 3.2.1 + + + + net.sourceforge.htmlcleaner + htmlcleaner + 2.4 + + + + org.apache.commons + commons-io + 1.3.2 + + + + + + + + org.apache.maven.plugins + maven-resources-plugin + + UTF-8 + + + + org.apache.maven.plugins + maven-source-plugin + + + attach-sources + + jar + + + + + + org.apache.maven.plugins + maven-javadoc-plugin + + + attach-javadocs + + jar + + + + + + org.apache.maven.plugins + maven-release-plugin + 2.0-beta-7 + + + + + + \ No newline at end of file diff --git a/src/main/java/us/codecraft/spider/Page.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java similarity index 94% rename from src/main/java/us/codecraft/spider/Page.java rename to webmagic-core/src/main/java/us/codecraft/webmagic/Page.java index 1f96e58..8f1a4c7 100644 --- a/src/main/java/us/codecraft/spider/Page.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java @@ -1,8 +1,8 @@ -package us.codecraft.spider; +package us.codecraft.webmagic; import org.apache.commons.lang3.StringUtils; -import us.codecraft.spider.selector.Selectable; -import us.codecraft.spider.utils.UrlUtils; +import us.codecraft.webmagic.selector.Selectable; +import us.codecraft.webmagic.utils.UrlUtils; import java.util.ArrayList; import java.util.List; diff --git a/src/main/java/us/codecraft/spider/Request.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java similarity index 80% rename from src/main/java/us/codecraft/spider/Request.java rename to webmagic-core/src/main/java/us/codecraft/webmagic/Request.java index 4446c16..ccefc7f 100644 --- a/src/main/java/us/codecraft/spider/Request.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java @@ -1,8 +1,4 @@ -package us.codecraft.spider; - -import us.codecraft.spider.Site; - -import java.util.List; +package us.codecraft.webmagic; /** * User: cairne diff --git a/src/main/java/us/codecraft/spider/Site.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java similarity index 98% rename from src/main/java/us/codecraft/spider/Site.java rename to webmagic-core/src/main/java/us/codecraft/webmagic/Site.java index 6f27a22..4c032aa 100644 --- a/src/main/java/us/codecraft/spider/Site.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java @@ -1,4 +1,4 @@ -package us.codecraft.spider; +package us.codecraft.webmagic; import java.util.HashSet; import java.util.Set; diff --git a/src/main/java/us/codecraft/spider/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java similarity index 81% rename from src/main/java/us/codecraft/spider/Spider.java rename to webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 81ecd9d..835bdf4 100644 --- a/src/main/java/us/codecraft/spider/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -1,17 +1,14 @@ -package us.codecraft.spider; +package us.codecraft.webmagic; import org.apache.commons.collections.CollectionUtils; import org.apache.log4j.Logger; -import us.codecraft.spider.downloader.Downloader; -import us.codecraft.spider.downloader.HttpClientDownloader; -import us.codecraft.spider.pipeline.ConsolePipeline; -import us.codecraft.spider.pipeline.Pipeline; -import us.codecraft.spider.processor.PageProcessor; -import us.codecraft.spider.schedular.QueueSchedular; -import us.codecraft.spider.schedular.Schedular; - -import java.util.Map; -import java.util.concurrent.ConcurrentHashMap; +import us.codecraft.webmagic.downloader.Downloader; +import us.codecraft.webmagic.downloader.HttpClientDownloader; +import us.codecraft.webmagic.pipeline.ConsolePipeline; +import us.codecraft.webmagic.pipeline.Pipeline; +import us.codecraft.webmagic.processor.PageProcessor; +import us.codecraft.webmagic.schedular.QueueSchedular; +import us.codecraft.webmagic.schedular.Schedular; /** * User: cairne diff --git a/src/main/java/us/codecraft/spider/downloader/Downloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Downloader.java similarity index 50% rename from src/main/java/us/codecraft/spider/downloader/Downloader.java rename to webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Downloader.java index d20bfbb..e847584 100644 --- a/src/main/java/us/codecraft/spider/downloader/Downloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Downloader.java @@ -1,8 +1,8 @@ -package us.codecraft.spider.downloader; +package us.codecraft.webmagic.downloader; -import us.codecraft.spider.Page; -import us.codecraft.spider.Request; -import us.codecraft.spider.Site; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.Site; /** * User: cairne diff --git a/src/main/java/us/codecraft/spider/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java similarity index 85% rename from src/main/java/us/codecraft/spider/downloader/HttpClientDownloader.java rename to webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index c817fb6..269ba6d 100644 --- a/src/main/java/us/codecraft/spider/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -1,16 +1,16 @@ -package us.codecraft.spider.downloader; +package us.codecraft.webmagic.downloader; import org.apache.commons.io.IOUtils; import org.apache.http.HttpResponse; import org.apache.http.client.HttpClient; import org.apache.http.client.methods.HttpGet; import org.apache.log4j.Logger; -import us.codecraft.spider.Page; -import us.codecraft.spider.Request; -import us.codecraft.spider.Site; -import us.codecraft.spider.selector.Html; -import us.codecraft.spider.selector.PlainText; -import us.codecraft.spider.utils.UrlUtils; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.selector.Html; +import us.codecraft.webmagic.selector.PlainText; +import us.codecraft.webmagic.utils.UrlUtils; /** diff --git a/src/main/java/us/codecraft/spider/downloader/HttpClientPool.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java similarity index 96% rename from src/main/java/us/codecraft/spider/downloader/HttpClientPool.java rename to webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java index 90696a6..4fdf421 100644 --- a/src/main/java/us/codecraft/spider/downloader/HttpClientPool.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java @@ -1,4 +1,4 @@ -package us.codecraft.spider.downloader; +package us.codecraft.webmagic.downloader; import org.apache.http.HttpVersion; import org.apache.http.client.HttpClient; @@ -10,7 +10,7 @@ import org.apache.http.conn.scheme.SchemeRegistry; import org.apache.http.impl.client.DefaultHttpClient; import org.apache.http.impl.conn.PoolingClientConnectionManager; import org.apache.http.params.*; -import us.codecraft.spider.Site; +import us.codecraft.webmagic.Site; /** * User: cairne diff --git a/src/main/java/us/codecraft/spider/pipeline/ConsolePipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java similarity index 73% rename from src/main/java/us/codecraft/spider/pipeline/ConsolePipeline.java rename to webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java index 6aa8f09..4115b8c 100644 --- a/src/main/java/us/codecraft/spider/pipeline/ConsolePipeline.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java @@ -1,8 +1,8 @@ -package us.codecraft.spider.pipeline; +package us.codecraft.webmagic.pipeline; -import us.codecraft.spider.Page; -import us.codecraft.spider.Site; -import us.codecraft.spider.selector.Selectable; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.selector.Selectable; import java.util.Map; diff --git a/src/main/java/us/codecraft/spider/pipeline/FilePipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java similarity index 87% rename from src/main/java/us/codecraft/spider/pipeline/FilePipeline.java rename to webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java index 564a9fa..508b00e 100644 --- a/src/main/java/us/codecraft/spider/pipeline/FilePipeline.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java @@ -1,10 +1,10 @@ -package us.codecraft.spider.pipeline; +package us.codecraft.webmagic.pipeline; import org.apache.commons.codec.digest.DigestUtils; import org.apache.commons.lang3.StringUtils; -import us.codecraft.spider.Page; -import us.codecraft.spider.Site; -import us.codecraft.spider.selector.Selectable; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.selector.Selectable; import java.io.File; import java.io.FileWriter; @@ -19,7 +19,7 @@ import java.util.Map; */ public class FilePipeline implements Pipeline { - private String path = "/data/temp/spider/"; + private String path = "/data/temp/webmagic/"; public FilePipeline(){ diff --git a/src/main/java/us/codecraft/spider/pipeline/Pipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/Pipeline.java similarity index 56% rename from src/main/java/us/codecraft/spider/pipeline/Pipeline.java rename to webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/Pipeline.java index 549c70d..ef27cda 100644 --- a/src/main/java/us/codecraft/spider/pipeline/Pipeline.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/Pipeline.java @@ -1,7 +1,7 @@ -package us.codecraft.spider.pipeline; +package us.codecraft.webmagic.pipeline; -import us.codecraft.spider.Page; -import us.codecraft.spider.Site; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; /** * User: cairne diff --git a/src/main/java/us/codecraft/spider/processor/PageProcessor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/PageProcessor.java similarity index 74% rename from src/main/java/us/codecraft/spider/processor/PageProcessor.java rename to webmagic-core/src/main/java/us/codecraft/webmagic/processor/PageProcessor.java index b2617a9..22a24c9 100644 --- a/src/main/java/us/codecraft/spider/processor/PageProcessor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/PageProcessor.java @@ -1,7 +1,7 @@ -package us.codecraft.spider.processor; +package us.codecraft.webmagic.processor; -import us.codecraft.spider.Page; -import us.codecraft.spider.Site; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; /** * User: cairne diff --git a/src/main/java/us/codecraft/spider/processor/SimplePageProcessor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/SimplePageProcessor.java similarity index 86% rename from src/main/java/us/codecraft/spider/processor/SimplePageProcessor.java rename to webmagic-core/src/main/java/us/codecraft/webmagic/processor/SimplePageProcessor.java index 197ca87..a8165bb 100644 --- a/src/main/java/us/codecraft/spider/processor/SimplePageProcessor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/SimplePageProcessor.java @@ -1,8 +1,8 @@ -package us.codecraft.spider.processor; +package us.codecraft.webmagic.processor; -import us.codecraft.spider.Page; -import us.codecraft.spider.Site; -import us.codecraft.spider.utils.UrlUtils; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.utils.UrlUtils; import java.util.List; diff --git a/src/main/java/us/codecraft/spider/schedular/FileCacheQueueSchedular.java b/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/FileCacheQueueSchedular.java similarity index 97% rename from src/main/java/us/codecraft/spider/schedular/FileCacheQueueSchedular.java rename to webmagic-core/src/main/java/us/codecraft/webmagic/schedular/FileCacheQueueSchedular.java index 3f3cf3c..0372d0e 100644 --- a/src/main/java/us/codecraft/spider/schedular/FileCacheQueueSchedular.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/FileCacheQueueSchedular.java @@ -1,9 +1,9 @@ -package us.codecraft.spider.schedular; +package us.codecraft.webmagic.schedular; import org.apache.commons.lang3.math.NumberUtils; import org.apache.log4j.Logger; -import us.codecraft.spider.Site; -import us.codecraft.spider.Request; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Request; import java.io.*; import java.util.LinkedHashSet; diff --git a/src/main/java/us/codecraft/spider/schedular/QueueSchedular.java b/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/QueueSchedular.java similarity index 88% rename from src/main/java/us/codecraft/spider/schedular/QueueSchedular.java rename to webmagic-core/src/main/java/us/codecraft/webmagic/schedular/QueueSchedular.java index 8182963..071f708 100644 --- a/src/main/java/us/codecraft/spider/schedular/QueueSchedular.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/QueueSchedular.java @@ -1,8 +1,8 @@ -package us.codecraft.spider.schedular; +package us.codecraft.webmagic.schedular; import org.apache.log4j.Logger; -import us.codecraft.spider.Request; -import us.codecraft.spider.Site; +import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.Site; import java.util.HashSet; import java.util.Set; diff --git a/src/main/java/us/codecraft/spider/schedular/Schedular.java b/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/Schedular.java similarity index 61% rename from src/main/java/us/codecraft/spider/schedular/Schedular.java rename to webmagic-core/src/main/java/us/codecraft/webmagic/schedular/Schedular.java index 246afb2..a5b71f5 100644 --- a/src/main/java/us/codecraft/spider/schedular/Schedular.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/Schedular.java @@ -1,7 +1,7 @@ -package us.codecraft.spider.schedular; +package us.codecraft.webmagic.schedular; -import us.codecraft.spider.Request; -import us.codecraft.spider.Site; +import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.Site; /** * User: cairne diff --git a/src/main/java/us/codecraft/spider/selector/Html.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java similarity index 95% rename from src/main/java/us/codecraft/spider/selector/Html.java rename to webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java index 7bbb64a..c385ff9 100644 --- a/src/main/java/us/codecraft/spider/selector/Html.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java @@ -1,6 +1,4 @@ -package us.codecraft.spider.selector; - -import org.apache.commons.collections.CollectionUtils; +package us.codecraft.webmagic.selector; import java.util.ArrayList; import java.util.List; diff --git a/src/main/java/us/codecraft/spider/selector/PlainText.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java similarity index 98% rename from src/main/java/us/codecraft/spider/selector/PlainText.java rename to webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java index 055cbda..91ab7ab 100644 --- a/src/main/java/us/codecraft/spider/selector/PlainText.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java @@ -1,4 +1,4 @@ -package us.codecraft.spider.selector; +package us.codecraft.webmagic.selector; import org.apache.commons.collections.CollectionUtils; diff --git a/src/main/java/us/codecraft/spider/selector/RegexResult.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexResult.java similarity index 91% rename from src/main/java/us/codecraft/spider/selector/RegexResult.java rename to webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexResult.java index f3ab585..8b14e8b 100644 --- a/src/main/java/us/codecraft/spider/selector/RegexResult.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexResult.java @@ -1,4 +1,4 @@ -package us.codecraft.spider.selector; +package us.codecraft.webmagic.selector; /** * User: cairne diff --git a/src/main/java/us/codecraft/spider/selector/RegexSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java similarity index 98% rename from src/main/java/us/codecraft/spider/selector/RegexSelector.java rename to webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java index 692c45e..a2e8b3d 100644 --- a/src/main/java/us/codecraft/spider/selector/RegexSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java @@ -1,4 +1,4 @@ -package us.codecraft.spider.selector; +package us.codecraft.webmagic.selector; import org.apache.commons.lang3.StringUtils; diff --git a/src/main/java/us/codecraft/spider/selector/ReplaceSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/ReplaceSelector.java similarity index 96% rename from src/main/java/us/codecraft/spider/selector/ReplaceSelector.java rename to webmagic-core/src/main/java/us/codecraft/webmagic/selector/ReplaceSelector.java index ddf887e..5f78898 100644 --- a/src/main/java/us/codecraft/spider/selector/ReplaceSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/ReplaceSelector.java @@ -1,4 +1,4 @@ -package us.codecraft.spider.selector; +package us.codecraft.webmagic.selector; import java.util.List; import java.util.regex.Matcher; diff --git a/src/main/java/us/codecraft/spider/selector/Selectable.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java similarity index 97% rename from src/main/java/us/codecraft/spider/selector/Selectable.java rename to webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java index 9f44c3c..0fcc420 100644 --- a/src/main/java/us/codecraft/spider/selector/Selectable.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java @@ -1,4 +1,4 @@ -package us.codecraft.spider.selector; +package us.codecraft.webmagic.selector; import java.util.List; diff --git a/src/main/java/us/codecraft/spider/selector/Selector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selector.java similarity index 83% rename from src/main/java/us/codecraft/spider/selector/Selector.java rename to webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selector.java index f44ed0f..914e8ab 100644 --- a/src/main/java/us/codecraft/spider/selector/Selector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selector.java @@ -1,4 +1,4 @@ -package us.codecraft.spider.selector; +package us.codecraft.webmagic.selector; import java.util.List; diff --git a/src/main/java/us/codecraft/spider/selector/SelectorFactory.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SelectorFactory.java similarity index 98% rename from src/main/java/us/codecraft/spider/selector/SelectorFactory.java rename to webmagic-core/src/main/java/us/codecraft/webmagic/selector/SelectorFactory.java index d479706..af19969 100644 --- a/src/main/java/us/codecraft/spider/selector/SelectorFactory.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SelectorFactory.java @@ -1,4 +1,4 @@ -package us.codecraft.spider.selector; +package us.codecraft.webmagic.selector; import org.apache.commons.lang3.StringUtils; diff --git a/src/main/java/us/codecraft/spider/selector/SmartContentSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SmartContentSelector.java similarity index 98% rename from src/main/java/us/codecraft/spider/selector/SmartContentSelector.java rename to webmagic-core/src/main/java/us/codecraft/webmagic/selector/SmartContentSelector.java index b87a0a3..c2e36df 100644 --- a/src/main/java/us/codecraft/spider/selector/SmartContentSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SmartContentSelector.java @@ -1,4 +1,4 @@ -package us.codecraft.spider.selector; +package us.codecraft.webmagic.selector; import org.apache.log4j.Logger; import org.htmlcleaner.HtmlCleaner; diff --git a/src/main/java/us/codecraft/spider/selector/XpathSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XpathSelector.java similarity index 98% rename from src/main/java/us/codecraft/spider/selector/XpathSelector.java rename to webmagic-core/src/main/java/us/codecraft/webmagic/selector/XpathSelector.java index 6d9a109..6de2f08 100644 --- a/src/main/java/us/codecraft/spider/selector/XpathSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XpathSelector.java @@ -1,4 +1,4 @@ -package us.codecraft.spider.selector; +package us.codecraft.webmagic.selector; import org.htmlcleaner.*; diff --git a/src/main/java/us/codecraft/spider/utils/UrlUtils.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java similarity index 98% rename from src/main/java/us/codecraft/spider/utils/UrlUtils.java rename to webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java index fdaa419..124ca64 100644 --- a/src/main/java/us/codecraft/spider/utils/UrlUtils.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java @@ -1,4 +1,4 @@ -package us.codecraft.spider.utils; +package us.codecraft.webmagic.utils; import org.apache.commons.lang3.StringUtils; diff --git a/src/main/resources/log4j.xml b/webmagic-core/src/main/resources/log4j.xml similarity index 100% rename from src/main/resources/log4j.xml rename to webmagic-core/src/main/resources/log4j.xml diff --git a/src/test/java/us/codecraft/spider/HtmlTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/HtmlTest.java similarity index 81% rename from src/test/java/us/codecraft/spider/HtmlTest.java rename to webmagic-core/src/test/java/us/codecraft/webmagic/HtmlTest.java index 0612d81..f799098 100644 --- a/src/test/java/us/codecraft/spider/HtmlTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/HtmlTest.java @@ -1,8 +1,8 @@ -package us.codecraft.spider; +package us.codecraft.webmagic; import org.junit.Assert; import org.junit.Test; -import us.codecraft.spider.selector.Html; +import us.codecraft.webmagic.selector.Html; /** * User: cairne diff --git a/src/test/java/us/codecraft/spider/SpiderTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/SpiderTest.java similarity index 88% rename from src/test/java/us/codecraft/spider/SpiderTest.java rename to webmagic-core/src/test/java/us/codecraft/webmagic/SpiderTest.java index 5c08b84..5cb9848 100644 --- a/src/test/java/us/codecraft/spider/SpiderTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/SpiderTest.java @@ -1,11 +1,11 @@ -package us.codecraft.spider; +package us.codecraft.webmagic; import org.junit.Ignore; import org.junit.Test; -import us.codecraft.spider.pipeline.FilePipeline; -import us.codecraft.spider.processor.SimplePageProcessor; -import us.codecraft.spider.samples.HuxiuProcessor; -import us.codecraft.spider.schedular.FileCacheQueueSchedular; +import us.codecraft.webmagic.pipeline.FilePipeline; +import us.codecraft.webmagic.processor.SimplePageProcessor; +import us.codecraft.webmagic.samples.HuxiuProcessor; +import us.codecraft.webmagic.schedular.FileCacheQueueSchedular; /** * User: cairne @@ -24,12 +24,12 @@ public class SpiderTest { @Test public void testGlobalSpider(){ // PageProcessor pageProcessor = new MeicanProcessor(); -// Spider.me().pipeline(new FilePipeline()).schedular(new FileCacheQueueSchedular(pageProcessor.getSite(),"/data/temp/spider/cache/")). +// Spider.me().pipeline(new FilePipeline()).schedular(new FileCacheQueueSchedular(pageProcessor.getSite(),"/data/temp/webmagic/cache/")). // processor(pageProcessor).run(); SimplePageProcessor pageProcessor2 = new SimplePageProcessor("http://www.diaoyuweng.com/home.php?mod=space&uid=88304&do=thread&view=me&type=thread&from=space", "http://www.diaoyuweng.com/thread-*-1-1.html"); pageProcessor2.getSite().setEncoding("GBK"); System.out.println(pageProcessor2.getSite().getEncoding()); - Spider.me().pipeline(new FilePipeline()).schedular(new FileCacheQueueSchedular(pageProcessor2.getSite(),"/data/temp/spider/cache/")). + Spider.me().pipeline(new FilePipeline()).schedular(new FileCacheQueueSchedular(pageProcessor2.getSite(),"/data/temp/webmagic/cache/")). processor(pageProcessor2).run(); diff --git a/src/test/java/us/codecraft/spider/samples/DiandianBlogProcessor.java b/webmagic-core/src/test/java/us/codecraft/webmagic/samples/DiandianBlogProcessor.java similarity index 84% rename from src/test/java/us/codecraft/spider/samples/DiandianBlogProcessor.java rename to webmagic-core/src/test/java/us/codecraft/webmagic/samples/DiandianBlogProcessor.java index c735cda..efd1ff7 100644 --- a/src/test/java/us/codecraft/spider/samples/DiandianBlogProcessor.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/samples/DiandianBlogProcessor.java @@ -1,8 +1,8 @@ -package us.codecraft.spider.samples; +package us.codecraft.webmagic.samples; -import us.codecraft.spider.Site; -import us.codecraft.spider.Page; -import us.codecraft.spider.processor.PageProcessor; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.processor.PageProcessor; import java.util.List; diff --git a/src/test/java/us/codecraft/spider/samples/DianpingBlogProcessor.java b/webmagic-core/src/test/java/us/codecraft/webmagic/samples/DianpingBlogProcessor.java similarity index 86% rename from src/test/java/us/codecraft/spider/samples/DianpingBlogProcessor.java rename to webmagic-core/src/test/java/us/codecraft/webmagic/samples/DianpingBlogProcessor.java index f041a32..dd601ad 100644 --- a/src/test/java/us/codecraft/spider/samples/DianpingBlogProcessor.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/samples/DianpingBlogProcessor.java @@ -1,8 +1,8 @@ -package us.codecraft.spider.samples; +package us.codecraft.webmagic.samples; -import us.codecraft.spider.Site; -import us.codecraft.spider.Page; -import us.codecraft.spider.processor.PageProcessor; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.processor.PageProcessor; import java.util.List; diff --git a/src/test/java/us/codecraft/spider/samples/DiaoyuwengProcessor.java b/webmagic-core/src/test/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java similarity index 86% rename from src/test/java/us/codecraft/spider/samples/DiaoyuwengProcessor.java rename to webmagic-core/src/test/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java index 586ec01..05b68b6 100644 --- a/src/test/java/us/codecraft/spider/samples/DiaoyuwengProcessor.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java @@ -1,8 +1,8 @@ -package us.codecraft.spider.samples; +package us.codecraft.webmagic.samples; -import us.codecraft.spider.Page; -import us.codecraft.spider.Site; -import us.codecraft.spider.processor.PageProcessor; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.processor.PageProcessor; import java.util.List; diff --git a/src/test/java/us/codecraft/spider/samples/F58PageProcesser.java b/webmagic-core/src/test/java/us/codecraft/webmagic/samples/F58PageProcesser.java similarity index 81% rename from src/test/java/us/codecraft/spider/samples/F58PageProcesser.java rename to webmagic-core/src/test/java/us/codecraft/webmagic/samples/F58PageProcesser.java index a5ce6cb..78211c4 100644 --- a/src/test/java/us/codecraft/spider/samples/F58PageProcesser.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/samples/F58PageProcesser.java @@ -1,8 +1,8 @@ -package us.codecraft.spider.samples; +package us.codecraft.webmagic.samples; -import us.codecraft.spider.Site; -import us.codecraft.spider.Page; -import us.codecraft.spider.processor.PageProcessor; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.processor.PageProcessor; import java.util.List; diff --git a/src/test/java/us/codecraft/spider/samples/HuxiuProcessor.java b/webmagic-core/src/test/java/us/codecraft/webmagic/samples/HuxiuProcessor.java similarity index 84% rename from src/test/java/us/codecraft/spider/samples/HuxiuProcessor.java rename to webmagic-core/src/test/java/us/codecraft/webmagic/samples/HuxiuProcessor.java index d7d1a6e..82552f9 100644 --- a/src/test/java/us/codecraft/spider/samples/HuxiuProcessor.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/samples/HuxiuProcessor.java @@ -1,8 +1,8 @@ -package us.codecraft.spider.samples; +package us.codecraft.webmagic.samples; -import us.codecraft.spider.Site; -import us.codecraft.spider.Page; -import us.codecraft.spider.processor.PageProcessor; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.processor.PageProcessor; import java.util.List; diff --git a/src/test/java/us/codecraft/spider/samples/KaichibaProcessor.java b/webmagic-core/src/test/java/us/codecraft/webmagic/samples/KaichibaProcessor.java similarity index 84% rename from src/test/java/us/codecraft/spider/samples/KaichibaProcessor.java rename to webmagic-core/src/test/java/us/codecraft/webmagic/samples/KaichibaProcessor.java index 5985803..58a2cb8 100644 --- a/src/test/java/us/codecraft/spider/samples/KaichibaProcessor.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/samples/KaichibaProcessor.java @@ -1,8 +1,8 @@ -package us.codecraft.spider.samples; +package us.codecraft.webmagic.samples; -import us.codecraft.spider.Page; -import us.codecraft.spider.Site; -import us.codecraft.spider.processor.PageProcessor; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.processor.PageProcessor; /** * User: cairne diff --git a/src/test/java/us/codecraft/spider/samples/MeicanProcessor.java b/webmagic-core/src/test/java/us/codecraft/webmagic/samples/MeicanProcessor.java similarity index 88% rename from src/test/java/us/codecraft/spider/samples/MeicanProcessor.java rename to webmagic-core/src/test/java/us/codecraft/webmagic/samples/MeicanProcessor.java index 3d15cd2..637aec1 100644 --- a/src/test/java/us/codecraft/spider/samples/MeicanProcessor.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/samples/MeicanProcessor.java @@ -1,8 +1,8 @@ -package us.codecraft.spider.samples; +package us.codecraft.webmagic.samples; -import us.codecraft.spider.Page; -import us.codecraft.spider.Site; -import us.codecraft.spider.processor.PageProcessor; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.processor.PageProcessor; import java.util.List; diff --git a/src/test/java/us/codecraft/spider/samples/NjuBBSProcessor.java b/webmagic-core/src/test/java/us/codecraft/webmagic/samples/NjuBBSProcessor.java similarity index 83% rename from src/test/java/us/codecraft/spider/samples/NjuBBSProcessor.java rename to webmagic-core/src/test/java/us/codecraft/webmagic/samples/NjuBBSProcessor.java index fa22eed..ca46de6 100644 --- a/src/test/java/us/codecraft/spider/samples/NjuBBSProcessor.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/samples/NjuBBSProcessor.java @@ -1,8 +1,8 @@ -package us.codecraft.spider.samples; +package us.codecraft.webmagic.samples; -import us.codecraft.spider.Site; -import us.codecraft.spider.Page; -import us.codecraft.spider.processor.PageProcessor; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.processor.PageProcessor; import java.util.List; diff --git a/src/test/java/us/codecraft/spider/samples/OschinaBlogPageProcesser.java b/webmagic-core/src/test/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java similarity index 84% rename from src/test/java/us/codecraft/spider/samples/OschinaBlogPageProcesser.java rename to webmagic-core/src/test/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java index 97ced9b..2166d9b 100644 --- a/src/test/java/us/codecraft/spider/samples/OschinaBlogPageProcesser.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java @@ -1,8 +1,8 @@ -package us.codecraft.spider.samples; +package us.codecraft.webmagic.samples; -import us.codecraft.spider.Site; -import us.codecraft.spider.Page; -import us.codecraft.spider.processor.PageProcessor; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.processor.PageProcessor; import java.util.List; diff --git a/src/test/java/us/codecraft/spider/samples/OschinaPageProcesser.java b/webmagic-core/src/test/java/us/codecraft/webmagic/samples/OschinaPageProcesser.java similarity index 84% rename from src/test/java/us/codecraft/spider/samples/OschinaPageProcesser.java rename to webmagic-core/src/test/java/us/codecraft/webmagic/samples/OschinaPageProcesser.java index 0ebaab6..cdfbc1e 100644 --- a/src/test/java/us/codecraft/spider/samples/OschinaPageProcesser.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/samples/OschinaPageProcesser.java @@ -1,8 +1,8 @@ -package us.codecraft.spider.samples; +package us.codecraft.webmagic.samples; -import us.codecraft.spider.Site; -import us.codecraft.spider.Page; -import us.codecraft.spider.processor.PageProcessor; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.processor.PageProcessor; import java.util.List; diff --git a/src/test/java/us/codecraft/spider/samples/QzoneBlogProcessor.java b/webmagic-core/src/test/java/us/codecraft/webmagic/samples/QzoneBlogProcessor.java similarity index 88% rename from src/test/java/us/codecraft/spider/samples/QzoneBlogProcessor.java rename to webmagic-core/src/test/java/us/codecraft/webmagic/samples/QzoneBlogProcessor.java index 320494d..67ef671 100644 --- a/src/test/java/us/codecraft/spider/samples/QzoneBlogProcessor.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/samples/QzoneBlogProcessor.java @@ -1,8 +1,8 @@ -package us.codecraft.spider.samples; +package us.codecraft.webmagic.samples; -import us.codecraft.spider.Page; -import us.codecraft.spider.Site; -import us.codecraft.spider.processor.PageProcessor; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.processor.PageProcessor; import java.util.List; diff --git a/src/test/java/us/codecraft/spider/samples/SinaBlogProcesser.java b/webmagic-core/src/test/java/us/codecraft/webmagic/samples/SinaBlogProcesser.java similarity index 85% rename from src/test/java/us/codecraft/spider/samples/SinaBlogProcesser.java rename to webmagic-core/src/test/java/us/codecraft/webmagic/samples/SinaBlogProcesser.java index f1de3cc..b86fff8 100644 --- a/src/test/java/us/codecraft/spider/samples/SinaBlogProcesser.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/samples/SinaBlogProcesser.java @@ -1,8 +1,8 @@ -package us.codecraft.spider.samples; +package us.codecraft.webmagic.samples; -import us.codecraft.spider.Site; -import us.codecraft.spider.Page; -import us.codecraft.spider.processor.PageProcessor; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.processor.PageProcessor; /** * User: cairne diff --git a/src/test/java/us/codecraft/spider/samples/TianyaPageProcesser.java b/webmagic-core/src/test/java/us/codecraft/webmagic/samples/TianyaPageProcesser.java similarity index 82% rename from src/test/java/us/codecraft/spider/samples/TianyaPageProcesser.java rename to webmagic-core/src/test/java/us/codecraft/webmagic/samples/TianyaPageProcesser.java index e39abc8..7a8920b 100644 --- a/src/test/java/us/codecraft/spider/samples/TianyaPageProcesser.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/samples/TianyaPageProcesser.java @@ -1,8 +1,8 @@ -package us.codecraft.spider.samples; +package us.codecraft.webmagic.samples; -import us.codecraft.spider.Site; -import us.codecraft.spider.Page; -import us.codecraft.spider.processor.PageProcessor; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.processor.PageProcessor; import java.util.List; diff --git a/src/test/java/us/codecraft/spider/selector/HtmlCleanerTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/HtmlCleanerTest.java similarity index 89% rename from src/test/java/us/codecraft/spider/selector/HtmlCleanerTest.java rename to webmagic-core/src/test/java/us/codecraft/webmagic/selector/HtmlCleanerTest.java index b3931ad..7aa2fc7 100644 --- a/src/test/java/us/codecraft/spider/selector/HtmlCleanerTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/HtmlCleanerTest.java @@ -1,4 +1,4 @@ -package us.codecraft.spider.selector; +package us.codecraft.webmagic.selector; import org.htmlcleaner.CleanerProperties; import org.htmlcleaner.HtmlCleaner; @@ -6,7 +6,6 @@ import org.htmlcleaner.TagNode; import org.junit.Test; import java.io.IOException; -import java.net.MalformedURLException; import java.net.URL; /** diff --git a/src/test/java/us/codecraft/spider/selector/RegexSelectorTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/RegexSelectorTest.java similarity index 90% rename from src/test/java/us/codecraft/spider/selector/RegexSelectorTest.java rename to webmagic-core/src/test/java/us/codecraft/webmagic/selector/RegexSelectorTest.java index a53b5a9..6128f17 100644 --- a/src/test/java/us/codecraft/spider/selector/RegexSelectorTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/RegexSelectorTest.java @@ -1,4 +1,4 @@ -package us.codecraft.spider.selector; +package us.codecraft.webmagic.selector; import junit.framework.Assert; import org.junit.Test; diff --git a/src/test/java/us/codecraft/spider/selector/SmartConentSelectorTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/SmartConentSelectorTest.java similarity index 99% rename from src/test/java/us/codecraft/spider/selector/SmartConentSelectorTest.java rename to webmagic-core/src/test/java/us/codecraft/webmagic/selector/SmartConentSelectorTest.java index 06c56b3..4620a24 100644 --- a/src/test/java/us/codecraft/spider/selector/SmartConentSelectorTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/SmartConentSelectorTest.java @@ -1,12 +1,8 @@ -package us.codecraft.spider.selector; +package us.codecraft.webmagic.selector; -import org.htmlcleaner.CleanerProperties; -import org.htmlcleaner.HtmlCleaner; -import org.htmlcleaner.TagNode; import org.junit.Test; import java.io.IOException; -import java.net.URL; /** * User: cairne diff --git a/src/test/java/us/codecraft/spider/selector/XpathSelectorTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java similarity index 99% rename from src/test/java/us/codecraft/spider/selector/XpathSelectorTest.java rename to webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java index 24988f7..96ea6e8 100644 --- a/src/test/java/us/codecraft/spider/selector/XpathSelectorTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java @@ -1,4 +1,4 @@ -package us.codecraft.spider.selector; +package us.codecraft.webmagic.selector; import org.junit.Assert; import org.junit.Test; diff --git a/src/test/java/us/codecraft/spider/utils/UrlUtilsTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java similarity index 99% rename from src/test/java/us/codecraft/spider/utils/UrlUtilsTest.java rename to webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java index 305bad7..d424005 100644 --- a/src/test/java/us/codecraft/spider/utils/UrlUtilsTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java @@ -1,4 +1,4 @@ -package us.codecraft.spider.utils; +package us.codecraft.webmagic.utils; import org.junit.Assert; import org.junit.Test; diff --git a/src/test/resources/log4j.xml b/webmagic-core/src/test/resources/log4j.xml similarity index 100% rename from src/test/resources/log4j.xml rename to webmagic-core/src/test/resources/log4j.xml diff --git a/webmagic-core/target/classes/log4j.xml b/webmagic-core/target/classes/log4j.xml new file mode 100644 index 0000000..a6630f8 --- /dev/null +++ b/webmagic-core/target/classes/log4j.xml @@ -0,0 +1,26 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/webmagic-core/target/classes/us/codecraft/webmagic/Page.class b/webmagic-core/target/classes/us/codecraft/webmagic/Page.class new file mode 100644 index 0000000..c9abd01 Binary files /dev/null and b/webmagic-core/target/classes/us/codecraft/webmagic/Page.class differ diff --git a/webmagic-core/target/classes/us/codecraft/webmagic/Request.class b/webmagic-core/target/classes/us/codecraft/webmagic/Request.class new file mode 100644 index 0000000..e55141d Binary files /dev/null and b/webmagic-core/target/classes/us/codecraft/webmagic/Request.class differ diff --git a/webmagic-core/target/classes/us/codecraft/webmagic/Site.class b/webmagic-core/target/classes/us/codecraft/webmagic/Site.class new file mode 100644 index 0000000..902294a Binary files /dev/null and b/webmagic-core/target/classes/us/codecraft/webmagic/Site.class differ diff --git a/webmagic-core/target/classes/us/codecraft/webmagic/Spider.class b/webmagic-core/target/classes/us/codecraft/webmagic/Spider.class new file mode 100644 index 0000000..d9d30f9 Binary files /dev/null and b/webmagic-core/target/classes/us/codecraft/webmagic/Spider.class differ diff --git a/webmagic-core/target/classes/us/codecraft/webmagic/downloader/Downloader.class b/webmagic-core/target/classes/us/codecraft/webmagic/downloader/Downloader.class new file mode 100644 index 0000000..eb31726 Binary files /dev/null and b/webmagic-core/target/classes/us/codecraft/webmagic/downloader/Downloader.class differ diff --git a/webmagic-core/target/classes/us/codecraft/webmagic/downloader/HttpClientDownloader.class b/webmagic-core/target/classes/us/codecraft/webmagic/downloader/HttpClientDownloader.class new file mode 100644 index 0000000..6e86a55 Binary files /dev/null and b/webmagic-core/target/classes/us/codecraft/webmagic/downloader/HttpClientDownloader.class differ diff --git a/webmagic-core/target/classes/us/codecraft/webmagic/downloader/HttpClientPool.class b/webmagic-core/target/classes/us/codecraft/webmagic/downloader/HttpClientPool.class new file mode 100644 index 0000000..ea93641 Binary files /dev/null and b/webmagic-core/target/classes/us/codecraft/webmagic/downloader/HttpClientPool.class differ diff --git a/webmagic-core/target/classes/us/codecraft/webmagic/pipeline/ConsolePipeline.class b/webmagic-core/target/classes/us/codecraft/webmagic/pipeline/ConsolePipeline.class new file mode 100644 index 0000000..dddce50 Binary files /dev/null and b/webmagic-core/target/classes/us/codecraft/webmagic/pipeline/ConsolePipeline.class differ diff --git a/webmagic-core/target/classes/us/codecraft/webmagic/pipeline/FilePipeline.class b/webmagic-core/target/classes/us/codecraft/webmagic/pipeline/FilePipeline.class new file mode 100644 index 0000000..6a28b83 Binary files /dev/null and b/webmagic-core/target/classes/us/codecraft/webmagic/pipeline/FilePipeline.class differ diff --git a/webmagic-core/target/classes/us/codecraft/webmagic/pipeline/Pipeline.class b/webmagic-core/target/classes/us/codecraft/webmagic/pipeline/Pipeline.class new file mode 100644 index 0000000..8f97c76 Binary files /dev/null and b/webmagic-core/target/classes/us/codecraft/webmagic/pipeline/Pipeline.class differ diff --git a/webmagic-core/target/classes/us/codecraft/webmagic/processor/PageProcessor.class b/webmagic-core/target/classes/us/codecraft/webmagic/processor/PageProcessor.class new file mode 100644 index 0000000..536ac09 Binary files /dev/null and b/webmagic-core/target/classes/us/codecraft/webmagic/processor/PageProcessor.class differ diff --git a/webmagic-core/target/classes/us/codecraft/webmagic/processor/SimplePageProcessor.class b/webmagic-core/target/classes/us/codecraft/webmagic/processor/SimplePageProcessor.class new file mode 100644 index 0000000..8050be4 Binary files /dev/null and b/webmagic-core/target/classes/us/codecraft/webmagic/processor/SimplePageProcessor.class differ diff --git a/webmagic-core/target/classes/us/codecraft/webmagic/schedular/FileCacheQueueSchedular$1.class b/webmagic-core/target/classes/us/codecraft/webmagic/schedular/FileCacheQueueSchedular$1.class new file mode 100644 index 0000000..5dd49cb Binary files /dev/null and b/webmagic-core/target/classes/us/codecraft/webmagic/schedular/FileCacheQueueSchedular$1.class differ diff --git a/webmagic-core/target/classes/us/codecraft/webmagic/schedular/FileCacheQueueSchedular.class b/webmagic-core/target/classes/us/codecraft/webmagic/schedular/FileCacheQueueSchedular.class new file mode 100644 index 0000000..34b5b7c Binary files /dev/null and b/webmagic-core/target/classes/us/codecraft/webmagic/schedular/FileCacheQueueSchedular.class differ diff --git a/webmagic-core/target/classes/us/codecraft/webmagic/schedular/QueueSchedular.class b/webmagic-core/target/classes/us/codecraft/webmagic/schedular/QueueSchedular.class new file mode 100644 index 0000000..e2a9188 Binary files /dev/null and b/webmagic-core/target/classes/us/codecraft/webmagic/schedular/QueueSchedular.class differ diff --git a/webmagic-core/target/classes/us/codecraft/webmagic/schedular/Schedular.class b/webmagic-core/target/classes/us/codecraft/webmagic/schedular/Schedular.class new file mode 100644 index 0000000..8674d22 Binary files /dev/null and b/webmagic-core/target/classes/us/codecraft/webmagic/schedular/Schedular.class differ diff --git a/webmagic-core/target/classes/us/codecraft/webmagic/selector/Html.class b/webmagic-core/target/classes/us/codecraft/webmagic/selector/Html.class new file mode 100644 index 0000000..de75e23 Binary files /dev/null and b/webmagic-core/target/classes/us/codecraft/webmagic/selector/Html.class differ diff --git a/webmagic-core/target/classes/us/codecraft/webmagic/selector/PlainText.class b/webmagic-core/target/classes/us/codecraft/webmagic/selector/PlainText.class new file mode 100644 index 0000000..3482d90 Binary files /dev/null and b/webmagic-core/target/classes/us/codecraft/webmagic/selector/PlainText.class differ diff --git a/webmagic-core/target/classes/us/codecraft/webmagic/selector/RegexResult.class b/webmagic-core/target/classes/us/codecraft/webmagic/selector/RegexResult.class new file mode 100644 index 0000000..29cda62 Binary files /dev/null and b/webmagic-core/target/classes/us/codecraft/webmagic/selector/RegexResult.class differ diff --git a/webmagic-core/target/classes/us/codecraft/webmagic/selector/RegexSelector.class b/webmagic-core/target/classes/us/codecraft/webmagic/selector/RegexSelector.class new file mode 100644 index 0000000..4330e0c Binary files /dev/null and b/webmagic-core/target/classes/us/codecraft/webmagic/selector/RegexSelector.class differ diff --git a/webmagic-core/target/classes/us/codecraft/webmagic/selector/ReplaceSelector.class b/webmagic-core/target/classes/us/codecraft/webmagic/selector/ReplaceSelector.class new file mode 100644 index 0000000..ea3f06b Binary files /dev/null and b/webmagic-core/target/classes/us/codecraft/webmagic/selector/ReplaceSelector.class differ diff --git a/webmagic-core/target/classes/us/codecraft/webmagic/selector/Selectable.class b/webmagic-core/target/classes/us/codecraft/webmagic/selector/Selectable.class new file mode 100644 index 0000000..422b787 Binary files /dev/null and b/webmagic-core/target/classes/us/codecraft/webmagic/selector/Selectable.class differ diff --git a/webmagic-core/target/classes/us/codecraft/webmagic/selector/Selector.class b/webmagic-core/target/classes/us/codecraft/webmagic/selector/Selector.class new file mode 100644 index 0000000..5d13e99 Binary files /dev/null and b/webmagic-core/target/classes/us/codecraft/webmagic/selector/Selector.class differ diff --git a/webmagic-core/target/classes/us/codecraft/webmagic/selector/SelectorFactory.class b/webmagic-core/target/classes/us/codecraft/webmagic/selector/SelectorFactory.class new file mode 100644 index 0000000..5fd14be Binary files /dev/null and b/webmagic-core/target/classes/us/codecraft/webmagic/selector/SelectorFactory.class differ diff --git a/webmagic-core/target/classes/us/codecraft/webmagic/selector/SmartContentSelector$1.class b/webmagic-core/target/classes/us/codecraft/webmagic/selector/SmartContentSelector$1.class new file mode 100644 index 0000000..f706e3b Binary files /dev/null and b/webmagic-core/target/classes/us/codecraft/webmagic/selector/SmartContentSelector$1.class differ diff --git a/webmagic-core/target/classes/us/codecraft/webmagic/selector/SmartContentSelector.class b/webmagic-core/target/classes/us/codecraft/webmagic/selector/SmartContentSelector.class new file mode 100644 index 0000000..0dc2f42 Binary files /dev/null and b/webmagic-core/target/classes/us/codecraft/webmagic/selector/SmartContentSelector.class differ diff --git a/webmagic-core/target/classes/us/codecraft/webmagic/selector/XpathSelector.class b/webmagic-core/target/classes/us/codecraft/webmagic/selector/XpathSelector.class new file mode 100644 index 0000000..932326c Binary files /dev/null and b/webmagic-core/target/classes/us/codecraft/webmagic/selector/XpathSelector.class differ diff --git a/webmagic-core/target/classes/us/codecraft/webmagic/utils/UrlUtils.class b/webmagic-core/target/classes/us/codecraft/webmagic/utils/UrlUtils.class new file mode 100644 index 0000000..8b7ad07 Binary files /dev/null and b/webmagic-core/target/classes/us/codecraft/webmagic/utils/UrlUtils.class differ diff --git a/webmagic-core/target/test-classes/log4j.xml b/webmagic-core/target/test-classes/log4j.xml new file mode 100644 index 0000000..a58e889 --- /dev/null +++ b/webmagic-core/target/test-classes/log4j.xml @@ -0,0 +1,31 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/webmagic-core/target/test-classes/us/codecraft/webmagic/HtmlTest.class b/webmagic-core/target/test-classes/us/codecraft/webmagic/HtmlTest.class new file mode 100644 index 0000000..7e93aef Binary files /dev/null and b/webmagic-core/target/test-classes/us/codecraft/webmagic/HtmlTest.class differ diff --git a/webmagic-core/target/test-classes/us/codecraft/webmagic/SpiderTest.class b/webmagic-core/target/test-classes/us/codecraft/webmagic/SpiderTest.class new file mode 100644 index 0000000..f41ca61 Binary files /dev/null and b/webmagic-core/target/test-classes/us/codecraft/webmagic/SpiderTest.class differ diff --git a/webmagic-core/target/test-classes/us/codecraft/webmagic/samples/DiandianBlogProcessor.class b/webmagic-core/target/test-classes/us/codecraft/webmagic/samples/DiandianBlogProcessor.class new file mode 100644 index 0000000..a9908a8 Binary files /dev/null and b/webmagic-core/target/test-classes/us/codecraft/webmagic/samples/DiandianBlogProcessor.class differ diff --git a/webmagic-core/target/test-classes/us/codecraft/webmagic/samples/DianpingBlogProcessor.class b/webmagic-core/target/test-classes/us/codecraft/webmagic/samples/DianpingBlogProcessor.class new file mode 100644 index 0000000..91b7eca Binary files /dev/null and b/webmagic-core/target/test-classes/us/codecraft/webmagic/samples/DianpingBlogProcessor.class differ diff --git a/webmagic-core/target/test-classes/us/codecraft/webmagic/samples/DiaoyuwengProcessor.class b/webmagic-core/target/test-classes/us/codecraft/webmagic/samples/DiaoyuwengProcessor.class new file mode 100644 index 0000000..ef7cecc Binary files /dev/null and b/webmagic-core/target/test-classes/us/codecraft/webmagic/samples/DiaoyuwengProcessor.class differ diff --git a/webmagic-core/target/test-classes/us/codecraft/webmagic/samples/F58PageProcesser.class b/webmagic-core/target/test-classes/us/codecraft/webmagic/samples/F58PageProcesser.class new file mode 100644 index 0000000..7ae98cf Binary files /dev/null and b/webmagic-core/target/test-classes/us/codecraft/webmagic/samples/F58PageProcesser.class differ diff --git a/webmagic-core/target/test-classes/us/codecraft/webmagic/samples/HuxiuProcessor.class b/webmagic-core/target/test-classes/us/codecraft/webmagic/samples/HuxiuProcessor.class new file mode 100644 index 0000000..b3dae1d Binary files /dev/null and b/webmagic-core/target/test-classes/us/codecraft/webmagic/samples/HuxiuProcessor.class differ diff --git a/webmagic-core/target/test-classes/us/codecraft/webmagic/samples/KaichibaProcessor.class b/webmagic-core/target/test-classes/us/codecraft/webmagic/samples/KaichibaProcessor.class new file mode 100644 index 0000000..fbbfb6a Binary files /dev/null and b/webmagic-core/target/test-classes/us/codecraft/webmagic/samples/KaichibaProcessor.class differ diff --git a/webmagic-core/target/test-classes/us/codecraft/webmagic/samples/MeicanProcessor.class b/webmagic-core/target/test-classes/us/codecraft/webmagic/samples/MeicanProcessor.class new file mode 100644 index 0000000..aba96f2 Binary files /dev/null and b/webmagic-core/target/test-classes/us/codecraft/webmagic/samples/MeicanProcessor.class differ diff --git a/webmagic-core/target/test-classes/us/codecraft/webmagic/samples/NjuBBSProcessor.class b/webmagic-core/target/test-classes/us/codecraft/webmagic/samples/NjuBBSProcessor.class new file mode 100644 index 0000000..d0bbc96 Binary files /dev/null and b/webmagic-core/target/test-classes/us/codecraft/webmagic/samples/NjuBBSProcessor.class differ diff --git a/webmagic-core/target/test-classes/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.class b/webmagic-core/target/test-classes/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.class new file mode 100644 index 0000000..8472ce3 Binary files /dev/null and b/webmagic-core/target/test-classes/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.class differ diff --git a/webmagic-core/target/test-classes/us/codecraft/webmagic/samples/OschinaPageProcesser.class b/webmagic-core/target/test-classes/us/codecraft/webmagic/samples/OschinaPageProcesser.class new file mode 100644 index 0000000..a7daed0 Binary files /dev/null and b/webmagic-core/target/test-classes/us/codecraft/webmagic/samples/OschinaPageProcesser.class differ diff --git a/webmagic-core/target/test-classes/us/codecraft/webmagic/samples/QzoneBlogProcessor.class b/webmagic-core/target/test-classes/us/codecraft/webmagic/samples/QzoneBlogProcessor.class new file mode 100644 index 0000000..a25a4a7 Binary files /dev/null and b/webmagic-core/target/test-classes/us/codecraft/webmagic/samples/QzoneBlogProcessor.class differ diff --git a/webmagic-core/target/test-classes/us/codecraft/webmagic/samples/SinaBlogProcesser.class b/webmagic-core/target/test-classes/us/codecraft/webmagic/samples/SinaBlogProcesser.class new file mode 100644 index 0000000..5e8a5f0 Binary files /dev/null and b/webmagic-core/target/test-classes/us/codecraft/webmagic/samples/SinaBlogProcesser.class differ diff --git a/webmagic-core/target/test-classes/us/codecraft/webmagic/samples/TianyaPageProcesser.class b/webmagic-core/target/test-classes/us/codecraft/webmagic/samples/TianyaPageProcesser.class new file mode 100644 index 0000000..d5ed6c7 Binary files /dev/null and b/webmagic-core/target/test-classes/us/codecraft/webmagic/samples/TianyaPageProcesser.class differ diff --git a/webmagic-core/target/test-classes/us/codecraft/webmagic/selector/HtmlCleanerTest.class b/webmagic-core/target/test-classes/us/codecraft/webmagic/selector/HtmlCleanerTest.class new file mode 100644 index 0000000..c8f064c Binary files /dev/null and b/webmagic-core/target/test-classes/us/codecraft/webmagic/selector/HtmlCleanerTest.class differ diff --git a/webmagic-core/target/test-classes/us/codecraft/webmagic/selector/RegexSelectorTest.class b/webmagic-core/target/test-classes/us/codecraft/webmagic/selector/RegexSelectorTest.class new file mode 100644 index 0000000..aac7ca1 Binary files /dev/null and b/webmagic-core/target/test-classes/us/codecraft/webmagic/selector/RegexSelectorTest.class differ diff --git a/webmagic-core/target/test-classes/us/codecraft/webmagic/selector/SmartConentSelectorTest.class b/webmagic-core/target/test-classes/us/codecraft/webmagic/selector/SmartConentSelectorTest.class new file mode 100644 index 0000000..aaa9adf Binary files /dev/null and b/webmagic-core/target/test-classes/us/codecraft/webmagic/selector/SmartConentSelectorTest.class differ diff --git a/webmagic-core/target/test-classes/us/codecraft/webmagic/selector/XpathSelectorTest.class b/webmagic-core/target/test-classes/us/codecraft/webmagic/selector/XpathSelectorTest.class new file mode 100644 index 0000000..d8598eb Binary files /dev/null and b/webmagic-core/target/test-classes/us/codecraft/webmagic/selector/XpathSelectorTest.class differ diff --git a/webmagic-core/target/test-classes/us/codecraft/webmagic/utils/UrlUtilsTest.class b/webmagic-core/target/test-classes/us/codecraft/webmagic/utils/UrlUtilsTest.class new file mode 100644 index 0000000..734ea86 Binary files /dev/null and b/webmagic-core/target/test-classes/us/codecraft/webmagic/utils/UrlUtilsTest.class differ diff --git a/webmagic-plugin/pom.xml b/webmagic-plugin/pom.xml new file mode 100644 index 0000000..6a554bf --- /dev/null +++ b/webmagic-plugin/pom.xml @@ -0,0 +1,67 @@ + + + us.codecraft + 0.0.1-SNAPSHOT + 4.0.0 + + webmagic-plugin + + + + us.codecraft + webmagic-core + 0.0.1-SNAPSHOT + + + junit + junit + 4.7 + test + + + + + + + org.apache.maven.plugins + maven-resources-plugin + + UTF-8 + + + + org.apache.maven.plugins + maven-source-plugin + + + attach-sources + + jar + + + + + + org.apache.maven.plugins + maven-javadoc-plugin + + + attach-javadocs + + jar + + + + + + org.apache.maven.plugins + maven-release-plugin + 2.0-beta-7 + + + + + + \ No newline at end of file diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml new file mode 100644 index 0000000..e3a846a --- /dev/null +++ b/webmagic-samples/pom.xml @@ -0,0 +1,68 @@ + + + + us.codecraft + 0.0.1-SNAPSHOT + 4.0.0 + + webmagic-samples + + + + us.codecraft + webmagic-core + 0.0.1-SNAPSHOT + + + junit + junit + 4.7 + test + + + + + + + org.apache.maven.plugins + maven-resources-plugin + + UTF-8 + + + + org.apache.maven.plugins + maven-source-plugin + + + attach-sources + + jar + + + + + + org.apache.maven.plugins + maven-javadoc-plugin + + + attach-javadocs + + jar + + + + + + org.apache.maven.plugins + maven-release-plugin + 2.0-beta-7 + + + + + + \ No newline at end of file diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiandianBlogProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiandianBlogProcessor.java new file mode 100644 index 0000000..efd1ff7 --- /dev/null +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiandianBlogProcessor.java @@ -0,0 +1,29 @@ +package us.codecraft.webmagic.samples; + +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.processor.PageProcessor; + +import java.util.List; + +/** + * User: cairne + * Date: 13-4-21 + * Time: 下午8:08 + */ +public class DiandianBlogProcessor implements PageProcessor { + @Override + public void process(Page page) { + //http://progressdaily.diandian.com/post/2013-01-24/40046867275 + List requests = page.getHtml().rs("]*href=[\"']{1}(http://17dujingdian\\.com/post/[^#]*?)[\"']{1}").toStrings(); + page.addTargetRequests(requests); + page.putField("title",page.getHtml().x("//div[@id='content']//h2/a")); + page.putField("content",page.getHtml().sc()); + } + + @Override + public Site getSite() { + return Site.me().setDomain("www.diandian.com").setStartUrl("http://17dujingdian.com/"). + setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); + } +} diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingBlogProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingBlogProcessor.java new file mode 100644 index 0000000..dd601ad --- /dev/null +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingBlogProcessor.java @@ -0,0 +1,33 @@ +package us.codecraft.webmagic.samples; + +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.processor.PageProcessor; + +import java.util.List; + +/** + * User: cairne + * Date: 13-4-21 + * Time: 下午8:08 + */ +public class DianpingBlogProcessor implements PageProcessor { + @Override + public void process(Page page) { + //http://progressdaily.diandian.com/post/2013-01-24/40046867275 + List requests = page.getHtml().rs("]*href=[\"']{1}(/shop/.*?)[\"']{1}").toStrings(); + page.addTargetRequests(requests); + requests = page.getHtml().rs("]*href=[\"']{1}(/search/category/.*?)[\"']{1}").toStrings(); + page.addTargetRequests(requests); + if (page.getUrl().toString().contains("shop")){ + page.putField("title", page.getHtml().x("//h1[@class='shop-title']")); + page.putField("content", page.getHtml().sc()); + } + } + + @Override + public Site getSite() { + return Site.me().setDomain("www.dianping.com").setStartUrl("http://www.dianping.com/"). + setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); + } +} diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java new file mode 100644 index 0000000..05b68b6 --- /dev/null +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java @@ -0,0 +1,33 @@ +package us.codecraft.webmagic.samples; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.processor.PageProcessor; + +import java.util.List; + +/** + * User: cairne + * Date: 13-4-21 + * Time: 下午8:08 + */ +public class DiaoyuwengProcessor implements PageProcessor { + @Override + public void process(Page page) { + //http://progressdaily.diandian.com/post/2013-01-24/40046867275 + List requests = page.getHtml().rs("]*href=[\"']{1}(/shop/.*?)[\"']{1}").toStrings(); + page.addTargetRequests(requests); + requests = page.getHtml().rs("]*href=[\"']{1}(/search/category/.*?)[\"']{1}").toStrings(); + page.addTargetRequests(requests); + if (page.getUrl().toString().contains("shop")){ + page.putField("title", page.getHtml().x("//h1[@class='shop-title']")); + page.putField("content", page.getHtml().sc()); + } + } + + @Override + public Site getSite() { + return Site.me().setDomain("www.dianping.com").setStartUrl("http://www.dianping.com/"). + setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); + } +} diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/F58PageProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/F58PageProcesser.java new file mode 100644 index 0000000..78211c4 --- /dev/null +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/F58PageProcesser.java @@ -0,0 +1,28 @@ +package us.codecraft.webmagic.samples; + +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.processor.PageProcessor; + +import java.util.List; + +/** + * User: cairne + * Date: 13-4-21 + * Time: 下午1:48 + */ +public class F58PageProcesser implements PageProcessor { + + @Override + public void process(Page page) { + List strings = page.getHtml().rs("]*href=[\"']{1}(/yewu/.*?)[\"']{1}").toStrings(); + page.addTargetRequests(strings); + page.putField("title",page.getHtml().r("(.*)")); + page.putField("body",page.getHtml().x("//dd[@class='w133']")); + } + + @Override + public Site getSite() { + return Site.me().setDomain("sh.58.com").setStartUrl("http://sh.58.com/"); //To change body of implemented methods use File | Settings | File Templates. + } +} diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java new file mode 100644 index 0000000..82552f9 --- /dev/null +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java @@ -0,0 +1,29 @@ +package us.codecraft.webmagic.samples; + +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.processor.PageProcessor; + +import java.util.List; + +/** + * User: cairne + * Date: 13-4-21 + * Time: 下午8:08 + */ +public class HuxiuProcessor implements PageProcessor { + @Override + public void process(Page page) { + //http://progressdaily.diandian.com/post/2013-01-24/40046867275 + List requests = page.getHtml().rs("\"']*href=[\"']{1}([/]{0,1}article[^<>#\"']*?)[\"']{1}").toStrings(); + page.addTargetRequests(requests); + page.putField("title",page.getHtml().x("//div[@class='neirong']//h1[@class='ph xs5']")); + page.putField("content",page.getHtml().sc()); + } + + @Override + public Site getSite() { + return Site.me().setDomain("www.huxiu.com").setStartUrl("http://www.huxiu.com/"). + setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); + } +} diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/KaichibaProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/KaichibaProcessor.java new file mode 100644 index 0000000..58a2cb8 --- /dev/null +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/KaichibaProcessor.java @@ -0,0 +1,27 @@ +package us.codecraft.webmagic.samples; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.processor.PageProcessor; + +/** + * User: cairne + * Date: 13-5-20 + * Time: 下午5:31 + */ +public class KaichibaProcessor implements PageProcessor { + @Override + public void process(Page page) { + //http://progressdaily.diandian.com/post/2013-01-24/40046867275 + int i = Integer.valueOf(page.getUrl().r("shop/(\\d+)").toString()) + 1; + page.addTargetRequests("http://kaichiba.com/shop/"+i); + page.putField("title",page.getHtml().x("//Title")); + page.putField("items", page.getHtml().xs("//li[@class=\"foodTitle\"]").rp("^\\s+", "").rp("\\s+$", "").rp(".*?", "")); + } + + @Override + public Site getSite() { + return Site.me().setDomain("kaichiba.com").setStartUrl("http://kaichiba.com/shop/41725781").setEncoding("utf-8"). + setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); + } +} diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MeicanProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MeicanProcessor.java new file mode 100644 index 0000000..637aec1 --- /dev/null +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MeicanProcessor.java @@ -0,0 +1,33 @@ +package us.codecraft.webmagic.samples; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.processor.PageProcessor; + +import java.util.List; + +/** + * User: cairne + * Date: 13-5-20 + * Time: 下午5:31 + */ +public class MeicanProcessor implements PageProcessor { + @Override + public void process(Page page) { + //http://progressdaily.diandian.com/post/2013-01-24/40046867275 + List requests = page.getHtml().xs("//a[@class=\"area_link flat_btn\"]/@href").toStrings(); + if (requests.size() > 2) { + requests = requests.subList(0, 2); + } + page.addTargetRequests(requests); + page.addTargetRequests(page.getHtml().as().rs("(.*/restaurant/[^#]+)").toStrings()); + page.putField("items", page.getHtml().xs("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"name\"]")); + page.putField("prices", page.getHtml().xs("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"price_outer\"]/span[@class=\"price\"]")); + } + + @Override + public Site getSite() { + return Site.me().setDomain("meican.com").setStartUrl("http://www.meican.com/shanghai/districts").setEncoding("utf-8"). + setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); + } +} diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/NjuBBSProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/NjuBBSProcessor.java new file mode 100644 index 0000000..ca46de6 --- /dev/null +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/NjuBBSProcessor.java @@ -0,0 +1,28 @@ +package us.codecraft.webmagic.samples; + +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.processor.PageProcessor; + +import java.util.List; + +/** + * User: cairne + * Date: 13-4-21 + * Time: 下午8:08 + */ +public class NjuBBSProcessor implements PageProcessor { + @Override + public void process(Page page) { + List requests = page.getHtml().rs("]*href=(bbstcon\\?board=Pictures&file=[^>]*)").toStrings(); + page.addTargetRequests(requests); + page.putField("title",page.getHtml().x("//div[@id='content']//h2/a")); + page.putField("content",page.getHtml().sc()); + } + + @Override + public Site getSite() { + return Site.me().setDomain("bbs.nju.edu.cn").setStartUrl("http://bbs.nju.edu.cn/board?board=Pictures"). + setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); + } +} diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java new file mode 100644 index 0000000..2166d9b --- /dev/null +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java @@ -0,0 +1,30 @@ +package us.codecraft.webmagic.samples; + +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.processor.PageProcessor; + +import java.util.List; + +/** + * User: cairne + * Date: 13-4-21 + * Time: 下午1:48 + */ +public class OschinaBlogPageProcesser implements PageProcessor { + + @Override + public void process(Page page) { + List strings = page.getHtml().as().r("(http://my\\.oschina\\.net)").toStrings(); + page.addTargetRequests(strings); + page.putField("title", page.getHtml().xs("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1")); + page.putField("content", page.getHtml().sc()); + page.putField("author", page.getUrl().r("my\\.oschina\\.net/(\\w+)/blog/\\d+")); + } + + @Override + public Site getSite() { + return Site.me().setDomain("my.oschina.net").setStartUrl("http://www.oschina.net/"). + setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); + } +} diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaPageProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaPageProcesser.java new file mode 100644 index 0000000..cdfbc1e --- /dev/null +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaPageProcesser.java @@ -0,0 +1,29 @@ +package us.codecraft.webmagic.samples; + +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.processor.PageProcessor; + +import java.util.List; + +/** + * User: cairne + * Date: 13-4-21 + * Time: 下午1:48 + */ +public class OschinaPageProcesser implements PageProcessor { + + @Override + public void process(Page page) { + List strings = page.getHtml().rs("]*href=[\"']{1}(http://www\\.oschina\\.net/question/[\\w]+)[\"']{1}").toStrings(); + page.addTargetRequests(strings); + page.putField("title", page.getHtml().x("//div[@class='QTitle']/h1/a")); + page.putField("content", page.getHtml().xs("//div[@class='Question']//div[@class='Content']/div[@class='detail']")); + } + + @Override + public Site getSite() { + return Site.me().setDomain("www.oschina.net").setStartUrl("http://www.oschina.net/"). + setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); + } +} diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/QzoneBlogProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/QzoneBlogProcessor.java new file mode 100644 index 0000000..67ef671 --- /dev/null +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/QzoneBlogProcessor.java @@ -0,0 +1,32 @@ +package us.codecraft.webmagic.samples; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.processor.PageProcessor; + +import java.util.List; + +/** + * User: cairne + * Date: 13-4-21 + * Time: 下午8:08 + */ +public class QzoneBlogProcessor implements PageProcessor { + @Override + public void process(Page page) { + //http://progressdaily.diandian.com/post/2013-01-24/40046867275 + + //http://b1.cnc.qzone.qq.com/cgi-bin/blognew/get_abs?hostUin=233017404&uin=233017404&blogType=0&statYear=2013&source=0&statYear=2013&g_tk=291639571&g_tk=291639571&reqInfo=7&pos=0&num=15&source=0&rand=0.46480297949165106 + // &cateName=&cateHex=&statYear=2013&reqInfo=7&pos=0&num=15&sortType=0&source=0&rand=0.46480297949165106&g_tk=291639571&verbose=1&ref=qzone + List requests = page.getHtml().rs("]*href=[\"']{1}(http://17dujingdian\\.com/post/[^#]*?)[\"']{1}").toStrings(); + page.addTargetRequests(requests); + page.putField("title",page.getHtml().x("//div[@id='content']//h2/a")); + page.putField("content",page.getHtml().sc()); + } + + @Override + public Site getSite() { + return Site.me().setDomain("www.diandian.com").setStartUrl("http://17dujingdian.com/"). + setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); + } +} diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcesser.java new file mode 100644 index 0000000..b86fff8 --- /dev/null +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcesser.java @@ -0,0 +1,29 @@ +package us.codecraft.webmagic.samples; + +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.processor.PageProcessor; + +/** + * User: cairne + * Date: 13-4-21 + * Time: 下午1:48 + */ +public class SinaBlogProcesser implements PageProcessor { + + @Override + public void process(Page page) { + page.addTargetRequests(page.getHtml().rs("]*href=[\"']{1}(http://blog\\.sina\\.com\\.cn/s/blog_.*?)[\"']{1}").toStrings()); + page.putField("title", page.getHtml().x("//div[@class='articalTitle']/h2")); + page.putField("body",page.getHtml().sc()); + //x("//dd[@class='w133']") + page.putField("date",page.getHtml().x("//div[@id='articlebody']//span[@class='time SG_txtc']").r("\\((.*)\\)")); + page.putField("tags",page.getHtml().xs("//td[@class='blog_tag']/h3/a")); + } + + @Override + public Site getSite() { + return Site.me().setDomain("blog.sina.com.cn").setStartUrl("http://blog.sina.com.cn/"). + setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); + } +} diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/TianyaPageProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/TianyaPageProcesser.java new file mode 100644 index 0000000..7a8920b --- /dev/null +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/TianyaPageProcesser.java @@ -0,0 +1,28 @@ +package us.codecraft.webmagic.samples; + +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.processor.PageProcessor; + +import java.util.List; + +/** + * User: cairne + * Date: 13-4-21 + * Time: 下午1:48 + */ +public class TianyaPageProcesser implements PageProcessor { + + @Override + public void process(Page page) { + List strings = page.getHtml().rs("]*href=[\"']{1}(/post-free.*?\\.shtml)[\"']{1}").toStrings(); + page.addTargetRequests(strings); + page.putField("title", page.getHtml().x("//div[@id='post_head']//span[@class='s_title']//b")); + page.putField("body",page.getHtml().sc()); + } + + @Override + public Site getSite() { + return Site.me().setDomain("http://bbs.tianya.cn/").setStartUrl("http://bbs.tianya.cn/"); //To change body of implemented methods use File | Settings | File Templates. + } +} diff --git a/webmagic-samples/src/test/java/us/codecraft/webmagic/HtmlTest.java b/webmagic-samples/src/test/java/us/codecraft/webmagic/HtmlTest.java new file mode 100644 index 0000000..f799098 --- /dev/null +++ b/webmagic-samples/src/test/java/us/codecraft/webmagic/HtmlTest.java @@ -0,0 +1,20 @@ +package us.codecraft.webmagic; + +import org.junit.Assert; +import org.junit.Test; +import us.codecraft.webmagic.selector.Html; + +/** + * User: cairne + * Date: 13-4-21 + * Time: 上午8:42 + */ +public class HtmlTest { + + @Test + public void testRegexSelector() { + Html selectable = new Html("aaaaaaab"); + Assert.assertEquals("abbabbab", (selectable.r("(.*)").rp("aa(a)", "$1bb").toString())); + + } +} diff --git a/webmagic-samples/src/test/java/us/codecraft/webmagic/SpiderTest.java b/webmagic-samples/src/test/java/us/codecraft/webmagic/SpiderTest.java new file mode 100644 index 0000000..5cb9848 --- /dev/null +++ b/webmagic-samples/src/test/java/us/codecraft/webmagic/SpiderTest.java @@ -0,0 +1,131 @@ +package us.codecraft.webmagic; + +import org.junit.Ignore; +import org.junit.Test; +import us.codecraft.webmagic.pipeline.FilePipeline; +import us.codecraft.webmagic.processor.SimplePageProcessor; +import us.codecraft.webmagic.samples.HuxiuProcessor; +import us.codecraft.webmagic.schedular.FileCacheQueueSchedular; + +/** + * User: cairne + * Date: 13-4-20 + * Time: 下午7:46 + */ +public class SpiderTest { + + + @Test + public void testSpider() throws InterruptedException { + Spider me = Spider.me().pipeline(new FilePipeline()).processor(new HuxiuProcessor()); + me.run(); + } + + @Test + public void testGlobalSpider(){ +// PageProcessor pageProcessor = new MeicanProcessor(); +// Spider.me().pipeline(new FilePipeline()).schedular(new FileCacheQueueSchedular(pageProcessor.getSite(),"/data/temp/webmagic/cache/")). +// processor(pageProcessor).run(); + SimplePageProcessor pageProcessor2 = new SimplePageProcessor("http://www.diaoyuweng.com/home.php?mod=space&uid=88304&do=thread&view=me&type=thread&from=space", "http://www.diaoyuweng.com/thread-*-1-1.html"); + pageProcessor2.getSite().setEncoding("GBK"); + System.out.println(pageProcessor2.getSite().getEncoding()); + Spider.me().pipeline(new FilePipeline()).schedular(new FileCacheQueueSchedular(pageProcessor2.getSite(),"/data/temp/webmagic/cache/")). + processor(pageProcessor2).run(); + + + } + + @Test + public void test(){ + System.out.println(System.getProperty("java.io.tmpdir")); + } + + + @Ignore + @Test + public void languageSchema() { + + + /** + * + * _hrefs = rs("]*href=[\"']{1}(/yewu/.*?)[\"']{1}") + * title = r(""(.*)"") + * body = x("//dd[@class='w133']") + * + * site.domain = "sh.58.com" + * site.ua="" + * site.cookie="aa:bb" + * + */ + + /** + * + * + * if (page == r('') && refer(1) == 1) { + * + * type = _refer(1) + * content = _text.t().c() + * title = x("asd@asd").r("",1) + * body[r(_currentUrl).g(1)] = body[r(_currentUrl).g(1)] + (x("").r("",1,2).c()) + * + * body=body[r(_currentUrl).g(1)] + * tags[%] = (tags[%] + xs('')) . r('') + * + * _targetUrls.add('' + x('').r('')) + * _sourceUrls.add() + * _header.put("",""); + * _cookie.add("asdsadasdsa"); + * + * + * } + * + * _cookie.add(_cookie['']) + * + * if (page == r('') && refer(1) == 1) + * ( + * _targetUrl = '' + x('') & r('') + * _sourceUrl = '' + * ) + * + */ + + /** + * + * + * + * + * + * + * + * + * + * + */ + + /** + * + * if (model.url('') && model.refer(1) == 1) + * ( + * + * model.set(type, model.refer(1)) + * content = t(_html) > c() + * title = x(_html, 'asd@asd') > r('',1) + * body[r(_currentUrl).g(1)] = body[r(_currentUrl).g(1)] + (x('') > r('',1,2) > c()) | x('') + * tags[%] = tags + xs('') > r('') + * model.setTargetUrl(); + * + * _targetUrl = '' + x('') & r('') + * _sourceUrl = '' + * ) + * + * _cookie.add(_cookie['']) + * + * if (page == r('') && refer(1) == 1) + * ( + * _targetUrl = '' + x('') & r('') + * _sourceUrl = '' + * ) + * + */ + } +} diff --git a/webmagic-samples/target/classes/us/codecraft/webmagic/samples/DiandianBlogProcessor.class b/webmagic-samples/target/classes/us/codecraft/webmagic/samples/DiandianBlogProcessor.class new file mode 100644 index 0000000..a9908a8 Binary files /dev/null and b/webmagic-samples/target/classes/us/codecraft/webmagic/samples/DiandianBlogProcessor.class differ diff --git a/webmagic-samples/target/classes/us/codecraft/webmagic/samples/DianpingBlogProcessor.class b/webmagic-samples/target/classes/us/codecraft/webmagic/samples/DianpingBlogProcessor.class new file mode 100644 index 0000000..91b7eca Binary files /dev/null and b/webmagic-samples/target/classes/us/codecraft/webmagic/samples/DianpingBlogProcessor.class differ diff --git a/webmagic-samples/target/classes/us/codecraft/webmagic/samples/DiaoyuwengProcessor.class b/webmagic-samples/target/classes/us/codecraft/webmagic/samples/DiaoyuwengProcessor.class new file mode 100644 index 0000000..ef7cecc Binary files /dev/null and b/webmagic-samples/target/classes/us/codecraft/webmagic/samples/DiaoyuwengProcessor.class differ diff --git a/webmagic-samples/target/classes/us/codecraft/webmagic/samples/F58PageProcesser.class b/webmagic-samples/target/classes/us/codecraft/webmagic/samples/F58PageProcesser.class new file mode 100644 index 0000000..7ae98cf Binary files /dev/null and b/webmagic-samples/target/classes/us/codecraft/webmagic/samples/F58PageProcesser.class differ diff --git a/webmagic-samples/target/classes/us/codecraft/webmagic/samples/HuxiuProcessor.class b/webmagic-samples/target/classes/us/codecraft/webmagic/samples/HuxiuProcessor.class new file mode 100644 index 0000000..b3dae1d Binary files /dev/null and b/webmagic-samples/target/classes/us/codecraft/webmagic/samples/HuxiuProcessor.class differ diff --git a/webmagic-samples/target/classes/us/codecraft/webmagic/samples/KaichibaProcessor.class b/webmagic-samples/target/classes/us/codecraft/webmagic/samples/KaichibaProcessor.class new file mode 100644 index 0000000..fbbfb6a Binary files /dev/null and b/webmagic-samples/target/classes/us/codecraft/webmagic/samples/KaichibaProcessor.class differ diff --git a/webmagic-samples/target/classes/us/codecraft/webmagic/samples/MeicanProcessor.class b/webmagic-samples/target/classes/us/codecraft/webmagic/samples/MeicanProcessor.class new file mode 100644 index 0000000..aba96f2 Binary files /dev/null and b/webmagic-samples/target/classes/us/codecraft/webmagic/samples/MeicanProcessor.class differ diff --git a/webmagic-samples/target/classes/us/codecraft/webmagic/samples/NjuBBSProcessor.class b/webmagic-samples/target/classes/us/codecraft/webmagic/samples/NjuBBSProcessor.class new file mode 100644 index 0000000..d0bbc96 Binary files /dev/null and b/webmagic-samples/target/classes/us/codecraft/webmagic/samples/NjuBBSProcessor.class differ diff --git a/webmagic-samples/target/classes/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.class b/webmagic-samples/target/classes/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.class new file mode 100644 index 0000000..8472ce3 Binary files /dev/null and b/webmagic-samples/target/classes/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.class differ diff --git a/webmagic-samples/target/classes/us/codecraft/webmagic/samples/OschinaPageProcesser.class b/webmagic-samples/target/classes/us/codecraft/webmagic/samples/OschinaPageProcesser.class new file mode 100644 index 0000000..a7daed0 Binary files /dev/null and b/webmagic-samples/target/classes/us/codecraft/webmagic/samples/OschinaPageProcesser.class differ diff --git a/webmagic-samples/target/classes/us/codecraft/webmagic/samples/QzoneBlogProcessor.class b/webmagic-samples/target/classes/us/codecraft/webmagic/samples/QzoneBlogProcessor.class new file mode 100644 index 0000000..a25a4a7 Binary files /dev/null and b/webmagic-samples/target/classes/us/codecraft/webmagic/samples/QzoneBlogProcessor.class differ diff --git a/webmagic-samples/target/classes/us/codecraft/webmagic/samples/SinaBlogProcesser.class b/webmagic-samples/target/classes/us/codecraft/webmagic/samples/SinaBlogProcesser.class new file mode 100644 index 0000000..5e8a5f0 Binary files /dev/null and b/webmagic-samples/target/classes/us/codecraft/webmagic/samples/SinaBlogProcesser.class differ diff --git a/webmagic-samples/target/classes/us/codecraft/webmagic/samples/TianyaPageProcesser.class b/webmagic-samples/target/classes/us/codecraft/webmagic/samples/TianyaPageProcesser.class new file mode 100644 index 0000000..d5ed6c7 Binary files /dev/null and b/webmagic-samples/target/classes/us/codecraft/webmagic/samples/TianyaPageProcesser.class differ diff --git a/webmagic-samples/target/test-classes/us/codecraft/webmagic/HtmlTest.class b/webmagic-samples/target/test-classes/us/codecraft/webmagic/HtmlTest.class new file mode 100644 index 0000000..7e93aef Binary files /dev/null and b/webmagic-samples/target/test-classes/us/codecraft/webmagic/HtmlTest.class differ diff --git a/webmagic-samples/target/test-classes/us/codecraft/webmagic/SpiderTest.class b/webmagic-samples/target/test-classes/us/codecraft/webmagic/SpiderTest.class new file mode 100644 index 0000000..f41ca61 Binary files /dev/null and b/webmagic-samples/target/test-classes/us/codecraft/webmagic/SpiderTest.class differ