diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java index 5c208dd..f9e0fd6 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java @@ -3,10 +3,12 @@ package us.codecraft.webmagic; import java.util.*; /** - * Site定义一个待抓取的站点的各种信息。 + * Site定义一个待抓取的站点的各种信息。
+ * 这个类的所有getter方法,一般都只会被爬虫框架内部进行调用。
+ * * @author code4crafter@gmail.com
- * Date: 13-4-21 - * Time: 下午12:13 + * Date: 13-4-21 + * Time: 下午12:13 */ public class Site { @@ -30,73 +32,157 @@ public class Site { DEFAULT_STATUS_CODE_SET.add(200); } + /** + * 创建一个Site对象,等价于new Site() + * + * @return 新建的对象 + */ public static Site me() { return new Site(); } - public Site setCookie(String name, String value) { + /** + * 为这个站点添加一个cookie,可用于抓取某些需要登录访问的站点。这个cookie的域名与{@link #getDomain()}是一致的 + * + * @param name cookie的名称 + * @param value cookie的值 + * @return this + */ + public Site addCookie(String name, String value) { cookies.put(name, value); return this; } + /** + * 为这个站点设置user-agent,很多网站都对user-agent进行了限制,不设置此选项可能会得到期望之外的结果。 + * + * @param userAgent userAgent + * @return this + */ public Site setUserAgent(String userAgent) { this.userAgent = userAgent; return this; } + /** + * 获取已经设置的所有cookie + * + * @return 已经设置的所有cookie + */ public Map getCookies() { return cookies; } + /** + * 获取已设置的user-agent + * + * @return 已设置的user-agent + */ public String getUserAgent() { return userAgent; } + /** + * 获取已设置的domain + * + * @return + */ public String getDomain() { return domain; } + /** + * 设置这个站点所在域名,必须项。
+ * 目前不支持多个域名的抓取。抓取多个域名请新建一个Spider。 + * + * @param domain 爬虫会抓取的域名 + * @return this + */ public Site setDomain(String domain) { this.domain = domain; return this; } - public String getEncoding() { - return encoding; - } - + /** + * 设置页面编码,若不设置则自动根据Html meta信息获取。
+ * 一般无需设置encoding,如果发现下载的结果是乱码,则可以设置此项。
+ * + * @param encoding 编码格式,主要是"utf-8"、"gbk"两种 + * @return this + */ public Site setEncoding(String encoding) { this.encoding = encoding; return this; } - public Set getAcceptStatCode() { - return acceptStatCode; + /** + * 获取已设置的编码 + * + * @return 已设置的domain + */ + public String getEncoding() { + return encoding; } + /** + * 设置可接受的http状态码,仅当状态码在这个集合中时,才会读取页面内容。
+ * 默认为200,正常情况下,无须设置此项。
+ * 某些站点会错误的返回状态码,此时可以对这个选项进行设置。
+ * + * @param acceptStatCode 可接受的状态码 + * @return this + */ public Site setAcceptStatCode(Set acceptStatCode) { this.acceptStatCode = acceptStatCode; return this; } + /** + * 获取可接受的状态码 + * + * @return 可接受的状态码 + */ + public Set getAcceptStatCode() { + return acceptStatCode; + } + + /** + * 获取初始页面的地址列表 + * @return 初始页面的地址列表 + */ public List getStartUrls() { return startUrls; } + /** + * 增加初始页面的地址,可反复调用此方法增加多个初始地址。 + * @param startUrl 初始页面的地址 + * @return this + */ public Site addStartUrl(String startUrl) { this.startUrls.add(startUrl); return this; } - public int getSleepTime() { - return sleepTime; - } - + /** + * 设置两次抓取之间的间隔,避免对目标站点压力过大(或者避免被防火墙屏蔽...)。 + * + * @param sleepTime 单位毫秒 + * @return this + */ public Site setSleepTime(int sleepTime) { this.sleepTime = sleepTime; return this; } + /** + * 获取两次抓取之间的间隔 + * @return 两次抓取之间的间隔,单位毫秒 + */ + public int getSleepTime() { + return sleepTime; + } + @Override public boolean equals(Object o) { if (this == o) return true; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 67e9c94..8c662eb 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -7,13 +7,18 @@ import us.codecraft.webmagic.downloader.HttpClientDownloader; import us.codecraft.webmagic.pipeline.ConsolePipeline; import us.codecraft.webmagic.pipeline.Pipeline; import us.codecraft.webmagic.processor.PageProcessor; -import us.codecraft.webmagic.schedular.QueueSchedular; -import us.codecraft.webmagic.schedular.Schedular; +import us.codecraft.webmagic.schedular.QueueScheduler; +import us.codecraft.webmagic.schedular.Scheduler; import java.util.ArrayList; import java.util.List; /** + *
+ * webmagic爬虫的入口类。
+ *      示例:
+ *      Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*")).run();
+ * 
* @author code4crafter@gmail.com
* Date: 13-4-21 * Time: 上午6:53 @@ -32,18 +37,17 @@ public class Spider implements Runnable, Task { private String uuid; - private Schedular schedular = new QueueSchedular(); + private Scheduler scheduler = new QueueScheduler(); private Logger logger = Logger.getLogger(getClass()); - public static Spider me() { - return new Spider(); - } - - public Spider processor(PageProcessor pageProcessor) { + public Spider(PageProcessor pageProcessor){ this.pageProcessor = pageProcessor; this.site = pageProcessor.getSite(); - return this; + } + + public static Spider create(PageProcessor pageProcessor) { + return new Spider(pageProcessor); } public Spider startUrls(List startUrls) { @@ -57,8 +61,13 @@ public class Spider implements Runnable, Task { return this; } - public Spider schedular(Schedular schedular) { - this.schedular = schedular; + public Spider setUUID(String uuid) { + this.uuid = uuid; + return this; + } + + public Spider schedular(Scheduler scheduler) { + this.scheduler = scheduler; return this; } @@ -71,9 +80,9 @@ public class Spider implements Runnable, Task { @Override public void run() { for (String startUrl : startUrls) { - schedular.push(new Request(startUrl), this); + scheduler.push(new Request(startUrl), this); } - Request request = schedular.poll(this); + Request request = scheduler.poll(this); if (pipelines.isEmpty()) { pipelines.add(new ConsolePipeline()); } @@ -89,16 +98,10 @@ public class Spider implements Runnable, Task { pipeline.process(page, this); } sleep(site.getSleepTime()); - request = schedular.poll(this); + request = scheduler.poll(this); } } - public Spider setUUID(String uuid) { - this.uuid = uuid; - return this; - } - - private void sleep(int time) { try { Thread.sleep(time); @@ -110,7 +113,7 @@ public class Spider implements Runnable, Task { private void addRequest(Page page) { if (CollectionUtils.isNotEmpty(page.getTargetRequests())) { for (Request request : page.getTargetRequests()) { - schedular.push(request, this); + scheduler.push(request, this); } } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/FileCacheQueueSchedular.java b/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/FileCacheQueueScheduler.java similarity index 97% rename from webmagic-core/src/main/java/us/codecraft/webmagic/schedular/FileCacheQueueSchedular.java rename to webmagic-core/src/main/java/us/codecraft/webmagic/schedular/FileCacheQueueScheduler.java index 0a93e52..246f7e0 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/FileCacheQueueSchedular.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/FileCacheQueueScheduler.java @@ -20,7 +20,7 @@ import java.util.concurrent.atomic.AtomicInteger; * Date: 13-4-21 * Time: 下午1:13 */ -public class FileCacheQueueSchedular implements Schedular { +public class FileCacheQueueScheduler implements Scheduler { private Logger logger = Logger.getLogger(getClass()); @@ -44,7 +44,7 @@ public class FileCacheQueueSchedular implements Schedular { private Set urls; - public FileCacheQueueSchedular(String filePath) { + public FileCacheQueueScheduler(String filePath) { this.filePath = filePath; } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/QueueSchedular.java b/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/QueueScheduler.java similarity index 94% rename from webmagic-core/src/main/java/us/codecraft/webmagic/schedular/QueueSchedular.java rename to webmagic-core/src/main/java/us/codecraft/webmagic/schedular/QueueScheduler.java index 20576fc..6976885 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/QueueSchedular.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/QueueScheduler.java @@ -14,7 +14,7 @@ import java.util.concurrent.LinkedBlockingQueue; * Date: 13-4-21 * Time: 下午1:13 */ -public class QueueSchedular implements Schedular { +public class QueueScheduler implements Scheduler { private Logger logger = Logger.getLogger(getClass()); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/Schedular.java b/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/Scheduler.java similarity index 90% rename from webmagic-core/src/main/java/us/codecraft/webmagic/schedular/Schedular.java rename to webmagic-core/src/main/java/us/codecraft/webmagic/schedular/Scheduler.java index 8df7760..7e02132 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/Schedular.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/Scheduler.java @@ -8,7 +8,7 @@ import us.codecraft.webmagic.Task; * Date: 13-4-21 * Time: 下午1:12 */ -public interface Schedular { +public interface Scheduler { public void push(Request request,Task task); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java index 3cc84f7..0b36372 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java @@ -5,8 +5,8 @@ import java.util.List; /** * @author code4crafter@gmail.com
- * Date: 13-4-21 - * Time: 上午7:54 + * Date: 13-4-21 + * Time: 上午7:54 */ public class Html extends PlainText { @@ -18,12 +18,16 @@ public class Html extends PlainText { super(text); } + public static Html create(String text) { + return new Html(text); + } + @Override protected Selectable select(Selector selector, List strings) { List results = new ArrayList(); for (String string : strings) { String result = selector.select(string); - if (result!=null){ + if (result != null) { results.add(result); } } @@ -43,13 +47,13 @@ public class Html extends PlainText { @Override public Selectable smartContent() { SmartContentSelector smartContentSelector = SelectorFactory.getInstatnce().newSmartContentSelector(); - return select(smartContentSelector,strings); + return select(smartContentSelector, strings); } @Override public Selectable links() { XpathSelector xpathSelector = SelectorFactory.getInstatnce().newXpathSelector("//a/@href"); - return selectList(xpathSelector,strings); + return selectList(xpathSelector, strings); } @Override diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java index 935abab..cedee63 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java @@ -24,6 +24,10 @@ public class PlainText implements Selectable { this.strings = results; } + public static PlainText create(String text) { + return new PlainText(text); + } + @Override public Selectable xpath(String xpath) { throw new UnsupportedOperationException(); diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java index 7f00e17..b2bcca2 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java @@ -15,7 +15,7 @@ public class HttpClientDownloaderTest { @Test public void testCookie() { - Site site = Site.me().setDomain("www.diandian.com").setCookie("t", "yct7q7e6v319wpg4cpxqduu5m77lcgix"); + Site site = Site.me().setDomain("www.diandian.com").addCookie("t", "yct7q7e6v319wpg4cpxqduu5m77lcgix"); HttpClientDownloader httpClientDownloader = new HttpClientDownloader(); Page download = httpClientDownloader.download(new Request("http://www.diandian.com"), site); Assert.assertTrue(download.getHtml().toString().contains("flashsword30")); diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingProcessor.java index 7a21188..c7233e8 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingProcessor.java @@ -33,6 +33,6 @@ public class DianpingProcessor implements PageProcessor { public static void main(String[] args) { DianpingProcessor dianpingProcessor = new DianpingProcessor(); - Spider.me().processor(dianpingProcessor).startUrl("http://www.dianping.com/shanghai/food").run(); + Spider.create(dianpingProcessor).startUrl("http://www.dianping.com/shanghai/food").run(); } } diff --git a/webmagic-samples/src/test/java/us/codecraft/webmagic/SpiderTest.java b/webmagic-samples/src/test/java/us/codecraft/webmagic/SpiderTest.java index 681aac7..39018d9 100644 --- a/webmagic-samples/src/test/java/us/codecraft/webmagic/SpiderTest.java +++ b/webmagic-samples/src/test/java/us/codecraft/webmagic/SpiderTest.java @@ -5,7 +5,7 @@ import org.junit.Test; import us.codecraft.webmagic.pipeline.FilePipeline; import us.codecraft.webmagic.processor.SimplePageProcessor; import us.codecraft.webmagic.samples.HuxiuProcessor; -import us.codecraft.webmagic.schedular.FileCacheQueueSchedular; +import us.codecraft.webmagic.schedular.FileCacheQueueScheduler; /** * @author code4crafter@gmail.com
@@ -18,7 +18,7 @@ public class SpiderTest { @Ignore @Test public void testSpider() throws InterruptedException { - Spider me = Spider.me().pipeline(new FilePipeline()).processor(new HuxiuProcessor()); + Spider me = Spider.create(new HuxiuProcessor()).pipeline(new FilePipeline()); me.run(); } @@ -26,13 +26,13 @@ public class SpiderTest { @Test public void testGlobalSpider(){ // PageProcessor pageProcessor = new MeicanProcessor(); -// Spider.me().pipeline(new FilePipeline()).schedular(new FileCacheQueueSchedular(pageProcessor.getSite(),"/data/temp/webmagic/cache/")). +// Spider.me().pipeline(new FilePipeline()).schedular(new FileCacheQueueScheduler(pageProcessor.getSite(),"/data/temp/webmagic/cache/")). // processor(pageProcessor).run(); SimplePageProcessor pageProcessor2 = new SimplePageProcessor("http://www.diaoyuweng.com/home.php?mod=space&uid=88304&do=thread&view=me&type=thread&from=space", "http://www.diaoyuweng.com/thread-*-1-1.html"); System.out.println(pageProcessor2.getSite().getEncoding()); pageProcessor2.getSite().setSleepTime(500); - Spider.me().pipeline(new FilePipeline()).schedular(new FileCacheQueueSchedular("/data/temp/webmagic/cache/")). - processor(pageProcessor2).run(); + Spider.create(pageProcessor2).pipeline(new FilePipeline()).schedular(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")). + run(); } diff --git a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiandianProcessorTest.java b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiandianProcessorTest.java index b87815c..00491d9 100644 --- a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiandianProcessorTest.java +++ b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiandianProcessorTest.java @@ -6,7 +6,7 @@ import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.pipeline.ConsolePipeline; import us.codecraft.webmagic.pipeline.FreemarkerPipeline; import us.codecraft.webmagic.samples.DiandianBlogProcessor; -import us.codecraft.webmagic.schedular.FileCacheQueueSchedular; +import us.codecraft.webmagic.schedular.FileCacheQueueScheduler; import java.io.IOException; @@ -30,7 +30,7 @@ public class DiandianProcessorTest { //ConsolePipeline输出结果到控制台 //FileCacheQueueSchedular保存url,支持断点续传,临时文件输出到/data/temp/webmagic/cache目录 //Spider.run()执行 - Spider.me().pipeline(new ConsolePipeline()).pipeline(pipeline).schedular(new FileCacheQueueSchedular("/data/temp/webmagic/cache/")). - processor(diaoyuwengProcessor).run(); + Spider.create(diaoyuwengProcessor).pipeline(new ConsolePipeline()).pipeline(pipeline).schedular(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")). + run(); } } diff --git a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiaoyuwengProcessorTest.java b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiaoyuwengProcessorTest.java index 2b2caac..a189126 100644 --- a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiaoyuwengProcessorTest.java +++ b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiaoyuwengProcessorTest.java @@ -6,14 +6,14 @@ import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.pipeline.FilePipeline; import us.codecraft.webmagic.pipeline.FreemarkerPipeline; import us.codecraft.webmagic.samples.DiaoyuwengProcessor; -import us.codecraft.webmagic.schedular.FileCacheQueueSchedular; +import us.codecraft.webmagic.schedular.FileCacheQueueScheduler; import java.io.IOException; /** * @author code4crafter@gmail.com
- * Date: 13-6-9 - * Time: 上午8:02 + * Date: 13-6-9 + * Time: 上午8:02 */ public class DiaoyuwengProcessorTest { @@ -22,7 +22,7 @@ public class DiaoyuwengProcessorTest { public void test() throws IOException { DiaoyuwengProcessor diaoyuwengProcessor = new DiaoyuwengProcessor(); FreemarkerPipeline pipeline = new FreemarkerPipeline("wordpress.ftl"); - Spider.me().pipeline(new FilePipeline()).pipeline(pipeline).schedular(new FileCacheQueueSchedular("/data/temp/webmagic/cache/")). - processor(diaoyuwengProcessor).run(); + Spider.create(diaoyuwengProcessor).pipeline(new FilePipeline()).pipeline(pipeline).schedular(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")). + run(); } } diff --git a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/SinablogProcessorTest.java b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/SinablogProcessorTest.java index 9613c9e..4a26383 100644 --- a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/SinablogProcessorTest.java +++ b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/SinablogProcessorTest.java @@ -6,14 +6,14 @@ import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.pipeline.FilePipeline; import us.codecraft.webmagic.pipeline.FreemarkerPipeline; import us.codecraft.webmagic.samples.SinaBlogProcesser; -import us.codecraft.webmagic.schedular.FileCacheQueueSchedular; +import us.codecraft.webmagic.schedular.FileCacheQueueScheduler; import java.io.IOException; /** * @author code4crafter@gmail.com
- * Date: 13-6-9 - * Time: 上午8:02 + * Date: 13-6-9 + * Time: 上午8:02 */ public class SinablogProcessorTest { @@ -30,7 +30,7 @@ public class SinablogProcessorTest { //ConsolePipeline输出结果到控制台 //FileCacheQueueSchedular保存url,支持断点续传,临时文件输出到/data/temp/webmagic/cache目录 //Spider.run()执行 - Spider.me().pipeline(new FilePipeline()).pipeline(pipeline).schedular(new FileCacheQueueSchedular("/data/temp/webmagic/cache/")). - processor(sinaBlogProcesser).run(); + Spider.create(sinaBlogProcesser).pipeline(new FilePipeline()).pipeline(pipeline).schedular(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")). + run(); } }