getStartUrls() {
return startUrls;
}
+ /**
+ * 增加初始页面的地址,可反复调用此方法增加多个初始地址。
+ * @param startUrl 初始页面的地址
+ * @return this
+ */
public Site addStartUrl(String startUrl) {
this.startUrls.add(startUrl);
return this;
}
- public int getSleepTime() {
- return sleepTime;
- }
-
+ /**
+ * 设置两次抓取之间的间隔,避免对目标站点压力过大(或者避免被防火墙屏蔽...)。
+ *
+ * @param sleepTime 单位毫秒
+ * @return this
+ */
public Site setSleepTime(int sleepTime) {
this.sleepTime = sleepTime;
return this;
}
+ /**
+ * 获取两次抓取之间的间隔
+ * @return 两次抓取之间的间隔,单位毫秒
+ */
+ public int getSleepTime() {
+ return sleepTime;
+ }
+
@Override
public boolean equals(Object o) {
if (this == o) return true;
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
index 67e9c94..8c662eb 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
@@ -7,13 +7,18 @@ import us.codecraft.webmagic.downloader.HttpClientDownloader;
import us.codecraft.webmagic.pipeline.ConsolePipeline;
import us.codecraft.webmagic.pipeline.Pipeline;
import us.codecraft.webmagic.processor.PageProcessor;
-import us.codecraft.webmagic.schedular.QueueSchedular;
-import us.codecraft.webmagic.schedular.Schedular;
+import us.codecraft.webmagic.schedular.QueueScheduler;
+import us.codecraft.webmagic.schedular.Scheduler;
import java.util.ArrayList;
import java.util.List;
/**
+ *
+ * webmagic爬虫的入口类。
+ * 示例:
+ * Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*")).run();
+ *
* @author code4crafter@gmail.com
* Date: 13-4-21
* Time: 上午6:53
@@ -32,18 +37,17 @@ public class Spider implements Runnable, Task {
private String uuid;
- private Schedular schedular = new QueueSchedular();
+ private Scheduler scheduler = new QueueScheduler();
private Logger logger = Logger.getLogger(getClass());
- public static Spider me() {
- return new Spider();
- }
-
- public Spider processor(PageProcessor pageProcessor) {
+ public Spider(PageProcessor pageProcessor){
this.pageProcessor = pageProcessor;
this.site = pageProcessor.getSite();
- return this;
+ }
+
+ public static Spider create(PageProcessor pageProcessor) {
+ return new Spider(pageProcessor);
}
public Spider startUrls(List startUrls) {
@@ -57,8 +61,13 @@ public class Spider implements Runnable, Task {
return this;
}
- public Spider schedular(Schedular schedular) {
- this.schedular = schedular;
+ public Spider setUUID(String uuid) {
+ this.uuid = uuid;
+ return this;
+ }
+
+ public Spider schedular(Scheduler scheduler) {
+ this.scheduler = scheduler;
return this;
}
@@ -71,9 +80,9 @@ public class Spider implements Runnable, Task {
@Override
public void run() {
for (String startUrl : startUrls) {
- schedular.push(new Request(startUrl), this);
+ scheduler.push(new Request(startUrl), this);
}
- Request request = schedular.poll(this);
+ Request request = scheduler.poll(this);
if (pipelines.isEmpty()) {
pipelines.add(new ConsolePipeline());
}
@@ -89,16 +98,10 @@ public class Spider implements Runnable, Task {
pipeline.process(page, this);
}
sleep(site.getSleepTime());
- request = schedular.poll(this);
+ request = scheduler.poll(this);
}
}
- public Spider setUUID(String uuid) {
- this.uuid = uuid;
- return this;
- }
-
-
private void sleep(int time) {
try {
Thread.sleep(time);
@@ -110,7 +113,7 @@ public class Spider implements Runnable, Task {
private void addRequest(Page page) {
if (CollectionUtils.isNotEmpty(page.getTargetRequests())) {
for (Request request : page.getTargetRequests()) {
- schedular.push(request, this);
+ scheduler.push(request, this);
}
}
}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/FileCacheQueueSchedular.java b/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/FileCacheQueueScheduler.java
similarity index 97%
rename from webmagic-core/src/main/java/us/codecraft/webmagic/schedular/FileCacheQueueSchedular.java
rename to webmagic-core/src/main/java/us/codecraft/webmagic/schedular/FileCacheQueueScheduler.java
index 0a93e52..246f7e0 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/FileCacheQueueSchedular.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/FileCacheQueueScheduler.java
@@ -20,7 +20,7 @@ import java.util.concurrent.atomic.AtomicInteger;
* Date: 13-4-21
* Time: 下午1:13
*/
-public class FileCacheQueueSchedular implements Schedular {
+public class FileCacheQueueScheduler implements Scheduler {
private Logger logger = Logger.getLogger(getClass());
@@ -44,7 +44,7 @@ public class FileCacheQueueSchedular implements Schedular {
private Set urls;
- public FileCacheQueueSchedular(String filePath) {
+ public FileCacheQueueScheduler(String filePath) {
this.filePath = filePath;
}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/QueueSchedular.java b/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/QueueScheduler.java
similarity index 94%
rename from webmagic-core/src/main/java/us/codecraft/webmagic/schedular/QueueSchedular.java
rename to webmagic-core/src/main/java/us/codecraft/webmagic/schedular/QueueScheduler.java
index 20576fc..6976885 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/QueueSchedular.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/QueueScheduler.java
@@ -14,7 +14,7 @@ import java.util.concurrent.LinkedBlockingQueue;
* Date: 13-4-21
* Time: 下午1:13
*/
-public class QueueSchedular implements Schedular {
+public class QueueScheduler implements Scheduler {
private Logger logger = Logger.getLogger(getClass());
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/Schedular.java b/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/Scheduler.java
similarity index 90%
rename from webmagic-core/src/main/java/us/codecraft/webmagic/schedular/Schedular.java
rename to webmagic-core/src/main/java/us/codecraft/webmagic/schedular/Scheduler.java
index 8df7760..7e02132 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/Schedular.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/Scheduler.java
@@ -8,7 +8,7 @@ import us.codecraft.webmagic.Task;
* Date: 13-4-21
* Time: 下午1:12
*/
-public interface Schedular {
+public interface Scheduler {
public void push(Request request,Task task);
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java
index 3cc84f7..0b36372 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java
@@ -5,8 +5,8 @@ import java.util.List;
/**
* @author code4crafter@gmail.com
- * Date: 13-4-21
- * Time: 上午7:54
+ * Date: 13-4-21
+ * Time: 上午7:54
*/
public class Html extends PlainText {
@@ -18,12 +18,16 @@ public class Html extends PlainText {
super(text);
}
+ public static Html create(String text) {
+ return new Html(text);
+ }
+
@Override
protected Selectable select(Selector selector, List strings) {
List results = new ArrayList();
for (String string : strings) {
String result = selector.select(string);
- if (result!=null){
+ if (result != null) {
results.add(result);
}
}
@@ -43,13 +47,13 @@ public class Html extends PlainText {
@Override
public Selectable smartContent() {
SmartContentSelector smartContentSelector = SelectorFactory.getInstatnce().newSmartContentSelector();
- return select(smartContentSelector,strings);
+ return select(smartContentSelector, strings);
}
@Override
public Selectable links() {
XpathSelector xpathSelector = SelectorFactory.getInstatnce().newXpathSelector("//a/@href");
- return selectList(xpathSelector,strings);
+ return selectList(xpathSelector, strings);
}
@Override
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java
index 935abab..cedee63 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java
@@ -24,6 +24,10 @@ public class PlainText implements Selectable {
this.strings = results;
}
+ public static PlainText create(String text) {
+ return new PlainText(text);
+ }
+
@Override
public Selectable xpath(String xpath) {
throw new UnsupportedOperationException();
diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java
index 7f00e17..b2bcca2 100644
--- a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java
+++ b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java
@@ -15,7 +15,7 @@ public class HttpClientDownloaderTest {
@Test
public void testCookie() {
- Site site = Site.me().setDomain("www.diandian.com").setCookie("t", "yct7q7e6v319wpg4cpxqduu5m77lcgix");
+ Site site = Site.me().setDomain("www.diandian.com").addCookie("t", "yct7q7e6v319wpg4cpxqduu5m77lcgix");
HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
Page download = httpClientDownloader.download(new Request("http://www.diandian.com"), site);
Assert.assertTrue(download.getHtml().toString().contains("flashsword30"));
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingProcessor.java
index 7a21188..c7233e8 100644
--- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingProcessor.java
+++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingProcessor.java
@@ -33,6 +33,6 @@ public class DianpingProcessor implements PageProcessor {
public static void main(String[] args) {
DianpingProcessor dianpingProcessor = new DianpingProcessor();
- Spider.me().processor(dianpingProcessor).startUrl("http://www.dianping.com/shanghai/food").run();
+ Spider.create(dianpingProcessor).startUrl("http://www.dianping.com/shanghai/food").run();
}
}
diff --git a/webmagic-samples/src/test/java/us/codecraft/webmagic/SpiderTest.java b/webmagic-samples/src/test/java/us/codecraft/webmagic/SpiderTest.java
index 681aac7..39018d9 100644
--- a/webmagic-samples/src/test/java/us/codecraft/webmagic/SpiderTest.java
+++ b/webmagic-samples/src/test/java/us/codecraft/webmagic/SpiderTest.java
@@ -5,7 +5,7 @@ import org.junit.Test;
import us.codecraft.webmagic.pipeline.FilePipeline;
import us.codecraft.webmagic.processor.SimplePageProcessor;
import us.codecraft.webmagic.samples.HuxiuProcessor;
-import us.codecraft.webmagic.schedular.FileCacheQueueSchedular;
+import us.codecraft.webmagic.schedular.FileCacheQueueScheduler;
/**
* @author code4crafter@gmail.com
@@ -18,7 +18,7 @@ public class SpiderTest {
@Ignore
@Test
public void testSpider() throws InterruptedException {
- Spider me = Spider.me().pipeline(new FilePipeline()).processor(new HuxiuProcessor());
+ Spider me = Spider.create(new HuxiuProcessor()).pipeline(new FilePipeline());
me.run();
}
@@ -26,13 +26,13 @@ public class SpiderTest {
@Test
public void testGlobalSpider(){
// PageProcessor pageProcessor = new MeicanProcessor();
-// Spider.me().pipeline(new FilePipeline()).schedular(new FileCacheQueueSchedular(pageProcessor.getSite(),"/data/temp/webmagic/cache/")).
+// Spider.me().pipeline(new FilePipeline()).schedular(new FileCacheQueueScheduler(pageProcessor.getSite(),"/data/temp/webmagic/cache/")).
// processor(pageProcessor).run();
SimplePageProcessor pageProcessor2 = new SimplePageProcessor("http://www.diaoyuweng.com/home.php?mod=space&uid=88304&do=thread&view=me&type=thread&from=space", "http://www.diaoyuweng.com/thread-*-1-1.html");
System.out.println(pageProcessor2.getSite().getEncoding());
pageProcessor2.getSite().setSleepTime(500);
- Spider.me().pipeline(new FilePipeline()).schedular(new FileCacheQueueSchedular("/data/temp/webmagic/cache/")).
- processor(pageProcessor2).run();
+ Spider.create(pageProcessor2).pipeline(new FilePipeline()).schedular(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")).
+ run();
}
diff --git a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiandianProcessorTest.java b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiandianProcessorTest.java
index b87815c..00491d9 100644
--- a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiandianProcessorTest.java
+++ b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiandianProcessorTest.java
@@ -6,7 +6,7 @@ import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.ConsolePipeline;
import us.codecraft.webmagic.pipeline.FreemarkerPipeline;
import us.codecraft.webmagic.samples.DiandianBlogProcessor;
-import us.codecraft.webmagic.schedular.FileCacheQueueSchedular;
+import us.codecraft.webmagic.schedular.FileCacheQueueScheduler;
import java.io.IOException;
@@ -30,7 +30,7 @@ public class DiandianProcessorTest {
//ConsolePipeline输出结果到控制台
//FileCacheQueueSchedular保存url,支持断点续传,临时文件输出到/data/temp/webmagic/cache目录
//Spider.run()执行
- Spider.me().pipeline(new ConsolePipeline()).pipeline(pipeline).schedular(new FileCacheQueueSchedular("/data/temp/webmagic/cache/")).
- processor(diaoyuwengProcessor).run();
+ Spider.create(diaoyuwengProcessor).pipeline(new ConsolePipeline()).pipeline(pipeline).schedular(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")).
+ run();
}
}
diff --git a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiaoyuwengProcessorTest.java b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiaoyuwengProcessorTest.java
index 2b2caac..a189126 100644
--- a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiaoyuwengProcessorTest.java
+++ b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiaoyuwengProcessorTest.java
@@ -6,14 +6,14 @@ import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.FilePipeline;
import us.codecraft.webmagic.pipeline.FreemarkerPipeline;
import us.codecraft.webmagic.samples.DiaoyuwengProcessor;
-import us.codecraft.webmagic.schedular.FileCacheQueueSchedular;
+import us.codecraft.webmagic.schedular.FileCacheQueueScheduler;
import java.io.IOException;
/**
* @author code4crafter@gmail.com
- * Date: 13-6-9
- * Time: 上午8:02
+ * Date: 13-6-9
+ * Time: 上午8:02
*/
public class DiaoyuwengProcessorTest {
@@ -22,7 +22,7 @@ public class DiaoyuwengProcessorTest {
public void test() throws IOException {
DiaoyuwengProcessor diaoyuwengProcessor = new DiaoyuwengProcessor();
FreemarkerPipeline pipeline = new FreemarkerPipeline("wordpress.ftl");
- Spider.me().pipeline(new FilePipeline()).pipeline(pipeline).schedular(new FileCacheQueueSchedular("/data/temp/webmagic/cache/")).
- processor(diaoyuwengProcessor).run();
+ Spider.create(diaoyuwengProcessor).pipeline(new FilePipeline()).pipeline(pipeline).schedular(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")).
+ run();
}
}
diff --git a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/SinablogProcessorTest.java b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/SinablogProcessorTest.java
index 9613c9e..4a26383 100644
--- a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/SinablogProcessorTest.java
+++ b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/SinablogProcessorTest.java
@@ -6,14 +6,14 @@ import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.FilePipeline;
import us.codecraft.webmagic.pipeline.FreemarkerPipeline;
import us.codecraft.webmagic.samples.SinaBlogProcesser;
-import us.codecraft.webmagic.schedular.FileCacheQueueSchedular;
+import us.codecraft.webmagic.schedular.FileCacheQueueScheduler;
import java.io.IOException;
/**
* @author code4crafter@gmail.com
- * Date: 13-6-9
- * Time: 上午8:02
+ * Date: 13-6-9
+ * Time: 上午8:02
*/
public class SinablogProcessorTest {
@@ -30,7 +30,7 @@ public class SinablogProcessorTest {
//ConsolePipeline输出结果到控制台
//FileCacheQueueSchedular保存url,支持断点续传,临时文件输出到/data/temp/webmagic/cache目录
//Spider.run()执行
- Spider.me().pipeline(new FilePipeline()).pipeline(pipeline).schedular(new FileCacheQueueSchedular("/data/temp/webmagic/cache/")).
- processor(sinaBlogProcesser).run();
+ Spider.create(sinaBlogProcesser).pipeline(new FilePipeline()).pipeline(pipeline).schedular(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")).
+ run();
}
}