diff --git a/README.md b/README.md
index e5dc333..421443f 100644
--- a/README.md
+++ b/README.md
@@ -90,8 +90,8 @@ webmagic遵循[Apache 2.0协议](http://opensource.org/licenses/Apache-2.0)
webmagic的架构和设计参考了以下两个项目,感谢以下两个项目的作者:
-python爬虫 **scrapy**[https://github.com/scrapy/scrapy](https://github.com/scrapy/scrapy)
+python爬虫 **scrapy** [https://github.com/scrapy/scrapy](https://github.com/scrapy/scrapy)
-Java爬虫 **Spiderman**[https://gitcafe.com/laiweiwei/Spiderman](https://gitcafe.com/laiweiwei/Spiderman)
+Java爬虫 **Spiderman** [https://gitcafe.com/laiweiwei/Spiderman](https://gitcafe.com/laiweiwei/Spiderman)
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
index f7f560c..6464d61 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
@@ -57,10 +57,6 @@ public class Spider implements Runnable, Task {
return this;
}
- public Thread thread() {
- return new Thread(this);
- }
-
public Spider schedular(Schedular schedular) {
this.schedular = schedular;
return this;
@@ -74,7 +70,7 @@ public class Spider implements Runnable, Task {
@Override
public void run() {
- for (String startUrl : pageProcessor.getSite().getStartUrls()) {
+ for (String startUrl : startUrls) {
schedular.push(new Request(startUrl), this);
}
Request request = schedular.poll(this);
diff --git a/webmagic-plugin/src/main/java/us/codecraft/webmagic/pipeline/FreemarkerPipeline.java b/webmagic-plugin/src/main/java/us/codecraft/webmagic/pipeline/FreemarkerPipeline.java
index 1121971..218276d 100644
--- a/webmagic-plugin/src/main/java/us/codecraft/webmagic/pipeline/FreemarkerPipeline.java
+++ b/webmagic-plugin/src/main/java/us/codecraft/webmagic/pipeline/FreemarkerPipeline.java
@@ -30,7 +30,7 @@ public class FreemarkerPipeline implements Pipeline {
configuration.setDirectoryForTemplateLoading(new File(this.getClass().getClassLoader().getResource("ftl/").getFile()));
this.template = configuration.getTemplate(template);
this.path = path;
- File file = new File(path);
+ new File(path);
}
public FreemarkerPipeline(String template) throws IOException {
diff --git a/webmagic-plugin/src/test/java/us/codecraft/webmagic/FreemarkerPipelineTest.java b/webmagic-plugin/src/test/java/us/codecraft/webmagic/FreemarkerPipelineTest.java
index 610edf5..9e6b995 100644
--- a/webmagic-plugin/src/test/java/us/codecraft/webmagic/FreemarkerPipelineTest.java
+++ b/webmagic-plugin/src/test/java/us/codecraft/webmagic/FreemarkerPipelineTest.java
@@ -13,7 +13,7 @@ import java.io.IOException;
public class FreemarkerPipelineTest {
@Test
- public void test() throws IOException {
+ public void testTemplateLoad() throws IOException {
FreemarkerPipeline freemarkerPipeline = new FreemarkerPipeline("wordpress.ftl");
}
}
diff --git a/webmagic-plugin/src/test/resources/ftl/wordpress.ftl b/webmagic-plugin/src/test/resources/ftl/wordpress.ftl
deleted file mode 100644
index 61820b7..0000000
--- a/webmagic-plugin/src/test/resources/ftl/wordpress.ftl
+++ /dev/null
@@ -1,23 +0,0 @@
--
- $it.Title
- http://127.0.0.1/wordpress/?p=$it.Id
- ${date}
- admin
- http://127.0.0.1/wordpress/?p=$it.Id
-
-
-
- <#--$it.Id-->
- ${date}
- ${date}
- open
- open
- ${title}
- publish
- 0
- 0
- post
-
- 0
- $tags
-
\ No newline at end of file
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingBlogProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingProcessor.java
similarity index 61%
rename from webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingBlogProcessor.java
rename to webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingProcessor.java
index fafb7de..63aa0f0 100644
--- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingBlogProcessor.java
+++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingProcessor.java
@@ -2,6 +2,7 @@ package us.codecraft.webmagic.samples;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Page;
+import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import java.util.List;
@@ -11,15 +12,14 @@ import java.util.List;
* Date: 13-4-21
* Time: 下午8:08
*/
-public class DianpingBlogProcessor implements PageProcessor {
+public class DianpingProcessor implements PageProcessor {
@Override
public void process(Page page) {
- //http://progressdaily.diandian.com/post/2013-01-24/40046867275
- List requests = page.getHtml().rs("]*href=[\"']{1}(/shop/.*?)[\"']{1}").toStrings();
+ List requests = page.getHtml().as().rs(".*shop.*").toStrings();
page.addTargetRequests(requests);
- requests = page.getHtml().rs("]*href=[\"']{1}(/search/category/.*?)[\"']{1}").toStrings();
+ requests = page.getHtml().rs(".*search/category/.*").toStrings();
page.addTargetRequests(requests);
- if (page.getUrl().toString().contains("shop")){
+ if (page.getUrl().toString().contains("shop")) {
page.putField("title", page.getHtml().x("//h1[@class='shop-title']"));
page.putField("content", page.getHtml().sc());
}
@@ -30,4 +30,9 @@ public class DianpingBlogProcessor implements PageProcessor {
return Site.me().setDomain("www.dianping.com").addStartUrl("http://www.dianping.com/").
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
}
+
+ public static void main(String[] args) {
+ DianpingProcessor dianpingProcessor = new DianpingProcessor();
+ Spider.me().processor(dianpingProcessor).startUrl("http://www.dianping.com/shanghai/food").run();
+ }
}