diff --git a/README.md b/README.md index e5dc333..421443f 100644 --- a/README.md +++ b/README.md @@ -90,8 +90,8 @@ webmagic遵循[Apache 2.0协议](http://opensource.org/licenses/Apache-2.0) webmagic的架构和设计参考了以下两个项目,感谢以下两个项目的作者: -python爬虫 **scrapy**[https://github.com/scrapy/scrapy](https://github.com/scrapy/scrapy) +python爬虫 **scrapy** [https://github.com/scrapy/scrapy](https://github.com/scrapy/scrapy) -Java爬虫 **Spiderman**[https://gitcafe.com/laiweiwei/Spiderman](https://gitcafe.com/laiweiwei/Spiderman) +Java爬虫 **Spiderman** [https://gitcafe.com/laiweiwei/Spiderman](https://gitcafe.com/laiweiwei/Spiderman) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index f7f560c..6464d61 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -57,10 +57,6 @@ public class Spider implements Runnable, Task { return this; } - public Thread thread() { - return new Thread(this); - } - public Spider schedular(Schedular schedular) { this.schedular = schedular; return this; @@ -74,7 +70,7 @@ public class Spider implements Runnable, Task { @Override public void run() { - for (String startUrl : pageProcessor.getSite().getStartUrls()) { + for (String startUrl : startUrls) { schedular.push(new Request(startUrl), this); } Request request = schedular.poll(this); diff --git a/webmagic-plugin/src/main/java/us/codecraft/webmagic/pipeline/FreemarkerPipeline.java b/webmagic-plugin/src/main/java/us/codecraft/webmagic/pipeline/FreemarkerPipeline.java index 1121971..218276d 100644 --- a/webmagic-plugin/src/main/java/us/codecraft/webmagic/pipeline/FreemarkerPipeline.java +++ b/webmagic-plugin/src/main/java/us/codecraft/webmagic/pipeline/FreemarkerPipeline.java @@ -30,7 +30,7 @@ public class FreemarkerPipeline implements Pipeline { configuration.setDirectoryForTemplateLoading(new File(this.getClass().getClassLoader().getResource("ftl/").getFile())); this.template = configuration.getTemplate(template); this.path = path; - File file = new File(path); + new File(path); } public FreemarkerPipeline(String template) throws IOException { diff --git a/webmagic-plugin/src/test/java/us/codecraft/webmagic/FreemarkerPipelineTest.java b/webmagic-plugin/src/test/java/us/codecraft/webmagic/FreemarkerPipelineTest.java index 610edf5..9e6b995 100644 --- a/webmagic-plugin/src/test/java/us/codecraft/webmagic/FreemarkerPipelineTest.java +++ b/webmagic-plugin/src/test/java/us/codecraft/webmagic/FreemarkerPipelineTest.java @@ -13,7 +13,7 @@ import java.io.IOException; public class FreemarkerPipelineTest { @Test - public void test() throws IOException { + public void testTemplateLoad() throws IOException { FreemarkerPipeline freemarkerPipeline = new FreemarkerPipeline("wordpress.ftl"); } } diff --git a/webmagic-plugin/src/test/resources/ftl/wordpress.ftl b/webmagic-plugin/src/test/resources/ftl/wordpress.ftl deleted file mode 100644 index 61820b7..0000000 --- a/webmagic-plugin/src/test/resources/ftl/wordpress.ftl +++ /dev/null @@ -1,23 +0,0 @@ - - $it.Title - http://127.0.0.1/wordpress/?p=$it.Id - ${date} - admin - http://127.0.0.1/wordpress/?p=$it.Id - - - - <#--$it.Id--> - ${date} - ${date} - open - open - ${title} - publish - 0 - 0 - post - - 0 - $tags - \ No newline at end of file diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingBlogProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingProcessor.java similarity index 61% rename from webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingBlogProcessor.java rename to webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingProcessor.java index fafb7de..63aa0f0 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingBlogProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingProcessor.java @@ -2,6 +2,7 @@ package us.codecraft.webmagic.samples; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; import java.util.List; @@ -11,15 +12,14 @@ import java.util.List; * Date: 13-4-21 * Time: 下午8:08 */ -public class DianpingBlogProcessor implements PageProcessor { +public class DianpingProcessor implements PageProcessor { @Override public void process(Page page) { - //http://progressdaily.diandian.com/post/2013-01-24/40046867275 - List requests = page.getHtml().rs("]*href=[\"']{1}(/shop/.*?)[\"']{1}").toStrings(); + List requests = page.getHtml().as().rs(".*shop.*").toStrings(); page.addTargetRequests(requests); - requests = page.getHtml().rs("]*href=[\"']{1}(/search/category/.*?)[\"']{1}").toStrings(); + requests = page.getHtml().rs(".*search/category/.*").toStrings(); page.addTargetRequests(requests); - if (page.getUrl().toString().contains("shop")){ + if (page.getUrl().toString().contains("shop")) { page.putField("title", page.getHtml().x("//h1[@class='shop-title']")); page.putField("content", page.getHtml().sc()); } @@ -30,4 +30,9 @@ public class DianpingBlogProcessor implements PageProcessor { return Site.me().setDomain("www.dianping.com").addStartUrl("http://www.dianping.com/"). setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); } + + public static void main(String[] args) { + DianpingProcessor dianpingProcessor = new DianpingProcessor(); + Spider.me().processor(dianpingProcessor).startUrl("http://www.dianping.com/shanghai/food").run(); + } }