fix samples
parent
312e1bce87
commit
8f954c7997
|
@ -90,8 +90,8 @@ webmagic遵循[Apache 2.0协议](http://opensource.org/licenses/Apache-2.0)
|
|||
|
||||
webmagic的架构和设计参考了以下两个项目,感谢以下两个项目的作者:
|
||||
|
||||
python爬虫 **scrapy**[https://github.com/scrapy/scrapy](https://github.com/scrapy/scrapy)
|
||||
python爬虫 **scrapy** [https://github.com/scrapy/scrapy](https://github.com/scrapy/scrapy)
|
||||
|
||||
Java爬虫 **Spiderman**[https://gitcafe.com/laiweiwei/Spiderman](https://gitcafe.com/laiweiwei/Spiderman)
|
||||
Java爬虫 **Spiderman** [https://gitcafe.com/laiweiwei/Spiderman](https://gitcafe.com/laiweiwei/Spiderman)
|
||||
|
||||
|
||||
|
|
|
@ -57,10 +57,6 @@ public class Spider implements Runnable, Task {
|
|||
return this;
|
||||
}
|
||||
|
||||
public Thread thread() {
|
||||
return new Thread(this);
|
||||
}
|
||||
|
||||
public Spider schedular(Schedular schedular) {
|
||||
this.schedular = schedular;
|
||||
return this;
|
||||
|
@ -74,7 +70,7 @@ public class Spider implements Runnable, Task {
|
|||
|
||||
@Override
|
||||
public void run() {
|
||||
for (String startUrl : pageProcessor.getSite().getStartUrls()) {
|
||||
for (String startUrl : startUrls) {
|
||||
schedular.push(new Request(startUrl), this);
|
||||
}
|
||||
Request request = schedular.poll(this);
|
||||
|
|
|
@ -30,7 +30,7 @@ public class FreemarkerPipeline implements Pipeline {
|
|||
configuration.setDirectoryForTemplateLoading(new File(this.getClass().getClassLoader().getResource("ftl/").getFile()));
|
||||
this.template = configuration.getTemplate(template);
|
||||
this.path = path;
|
||||
File file = new File(path);
|
||||
new File(path);
|
||||
}
|
||||
|
||||
public FreemarkerPipeline(String template) throws IOException {
|
||||
|
|
|
@ -13,7 +13,7 @@ import java.io.IOException;
|
|||
public class FreemarkerPipelineTest {
|
||||
|
||||
@Test
|
||||
public void test() throws IOException {
|
||||
public void testTemplateLoad() throws IOException {
|
||||
FreemarkerPipeline freemarkerPipeline = new FreemarkerPipeline("wordpress.ftl");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,23 +0,0 @@
|
|||
<item>
|
||||
<title>$it.Title</title>
|
||||
<link>http://127.0.0.1/wordpress/?p=$it.Id</link>
|
||||
<pubDate>${date}</pubDate>
|
||||
<dc:creator>admin</dc:creator>
|
||||
<guid isPermaLink="false">http://127.0.0.1/wordpress/?p=$it.Id</guid>
|
||||
<description></description>
|
||||
<content:encoded><![CDATA[${text}]]></content:encoded>
|
||||
<excerpt:encoded><![CDATA[]]></excerpt:encoded>
|
||||
<#--<wp:post_id>$it.Id</wp:post_id>-->
|
||||
<wp:post_date>${date}</wp:post_date>
|
||||
<wp:post_date_gmt>${date}</wp:post_date_gmt>
|
||||
<wp:comment_status>open</wp:comment_status>
|
||||
<wp:ping_status>open</wp:ping_status>
|
||||
<wp:post_name>${title}</wp:post_name>
|
||||
<wp:status>publish</wp:status>
|
||||
<wp:post_parent>0</wp:post_parent>
|
||||
<wp:menu_order>0</wp:menu_order>
|
||||
<wp:post_type>post</wp:post_type>
|
||||
<wp:post_password></wp:post_password>
|
||||
<wp:is_sticky>0</wp:is_sticky>
|
||||
$tags
|
||||
</item>
|
|
@ -2,6 +2,7 @@ package us.codecraft.webmagic.samples;
|
|||
|
||||
import us.codecraft.webmagic.Site;
|
||||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.Spider;
|
||||
import us.codecraft.webmagic.processor.PageProcessor;
|
||||
|
||||
import java.util.List;
|
||||
|
@ -11,15 +12,14 @@ import java.util.List;
|
|||
* Date: 13-4-21
|
||||
* Time: 下午8:08
|
||||
*/
|
||||
public class DianpingBlogProcessor implements PageProcessor {
|
||||
public class DianpingProcessor implements PageProcessor {
|
||||
@Override
|
||||
public void process(Page page) {
|
||||
//http://progressdaily.diandian.com/post/2013-01-24/40046867275
|
||||
List<String> requests = page.getHtml().rs("<a[^<>]*href=[\"']{1}(/shop/.*?)[\"']{1}").toStrings();
|
||||
List<String> requests = page.getHtml().as().rs(".*shop.*").toStrings();
|
||||
page.addTargetRequests(requests);
|
||||
requests = page.getHtml().rs("<a[^<>]*href=[\"']{1}(/search/category/.*?)[\"']{1}").toStrings();
|
||||
requests = page.getHtml().rs(".*search/category/.*").toStrings();
|
||||
page.addTargetRequests(requests);
|
||||
if (page.getUrl().toString().contains("shop")){
|
||||
if (page.getUrl().toString().contains("shop")) {
|
||||
page.putField("title", page.getHtml().x("//h1[@class='shop-title']"));
|
||||
page.putField("content", page.getHtml().sc());
|
||||
}
|
||||
|
@ -30,4 +30,9 @@ public class DianpingBlogProcessor implements PageProcessor {
|
|||
return Site.me().setDomain("www.dianping.com").addStartUrl("http://www.dianping.com/").
|
||||
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
|
||||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
DianpingProcessor dianpingProcessor = new DianpingProcessor();
|
||||
Spider.me().processor(dianpingProcessor).startUrl("http://www.dianping.com/shanghai/food").run();
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue