update pipeline
parent
755b9aa84e
commit
ecb61d1385
|
@ -10,6 +10,9 @@ import us.codecraft.webmagic.processor.PageProcessor;
|
||||||
import us.codecraft.webmagic.schedular.QueueSchedular;
|
import us.codecraft.webmagic.schedular.QueueSchedular;
|
||||||
import us.codecraft.webmagic.schedular.Schedular;
|
import us.codecraft.webmagic.schedular.Schedular;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* User: cairne
|
* User: cairne
|
||||||
* Date: 13-4-21
|
* Date: 13-4-21
|
||||||
|
@ -19,7 +22,7 @@ public class Spider implements Runnable {
|
||||||
|
|
||||||
private Downloader downloader = new HttpClientDownloader();
|
private Downloader downloader = new HttpClientDownloader();
|
||||||
|
|
||||||
private Pipeline pipeline = new ConsolePipeline();
|
private List<Pipeline> pipelines = new ArrayList<Pipeline>();
|
||||||
|
|
||||||
private PageProcessor pageProcessor;
|
private PageProcessor pageProcessor;
|
||||||
|
|
||||||
|
@ -47,7 +50,7 @@ public class Spider implements Runnable {
|
||||||
}
|
}
|
||||||
|
|
||||||
public Spider pipeline(Pipeline pipeline) {
|
public Spider pipeline(Pipeline pipeline) {
|
||||||
this.pipeline = pipeline;
|
this.pipelines.add(pipeline);
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -56,6 +59,9 @@ public class Spider implements Runnable {
|
||||||
public void run() {
|
public void run() {
|
||||||
Site site = pageProcessor.getSite();
|
Site site = pageProcessor.getSite();
|
||||||
Request request = schedular.poll(site);
|
Request request = schedular.poll(site);
|
||||||
|
if (pipelines.isEmpty()){
|
||||||
|
pipelines.add(new ConsolePipeline());
|
||||||
|
}
|
||||||
while (request != null) {
|
while (request != null) {
|
||||||
Page page = downloader.download(request,site);
|
Page page = downloader.download(request,site);
|
||||||
if (page == null) {
|
if (page == null) {
|
||||||
|
@ -64,7 +70,9 @@ public class Spider implements Runnable {
|
||||||
}
|
}
|
||||||
pageProcessor.process(page);
|
pageProcessor.process(page);
|
||||||
addRequest(page);
|
addRequest(page);
|
||||||
pipeline.process(page,site);
|
for (Pipeline pipeline : pipelines) {
|
||||||
|
pipeline.process(page,site);
|
||||||
|
}
|
||||||
sleep(site.getSleepTime());
|
sleep(site.getSleepTime());
|
||||||
request = schedular.poll(site);
|
request = schedular.poll(site);
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,10 +1,10 @@
|
||||||
package us.codecraft.webmagic.pipeline;
|
package us.codecraft.webmagic.pipeline;
|
||||||
|
|
||||||
import org.apache.commons.codec.digest.DigestUtils;
|
import org.apache.commons.codec.digest.DigestUtils;
|
||||||
import org.apache.commons.lang3.StringUtils;
|
|
||||||
import us.codecraft.webmagic.Page;
|
import us.codecraft.webmagic.Page;
|
||||||
import us.codecraft.webmagic.Site;
|
import us.codecraft.webmagic.Site;
|
||||||
import us.codecraft.webmagic.selector.Selectable;
|
import us.codecraft.webmagic.selector.Selectable;
|
||||||
|
import us.codecraft.webmagic.utils.UrlUtils;
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.FileWriter;
|
import java.io.FileWriter;
|
||||||
|
@ -21,7 +21,7 @@ public class FilePipeline implements Pipeline {
|
||||||
|
|
||||||
private String path = "/data/temp/webmagic/";
|
private String path = "/data/temp/webmagic/";
|
||||||
|
|
||||||
public FilePipeline(){
|
public FilePipeline() {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -30,11 +30,9 @@ public class FilePipeline implements Pipeline {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void process(Page page,Site site) {
|
public void process(Page page, Site site) {
|
||||||
String domain = site.getDomain();
|
String domain = site.getDomain();
|
||||||
domain = StringUtils.removeStart(domain, "http://");
|
domain = UrlUtils.getDomain(domain);
|
||||||
domain = StringUtils.removeStart(domain, "https://");
|
|
||||||
domain = StringUtils.replace(domain, "/", "");
|
|
||||||
String path = this.path + "" + domain + "/";
|
String path = this.path + "" + domain + "/";
|
||||||
File file = new File(path);
|
File file = new File(path);
|
||||||
if (!file.exists()) {
|
if (!file.exists()) {
|
||||||
|
|
|
@ -23,6 +23,9 @@ public class RegexSelector implements Selector {
|
||||||
if (StringUtils.isBlank(regexStr)){
|
if (StringUtils.isBlank(regexStr)){
|
||||||
throw new IllegalArgumentException("regex must not be empty");
|
throw new IllegalArgumentException("regex must not be empty");
|
||||||
}
|
}
|
||||||
|
if (!StringUtils.contains(regexStr,"(")&&!StringUtils.contains(regexStr,")")){
|
||||||
|
regexStr="("+regexStr+")";
|
||||||
|
}
|
||||||
if (!StringUtils.contains(regexStr,"(")||!StringUtils.contains(regexStr,")")){
|
if (!StringUtils.contains(regexStr,"(")||!StringUtils.contains(regexStr,")")){
|
||||||
throw new IllegalArgumentException("regex must have capture group 1");
|
throw new IllegalArgumentException("regex must have capture group 1");
|
||||||
}
|
}
|
||||||
|
|
|
@ -20,6 +20,11 @@
|
||||||
<version>4.7</version>
|
<version>4.7</version>
|
||||||
<scope>test</scope>
|
<scope>test</scope>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.freemarker</groupId>
|
||||||
|
<artifactId>freemarker</artifactId>
|
||||||
|
<version>2.3.19</version>
|
||||||
|
</dependency>
|
||||||
</dependencies>
|
</dependencies>
|
||||||
|
|
||||||
<build>
|
<build>
|
||||||
|
|
|
@ -0,0 +1,57 @@
|
||||||
|
package us.codecraft.webmagic.pipeline;
|
||||||
|
|
||||||
|
import freemarker.template.Configuration;
|
||||||
|
import freemarker.template.Template;
|
||||||
|
import freemarker.template.TemplateException;
|
||||||
|
import org.apache.commons.codec.digest.DigestUtils;
|
||||||
|
import us.codecraft.webmagic.Page;
|
||||||
|
import us.codecraft.webmagic.Site;
|
||||||
|
import us.codecraft.webmagic.utils.UrlUtils;
|
||||||
|
|
||||||
|
import java.io.*;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* User: cairne
|
||||||
|
* Date: 13-6-8
|
||||||
|
* Time: 下午9:00
|
||||||
|
*/
|
||||||
|
public class FreemarkerPipeline implements Pipeline {
|
||||||
|
|
||||||
|
private Configuration configuration;
|
||||||
|
|
||||||
|
private Template template;
|
||||||
|
|
||||||
|
private String path = "/data/temp/webmagic/ftl/";
|
||||||
|
|
||||||
|
public FreemarkerPipeline(String template, String path) throws IOException {
|
||||||
|
configuration = new Configuration();
|
||||||
|
configuration.setDirectoryForTemplateLoading(new File(this.getClass().getClassLoader().getResource("ftl/").getFile()));
|
||||||
|
this.template = configuration.getTemplate(template);
|
||||||
|
this.path = path;
|
||||||
|
File file = new File(path);
|
||||||
|
if (!file.exists()) {
|
||||||
|
file.mkdir();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public FreemarkerPipeline(String template) throws IOException {
|
||||||
|
this(template, "/data/temp/webmagic/ftl/");
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void process(Page page, Site site) {
|
||||||
|
String domain = site.getDomain();
|
||||||
|
domain = UrlUtils.getDomain(domain);
|
||||||
|
String path = this.path + "" + domain + "/";
|
||||||
|
try {
|
||||||
|
PrintWriter printWriter = new PrintWriter(new FileWriter(path + DigestUtils.md5Hex(page.getUrl().toString()) + ".html"));
|
||||||
|
template.process(page.getFields(), printWriter);
|
||||||
|
printWriter.close();
|
||||||
|
} catch (TemplateException e) {
|
||||||
|
e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates.
|
||||||
|
} catch (IOException e) {
|
||||||
|
e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates.
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,23 @@
|
||||||
|
<item>
|
||||||
|
<title>$it.Title</title>
|
||||||
|
<link>http://127.0.0.1/wordpress/?p=$it.Id</link>
|
||||||
|
<pubDate>${date}</pubDate>
|
||||||
|
<dc:creator>admin</dc:creator>
|
||||||
|
<guid isPermaLink="false">http://127.0.0.1/wordpress/?p=$it.Id</guid>
|
||||||
|
<description></description>
|
||||||
|
<content:encoded><![CDATA[${text}]]></content:encoded>
|
||||||
|
<excerpt:encoded><![CDATA[]]></excerpt:encoded>
|
||||||
|
<#--<wp:post_id>$it.Id</wp:post_id>-->
|
||||||
|
<wp:post_date>${date}</wp:post_date>
|
||||||
|
<wp:post_date_gmt>${date}</wp:post_date_gmt>
|
||||||
|
<wp:comment_status>open</wp:comment_status>
|
||||||
|
<wp:ping_status>open</wp:ping_status>
|
||||||
|
<wp:post_name>${title}</wp:post_name>
|
||||||
|
<wp:status>publish</wp:status>
|
||||||
|
<wp:post_parent>0</wp:post_parent>
|
||||||
|
<wp:menu_order>0</wp:menu_order>
|
||||||
|
<wp:post_type>post</wp:post_type>
|
||||||
|
<wp:post_password></wp:post_password>
|
||||||
|
<wp:is_sticky>0</wp:is_sticky>
|
||||||
|
$tags
|
||||||
|
</item>
|
|
@ -0,0 +1,19 @@
|
||||||
|
package us.codecraft.webmagic;
|
||||||
|
|
||||||
|
import org.junit.Test;
|
||||||
|
import us.codecraft.webmagic.pipeline.FreemarkerPipeline;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* User: cairne
|
||||||
|
* Date: 13-6-9
|
||||||
|
* Time: 上午7:14
|
||||||
|
*/
|
||||||
|
public class FreemarkerPipelineTest {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void test() throws IOException {
|
||||||
|
FreemarkerPipeline freemarkerPipeline = new FreemarkerPipeline("wordpress.ftl");
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,23 @@
|
||||||
|
<item>
|
||||||
|
<title>$it.Title</title>
|
||||||
|
<link>http://127.0.0.1/wordpress/?p=$it.Id</link>
|
||||||
|
<pubDate>${date}</pubDate>
|
||||||
|
<dc:creator>admin</dc:creator>
|
||||||
|
<guid isPermaLink="false">http://127.0.0.1/wordpress/?p=$it.Id</guid>
|
||||||
|
<description></description>
|
||||||
|
<content:encoded><![CDATA[${text}]]></content:encoded>
|
||||||
|
<excerpt:encoded><![CDATA[]]></excerpt:encoded>
|
||||||
|
<#--<wp:post_id>$it.Id</wp:post_id>-->
|
||||||
|
<wp:post_date>${date}</wp:post_date>
|
||||||
|
<wp:post_date_gmt>${date}</wp:post_date_gmt>
|
||||||
|
<wp:comment_status>open</wp:comment_status>
|
||||||
|
<wp:ping_status>open</wp:ping_status>
|
||||||
|
<wp:post_name>${title}</wp:post_name>
|
||||||
|
<wp:status>publish</wp:status>
|
||||||
|
<wp:post_parent>0</wp:post_parent>
|
||||||
|
<wp:menu_order>0</wp:menu_order>
|
||||||
|
<wp:post_type>post</wp:post_type>
|
||||||
|
<wp:post_password></wp:post_password>
|
||||||
|
<wp:is_sticky>0</wp:is_sticky>
|
||||||
|
$tags
|
||||||
|
</item>
|
|
@ -15,6 +15,11 @@
|
||||||
<artifactId>webmagic-core</artifactId>
|
<artifactId>webmagic-core</artifactId>
|
||||||
<version>0.0.1-SNAPSHOT</version>
|
<version>0.0.1-SNAPSHOT</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>us.codecraft</groupId>
|
||||||
|
<artifactId>webmagic-plugin</artifactId>
|
||||||
|
<version>0.0.1-SNAPSHOT</version>
|
||||||
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>junit</groupId>
|
<groupId>junit</groupId>
|
||||||
<artifactId>junit</artifactId>
|
<artifactId>junit</artifactId>
|
||||||
|
|
|
@ -3,6 +3,7 @@ package us.codecraft.webmagic.samples;
|
||||||
import us.codecraft.webmagic.Page;
|
import us.codecraft.webmagic.Page;
|
||||||
import us.codecraft.webmagic.Site;
|
import us.codecraft.webmagic.Site;
|
||||||
import us.codecraft.webmagic.processor.PageProcessor;
|
import us.codecraft.webmagic.processor.PageProcessor;
|
||||||
|
import us.codecraft.webmagic.selector.PlainText;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
|
@ -14,20 +15,21 @@ import java.util.List;
|
||||||
public class DiaoyuwengProcessor implements PageProcessor {
|
public class DiaoyuwengProcessor implements PageProcessor {
|
||||||
@Override
|
@Override
|
||||||
public void process(Page page) {
|
public void process(Page page) {
|
||||||
//http://progressdaily.diandian.com/post/2013-01-24/40046867275
|
List<String> requests = page.getHtml().as().rs("(http://www\\.diaoyuweng\\.com/home\\.php\\?mod=space&uid=88304&do=thread&view=me&type=thread&order=dateline&from=space&page=\\d+)").toStrings();
|
||||||
List<String> requests = page.getHtml().rs("<a[^<>]*href=[\"']{1}(/shop/.*?)[\"']{1}").toStrings();
|
|
||||||
page.addTargetRequests(requests);
|
page.addTargetRequests(requests);
|
||||||
requests = page.getHtml().rs("<a[^<>]*href=[\"']{1}(/search/category/.*?)[\"']{1}").toStrings();
|
requests = page.getHtml().as().rs("(http://www\\.diaoyuweng\\.com/thread-\\d+-1-1.html)").toStrings();
|
||||||
page.addTargetRequests(requests);
|
page.addTargetRequests(requests);
|
||||||
if (page.getUrl().toString().contains("shop")){
|
if (page.getUrl().toString().contains("thread")){
|
||||||
page.putField("title", page.getHtml().x("//h1[@class='shop-title']"));
|
page.putField("title", page.getHtml().x("//a[@id='thread_subject']"));
|
||||||
page.putField("content", page.getHtml().sc());
|
page.putField("content", page.getHtml().x("//div[@class='pcb']//tbody"));
|
||||||
|
page.putField("date",page.getHtml().r("发表于 (\\d{4}-\\d+-\\d+ \\d+:\\d+:\\d+)"));
|
||||||
|
page.putField("id",new PlainText("1000"+page.getUrl().r("http://www\\.diaoyuweng\\.com/thread-(\\d+)-1-1.html").toString()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Site getSite() {
|
public Site getSite() {
|
||||||
return Site.me().setDomain("www.dianping.com").setStartUrl("http://www.dianping.com/").
|
return Site.me().setDomain("www.diaoyuweng.com").setStartUrl("http://www.diaoyuweng.com/home.php?mod=space&uid=88304&do=thread&view=me&type=thread&from=space").
|
||||||
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
|
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31").setEncoding("GBK").setSleepTime(500);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,26 @@
|
||||||
|
package us.codecraft.webmagic.processor;
|
||||||
|
|
||||||
|
import org.junit.Test;
|
||||||
|
import us.codecraft.webmagic.Spider;
|
||||||
|
import us.codecraft.webmagic.pipeline.FilePipeline;
|
||||||
|
import us.codecraft.webmagic.pipeline.FreemarkerPipeline;
|
||||||
|
import us.codecraft.webmagic.samples.DiaoyuwengProcessor;
|
||||||
|
import us.codecraft.webmagic.schedular.FileCacheQueueSchedular;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* User: cairne
|
||||||
|
* Date: 13-6-9
|
||||||
|
* Time: 上午8:02
|
||||||
|
*/
|
||||||
|
public class DiaoyuwengProcessorTest {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void test() throws IOException {
|
||||||
|
DiaoyuwengProcessor diaoyuwengProcessor = new DiaoyuwengProcessor();
|
||||||
|
FreemarkerPipeline pipeline = new FreemarkerPipeline("wordpress.ftl");
|
||||||
|
Spider.me().pipeline(new FilePipeline()).pipeline(pipeline).schedular(new FileCacheQueueSchedular(diaoyuwengProcessor.getSite(), "/data/temp/webmagic/cache/")).
|
||||||
|
processor(diaoyuwengProcessor).run();
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,22 @@
|
||||||
|
<item>
|
||||||
|
<title>${title}</title>
|
||||||
|
<link>http://127.0.0.1/wordpress/?p=${id}</link>
|
||||||
|
<pubDate>${date}</pubDate>
|
||||||
|
<dc:creator>admin</dc:creator>
|
||||||
|
<guid isPermaLink="false">http://127.0.0.1/wordpress/?p=${id}</guid>
|
||||||
|
<description></description>
|
||||||
|
<content:encoded><![CDATA[${content}]]></content:encoded>
|
||||||
|
<excerpt:encoded><![CDATA[]]></excerpt:encoded>
|
||||||
|
<#--<wp:post_id>$it.Id</wp:post_id>-->
|
||||||
|
<wp:post_date>${date}</wp:post_date>
|
||||||
|
<wp:post_date_gmt>${date}</wp:post_date_gmt>
|
||||||
|
<wp:comment_status>open</wp:comment_status>
|
||||||
|
<wp:ping_status>open</wp:ping_status>
|
||||||
|
<wp:post_name>${title}</wp:post_name>
|
||||||
|
<wp:status>publish</wp:status>
|
||||||
|
<wp:post_parent>0</wp:post_parent>
|
||||||
|
<wp:menu_order>0</wp:menu_order>
|
||||||
|
<wp:post_type>post</wp:post_type>
|
||||||
|
<wp:post_password></wp:post_password>
|
||||||
|
<wp:is_sticky>0</wp:is_sticky>
|
||||||
|
</item>
|
Loading…
Reference in New Issue