diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 835bdf4..f3ec5f8 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -10,6 +10,9 @@ import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.schedular.QueueSchedular; import us.codecraft.webmagic.schedular.Schedular; +import java.util.ArrayList; +import java.util.List; + /** * User: cairne * Date: 13-4-21 @@ -19,7 +22,7 @@ public class Spider implements Runnable { private Downloader downloader = new HttpClientDownloader(); - private Pipeline pipeline = new ConsolePipeline(); + private List pipelines = new ArrayList(); private PageProcessor pageProcessor; @@ -47,7 +50,7 @@ public class Spider implements Runnable { } public Spider pipeline(Pipeline pipeline) { - this.pipeline = pipeline; + this.pipelines.add(pipeline); return this; } @@ -56,6 +59,9 @@ public class Spider implements Runnable { public void run() { Site site = pageProcessor.getSite(); Request request = schedular.poll(site); + if (pipelines.isEmpty()){ + pipelines.add(new ConsolePipeline()); + } while (request != null) { Page page = downloader.download(request,site); if (page == null) { @@ -64,7 +70,9 @@ public class Spider implements Runnable { } pageProcessor.process(page); addRequest(page); - pipeline.process(page,site); + for (Pipeline pipeline : pipelines) { + pipeline.process(page,site); + } sleep(site.getSleepTime()); request = schedular.poll(site); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java index 508b00e..0f81dba 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java @@ -1,10 +1,10 @@ package us.codecraft.webmagic.pipeline; import org.apache.commons.codec.digest.DigestUtils; -import org.apache.commons.lang3.StringUtils; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.selector.Selectable; +import us.codecraft.webmagic.utils.UrlUtils; import java.io.File; import java.io.FileWriter; @@ -21,7 +21,7 @@ public class FilePipeline implements Pipeline { private String path = "/data/temp/webmagic/"; - public FilePipeline(){ + public FilePipeline() { } @@ -30,11 +30,9 @@ public class FilePipeline implements Pipeline { } @Override - public void process(Page page,Site site) { + public void process(Page page, Site site) { String domain = site.getDomain(); - domain = StringUtils.removeStart(domain, "http://"); - domain = StringUtils.removeStart(domain, "https://"); - domain = StringUtils.replace(domain, "/", ""); + domain = UrlUtils.getDomain(domain); String path = this.path + "" + domain + "/"; File file = new File(path); if (!file.exists()) { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java index a2e8b3d..671cbe7 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java @@ -23,6 +23,9 @@ public class RegexSelector implements Selector { if (StringUtils.isBlank(regexStr)){ throw new IllegalArgumentException("regex must not be empty"); } + if (!StringUtils.contains(regexStr,"(")&&!StringUtils.contains(regexStr,")")){ + regexStr="("+regexStr+")"; + } if (!StringUtils.contains(regexStr,"(")||!StringUtils.contains(regexStr,")")){ throw new IllegalArgumentException("regex must have capture group 1"); } diff --git a/webmagic-plugin/pom.xml b/webmagic-plugin/pom.xml index 6a554bf..1128f7a 100644 --- a/webmagic-plugin/pom.xml +++ b/webmagic-plugin/pom.xml @@ -20,6 +20,11 @@ 4.7 test + + org.freemarker + freemarker + 2.3.19 + diff --git a/webmagic-plugin/src/main/java/us/codecraft/webmagic/pipeline/FreemarkerPipeline.java b/webmagic-plugin/src/main/java/us/codecraft/webmagic/pipeline/FreemarkerPipeline.java new file mode 100644 index 0000000..32fec16 --- /dev/null +++ b/webmagic-plugin/src/main/java/us/codecraft/webmagic/pipeline/FreemarkerPipeline.java @@ -0,0 +1,57 @@ +package us.codecraft.webmagic.pipeline; + +import freemarker.template.Configuration; +import freemarker.template.Template; +import freemarker.template.TemplateException; +import org.apache.commons.codec.digest.DigestUtils; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.utils.UrlUtils; + +import java.io.*; + +/** + * User: cairne + * Date: 13-6-8 + * Time: 下午9:00 + */ +public class FreemarkerPipeline implements Pipeline { + + private Configuration configuration; + + private Template template; + + private String path = "/data/temp/webmagic/ftl/"; + + public FreemarkerPipeline(String template, String path) throws IOException { + configuration = new Configuration(); + configuration.setDirectoryForTemplateLoading(new File(this.getClass().getClassLoader().getResource("ftl/").getFile())); + this.template = configuration.getTemplate(template); + this.path = path; + File file = new File(path); + if (!file.exists()) { + file.mkdir(); + } + } + + public FreemarkerPipeline(String template) throws IOException { + this(template, "/data/temp/webmagic/ftl/"); + } + + + @Override + public void process(Page page, Site site) { + String domain = site.getDomain(); + domain = UrlUtils.getDomain(domain); + String path = this.path + "" + domain + "/"; + try { + PrintWriter printWriter = new PrintWriter(new FileWriter(path + DigestUtils.md5Hex(page.getUrl().toString()) + ".html")); + template.process(page.getFields(), printWriter); + printWriter.close(); + } catch (TemplateException e) { + e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates. + } catch (IOException e) { + e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates. + } + } +} diff --git a/webmagic-plugin/src/main/resources/ftl/wordpress.ftl b/webmagic-plugin/src/main/resources/ftl/wordpress.ftl new file mode 100644 index 0000000..61820b7 --- /dev/null +++ b/webmagic-plugin/src/main/resources/ftl/wordpress.ftl @@ -0,0 +1,23 @@ + + $it.Title + http://127.0.0.1/wordpress/?p=$it.Id + ${date} + admin + http://127.0.0.1/wordpress/?p=$it.Id + + + + <#--$it.Id--> + ${date} + ${date} + open + open + ${title} + publish + 0 + 0 + post + + 0 + $tags + \ No newline at end of file diff --git a/webmagic-plugin/src/test/java/us/codecraft/webmagic/FreemarkerPipelineTest.java b/webmagic-plugin/src/test/java/us/codecraft/webmagic/FreemarkerPipelineTest.java new file mode 100644 index 0000000..d52154f --- /dev/null +++ b/webmagic-plugin/src/test/java/us/codecraft/webmagic/FreemarkerPipelineTest.java @@ -0,0 +1,19 @@ +package us.codecraft.webmagic; + +import org.junit.Test; +import us.codecraft.webmagic.pipeline.FreemarkerPipeline; + +import java.io.IOException; + +/** + * User: cairne + * Date: 13-6-9 + * Time: 上午7:14 + */ +public class FreemarkerPipelineTest { + + @Test + public void test() throws IOException { + FreemarkerPipeline freemarkerPipeline = new FreemarkerPipeline("wordpress.ftl"); + } +} diff --git a/webmagic-plugin/src/test/resources/ftl/wordpress.ftl b/webmagic-plugin/src/test/resources/ftl/wordpress.ftl new file mode 100644 index 0000000..61820b7 --- /dev/null +++ b/webmagic-plugin/src/test/resources/ftl/wordpress.ftl @@ -0,0 +1,23 @@ + + $it.Title + http://127.0.0.1/wordpress/?p=$it.Id + ${date} + admin + http://127.0.0.1/wordpress/?p=$it.Id + + + + <#--$it.Id--> + ${date} + ${date} + open + open + ${title} + publish + 0 + 0 + post + + 0 + $tags + \ No newline at end of file diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index e3a846a..bfa1bfa 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -15,6 +15,11 @@ webmagic-core 0.0.1-SNAPSHOT + + us.codecraft + webmagic-plugin + 0.0.1-SNAPSHOT + junit junit diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java index 05b68b6..33b86bb 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java @@ -3,6 +3,7 @@ package us.codecraft.webmagic.samples; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.processor.PageProcessor; +import us.codecraft.webmagic.selector.PlainText; import java.util.List; @@ -14,20 +15,21 @@ import java.util.List; public class DiaoyuwengProcessor implements PageProcessor { @Override public void process(Page page) { - //http://progressdaily.diandian.com/post/2013-01-24/40046867275 - List requests = page.getHtml().rs("]*href=[\"']{1}(/shop/.*?)[\"']{1}").toStrings(); + List requests = page.getHtml().as().rs("(http://www\\.diaoyuweng\\.com/home\\.php\\?mod=space&uid=88304&do=thread&view=me&type=thread&order=dateline&from=space&page=\\d+)").toStrings(); page.addTargetRequests(requests); - requests = page.getHtml().rs("]*href=[\"']{1}(/search/category/.*?)[\"']{1}").toStrings(); + requests = page.getHtml().as().rs("(http://www\\.diaoyuweng\\.com/thread-\\d+-1-1.html)").toStrings(); page.addTargetRequests(requests); - if (page.getUrl().toString().contains("shop")){ - page.putField("title", page.getHtml().x("//h1[@class='shop-title']")); - page.putField("content", page.getHtml().sc()); + if (page.getUrl().toString().contains("thread")){ + page.putField("title", page.getHtml().x("//a[@id='thread_subject']")); + page.putField("content", page.getHtml().x("//div[@class='pcb']//tbody")); + page.putField("date",page.getHtml().r("发表于 (\\d{4}-\\d+-\\d+ \\d+:\\d+:\\d+)")); + page.putField("id",new PlainText("1000"+page.getUrl().r("http://www\\.diaoyuweng\\.com/thread-(\\d+)-1-1.html").toString())); } } @Override public Site getSite() { - return Site.me().setDomain("www.dianping.com").setStartUrl("http://www.dianping.com/"). - setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); + return Site.me().setDomain("www.diaoyuweng.com").setStartUrl("http://www.diaoyuweng.com/home.php?mod=space&uid=88304&do=thread&view=me&type=thread&from=space"). + setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31").setEncoding("GBK").setSleepTime(500); } } diff --git a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiaoyuwengProcessorTest.java b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiaoyuwengProcessorTest.java new file mode 100644 index 0000000..1e77c7c --- /dev/null +++ b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiaoyuwengProcessorTest.java @@ -0,0 +1,26 @@ +package us.codecraft.webmagic.processor; + +import org.junit.Test; +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.pipeline.FilePipeline; +import us.codecraft.webmagic.pipeline.FreemarkerPipeline; +import us.codecraft.webmagic.samples.DiaoyuwengProcessor; +import us.codecraft.webmagic.schedular.FileCacheQueueSchedular; + +import java.io.IOException; + +/** + * User: cairne + * Date: 13-6-9 + * Time: 上午8:02 + */ +public class DiaoyuwengProcessorTest { + + @Test + public void test() throws IOException { + DiaoyuwengProcessor diaoyuwengProcessor = new DiaoyuwengProcessor(); + FreemarkerPipeline pipeline = new FreemarkerPipeline("wordpress.ftl"); + Spider.me().pipeline(new FilePipeline()).pipeline(pipeline).schedular(new FileCacheQueueSchedular(diaoyuwengProcessor.getSite(), "/data/temp/webmagic/cache/")). + processor(diaoyuwengProcessor).run(); + } +} diff --git a/webmagic-samples/src/test/resources/ftl/wordpress.ftl b/webmagic-samples/src/test/resources/ftl/wordpress.ftl new file mode 100644 index 0000000..e58ae79 --- /dev/null +++ b/webmagic-samples/src/test/resources/ftl/wordpress.ftl @@ -0,0 +1,22 @@ + + ${title} + http://127.0.0.1/wordpress/?p=${id} + ${date} + admin + http://127.0.0.1/wordpress/?p=${id} + + + + <#--$it.Id--> + ${date} + ${date} + open + open + ${title} + publish + 0 + 0 + post + + 0 + \ No newline at end of file