From 5b254e446b0a616bc91f5d9526fc83d0d2bc54cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=84=A4=E6=80=92=E7=9A=84=E7=95=AA=E8=8C=84?= Date: Sat, 12 Apr 2014 22:08:53 +0800 Subject: [PATCH 1/5] =?UTF-8?q?=E6=9B=B4=E6=96=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../us/codecraft/webmagic/downloader/selenium/WebDriverPool.java | 1 + 1 file changed, 1 insertion(+) diff --git a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java index 98b93a9..f628ede 100644 --- a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java +++ b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java @@ -87,4 +87,5 @@ class WebDriverPool { webDriver.quit(); } } + } From 610ac42c070ea639ea5bad574916b90963232fac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=84=A4=E6=80=92=E7=9A=84=E7=95=AA=E8=8C=84?= Date: Sat, 12 Apr 2014 22:22:07 +0800 Subject: [PATCH 2/5] =?UTF-8?q?=E6=9B=B4=E6=96=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../webmagic/samples/SinaBlogProcesser.java | 37 -------------- .../webmagic/samples/SinaBlogProcessor.java | 48 +++++++++++++++++++ 2 files changed, 48 insertions(+), 37 deletions(-) delete mode 100644 webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcesser.java create mode 100644 webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcessor.java diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcesser.java deleted file mode 100644 index dcb6eff..0000000 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcesser.java +++ /dev/null @@ -1,37 +0,0 @@ -package us.codecraft.webmagic.samples; - -import us.codecraft.webmagic.Page; -import us.codecraft.webmagic.Site; -import us.codecraft.webmagic.Spider; -import us.codecraft.webmagic.processor.PageProcessor; - -/** - * @author code4crafter@gmail.com
- */ -public class SinaBlogProcesser implements PageProcessor { - - private Site site; - - @Override - public void process(Page page) { - page.addTargetRequests(page.getHtml().xpath("//div[@class='articalfrontback SG_j_linedot1 clearfix']").links().all()); - page.putField("title", page.getHtml().xpath("//div[@class='articalTitle']/h2")); - page.putField("content",page.getHtml().xpath("//div[@id='articlebody']//div[@class='articalContent']")); - page.putField("id",page.getUrl().regex("http://blog\\.sina\\.com\\.cn/s/blog_(\\w+)")); - page.putField("date",page.getHtml().xpath("//div[@id='articlebody']//span[@class='time SG_txtc']").regex("\\((.*)\\)")); -// page.putField("tags",page.getHtml().xpath("//td[@class='blog_tag']/h3/a")); - } - - @Override - public Site getSite() { - if (site==null){ - site = Site.me().setDomain("blog.sina.com.cn").addStartUrl("http://blog.sina.com.cn/s/blog_4701280b0102egl0.html").setSleepTime(3000). - setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); - } - return site; - } - - public static void main(String[] args) { - Spider.create(new SinaBlogProcesser()).run(); - } -} diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcessor.java new file mode 100644 index 0000000..2872e02 --- /dev/null +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcessor.java @@ -0,0 +1,48 @@ +package us.codecraft.webmagic.samples; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.processor.PageProcessor; + +/** + * @author code4crafter@gmail.com
+ */ +public class SinaBlogProcessor implements PageProcessor { + + public static final String URL_LIST = "http://blog\\.sina\\.com\\.cn/s/articlelist_1487828712_0_\\d+\\.html"; + + public static final String URL_POST = "http://blog\\.sina\\.com\\.cn/s/blog_\\w+\\.html"; + + private Site site = Site + .me() + .setDomain("blog.sina.com.cn") + .setSleepTime(3000) + .setUserAgent( + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); + + @Override + public void process(Page page) { + //列表页 + if (page.getUrl().regex(URL_LIST).match()) { + page.addTargetRequests(page.getHtml().xpath("//div[@class=\"articleList\"]").links().regex(URL_POST).all()); + page.addTargetRequests(page.getHtml().links().regex(URL_LIST).all()); + //文章页 + } else { + page.putField("title", page.getHtml().xpath("//div[@class='articalTitle']/h2")); + page.putField("content", page.getHtml().xpath("//div[@id='articlebody']//div[@class='articalContent']")); + page.putField("date", + page.getHtml().xpath("//div[@id='articlebody']//span[@class='time SG_txtc']").regex("\\((.*)\\)")); + } + } + + @Override + public Site getSite() { + return site; + } + + public static void main(String[] args) { + Spider.create(new SinaBlogProcessor()).addUrl("http://blog.sina.com.cn/s/articlelist_1487828712_0_1.html") + .run(); + } +} From 644e8d1f72c08c83348e5c31a42f0f0dfa32f07d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=84=A4=E6=80=92=E7=9A=84=E7=95=AA=E8=8C=84?= Date: Sat, 12 Apr 2014 22:32:22 +0800 Subject: [PATCH 3/5] =?UTF-8?q?=E5=90=8C=E6=AD=A5=E5=AE=98=E6=96=B9?= =?UTF-8?q?=E6=BA=90=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 5 +- {asserts => assets}/data.plist | 0 {asserts => assets}/image1.pdf | Bin {asserts => assets}/logo-simple.jpg | Bin {asserts => assets}/logo.graffle | 0 {asserts => assets}/logo.jpg | Bin {asserts => assets}/logo2.graffle/data.plist | 0 {asserts => assets}/logo2.graffle/image1.tiff | Bin {asserts => assets}/logo3.graffle/data.plist | 0 {asserts => assets}/logo3.graffle/image1.tiff | Bin {asserts => assets}/logo3.graffle/image2.tiff | Bin {asserts => assets}/logo3.graffle/image4.tiff | Bin {asserts => assets}/logo3.graffle/image5.tiff | Bin {asserts => assets}/logo3.png | Bin {asserts => assets}/logo4.png | Bin assets/page-extract-rule.bmml | 9 + .../webmagic-create-spider.bmml | 0 .../webmagic-create-spider.png | Bin .../webmagic-spider-manage.bmml | 0 .../webmagic-spider-manage.png | Bin {asserts => assets}/webmagic.psd | Bin pom.xml | 9 + .../webapp/WEB-INF/pages/create_spider.ftl | 14 ++ .../downloader/HttpClientDownloader.java | 2 +- .../LocalDuplicatedRemovedScheduler.java | 3 +- .../us/codecraft/webmagic/selector/Html.java | 1 + webmagic-core/src/test/resources/log4j.xml | 10 - .../ConfigurablePageProcessor.java | 49 ++++ .../webmagic/configurable/ExpressionType.java | 11 + .../webmagic/configurable/ExtractRule.java | 113 +++++++++ .../webmagic/configurable/Inject.java | 15 -- .../webmagic/configurable/PropertyLoader.java | 18 -- .../ConfigurableBlogPageProcessor.java | 51 ---- .../example/PatternProcessorDemo.java | 53 +++++ .../handler/CompositePageProcessor.java | 49 ++++ .../webmagic/handler/PatternHandler.java | 113 +++++++++ .../webmagic/handler/SubPageProcessor.java | 33 +++ .../webmagic/model/ModelPageProcessor.java | 9 +- .../webmagic/model/PageModelExtractor.java | 4 +- .../webmagic/pipeline/PatternPipeline.java | 44 ++++ .../processor/PatternPageProcessor.java | 78 +++++++ .../scheduler/FileCacheQueueScheduler.java | 14 +- .../ConfigurablePageProcessorTest.java | 39 ++++ .../model/ModelPageProcessorTest.java | 45 ++++ .../webmagic/model/samples/BaiduNews.java | 43 ++++ .../webmagic/model/samples/News163.java | 10 +- .../webmagic/model/samples/QQMeishi.java | 27 +++ .../processor/SinablogProcessorTest.java | 6 +- webmagic-scripts/README.md | 0 webmagic-scripts/deploy.sh | 0 webmagic-scripts/pom.xml | 4 + .../codecraft/webmagic/scripts/Language.java | 4 +- .../webmagic/scripts/ScriptConsole.java | 0 .../webmagic/scripts/ScriptEnginePool.java | 0 .../webmagic/scripts/ScriptProcessor.java | 46 ++-- .../scripts/ScriptProcessorBuilder.java | 0 .../src/main/resources/js/defines.js | 0 .../src/main/resources/js/github.js | 0 .../src/main/resources/js/oschina.js | 1 + webmagic-scripts/src/main/resources/log4j.xml | 0 .../src/main/resources/python/defines.py | 13 ++ .../src/main/resources/python/oschina.py | 4 + .../src/main/resources/ruby/defines.rb | 0 .../src/main/resources/ruby/github.rb | 0 .../src/main/resources/ruby/oschina.rb | 5 +- .../webmagic/scripts/ScriptProcessorTest.java | 8 + webmagic-scripts/src/test/resouces/log4j.xml | 0 zh_docs/user-manual-new.md | 221 ++++++++++++++++-- 68 files changed, 1024 insertions(+), 159 deletions(-) rename {asserts => assets}/data.plist (100%) rename {asserts => assets}/image1.pdf (100%) rename {asserts => assets}/logo-simple.jpg (100%) rename {asserts => assets}/logo.graffle (100%) rename {asserts => assets}/logo.jpg (100%) rename {asserts => assets}/logo2.graffle/data.plist (100%) rename {asserts => assets}/logo2.graffle/image1.tiff (100%) rename {asserts => assets}/logo3.graffle/data.plist (100%) rename {asserts => assets}/logo3.graffle/image1.tiff (100%) rename {asserts => assets}/logo3.graffle/image2.tiff (100%) rename {asserts => assets}/logo3.graffle/image4.tiff (100%) rename {asserts => assets}/logo3.graffle/image5.tiff (100%) rename {asserts => assets}/logo3.png (100%) rename {asserts => assets}/logo4.png (100%) create mode 100644 assets/page-extract-rule.bmml rename {asserts => assets}/webmagic-create-spider.bmml (100%) rename {asserts => assets}/webmagic-create-spider.png (100%) rename {asserts => assets}/webmagic-spider-manage.bmml (100%) rename {asserts => assets}/webmagic-spider-manage.png (100%) rename {asserts => assets}/webmagic.psd (100%) create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ConfigurablePageProcessor.java create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ExpressionType.java create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ExtractRule.java delete mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/Inject.java delete mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/PropertyLoader.java delete mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/example/ConfigurableBlogPageProcessor.java create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/example/PatternProcessorDemo.java create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/handler/CompositePageProcessor.java create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/handler/PatternHandler.java create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/handler/SubPageProcessor.java create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/PatternPipeline.java create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/processor/PatternPageProcessor.java create mode 100644 webmagic-extension/src/test/java/us/codecraft/webmagic/configurable/ConfigurablePageProcessorTest.java create mode 100644 webmagic-extension/src/test/java/us/codecraft/webmagic/model/ModelPageProcessorTest.java create mode 100644 webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/BaiduNews.java create mode 100644 webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/QQMeishi.java mode change 100644 => 100755 webmagic-scripts/README.md mode change 100644 => 100755 webmagic-scripts/deploy.sh mode change 100644 => 100755 webmagic-scripts/pom.xml mode change 100644 => 100755 webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/Language.java mode change 100644 => 100755 webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptConsole.java mode change 100644 => 100755 webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptEnginePool.java mode change 100644 => 100755 webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptProcessor.java mode change 100644 => 100755 webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptProcessorBuilder.java mode change 100644 => 100755 webmagic-scripts/src/main/resources/js/defines.js mode change 100644 => 100755 webmagic-scripts/src/main/resources/js/github.js mode change 100644 => 100755 webmagic-scripts/src/main/resources/js/oschina.js mode change 100644 => 100755 webmagic-scripts/src/main/resources/log4j.xml create mode 100755 webmagic-scripts/src/main/resources/python/defines.py create mode 100755 webmagic-scripts/src/main/resources/python/oschina.py mode change 100644 => 100755 webmagic-scripts/src/main/resources/ruby/defines.rb mode change 100644 => 100755 webmagic-scripts/src/main/resources/ruby/github.rb mode change 100644 => 100755 webmagic-scripts/src/test/java/us/codecraft/webmagic/scripts/ScriptProcessorTest.java mode change 100644 => 100755 webmagic-scripts/src/test/resouces/log4j.xml diff --git a/README.md b/README.md index 1f4bc13..2056fba 100644 --- a/README.md +++ b/README.md @@ -78,7 +78,6 @@ public class GithubRepoPageProcessor implements PageProcessor { Spider.create(new GithubRepoPageProcessor()).addUrl("https://github.com/code4craft").thread(5).run(); } } - ``` * `page.addTargetRequests(links)` @@ -164,6 +163,10 @@ To write webmagic, I refered to the projects below : [https://groups.google.com/forum/#!forum/webmagic-java](https://groups.google.com/forum/#!forum/webmagic-java) +[http://list.qq.com/cgi-bin/qf_invite?id=023a01f505246785f77c5a5a9aff4e57ab20fcdde871e988](http://list.qq.com/cgi-bin/qf_invite?id=023a01f505246785f77c5a5a9aff4e57ab20fcdde871e988) + +QQ Group: 330192938 + [![Bitdeli Badge](https://d2weczhvl823v0.cloudfront.net/code4craft/webmagic/trend.png)](https://bitdeli.com/free "Bitdeli Badge") diff --git a/asserts/data.plist b/assets/data.plist similarity index 100% rename from asserts/data.plist rename to assets/data.plist diff --git a/asserts/image1.pdf b/assets/image1.pdf similarity index 100% rename from asserts/image1.pdf rename to assets/image1.pdf diff --git a/asserts/logo-simple.jpg b/assets/logo-simple.jpg similarity index 100% rename from asserts/logo-simple.jpg rename to assets/logo-simple.jpg diff --git a/asserts/logo.graffle b/assets/logo.graffle similarity index 100% rename from asserts/logo.graffle rename to assets/logo.graffle diff --git a/asserts/logo.jpg b/assets/logo.jpg similarity index 100% rename from asserts/logo.jpg rename to assets/logo.jpg diff --git a/asserts/logo2.graffle/data.plist b/assets/logo2.graffle/data.plist similarity index 100% rename from asserts/logo2.graffle/data.plist rename to assets/logo2.graffle/data.plist diff --git a/asserts/logo2.graffle/image1.tiff b/assets/logo2.graffle/image1.tiff similarity index 100% rename from asserts/logo2.graffle/image1.tiff rename to assets/logo2.graffle/image1.tiff diff --git a/asserts/logo3.graffle/data.plist b/assets/logo3.graffle/data.plist similarity index 100% rename from asserts/logo3.graffle/data.plist rename to assets/logo3.graffle/data.plist diff --git a/asserts/logo3.graffle/image1.tiff b/assets/logo3.graffle/image1.tiff similarity index 100% rename from asserts/logo3.graffle/image1.tiff rename to assets/logo3.graffle/image1.tiff diff --git a/asserts/logo3.graffle/image2.tiff b/assets/logo3.graffle/image2.tiff similarity index 100% rename from asserts/logo3.graffle/image2.tiff rename to assets/logo3.graffle/image2.tiff diff --git a/asserts/logo3.graffle/image4.tiff b/assets/logo3.graffle/image4.tiff similarity index 100% rename from asserts/logo3.graffle/image4.tiff rename to assets/logo3.graffle/image4.tiff diff --git a/asserts/logo3.graffle/image5.tiff b/assets/logo3.graffle/image5.tiff similarity index 100% rename from asserts/logo3.graffle/image5.tiff rename to assets/logo3.graffle/image5.tiff diff --git a/asserts/logo3.png b/assets/logo3.png similarity index 100% rename from asserts/logo3.png rename to assets/logo3.png diff --git a/asserts/logo4.png b/assets/logo4.png similarity index 100% rename from asserts/logo4.png rename to assets/logo4.png diff --git a/assets/page-extract-rule.bmml b/assets/page-extract-rule.bmml new file mode 100644 index 0000000..fec8d3e --- /dev/null +++ b/assets/page-extract-rule.bmml @@ -0,0 +1,9 @@ + + + + + A%20Web%20Page%0Ahttp%3A// + + + + \ No newline at end of file diff --git a/asserts/webmagic-create-spider.bmml b/assets/webmagic-create-spider.bmml similarity index 100% rename from asserts/webmagic-create-spider.bmml rename to assets/webmagic-create-spider.bmml diff --git a/asserts/webmagic-create-spider.png b/assets/webmagic-create-spider.png similarity index 100% rename from asserts/webmagic-create-spider.png rename to assets/webmagic-create-spider.png diff --git a/asserts/webmagic-spider-manage.bmml b/assets/webmagic-spider-manage.bmml similarity index 100% rename from asserts/webmagic-spider-manage.bmml rename to assets/webmagic-spider-manage.bmml diff --git a/asserts/webmagic-spider-manage.png b/assets/webmagic-spider-manage.png similarity index 100% rename from asserts/webmagic-spider-manage.png rename to assets/webmagic-spider-manage.png diff --git a/asserts/webmagic.psd b/assets/webmagic.psd similarity index 100% rename from asserts/webmagic.psd rename to assets/webmagic.psd diff --git a/pom.xml b/pom.xml index 085e94e..b277b38 100644 --- a/pom.xml +++ b/pom.xml @@ -182,6 +182,15 @@ UTF-8 + + org.apache.maven.plugins + maven-jar-plugin + + + log4j.xml + + + org.apache.maven.plugins maven-source-plugin diff --git a/webmagic-avalon/webmagic-admin/src/main/webapp/WEB-INF/pages/create_spider.ftl b/webmagic-avalon/webmagic-admin/src/main/webapp/WEB-INF/pages/create_spider.ftl index e69de29..4cd838c 100644 --- a/webmagic-avalon/webmagic-admin/src/main/webapp/WEB-INF/pages/create_spider.ftl +++ b/webmagic-avalon/webmagic-admin/src/main/webapp/WEB-INF/pages/create_spider.ftl @@ -0,0 +1,14 @@ + + + + + +
+ +
+ +
+ +
+ + \ No newline at end of file diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index bcf4a53..30c561b 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -74,7 +74,7 @@ public class HttpClientDownloader extends AbstractDownloader { } else { acceptStatCode = Sets.newHashSet(200); } - logger.info("downloading page " + request.getUrl()); + logger.info("downloading page {}" , request.getUrl()); RequestBuilder requestBuilder = RequestBuilder.get().setUri(request.getUrl()); if (headers != null) { for (Map.Entry headerEntry : headers.entrySet()) { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/LocalDuplicatedRemovedScheduler.java b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/LocalDuplicatedRemovedScheduler.java index c4b08f3..397199c 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/LocalDuplicatedRemovedScheduler.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/LocalDuplicatedRemovedScheduler.java @@ -23,8 +23,9 @@ public abstract class LocalDuplicatedRemovedScheduler implements Scheduler { @Override public void push(Request request, Task task) { - logger.debug("push to queue " + request.getUrl()); + logger.trace("get a candidate url {}", request.getUrl()); if (request.getExtra(Request.CYCLE_TRIED_TIMES) != null || urls.add(request.getUrl())) { + logger.debug("push to queue {}", request.getUrl()); pushWhenNoDuplicate(request, task); } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java index 614b111..34386b5 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java @@ -131,6 +131,7 @@ public class Html extends PlainText { } public Document getDocument() { + initDocument(); return document; } diff --git a/webmagic-core/src/test/resources/log4j.xml b/webmagic-core/src/test/resources/log4j.xml index 9084694..c2b5a2f 100644 --- a/webmagic-core/src/test/resources/log4j.xml +++ b/webmagic-core/src/test/resources/log4j.xml @@ -8,21 +8,11 @@ - - - - - - - - - - diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ConfigurablePageProcessor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ConfigurablePageProcessor.java new file mode 100644 index 0000000..36615d8 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ConfigurablePageProcessor.java @@ -0,0 +1,49 @@ +package us.codecraft.webmagic.configurable; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.processor.PageProcessor; + +import java.util.List; + +/** + * @author code4crafter@gmail.com
+ */ +public class ConfigurablePageProcessor implements PageProcessor { + + private Site site; + + private List extractRules; + + public ConfigurablePageProcessor(Site site, List extractRules) { + this.site = site; + this.extractRules = extractRules; + } + + @Override + public void process(Page page) { + for (ExtractRule extractRule : extractRules) { + if (extractRule.isMulti()) { + List results = page.getHtml().selectDocumentForList(extractRule.getSelector()); + if (extractRule.isNotNull() && results.size() == 0) { + page.setSkip(true); + } else { + page.getResultItems().put(extractRule.getFieldName(), results); + } + } else { + String result = page.getHtml().selectDocument(extractRule.getSelector()); + if (extractRule.isNotNull() && result == null) { + page.setSkip(true); + } else { + page.getResultItems().put(extractRule.getFieldName(), result); + } + } + } + } + + @Override + public Site getSite() { + return site; + } + +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ExpressionType.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ExpressionType.java new file mode 100644 index 0000000..bd84be3 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ExpressionType.java @@ -0,0 +1,11 @@ +package us.codecraft.webmagic.configurable; + +/** + * @author code4crafter@gmail.com + * @date 14-4-5 + */ +public enum ExpressionType { + + XPath, Regex, Css, JsonPath; + +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ExtractRule.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ExtractRule.java new file mode 100644 index 0000000..82337c4 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ExtractRule.java @@ -0,0 +1,113 @@ +package us.codecraft.webmagic.configurable; + +import us.codecraft.webmagic.selector.JsonPathSelector; +import us.codecraft.webmagic.selector.Selector; + +import static us.codecraft.webmagic.selector.Selectors.*; + +/** + * @author code4crafter@gmail.com + * @date 14-4-5 + */ +public class ExtractRule { + + private String fieldName; + + private ExpressionType expressionType; + + private String expressionValue; + + private String[] expressionParams; + + private boolean multi = false; + + private volatile Selector selector; + + private boolean notNull = false; + + public String getFieldName() { + return fieldName; + } + + public void setFieldName(String fieldName) { + this.fieldName = fieldName; + } + + public ExpressionType getExpressionType() { + return expressionType; + } + + public void setExpressionType(ExpressionType expressionType) { + this.expressionType = expressionType; + } + + public String getExpressionValue() { + return expressionValue; + } + + public void setExpressionValue(String expressionValue) { + this.expressionValue = expressionValue; + } + + public String[] getExpressionParams() { + return expressionParams; + } + + public void setExpressionParams(String[] expressionParams) { + this.expressionParams = expressionParams; + } + + public boolean isMulti() { + return multi; + } + + public void setMulti(boolean multi) { + this.multi = multi; + } + + public Selector getSelector() { + if (selector == null) { + synchronized (this) { + if (selector == null) { + selector = compileSelector(); + } + } + } + return selector; + } + + private Selector compileSelector() { + switch (expressionType) { + case Css: + if (expressionParams.length >= 1) { + return $(expressionValue, expressionParams[0]); + } else { + return $(expressionValue); + } + case XPath: + return xpath(expressionValue); + case Regex: + if (expressionParams.length >= 1) { + return regex(expressionValue, Integer.parseInt(expressionParams[0])); + } else { + return regex(expressionValue); + } + case JsonPath: + return new JsonPathSelector(expressionValue); + default: + return xpath(expressionValue); + } + } + + public void setSelector(Selector selector) { + this.selector = selector; + } + + public boolean isNotNull() { + return notNull; + } + + public void setNotNull(boolean notNull) { + this.notNull = notNull; + } +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/Inject.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/Inject.java deleted file mode 100644 index c6608ae..0000000 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/Inject.java +++ /dev/null @@ -1,15 +0,0 @@ -package us.codecraft.webmagic.configurable; - -import java.lang.annotation.ElementType; -import java.lang.annotation.Retention; -import java.lang.annotation.Target; - -/** - * @author yihua.huang@dianping.com - */ -@Retention(java.lang.annotation.RetentionPolicy.RUNTIME) -@Target({ElementType.FIELD}) -public @interface Inject { - - String value() default ""; -} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/PropertyLoader.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/PropertyLoader.java deleted file mode 100644 index bffbcf2..0000000 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/PropertyLoader.java +++ /dev/null @@ -1,18 +0,0 @@ -package us.codecraft.webmagic.configurable; - -import us.codecraft.webmagic.processor.PageProcessor; - -import java.util.Map; - -/** - * Inject property to object by {@link Inject} annotation. - * - * @author yihua.huang@dianping.com - */ -public class PropertyLoader { - - public T load(T object, Map properties) { - return object; - } - -} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/ConfigurableBlogPageProcessor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/ConfigurableBlogPageProcessor.java deleted file mode 100644 index 28d3ab0..0000000 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/ConfigurableBlogPageProcessor.java +++ /dev/null @@ -1,51 +0,0 @@ -package us.codecraft.webmagic.example; - -import java.util.List; -import us.codecraft.webmagic.Page; -import us.codecraft.webmagic.Site; -import us.codecraft.webmagic.Spider; -import us.codecraft.webmagic.configurable.Inject; -import us.codecraft.webmagic.processor.PageProcessor; - -/** - * @author code4crafter@gmail.com
- */ -public class ConfigurableBlogPageProcessor implements PageProcessor { - - private Site site = Site.me().setDomain("my.oschina.net"); - - @Inject("linkRegex") - private String linkRegex; - - @Inject("titleXpath") - private String titleXpath; - - @Inject("contentXpath") - private String contentXpath; - - @Inject("tagsXpath") - private String tagsXpath; - - @Override - public void process(Page page) { - List links = page.getHtml().links().regex(linkRegex).all(); - page.addTargetRequests(links); - page.putField("title", page.getHtml().xpath(titleXpath).toString()); - if (page.getResultItems().get("title") == null) { - //skip this page - page.setSkip(true); - } - page.putField("content", page.getHtml().smartContent().toString()); - page.putField("tags", page.getHtml().xpath(tagsXpath).all()); - } - - @Override - public Site getSite() { - return site; - - } - - public static void main(String[] args) { - Spider.create(new ConfigurableBlogPageProcessor()).addUrl("http://my.oschina.net/flashsword/blog").thread(2).run(); - } -} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/PatternProcessorDemo.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/PatternProcessorDemo.java new file mode 100644 index 0000000..51a9484 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/PatternProcessorDemo.java @@ -0,0 +1,53 @@ +package us.codecraft.webmagic.example; + +import org.apache.log4j.Logger; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.ResultItems; +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.Task; +import us.codecraft.webmagic.handler.PatternHandler; +import us.codecraft.webmagic.pipeline.PatternPipeline; +import us.codecraft.webmagic.processor.PatternPageProcessor; + +/** + * Created with IntelliJ IDEA. + * User: Sebastian MA + * Date: April 04, 2014 + * Time: 21:23 + */ +public class PatternProcessorDemo { + + private static Logger log = Logger.getLogger(PatternProcessorDemo.class); + + public static void main(String... args) { + + PatternPageProcessor processor + = new PatternPageProcessor("http://item.jd.com/981821.html", + PatternPageProcessor.TARGET_PATTERN_ALL + ); + + PatternPipeline pipeline = new PatternPipeline(); + + // define a handler which handles only "http://item.jd.com/.*" + PatternHandler handler = new PatternHandler("http://item.jd.com/.*") { + + @Override + public void onExtract(Page page) { + + log.info("Extracting from " + page.getUrl()); + page.putField("test", "hello world:)"); + } + + @Override + public void onHandle(ResultItems result, Task task) { + + log.info("Handling " + result.getRequest().getUrl()); + log.info("Retrieved test=" + result.get("test")); + } + }; + + handler.register(processor, pipeline); + + Spider.create(processor).thread(5).addPipeline(pipeline).runAsync(); + } +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/CompositePageProcessor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/CompositePageProcessor.java new file mode 100644 index 0000000..ecf4aa1 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/CompositePageProcessor.java @@ -0,0 +1,49 @@ +package us.codecraft.webmagic.handler; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.processor.PageProcessor; + +import java.util.ArrayList; +import java.util.List; + +/** + * @author code4crafter@gmail.com + * @date 14-4-5 + */ +public class CompositePageProcessor implements PageProcessor { + + private Site site; + + private List subPageProcessors; + + @Override + public void process(Page page) { + for (SubPageProcessor subPageProcessor : subPageProcessors) { + if (subPageProcessor.match(page)) { + SubPageProcessor.MatchOtherProcessor matchOtherProcessorProcessor = subPageProcessor.process(page); + if (matchOtherProcessorProcessor == null || matchOtherProcessorProcessor != SubPageProcessor.MatchOtherProcessor.YES) { + return; + } + } + } + } + + public CompositePageProcessor setSite(Site site) { + this.site = site; + return this; + } + + public CompositePageProcessor setSubPageProcessors(SubPageProcessor... subPageProcessors) { + this.subPageProcessors = new ArrayList(); + for (SubPageProcessor subPageProcessor : subPageProcessors) { + this.subPageProcessors.add(subPageProcessor); + } + return this; + } + + @Override + public Site getSite() { + return site; + } +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/PatternHandler.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/PatternHandler.java new file mode 100644 index 0000000..51e44e0 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/PatternHandler.java @@ -0,0 +1,113 @@ +package us.codecraft.webmagic.handler; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.ResultItems; +import us.codecraft.webmagic.Task; +import us.codecraft.webmagic.pipeline.PatternPipeline; +import us.codecraft.webmagic.processor.PatternPageProcessor; + +import java.util.UUID; + +/** + * Created with IntelliJ IDEA. + * User: Sebastian MA + * Date: April 03, 2014 + * Time: 10:00 + *

+ * A PatternHandler is in charge of both page extraction and data processing by implementing + * its two abstract methods. + */ +public abstract class PatternHandler { + + /** + * identity of the handler. + */ + protected String id; + + /** + * match pattern. only matched page should be handled. + */ + protected String pattern; + + /** + * @param pattern + * url pattern to handle + */ + protected PatternHandler(String pattern) { + + this.pattern = pattern; + this.id = UUID.randomUUID().toString(); + } + + /** + * determine if the page should be handled. + */ + public boolean match(String url) { + + return url.matches(pattern); + } + + /** + * registers to both the page processor and the pipeline so the handler could take charge of + * both end of procedure. + * + * @param processor + * the processor to handle + * @param pipeline + * the pipeline to handle + */ + public void register(PatternPageProcessor processor, PatternPipeline pipeline) { + + processor.addHandler(this); + pipeline.addHandler(this); + } + + public void unregister(PatternPageProcessor processor, PatternPipeline pipeline) { + + processor.removeHandler(this); + pipeline.removeHandler(this); + } + + public boolean process(Page page) { + + if(match(page.getUrl().toString())) { + page.putField(id, true); + onExtract(page); + return true; + } else { + return false; + } + } + + public boolean process(ResultItems resultItems, Task task) { + + if(resultItems.isSkip()) { + return false; + } + + if(match(resultItems.getRequest().getUrl()) && resultItems.get(id) != null) { + onHandle(resultItems, task); + return true; + } else { + return false; + } + } + + /** + * implements this method to extract from page. + * + * @param page + * the page to extract + */ + public abstract void onExtract(Page page); + + /** + * implements this method to handle the extraction result. + * + * @param result + * extraction result + * @param task + */ + public abstract void onHandle(ResultItems result, Task task); + +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/SubPageProcessor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/SubPageProcessor.java new file mode 100644 index 0000000..c880500 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/SubPageProcessor.java @@ -0,0 +1,33 @@ +package us.codecraft.webmagic.handler; + +import us.codecraft.webmagic.Page; + +/** + * @author code4crafter@gmail.com + * @date 14-4-5 + */ +public interface SubPageProcessor { + + /** + * Check whether the SubPageProcessor can process the page.

+ * Please DO NOT change page status in this method. + * + * @param page + * @return + */ + public boolean match(Page page); + + /** + * + * process the page, extract urls to fetch, extract the data and store + * + * @param page + * @return whether continue to match + */ + public MatchOtherProcessor process(Page page); + + public enum MatchOtherProcessor { + YES, NO; + } + +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ModelPageProcessor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ModelPageProcessor.java index 8a40dae..3a97e1d 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ModelPageProcessor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ModelPageProcessor.java @@ -25,8 +25,6 @@ class ModelPageProcessor implements PageProcessor { private Site site; - private Set targetUrlPatterns = new HashSet(); - public static ModelPageProcessor create(Site site, Class... clazzs) { ModelPageProcessor modelPageProcessor = new ModelPageProcessor(site); for (Class clazz : clazzs) { @@ -38,8 +36,6 @@ class ModelPageProcessor implements PageProcessor { public ModelPageProcessor addPageModel(Class clazz) { PageModelExtractor pageModelExtractor = PageModelExtractor.create(clazz); - targetUrlPatterns.addAll(pageModelExtractor.getTargetUrlPatterns()); - targetUrlPatterns.addAll(pageModelExtractor.getHelpUrlPatterns()); pageModelExtractorList.add(pageModelExtractor); return this; } @@ -55,11 +51,14 @@ class ModelPageProcessor implements PageProcessor { extractLinks(page, pageModelExtractor.getTargetUrlRegionSelector(), pageModelExtractor.getTargetUrlPatterns()); Object process = pageModelExtractor.process(page); if (process == null || (process instanceof List && ((List) process).size() == 0)) { - page.getResultItems().setSkip(true); + continue; } postProcessPageModel(pageModelExtractor.getClazz(), process); page.putField(pageModelExtractor.getClazz().getCanonicalName(), process); } + if (page.getResultItems().getAll().size() == 0) { + page.getResultItems().setSkip(true); + } } private void extractLinks(Page page, Selector urlRegionSelector, List urlPatterns) { diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java index 5e4da11..b7b7900 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java @@ -340,9 +340,7 @@ class PageModelExtractor { private Object convert(String value, ObjectFormatter objectFormatter) { try { Object format = objectFormatter.format(value); - if (logger.isDebugEnabled()) { - logger.debug("String " + value + " is converted to " + format); - } + logger.debug("String {} is converted to {}", value, format); return format; } catch (Exception e) { logger.error("convert " + value + " to " + objectFormatter.clazz() + " error!", e); diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/PatternPipeline.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/PatternPipeline.java new file mode 100644 index 0000000..582b162 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/PatternPipeline.java @@ -0,0 +1,44 @@ +package us.codecraft.webmagic.pipeline; + +import us.codecraft.webmagic.ResultItems; +import us.codecraft.webmagic.Task; +import us.codecraft.webmagic.handler.PatternHandler; + +import java.util.ArrayList; + +/** + * Created with IntelliJ IDEA. + * User: Sebastian MA + * Date: April 04, 2014 + * Time: 20:44 + */ +public class PatternPipeline implements Pipeline { + + protected ArrayList handlers = new ArrayList(); + + /** + * A handler works only if it is added to BOTH the page processor and the pipeline. + * Uses PatternHandler's register instead. + * + * @param handler the pattern handler + * + * @see PatternHandler#register + */ + public void addHandler(PatternHandler handler) { + + handlers.add(handler); + } + + public void removeHandler(PatternHandler handler) { + + handlers.remove(handler); + } + + @Override + public void process(ResultItems resultItems, Task task) { + + for(PatternHandler handler : handlers) { + handler.process(resultItems, task); + } + } +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/processor/PatternPageProcessor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/processor/PatternPageProcessor.java new file mode 100644 index 0000000..d7d909c --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/processor/PatternPageProcessor.java @@ -0,0 +1,78 @@ +package us.codecraft.webmagic.processor; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.handler.PatternHandler; +import us.codecraft.webmagic.utils.UrlUtils; + +import java.util.ArrayList; +import java.util.List; + +/** + * Created with IntelliJ IDEA. + * User: Sebastian MA + * Date: April 04, 2014 + * Time: 15:36 + *

+ * A PatternPageProcessor uses PatternHandler to setup extraction rules for specific url pattern. + * + * @see us.codecraft.webmagic.handler.PatternHandler + */ +public class PatternPageProcessor implements PageProcessor { + + public static final String TARGET_PATTERN_ALL = "http://*"; + + protected Site site; + + protected String targetPattern; + + protected ArrayList handlers = new ArrayList(); + + public PatternPageProcessor(String startUrl, String targetPattern) { + + this.targetPattern = targetPattern; + + this.site = Site.me().addStartUrl(startUrl).setDomain(UrlUtils.getDomain(startUrl)); + this.targetPattern = "(" + targetPattern.replace(".", "\\.").replace("*", + "[^\"'#]*") + ")"; + + site.setUserAgent("Chrome/5.0.354.0"); + } + + @Override + public void process(Page page) { + + + List requests = page.getHtml().links().regex(targetPattern).all(); + page.addTargetRequests(requests); + for(PatternHandler handler : handlers) { + if(handler.match(page.getUrl().toString())) { + handler.process(page); + } + } + } + + /** + * A handler works only if it is added to BOTH the page processor and the pipeline. + * Uses PatternHandler's register instead. + * + * @param handler the pattern handler + * + * @see PatternHandler#register + */ + public void addHandler(PatternHandler handler) { + + handlers.add(handler); + } + + public void removeHandler(PatternHandler handler) { + + handlers.remove(handler); + } + + @Override + public Site getSite() { + + return site; + } +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java index 38e8a79..79f3b8b 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java @@ -23,7 +23,7 @@ import java.util.concurrent.atomic.AtomicInteger; * @author code4crafter@gmail.com
* @since 0.2.0 */ -public class FileCacheQueueScheduler implements Scheduler { +public class FileCacheQueueScheduler extends LocalDuplicatedRemovedScheduler { private Logger logger = LoggerFactory.getLogger(getClass()); @@ -145,18 +145,12 @@ public class FileCacheQueueScheduler implements Scheduler { } @Override - public synchronized void push(Request request, Task task) { + protected void pushWhenNoDuplicate(Request request, Task task) { if (!inited.get()) { init(task); } - if (logger.isDebugEnabled()) { - logger.debug("push to queue " + request.getUrl()); - } - if (urls.add(request.getUrl())) { - queue.add(request); - fileUrlWriter.println(request.getUrl()); - } - + queue.add(request); + fileUrlWriter.println(request.getUrl()); } @Override diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/configurable/ConfigurablePageProcessorTest.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/configurable/ConfigurablePageProcessorTest.java new file mode 100644 index 0000000..a35fffa --- /dev/null +++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/configurable/ConfigurablePageProcessorTest.java @@ -0,0 +1,39 @@ +package us.codecraft.webmagic.configurable; + +import org.junit.Test; +import us.codecraft.webmagic.ResultItems; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.downloader.MockGithubDownloader; + +import java.util.ArrayList; +import java.util.List; + +import static org.assertj.core.api.Assertions.assertThat; + +/** + * @author code4crafter@gmail.com + * @date 14-4-5 + */ +public class ConfigurablePageProcessorTest { + + @Test + public void test() throws Exception { + List extractRules = new ArrayList(); + ExtractRule extractRule = new ExtractRule(); + extractRule.setExpressionType(ExpressionType.XPath); + extractRule.setExpressionValue("//title"); + extractRule.setFieldName("title"); + extractRules.add(extractRule); + extractRule = new ExtractRule(); + extractRule.setExpressionType(ExpressionType.XPath); + extractRule.setExpressionValue("//ul[@class='pagehead-actions']/li[1]//a[@class='social-count js-social-count']/text()"); + extractRule.setFieldName("star"); + extractRules.add(extractRule); + ResultItems resultItems = Spider.create(new ConfigurablePageProcessor(Site.me(), extractRules)) + .setDownloader(new MockGithubDownloader()).get("https://github.com/code4craft/webmagic"); + assertThat(resultItems.getAll()).containsEntry("title", "code4craft/webmagic · GitHub"); + assertThat(resultItems.getAll()).containsEntry("star", " 86 "); + + } +} diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/model/ModelPageProcessorTest.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/ModelPageProcessorTest.java new file mode 100644 index 0000000..74f3f6a --- /dev/null +++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/ModelPageProcessorTest.java @@ -0,0 +1,45 @@ +package us.codecraft.webmagic.model; + +import org.junit.Test; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.model.annotation.ExtractBy; +import us.codecraft.webmagic.model.annotation.TargetUrl; +import us.codecraft.webmagic.selector.PlainText; + +import static org.assertj.core.api.Assertions.assertThat; + +/** + * @author code4crafter@gmail.com + * @date 14-4-4 + */ +public class ModelPageProcessorTest { + + @TargetUrl("http://codecraft.us/foo") + public static class ModelFoo { + + @ExtractBy(value = "//div/@foo", notNull = true) + private String foo; + + } + + @TargetUrl("http://codecraft.us/bar") + public static class ModelBar { + + @ExtractBy(value = "//div/@bar", notNull = true) + private String bar; + + } + + @Test + public void testMultiModel_should_not_skip_when_match() throws Exception { + Page page = new Page(); + page.setRawText("
"); + page.setRequest(new Request("http://codecraft.us/foo")); + page.setUrl(PlainText.create("http://codecraft.us/foo")); + ModelPageProcessor modelPageProcessor = ModelPageProcessor.create(null, ModelFoo.class, ModelBar.class); + modelPageProcessor.process(page); + assertThat(page.getResultItems().isSkip()).isFalse(); + + } +} diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/BaiduNews.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/BaiduNews.java new file mode 100644 index 0000000..4795662 --- /dev/null +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/BaiduNews.java @@ -0,0 +1,43 @@ +package us.codecraft.webmagic.model.samples; + +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.model.OOSpider; +import us.codecraft.webmagic.model.annotation.ExtractBy; + +/** + * @author code4crafter@gmail.com + * @date 14-4-9 + */ +public class BaiduNews { + + @ExtractBy("//h3[@class='c-title']/a/text()") + private String name; + + @ExtractBy("//div[@class='c-summary']/text()") + private String description; + + @Override + public String toString() { + return "BaiduNews{" + + "name='" + name + '\'' + + ", description='" + description + '\'' + + '}'; + } + + public static void main(String[] args) { + OOSpider ooSpider = OOSpider.create(Site.me().setSleepTime(0), BaiduNews.class); + //single download + BaiduNews baike = ooSpider.get("http://news.baidu.com/ns?tn=news&cl=2&rn=20&ct=1&fr=bks0000&ie=utf-8&word=httpclient"); + System.out.println(baike); + + ooSpider.close(); + } + + public String getName() { + return name; + } + + public String getDescription() { + return description; + } +} \ No newline at end of file diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java index e9dfb26..45bee2f 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java @@ -3,7 +3,6 @@ package us.codecraft.webmagic.model.samples; import us.codecraft.webmagic.MultiPageModel; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.model.OOSpider; -import us.codecraft.webmagic.model.annotation.ComboExtract; import us.codecraft.webmagic.model.annotation.ExtractBy; import us.codecraft.webmagic.model.annotation.ExtractByUrl; import us.codecraft.webmagic.model.annotation.TargetUrl; @@ -26,9 +25,8 @@ public class News163 implements MultiPageModel { @ExtractByUrl(value = "http://news\\.163\\.com/\\d+/\\d+/\\d+/\\w+_(\\d+)\\.html", notNull = false) private String page; - @ComboExtract(value = {@ExtractBy("//div[@class=\"ep-pages\"]//a/@href"), - @ExtractBy(value = "http://news\\.163\\.com/\\d+/\\d+/\\d+/\\w+_(\\d+)\\.html", type = ExtractBy.Type.Regex)}, - multi = true, notNull = false) + @ExtractBy(value = "//div[@class=\"ep-pages\"]//a/regex('http://news\\.163\\.com/\\d+/\\d+/\\d+/\\w+_(\\d+)\\.html',1)" + , multi = true, notNull = false) private List otherPage; @ExtractBy("//h1[@id=\"h1title\"]/text()") @@ -74,8 +72,8 @@ public class News163 implements MultiPageModel { } public static void main(String[] args) { - OOSpider.create(Site.me().addStartUrl("http://news.163.com/13/0802/05/958I1E330001124J_2.html"), News163.class) - .scheduler(new RedisScheduler("localhost")).clearPipeline().pipeline(new MultiPagePipeline()).pipeline(new ConsolePipeline()).run(); + OOSpider.create(Site.me(), News163.class).addUrl("http://news.163.com/13/0802/05/958I1E330001124J_2.html") + .scheduler(new RedisScheduler("localhost")).addPipeline(new MultiPagePipeline()).addPipeline(new ConsolePipeline()).run(); } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/QQMeishi.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/QQMeishi.java new file mode 100644 index 0000000..f4f8591 --- /dev/null +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/QQMeishi.java @@ -0,0 +1,27 @@ +package us.codecraft.webmagic.model.samples; + +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.model.ConsolePageModelPipeline; +import us.codecraft.webmagic.model.OOSpider; +import us.codecraft.webmagic.model.annotation.ExtractBy; +import us.codecraft.webmagic.model.annotation.TargetUrl; + +/** + * @author code4crafter@gmail.com + * @date 14-4-11 + */ +@TargetUrl("http://meishi.qq.com/beijing/c/all[\\-p2]*") +@ExtractBy(value = "//ul[@id=\"promos_list2\"]/li",multi = true) +public class QQMeishi { + + @ExtractBy("//div[@class=info]/a[@class=title]/h4/text()") + private String shopName; + + @ExtractBy("//div[@class=info]/a[@class=title]/text()") + private String promo; + + public static void main(String[] args) { + OOSpider.create(Site.me(), new ConsolePageModelPipeline(), QQMeishi.class).addUrl("http://meishi.qq.com/beijing/c/all").thread(4).run(); + } + +} diff --git a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/SinablogProcessorTest.java b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/SinablogProcessorTest.java index 026f8d5..d7cd5d5 100644 --- a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/SinablogProcessorTest.java +++ b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/SinablogProcessorTest.java @@ -5,7 +5,7 @@ import org.junit.Test; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.pipeline.FilePipeline; import us.codecraft.webmagic.pipeline.JsonFilePipeline; -import us.codecraft.webmagic.samples.SinaBlogProcesser; +import us.codecraft.webmagic.samples.SinaBlogProcessor; import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler; import java.io.IOException; @@ -20,7 +20,7 @@ public class SinablogProcessorTest { @Ignore @Test public void test() throws IOException { - SinaBlogProcesser sinaBlogProcesser = new SinaBlogProcesser(); + SinaBlogProcessor sinaBlogProcessor = new SinaBlogProcessor(); //pipeline是抓取结束后的处理 //默认放到/data/webmagic/ftl/[domain]目录下 JsonFilePipeline pipeline = new JsonFilePipeline("/data/webmagic/"); @@ -29,7 +29,7 @@ public class SinablogProcessorTest { //ConsolePipeline输出结果到控制台 //FileCacheQueueSchedular保存url,支持断点续传,临时文件输出到/data/temp/webmagic/cache目录 //Spider.run()执行 - Spider.create(sinaBlogProcesser).pipeline(new FilePipeline()).pipeline(pipeline).scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")). + Spider.create(sinaBlogProcessor).pipeline(new FilePipeline()).pipeline(pipeline).scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")). run(); } } diff --git a/webmagic-scripts/README.md b/webmagic-scripts/README.md old mode 100644 new mode 100755 diff --git a/webmagic-scripts/deploy.sh b/webmagic-scripts/deploy.sh old mode 100644 new mode 100755 diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml old mode 100644 new mode 100755 index 5c21160..41c79ea --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -16,6 +16,10 @@ jruby 1.7.6 + org.python + jython + 2.5.3 + commons-cli commons-cli diff --git a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/Language.java b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/Language.java old mode 100644 new mode 100755 index c7ddcda..2f9d22d --- a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/Language.java +++ b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/Language.java @@ -7,7 +7,9 @@ public enum Language { JavaScript("javascript","js/defines.js",""), - JRuby("jruby","ruby/defines.rb",""); + JRuby("jruby","ruby/defines.rb",""), + + Jython("jython","python/defines.py",""); private String engineName; diff --git a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptConsole.java b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptConsole.java old mode 100644 new mode 100755 diff --git a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptEnginePool.java b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptEnginePool.java old mode 100644 new mode 100755 diff --git a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptProcessor.java b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptProcessor.java old mode 100644 new mode 100755 index 5801851..1822318 --- a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptProcessor.java +++ b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptProcessor.java @@ -1,6 +1,8 @@ package us.codecraft.webmagic.scripts; import org.apache.commons.io.IOUtils; +import org.jruby.RubyHash; +import org.python.core.PyDictionary; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.processor.PageProcessor; @@ -10,6 +12,8 @@ import javax.script.ScriptEngine; import javax.script.ScriptException; import java.io.IOException; import java.io.InputStream; +import java.util.Iterator; +import java.util.Map; /** * @author code4crafter@gmail.com @@ -50,20 +54,35 @@ public class ScriptProcessor implements PageProcessor { context.setAttribute("page", page, ScriptContext.ENGINE_SCOPE); context.setAttribute("config", site, ScriptContext.ENGINE_SCOPE); try { - engine.eval(defines + "\n" + script, context); -// switch (language) { -// case JavaScript: -// NativeObject o = (NativeObject) engine.get("result"); -// if (o != null) { -// for (Map.Entry objectObjectEntry : o.entrySet()) { -// page.getResultItems().put(objectObjectEntry.getKey().toString(), objectObjectEntry.getValue()); + switch (language) { + case JavaScript: + engine.eval(defines + "\n" + script, context); +// NativeObject o = (NativeObject) engine.get("result"); +// if (o != null) { +// for (Object o1 : o.getIds()) { +// String key = String.valueOf(o1); +// page.getResultItems().put(key, NativeObject.getProperty(o, key)); +// } // } -// } -// break; -// case JRuby: -// Object o1 = engine.get("result"); -// break; -// } + break; + case JRuby: + RubyHash oRuby = (RubyHash) engine.eval(defines + "\n" + script, context); + Iterator itruby = oRuby.entrySet().iterator(); + while (itruby.hasNext()) { + Map.Entry pairs = (Map.Entry) itruby.next(); + page.getResultItems().put(pairs.getKey().toString(), pairs.getValue()); + } + break; + case Jython: + engine.eval(defines + "\n" + script, context); + PyDictionary oJython = (PyDictionary) engine.get("result"); + Iterator it = oJython.entrySet().iterator(); + while (it.hasNext()) { + Map.Entry pairs = (Map.Entry) it.next(); + page.getResultItems().put(pairs.getKey().toString(), pairs.getValue()); + } + break; + } } catch (ScriptException e) { e.printStackTrace(); } @@ -72,6 +91,7 @@ public class ScriptProcessor implements PageProcessor { } } + @Override public Site getSite() { return site; diff --git a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptProcessorBuilder.java b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptProcessorBuilder.java old mode 100644 new mode 100755 diff --git a/webmagic-scripts/src/main/resources/js/defines.js b/webmagic-scripts/src/main/resources/js/defines.js old mode 100644 new mode 100755 diff --git a/webmagic-scripts/src/main/resources/js/github.js b/webmagic-scripts/src/main/resources/js/github.js old mode 100644 new mode 100755 diff --git a/webmagic-scripts/src/main/resources/js/oschina.js b/webmagic-scripts/src/main/resources/js/oschina.js old mode 100644 new mode 100755 index 305682e..02191c3 --- a/webmagic-scripts/src/main/resources/js/oschina.js +++ b/webmagic-scripts/src/main/resources/js/oschina.js @@ -9,3 +9,4 @@ var config = { title = $("div.BlogTitle h1"), content = $("div.BlogContent") urls("http://my\\.oschina\\.net/flashsword/blog/\\d+") +config; diff --git a/webmagic-scripts/src/main/resources/log4j.xml b/webmagic-scripts/src/main/resources/log4j.xml old mode 100644 new mode 100755 diff --git a/webmagic-scripts/src/main/resources/python/defines.py b/webmagic-scripts/src/main/resources/python/defines.py new file mode 100755 index 0000000..913a4b4 --- /dev/null +++ b/webmagic-scripts/src/main/resources/python/defines.py @@ -0,0 +1,13 @@ +def xpath(str): + return page.getHtml().xpath(str).toString() + +def css(str): + return page.getHtml().css(str).toString() + +def urls(str): + links=page.getHtml().links().regex(str).all() + page.addTargetRequests(links); + +def tomap(key,value): + return "hello world" + diff --git a/webmagic-scripts/src/main/resources/python/oschina.py b/webmagic-scripts/src/main/resources/python/oschina.py new file mode 100755 index 0000000..51a188b --- /dev/null +++ b/webmagic-scripts/src/main/resources/python/oschina.py @@ -0,0 +1,4 @@ +title=xpath("div[@class=BlogTitle]") +urls="http://my\\.oschina\\.net/flashsword/blog/\\d+" + +result={"title":title,"urls":urls} diff --git a/webmagic-scripts/src/main/resources/ruby/defines.rb b/webmagic-scripts/src/main/resources/ruby/defines.rb old mode 100644 new mode 100755 diff --git a/webmagic-scripts/src/main/resources/ruby/github.rb b/webmagic-scripts/src/main/resources/ruby/github.rb old mode 100644 new mode 100755 diff --git a/webmagic-scripts/src/main/resources/ruby/oschina.rb b/webmagic-scripts/src/main/resources/ruby/oschina.rb index cbced0b..dbea13b 100644 --- a/webmagic-scripts/src/main/resources/ruby/oschina.rb +++ b/webmagic-scripts/src/main/resources/ruby/oschina.rb @@ -1,3 +1,6 @@ +urls "http://my\\.oschina\\.net/flashsword/blog/\\d+" title = css "div.BlogTitle h1" content = css "div.BlogContent" -urls "http://my\\.oschina\\.net/flashsword/blog/\\d+" \ No newline at end of file + +return {"title"=>title,"content"=>content} + diff --git a/webmagic-scripts/src/test/java/us/codecraft/webmagic/scripts/ScriptProcessorTest.java b/webmagic-scripts/src/test/java/us/codecraft/webmagic/scripts/ScriptProcessorTest.java old mode 100644 new mode 100755 index ec3f674..23fe093 --- a/webmagic-scripts/src/test/java/us/codecraft/webmagic/scripts/ScriptProcessorTest.java +++ b/webmagic-scripts/src/test/java/us/codecraft/webmagic/scripts/ScriptProcessorTest.java @@ -22,4 +22,12 @@ public class ScriptProcessorTest { pageProcessor.getSite().setSleepTime(0); Spider.create(pageProcessor).addUrl("http://my.oschina.net/flashsword/blog").setSpawnUrl(false).run(); } + + + @Test + public void testPythonProcessor() { + ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom().language(Language.Jython).scriptFromClassPathFile("python/oschina.py").build(); + pageProcessor.getSite().setSleepTime(0); + Spider.create(pageProcessor).addUrl("http://my.oschina.net/flashsword/blog").setSpawnUrl(false).run(); + } } diff --git a/webmagic-scripts/src/test/resouces/log4j.xml b/webmagic-scripts/src/test/resouces/log4j.xml old mode 100644 new mode 100755 diff --git a/zh_docs/user-manual-new.md b/zh_docs/user-manual-new.md index 229c9a6..a8ae5c2 100644 --- a/zh_docs/user-manual-new.md +++ b/zh_docs/user-manual-new.md @@ -1,11 +1,11 @@ -WebMagic文档2.0版 +WebMagic in Action ======== WebMagic是一个简单灵活、便于二次开发的爬虫框架。除了可以便捷的实现一个爬虫,WebMagic还提供多线程功能,以及基本的分布式功能。 你可以直接使用WebMagic进行爬虫开发,也可以定制WebMagic以适应复杂项目的需要。 -## 1. 使用WebMagic +## 1. 在项目中使用WebMagic WebMagic主要包含两个jar包:`webmagic-core-{version}.jar`和`webmagic-extension-{version}.jar`。在项目中添加这两个包的依赖,即可使用WebMagic。 @@ -88,6 +88,8 @@ public class GithubRepoPageProcessor implements PageProcessor { ![runlog](http://static.oschina.net/uploads/space/2014/0403/103741_3Gf5_190591.png) +
+ ## 2.下载和编译源码 WebMagic是一个纯Java项目,如果你熟悉Maven,那么下载并编译源码是非常简单的。如果不熟悉Maven也没关系,这部分会介绍如何在Eclipse里导入这个项目。 @@ -158,21 +160,200 @@ Intellij Idea默认自带Maven支持,import项目时选择Maven项目即可。 ## 3. 基本的爬虫 -### 3.1 抽取内容(xpath, regex, css selector, jsonpath) +### 3.1 实现PageProcessor -### 3.2 发现链接 +在WebMagic里,实现一个基本的爬虫只需要编写一个类,实现`PageProcessor`接口即可。这个类基本上包含了抓取一个网站,你需要写的所有代码。 -### 3.3 处理多个页面 +以之前的`GithubRepoPageProcessor`为例,我将PageProcessor的定制分为三个部分,分别是爬虫的配置、页面元素的抽取和链接的发现。 -## 4. 使用注解 +```java +public class GithubRepoPageProcessor implements PageProcessor { -### 4.1 抽取内容(xpath, regex, css selector, jsonpath) + // 部分一:抓取网站的相关配置,包括编码、抓取间隔、重试次数等 + private Site site = Site.me().setRetryTimes(3).setSleepTime(1000); -### 4.2 发现链接 + @Override + // process是定制爬虫逻辑的核心接口,在这里编写抽取逻辑 + public void process(Page page) { + // 部分二:定义如何抽取页面信息,并保存下来 + page.putField("author", page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString()); + page.putField("name", page.getHtml().xpath("//h1[@class='entry-title public']/strong/a/text()").toString()); + if (page.getResultItems().get("name") == null) { + //skip this page + page.setSkip(true); + } + page.putField("readme", page.getHtml().xpath("//div[@id='readme']/tidyText()")); -### 4.3 处理多个页面 + // 部分三:从页面发现后续的url地址来抓取 + page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all()); + } -### 4.4 在POJO中实现复杂逻辑 + @Override + public Site getSite() { + return site; + } + + public static void main(String[] args) { + + Spider.create(new GithubRepoPageProcessor()) + //从"https://github.com/code4craft"开始抓 + .addUrl("https://github.com/code4craft") + //开启5个线程抓取 + .thread(5) + //启动爬虫 + .run(); + } +} +``` + +#### 3.1.1 爬虫的配置 + +第一部分关于爬虫的配置,包括编码、抓取间隔、超时时间、重试次数等,也包括一些模拟的参数,例如User Agent、cookie,以及代理的设置,我们会在第5章-“爬虫的配置”里进行介绍。在这里我们先简单设置一下:重试次数为3次,抓取间隔为一秒。 + +#### 3.1.2 页面元素的抽取 + +第二部分是爬虫的核心部分:对于下载到的Html页面,你如何从中抽取到你想要的信息?WebMagic里主要使用了三种抽取技术:XPath、正则表达式和CSS选择器。 + +1. XPath + + XPath本来是用于XML中获取元素的一种查询语言,但是用于Html也是比较方便的。例如: + + ```java + page.getHtml().xpath("//h1[@class='entry-title public']/strong/a/text()") + ``` + 这段代码使用了XPath,它的意思是“查找所有class属性为'entry-title public'的h1元素,并找到他的strong子节点的a子节点,并提取a节点的文本信息”。 +对应的Html是这样子的: + + ![xpath-html](http://static.oschina.net/uploads/space/2014/0404/104607_Aqq8_190591.png) + +2. CSS选择器 + + CSS选择器是与XPath类似的语言。如果大家做过前端开发,肯定知道$('h1.entry-title')这种写法的含义。客观的说,它比XPath写起来要简单一些,但是如果写复杂一点的抽取规则,就相对要麻烦一点。 + +3. 正则表达式 + + 正则表达式则是一种通用的文本抽取语言。 + + ```java + page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all()); + ``` + + 这段代码就用到了正则表达式,它表示匹配所有"https://github.com/code4craft/webmagic"这样的链接。 + +XPath、CSS选择器和正则表达式的具体用法会在第4章“抽取工具详解”中讲到。 + +#### 3.1.3 链接的发现 + +有了处理页面的逻辑,我们的爬虫就接近完工了! + +但是现在还有一个问题:一个站点的页面是很多的,一开始我们不可能全部列举出来,于是如何发现后续的链接,是一个爬虫不可缺少的一部分。 + +```java +page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all()); +``` + +这段代码的分为两部分,`page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all()`用于获取所有满足"(https://github\\.com/\\w+/\\w+)"这个正则表达式的链接,`page.addTargetRequests()`则将这些链接加入到待抓取的队列中去。 + +### 3.2 使用Selectable的链式API + +`Selectable`相关的链式API是WebMagic的一个核心功能。使用Selectable接口,你可以直接完成页面元素的链式抽取,也无需去关心抽取的细节。 + +在刚才的例子中可以看到,page.getHtml()返回的是一个`Html`对象,它实现了`Selectable`接口。这个接口包含一些重要的方法,我将它分为两类:抽取部分和获取结果部分。 + +#### 3.2.1 抽取部分API: + +| 方法 | 说明 | 示例 | +| ------------ | ------------- | ------------ | +| xpath(String xpath) | 使用XPath选择 | html.xpath("//div[@class='title']") | +| \$(String selector) | 使用Css选择器选择 | html.\$("div.title") | +| \$(String selector,String attr) | 使用Css选择器选择 | html.\$("div.title","text") | +| css(String selector) | 功能同$(),使用Css选择器选择 | html.css("div.title") | +| links() | 选择所有链接 | html.links() | +| regex(String regex) | 使用正则表达式抽取 | html.regex("\(.\*?)\") | +| regex(String regex,int group) | 使用正则表达式抽取,并指定捕获组 | html.regex("\(.\*?)\",1) | +| replace(String regex, String replacement) | 替换内容| html.replace("\","")| + +这部分抽取API返回的都是一个`Selectable`接口,意思是说,抽取是支持链式调用的。下面我用一个实例来讲解链式API的使用。 + +例如,我现在要抓取github上所有的Java项目,这些项目可以在[https://github.com/search?l=Java&p=1&q=stars%3A%3E1&s=stars&type=Repositories](https://github.com/search?l=Java&p=1&q=stars%3A%3E1&s=stars&type=Repositories)搜索结果中看到。 + +为了避免抓取范围太宽,我指定只从分页部分抓取链接。这个抓取规则是比较复杂的,我会要怎么写呢? + +![selectable-chain-ui](http://static.oschina.net/uploads/space/2014/0404/151454_2T01_190591.png) + +首先看到页面的html结构是这个样子的: + +![selectable-chain](http://static.oschina.net/uploads/space/2014/0404/151632_88Oq_190591.png) + +那么我可以先用CSS选择器提取出这个div,然后在取到所有的链接。为了保险起见,我再使用正则表达式限定一下提取出的URL的格式,那么最终的写法是这样子的: + +```java +List urls = page.getHtml().css("div.pagination").links().regex(".*/search/\?l=java.*").all(); +``` + +然后,我们可以把这些URL加到抓取列表中去: + +```java +List urls = page.getHtml().css("div.pagination").links().regex(".*/search/\?l=java.*").all(); +page.addTargetRequests(urls); +``` + +是不是比较简单?除了发现链接,Selectable的链式抽取还可以完成很多工作。我们会在第9章示例中再讲到。 + +#### 3.2.2 获取结果的API: + +当链式调用结束时,我们一般都想要拿到一个字符串类型的结果。这时候就需要用到获取结果的API了。我们知道,一条抽取规则,无论是XPath、CSS选择器或者正则表达式,总有可能抽取到多条元素。WebMagic对这些进行了统一,你可以通过不同的API获取到一个或者多个元素。 + +| 方法 | 说明 | 示例 | +| ------------ | ------------- | ------------ | +| get() | 返回一条String类型的结果 | String link= html.links().get()| +| toString() | 功能同get(),返回一条String类型的结果 | String link= html.links().toString()| +| all() | 返回所有抽取结果 | List links= html.links().all()| +| match() | 是否有匹配结果 | if (html.links().match()){ xxx; }| + +例如,我们知道页面只会有一条结果,那么可以使用selectable.get()或者selectable.toString()拿到这条结果。 + +这里selectable.toString()采用了toString()这个接口,是为了在输出以及和一些框架结合的时候,更加方便。因为一般情况下,我们都只需要选择一个元素! + +selectable.all()则会获取到所有元素。 + +好了,到现在为止,在回过头看看3.1中的GithubRepoPageProcessor,可能就觉得更加清晰了吧?指定main方法,已经可以看到抓取结果在控制台输出了。 + +### 3.3 保存结果 + +好了,爬虫编写完成,现在我们可能还有一个问题:我如果想把抓取的结果保存下来,要怎么做呢?WebMagic用于保存结果的组件叫做`Pipeline`。例如我们通过“控制台输出结果”这件事也是通过一个内置的Pipeline完成的,它叫做`ConsolePipeline`。那么,我现在想要把结果用Json的格式保存下来,怎么做呢?我只需要将Pipeline的实现换成"JsonFilePipeline"就可以了。 + +```java + public static void main(String[] args) { + + Spider.create(new GithubRepoPageProcessor()) + //从"https://github.com/code4craft"开始抓 + .addUrl("https://github.com/code4craft") + .addPipeline(new JsonFilePipeline("D:\webmagic\")) + //开启5个线程抓取 + .thread(5) + //启动爬虫 + .run(); + } +``` + +这样子下载下来的文件就会保存在D盘的webmagic目录中了。 + +通过定制Pipeline,我们还可以实现保存结果到文件、数据库等一系列功能。这个会在第7章“抽取结果的处理”中介绍。 + +至此为止,我们已经完成了一个基本爬虫的编写,也具有了一些定制功能。 + +
+ +## 4. 抽取工具详解 + +### 4.1 XPath + +### 4.2 CSS选择器 + +### 4.3 正则表达式 + +### 4.4 JsonPath ## 5. 配置爬虫 @@ -198,25 +379,25 @@ Intellij Idea默认自带Maven支持,import项目时选择Maven项目即可。 ### 6.4 定期抓取 -## 7. 管理URL +## 7. 抽取结果的处理 -### 7.1 手动添加URL +### 7.1 输出到控制台 -### 7.2 在URL中保存信息 +### 7.2 保存到文件 -### 7.3 几种URL管理方式 +### 7.3 JSON格式输出 -### 7.4 自己管理爬虫的URL +### 7.4 自定义持久化方式(mysql/mongodb…) -## 8. 抽取结果的处理 +## 8. 管理URL -### 8.1 输出到控制台 +### 8.1 手动添加URL -### 8.2 保存到文件 +### 8.2 在URL中保存信息 -### 8.3 JSON格式输出 +### 8.3 几种URL管理方式 -### 8.4 自定义持久化方式(mysql/mongodb…) +### 8.4 自己管理爬虫的URL ## 9. 实例 From 53184f0390e5c2e8b77d3642b8475f8e95788e63 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=84=A4=E6=80=92=E7=9A=84=E7=95=AA=E8=8C=84?= Date: Sat, 12 Apr 2014 23:00:37 +0800 Subject: [PATCH 4/5] test --- .../java/us/codecraft/webmagic/samples/HuabanProcessor.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webmagic-selenium/src/test/java/us/codecraft/webmagic/samples/HuabanProcessor.java b/webmagic-selenium/src/test/java/us/codecraft/webmagic/samples/HuabanProcessor.java index fcfb068..2854a76 100644 --- a/webmagic-selenium/src/test/java/us/codecraft/webmagic/samples/HuabanProcessor.java +++ b/webmagic-selenium/src/test/java/us/codecraft/webmagic/samples/HuabanProcessor.java @@ -30,7 +30,7 @@ public class HuabanProcessor implements PageProcessor { @Override public Site getSite() { - if (site == null) { + if (null == site) { site = Site.me().setDomain("huaban.com").setSleepTime(0); } return site; From 32ba1b888941a5b1fe6e908c1b3e9c902bb80e9e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=84=A4=E6=80=92=E7=9A=84=E7=95=AA=E8=8C=84?= Date: Sun, 13 Apr 2014 12:41:15 +0800 Subject: [PATCH 5/5] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E5=87=A0=E5=A4=84?= =?UTF-8?q?=E6=B3=A8=E9=87=8A=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- webmagic-core/src/main/java/us/codecraft/webmagic/Site.java | 2 +- .../src/main/java/us/codecraft/webmagic/Spider.java | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java index e83e85f..48b43f0 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java @@ -218,7 +218,7 @@ public class Site { * * @deprecated * @see Spider#addRequest(Request...) - * @param startUrl + * @param startRequest * @return this */ public Site addStartRequest(Request startRequest) { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 6fe2880..8af1338 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -142,7 +142,7 @@ public class Spider implements Runnable, Task { * Set startUrls of Spider.
* Prior to startUrls of Site. * - * @param startUrls + * @param startRequests * @return this */ public Spider startRequest(List startRequests) { @@ -218,7 +218,7 @@ public class Spider implements Runnable, Task { /** * set pipelines for Spider * - * @param pipeline + * @param pipelines * @return this * @see Pipeline * @since 0.4.1 @@ -477,7 +477,7 @@ public class Spider implements Runnable, Task { /** * Add urls with information to crawl.
* - * @param urls + * @param requests * @return */ public Spider addRequest(Request... requests) {