From 9606a173cd9f887778f0a60669912a06e2d4d87d Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Thu, 13 Mar 2014 22:55:50 +0800 Subject: [PATCH 001/130] fix ZipCodePageProcessor --- .../webmagic/selector/PlainText.java | 7 ++- .../webmagic/selector/Selectable.java | 7 +++ .../scheduler/ZipCodePageProcessor.java | 44 +++++++++---------- 3 files changed, 33 insertions(+), 25 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java index bb1b868..9d5c385 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java @@ -109,7 +109,7 @@ public class PlainText implements Selectable { } @Override - public String toString() { + public String get() { if (CollectionUtils.isNotEmpty(all())) { return all().get(0); } else { @@ -117,6 +117,11 @@ public class PlainText implements Selectable { } } + @Override + public String toString() { + return get(); + } + @Override public boolean match() { return strings != null && strings.size() > 0; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java index 6b4410e..aa1bb62 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java @@ -99,6 +99,13 @@ public interface Selectable { */ public String toString(); + /** + * single string result + * + * @return single string result + */ + public String get(); + /** * if result exist for select * diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/scheduler/ZipCodePageProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/scheduler/ZipCodePageProcessor.java index ddbaa08..3f2de70 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/scheduler/ZipCodePageProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/scheduler/ZipCodePageProcessor.java @@ -9,8 +9,9 @@ import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.scheduler.PriorityScheduler; import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; -import static us.codecraft.webmagic.selector.Selectors.regex; import static us.codecraft.webmagic.selector.Selectors.xpath; /** @@ -19,16 +20,16 @@ import static us.codecraft.webmagic.selector.Selectors.xpath; public class ZipCodePageProcessor implements PageProcessor { private Site site = Site.me().setCharset("gb2312") - .setSleepTime(100).addStartUrl("http://www.ip138.com/post/"); + .setSleepTime(100); @Override public void process(Page page) { if (page.getUrl().toString().equals("http://www.ip138.com/post/")) { processCountry(page); - } else if (page.getUrl().regex("http://www\\.ip138\\.com/post/\\w+[/]?$").toString() != null) { - processProvince(page); - } else { + } else if (page.getUrl().regex("http://www\\.ip138\\.com/\\d{6}[/]?$").toString() != null) { processDistrict(page); + } else { + processProvince(page); } } @@ -45,28 +46,26 @@ public class ZipCodePageProcessor implements PageProcessor { private void processProvince(Page page) { //这里仅靠xpath没法精准定位,所以使用正则作为筛选,不符合正则的会被过滤掉 - List districts = page.getHtml().xpath("//body/table/tbody/tr/td").regex(".*http://www\\.ip138\\.com/post/\\w+/\\w+.*").all(); + List districts = page.getHtml().xpath("//body/table/tbody/tr[@bgcolor=\"#ffffff\"]").all(); + Pattern pattern = Pattern.compile("([^<>]+).*?href=\"(.*?)\"",Pattern.DOTALL); for (String district : districts) { - String link = xpath("//@href").select(district); - String title = xpath("/text()").select(district); - Request request = new Request(link).setPriority(1).putExtra("province", page.getRequest().getExtra("province")).putExtra("district", title); - page.addTargetRequest(request); + Matcher matcher = pattern.matcher(district); + while (matcher.find()) { + String title = matcher.group(1); + String link = matcher.group(2); + Request request = new Request(link).setPriority(1).putExtra("province", page.getRequest().getExtra("province")).putExtra("district", title); + page.addTargetRequest(request); + } } } private void processDistrict(Page page) { String province = page.getRequest().getExtra("province").toString(); String district = page.getRequest().getExtra("district").toString(); - List counties = page.getHtml().xpath("//body/table/tbody/tr").regex(".*\\d+.*").all(); - String regex = "]*>([^<>]+)]*>([^<>]+)]*>([^<>]+)]*>([^<>]+)"; - for (String county : counties) { - String county0 = regex(regex, 1).select(county); - String county1 = regex(regex, 2).select(county); - String zipCode = regex(regex, 3).select(county); - page.putField("result", StringUtils.join(new String[]{province, district, - county0, county1, zipCode}, "\t")); - } - List links = page.getHtml().links().regex("http://www\\.ip138\\.com/post/\\w+/\\w+").all(); + String zipCode = page.getHtml().regex("

邮编:(\\d+)

").toString(); + page.putField("result", StringUtils.join(new String[]{province, district, + zipCode}, "\t")); + List links = page.getHtml().links().regex("http://www\\.ip138\\.com/\\d{6}[/]?$").all(); for (String link : links) { page.addTargetRequest(new Request(link).setPriority(2).putExtra("province", province).putExtra("district", district)); } @@ -79,11 +78,8 @@ public class ZipCodePageProcessor implements PageProcessor { } public static void main(String[] args) { - Spider.create(new ZipCodePageProcessor()).scheduler(new PriorityScheduler()).run(); + Spider spider = Spider.create(new ZipCodePageProcessor()).scheduler(new PriorityScheduler()).addUrl("http://www.ip138.com/post/"); - PriorityScheduler scheduler = new PriorityScheduler(); - Spider spider = Spider.create(new ZipCodePageProcessor()).scheduler(scheduler); - scheduler.push(new Request("http://www.baidu.com/s?wd=webmagic&f=12&rsp=0&oq=webmagix&tn=baiduhome_pg&ie=utf-8"),spider); spider.run(); } } From 6c11718566a0164245470454f331d32d4968a90f Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Fri, 14 Mar 2014 23:24:25 +0800 Subject: [PATCH 002/130] Clean project structure #70 --- pom.xml | 22 +--- webmagic-admin/README.md | 3 + {webmagic-avalon => webmagic-admin}/pom.xml | 4 +- .../avalon/web/DashBoardController.java | 0 .../webmagic/avalon/web/SpiderController.java | 0 .../src/main/resources/freemarker.properties | 0 .../src/main/resources/log/log4j.xml | 0 .../spring/applicationContext-freemarker.xml | 0 .../spring/applicationContext-myBatis.xml | 0 .../resources/spring/applicationContext.xml | 0 .../src/main/webapp/WEB-INF/jsp/404.jsp | 0 .../src/main/webapp/WEB-INF/jsp/500.jsp | 0 .../webapp/WEB-INF/pages/create_spider.ftl | 0 .../main/webapp/WEB-INF/pages/dashboard.ftl | 0 .../main/webapp/WEB-INF/pages/spider_list.ftl | 0 .../src/main/webapp/WEB-INF/web.xml | 0 .../webapp/static/css/bootstrap-cerulean.css | 0 .../webapp/static/css/bootstrap-classic.css | 0 .../static/css/bootstrap-classic.min.css | 0 .../webapp/static/css/bootstrap-cyborg.css | 0 .../webapp/static/css/bootstrap-journal.css | 0 .../main/webapp/static/css/bootstrap-redy.css | 0 .../static/css/bootstrap-responsive.css | 0 .../static/css/bootstrap-responsive.min.css | 0 .../webapp/static/css/bootstrap-simplex.css | 0 .../webapp/static/css/bootstrap-slate.css | 0 .../webapp/static/css/bootstrap-spacelab.css | 0 .../webapp/static/css/bootstrap-united.css | 0 .../main/webapp/static/css/charisma-app.css | 0 .../src/main/webapp/static/css/chosen.css | 0 .../src/main/webapp/static/css/colorbox.css | 0 .../main/webapp/static/css/elfinder.min.css | 0 .../main/webapp/static/css/elfinder.theme.css | 0 .../main/webapp/static/css/fullcalendar.css | 0 .../webapp/static/css/fullcalendar.print.css | 0 .../static/css/jquery-ui-1.8.21.custom.css | 0 .../webapp/static/css/jquery.cleditor.css | 0 .../static/css/jquery.iphone.toggle.css | 0 .../main/webapp/static/css/jquery.noty.css | 0 .../webapp/static/css/noty_theme_default.css | 0 .../src/main/webapp/static/css/opa-icons.css | 0 .../webapp/static/css/uniform.default.css | 0 .../src/main/webapp/static/css/uploadify.css | 0 .../src/main/webapp/static/favicon.jpg | Bin .../main/webapp/static/js/bootstrap-alert.js | 0 .../main/webapp/static/js/bootstrap-button.js | 0 .../webapp/static/js/bootstrap-carousel.js | 0 .../webapp/static/js/bootstrap-collapse.js | 0 .../webapp/static/js/bootstrap-dropdown.js | 0 .../main/webapp/static/js/bootstrap-modal.js | 0 .../webapp/static/js/bootstrap-popover.js | 0 .../webapp/static/js/bootstrap-scrollspy.js | 0 .../main/webapp/static/js/bootstrap-tab.js | 0 .../main/webapp/static/js/bootstrap-toggle.js | 0 .../webapp/static/js/bootstrap-tooltip.js | 0 .../main/webapp/static/js/bootstrap-tour.js | 0 .../webapp/static/js/bootstrap-transition.js | 0 .../webapp/static/js/bootstrap-typeahead.js | 0 .../src/main/webapp/static/js/charisma.js | 0 .../src/main/webapp/static/js/excanvas.js | 0 .../main/webapp/static/js/fullcalendar.min.js | 0 .../main/webapp/static/js/jquery-1.7.2.min.js | 0 .../static/js/jquery-ui-1.8.21.custom.min.js | 0 .../static/js/jquery.autogrow-textarea.js | 0 .../webapp/static/js/jquery.chosen.min.js | 0 .../webapp/static/js/jquery.cleditor.min.js | 0 .../webapp/static/js/jquery.colorbox.min.js | 0 .../main/webapp/static/js/jquery.cookie.js | 0 .../webapp/static/js/jquery.dataTables.min.js | 0 .../webapp/static/js/jquery.elfinder.min.js | 0 .../main/webapp/static/js/jquery.flot.min.js | 0 .../webapp/static/js/jquery.flot.pie.min.js | 0 .../static/js/jquery.flot.resize.min.js | 0 .../webapp/static/js/jquery.flot.stack.js | 0 .../main/webapp/static/js/jquery.history.js | 0 .../webapp/static/js/jquery.iphone.toggle.js | 0 .../src/main/webapp/static/js/jquery.js | 0 .../src/main/webapp/static/js/jquery.noty.js | 0 .../main/webapp/static/js/jquery.raty.min.js | 0 .../webapp/static/js/jquery.uniform.min.js | 0 .../static/js/jquery.uploadify-3.1.min.js | 0 webmagic-core/pom.xml | 7 +- .../java/us/codecraft/webmagic/Spider.java | 10 -- .../us/codecraft/webmagic/selector/Html.java | 14 +-- .../webmagic/selector/Selectors.java | 8 +- .../webmagic/selector/XpathSelector.java | 62 ++------- .../webmagic/selector/XsoupSelector.java | 32 ----- .../webmagic/utils/EnvironmentUtil.java | 28 ----- .../webmagic/selector/ExtractorsTest.java | 2 +- .../webmagic/utils/EnvironmentUtilTest.java | 18 --- webmagic-extension/pom.xml | 2 +- .../webmagic/utils/ExtractorUtils.java | 7 +- webmagic-lucene/README.md | 3 - webmagic-lucene/pom.xml | 46 ------- .../webmagic/pipeline/LucenePipeline.java | 92 -------------- .../webmagic/lucene/OschinaBlog.java | 61 --------- webmagic-panel/README.md | 20 --- webmagic-panel/pom.xml | 35 ------ webmagic-samples/pom.xml | 2 +- webmagic-saxon/pom.xml | 8 +- .../webmagic/selector/XpathSelectorTest.java | 2 +- webmagic-scripts/pom.xml | 2 +- webmagic-selenium/pom.xml | 2 +- webmagic-worker/README.md | 3 + webmagic-worker/pom.xml | 118 ++++++++++++++++++ webmagic-worker/src/main/resources/log4j.xml | 26 ++++ 106 files changed, 195 insertions(+), 444 deletions(-) create mode 100644 webmagic-admin/README.md rename {webmagic-avalon => webmagic-admin}/pom.xml (97%) rename {webmagic-avalon => webmagic-admin}/src/main/java/us/codecraft/webmagic/avalon/web/DashBoardController.java (100%) rename {webmagic-avalon => webmagic-admin}/src/main/java/us/codecraft/webmagic/avalon/web/SpiderController.java (100%) rename {webmagic-avalon => webmagic-admin}/src/main/resources/freemarker.properties (100%) rename {webmagic-avalon => webmagic-admin}/src/main/resources/log/log4j.xml (100%) rename {webmagic-avalon => webmagic-admin}/src/main/resources/spring/applicationContext-freemarker.xml (100%) rename {webmagic-avalon => webmagic-admin}/src/main/resources/spring/applicationContext-myBatis.xml (100%) rename {webmagic-avalon => webmagic-admin}/src/main/resources/spring/applicationContext.xml (100%) rename {webmagic-avalon => webmagic-admin}/src/main/webapp/WEB-INF/jsp/404.jsp (100%) rename {webmagic-avalon => webmagic-admin}/src/main/webapp/WEB-INF/jsp/500.jsp (100%) rename {webmagic-avalon => webmagic-admin}/src/main/webapp/WEB-INF/pages/create_spider.ftl (100%) rename {webmagic-avalon => webmagic-admin}/src/main/webapp/WEB-INF/pages/dashboard.ftl (100%) rename {webmagic-avalon => webmagic-admin}/src/main/webapp/WEB-INF/pages/spider_list.ftl (100%) rename {webmagic-avalon => webmagic-admin}/src/main/webapp/WEB-INF/web.xml (100%) rename {webmagic-avalon => webmagic-admin}/src/main/webapp/static/css/bootstrap-cerulean.css (100%) rename {webmagic-avalon => webmagic-admin}/src/main/webapp/static/css/bootstrap-classic.css (100%) rename {webmagic-avalon => webmagic-admin}/src/main/webapp/static/css/bootstrap-classic.min.css (100%) rename {webmagic-avalon => webmagic-admin}/src/main/webapp/static/css/bootstrap-cyborg.css (100%) rename {webmagic-avalon => webmagic-admin}/src/main/webapp/static/css/bootstrap-journal.css (100%) rename {webmagic-avalon => webmagic-admin}/src/main/webapp/static/css/bootstrap-redy.css (100%) rename {webmagic-avalon => webmagic-admin}/src/main/webapp/static/css/bootstrap-responsive.css (100%) rename {webmagic-avalon => webmagic-admin}/src/main/webapp/static/css/bootstrap-responsive.min.css (100%) rename {webmagic-avalon => webmagic-admin}/src/main/webapp/static/css/bootstrap-simplex.css (100%) rename {webmagic-avalon => webmagic-admin}/src/main/webapp/static/css/bootstrap-slate.css (100%) rename {webmagic-avalon => webmagic-admin}/src/main/webapp/static/css/bootstrap-spacelab.css (100%) rename {webmagic-avalon => webmagic-admin}/src/main/webapp/static/css/bootstrap-united.css (100%) rename {webmagic-avalon => webmagic-admin}/src/main/webapp/static/css/charisma-app.css (100%) rename {webmagic-avalon => webmagic-admin}/src/main/webapp/static/css/chosen.css (100%) rename {webmagic-avalon => webmagic-admin}/src/main/webapp/static/css/colorbox.css (100%) rename {webmagic-avalon => webmagic-admin}/src/main/webapp/static/css/elfinder.min.css (100%) rename {webmagic-avalon => webmagic-admin}/src/main/webapp/static/css/elfinder.theme.css (100%) rename {webmagic-avalon => webmagic-admin}/src/main/webapp/static/css/fullcalendar.css (100%) rename {webmagic-avalon => webmagic-admin}/src/main/webapp/static/css/fullcalendar.print.css (100%) rename {webmagic-avalon => webmagic-admin}/src/main/webapp/static/css/jquery-ui-1.8.21.custom.css (100%) rename {webmagic-avalon => webmagic-admin}/src/main/webapp/static/css/jquery.cleditor.css (100%) rename {webmagic-avalon => webmagic-admin}/src/main/webapp/static/css/jquery.iphone.toggle.css (100%) rename {webmagic-avalon => webmagic-admin}/src/main/webapp/static/css/jquery.noty.css (100%) rename {webmagic-avalon => webmagic-admin}/src/main/webapp/static/css/noty_theme_default.css (100%) rename {webmagic-avalon => webmagic-admin}/src/main/webapp/static/css/opa-icons.css (100%) rename {webmagic-avalon => webmagic-admin}/src/main/webapp/static/css/uniform.default.css (100%) rename {webmagic-avalon => webmagic-admin}/src/main/webapp/static/css/uploadify.css (100%) rename {webmagic-avalon => webmagic-admin}/src/main/webapp/static/favicon.jpg (100%) rename {webmagic-avalon => webmagic-admin}/src/main/webapp/static/js/bootstrap-alert.js (100%) rename {webmagic-avalon => webmagic-admin}/src/main/webapp/static/js/bootstrap-button.js (100%) rename {webmagic-avalon => webmagic-admin}/src/main/webapp/static/js/bootstrap-carousel.js (100%) rename {webmagic-avalon => webmagic-admin}/src/main/webapp/static/js/bootstrap-collapse.js (100%) rename {webmagic-avalon => webmagic-admin}/src/main/webapp/static/js/bootstrap-dropdown.js (100%) rename {webmagic-avalon => webmagic-admin}/src/main/webapp/static/js/bootstrap-modal.js (100%) rename {webmagic-avalon => webmagic-admin}/src/main/webapp/static/js/bootstrap-popover.js (100%) rename {webmagic-avalon => webmagic-admin}/src/main/webapp/static/js/bootstrap-scrollspy.js (100%) rename {webmagic-avalon => webmagic-admin}/src/main/webapp/static/js/bootstrap-tab.js (100%) rename {webmagic-avalon => webmagic-admin}/src/main/webapp/static/js/bootstrap-toggle.js (100%) rename {webmagic-avalon => webmagic-admin}/src/main/webapp/static/js/bootstrap-tooltip.js (100%) rename {webmagic-avalon => webmagic-admin}/src/main/webapp/static/js/bootstrap-tour.js (100%) rename {webmagic-avalon => webmagic-admin}/src/main/webapp/static/js/bootstrap-transition.js (100%) rename {webmagic-avalon => webmagic-admin}/src/main/webapp/static/js/bootstrap-typeahead.js (100%) rename {webmagic-avalon => webmagic-admin}/src/main/webapp/static/js/charisma.js (100%) rename {webmagic-avalon => webmagic-admin}/src/main/webapp/static/js/excanvas.js (100%) rename {webmagic-avalon => webmagic-admin}/src/main/webapp/static/js/fullcalendar.min.js (100%) rename {webmagic-avalon => webmagic-admin}/src/main/webapp/static/js/jquery-1.7.2.min.js (100%) rename {webmagic-avalon => webmagic-admin}/src/main/webapp/static/js/jquery-ui-1.8.21.custom.min.js (100%) rename {webmagic-avalon => webmagic-admin}/src/main/webapp/static/js/jquery.autogrow-textarea.js (100%) rename {webmagic-avalon => webmagic-admin}/src/main/webapp/static/js/jquery.chosen.min.js (100%) rename {webmagic-avalon => webmagic-admin}/src/main/webapp/static/js/jquery.cleditor.min.js (100%) rename {webmagic-avalon => webmagic-admin}/src/main/webapp/static/js/jquery.colorbox.min.js (100%) rename {webmagic-avalon => webmagic-admin}/src/main/webapp/static/js/jquery.cookie.js (100%) rename {webmagic-avalon => webmagic-admin}/src/main/webapp/static/js/jquery.dataTables.min.js (100%) rename {webmagic-avalon => webmagic-admin}/src/main/webapp/static/js/jquery.elfinder.min.js (100%) rename {webmagic-avalon => webmagic-admin}/src/main/webapp/static/js/jquery.flot.min.js (100%) rename {webmagic-avalon => webmagic-admin}/src/main/webapp/static/js/jquery.flot.pie.min.js (100%) rename {webmagic-avalon => webmagic-admin}/src/main/webapp/static/js/jquery.flot.resize.min.js (100%) rename {webmagic-avalon => webmagic-admin}/src/main/webapp/static/js/jquery.flot.stack.js (100%) rename {webmagic-avalon => webmagic-admin}/src/main/webapp/static/js/jquery.history.js (100%) rename {webmagic-avalon => webmagic-admin}/src/main/webapp/static/js/jquery.iphone.toggle.js (100%) rename {webmagic-avalon => webmagic-admin}/src/main/webapp/static/js/jquery.js (100%) rename {webmagic-avalon => webmagic-admin}/src/main/webapp/static/js/jquery.noty.js (100%) rename {webmagic-avalon => webmagic-admin}/src/main/webapp/static/js/jquery.raty.min.js (100%) rename {webmagic-avalon => webmagic-admin}/src/main/webapp/static/js/jquery.uniform.min.js (100%) rename {webmagic-avalon => webmagic-admin}/src/main/webapp/static/js/jquery.uploadify-3.1.min.js (100%) delete mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/selector/XsoupSelector.java delete mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/utils/EnvironmentUtil.java delete mode 100644 webmagic-core/src/test/java/us/codecraft/webmagic/utils/EnvironmentUtilTest.java delete mode 100644 webmagic-lucene/README.md delete mode 100644 webmagic-lucene/pom.xml delete mode 100644 webmagic-lucene/src/main/java/us/codecraft/webmagic/pipeline/LucenePipeline.java delete mode 100644 webmagic-lucene/src/main/test/java/us/codecraft/webmagic/lucene/OschinaBlog.java delete mode 100644 webmagic-panel/README.md delete mode 100644 webmagic-panel/pom.xml create mode 100644 webmagic-worker/README.md create mode 100644 webmagic-worker/pom.xml create mode 100644 webmagic-worker/src/main/resources/log4j.xml diff --git a/pom.xml b/pom.xml index 1828c71..77bc93e 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ 7 us.codecraft - 0.4.4-SNAPSHOT + 0.5.0-SNAPSHOT 4.0.0 pom @@ -51,11 +51,11 @@ webmagic-core webmagic-extension/ webmagic-scripts/ - webmagic-avalon - webmagic-lucene - webmagic-samples - webmagic-saxon webmagic-selenium + webmagic-saxon + webmagic-samples + webmagic-admin + webmagic-worker @@ -63,7 +63,7 @@ junit junit - 4.7 + 4.11 test @@ -91,11 +91,6 @@ xsoup 0.2.0 - - net.sf.saxon - Saxon-HE - 9.5.1-1 - com.alibaba fastjson @@ -121,11 +116,6 @@ commons-collections 3.2.1 - - net.sourceforge.htmlcleaner - htmlcleaner - 2.5 - org.apache.commons commons-io diff --git a/webmagic-admin/README.md b/webmagic-admin/README.md new file mode 100644 index 0000000..6e32c06 --- /dev/null +++ b/webmagic-admin/README.md @@ -0,0 +1,3 @@ +WebMagic-Admin +===== +Admin is the control web of workers. \ No newline at end of file diff --git a/webmagic-avalon/pom.xml b/webmagic-admin/pom.xml similarity index 97% rename from webmagic-avalon/pom.xml rename to webmagic-admin/pom.xml index a62bbe9..58068f9 100644 --- a/webmagic-avalon/pom.xml +++ b/webmagic-admin/pom.xml @@ -3,12 +3,12 @@ webmagic-parent us.codecraft - 0.4.4-SNAPSHOT + 0.5.0-SNAPSHOT 4.0.0 us.codecraft - webmagic-avalon + webmagic-admin war diff --git a/webmagic-avalon/src/main/java/us/codecraft/webmagic/avalon/web/DashBoardController.java b/webmagic-admin/src/main/java/us/codecraft/webmagic/avalon/web/DashBoardController.java similarity index 100% rename from webmagic-avalon/src/main/java/us/codecraft/webmagic/avalon/web/DashBoardController.java rename to webmagic-admin/src/main/java/us/codecraft/webmagic/avalon/web/DashBoardController.java diff --git a/webmagic-avalon/src/main/java/us/codecraft/webmagic/avalon/web/SpiderController.java b/webmagic-admin/src/main/java/us/codecraft/webmagic/avalon/web/SpiderController.java similarity index 100% rename from webmagic-avalon/src/main/java/us/codecraft/webmagic/avalon/web/SpiderController.java rename to webmagic-admin/src/main/java/us/codecraft/webmagic/avalon/web/SpiderController.java diff --git a/webmagic-avalon/src/main/resources/freemarker.properties b/webmagic-admin/src/main/resources/freemarker.properties similarity index 100% rename from webmagic-avalon/src/main/resources/freemarker.properties rename to webmagic-admin/src/main/resources/freemarker.properties diff --git a/webmagic-avalon/src/main/resources/log/log4j.xml b/webmagic-admin/src/main/resources/log/log4j.xml similarity index 100% rename from webmagic-avalon/src/main/resources/log/log4j.xml rename to webmagic-admin/src/main/resources/log/log4j.xml diff --git a/webmagic-avalon/src/main/resources/spring/applicationContext-freemarker.xml b/webmagic-admin/src/main/resources/spring/applicationContext-freemarker.xml similarity index 100% rename from webmagic-avalon/src/main/resources/spring/applicationContext-freemarker.xml rename to webmagic-admin/src/main/resources/spring/applicationContext-freemarker.xml diff --git a/webmagic-avalon/src/main/resources/spring/applicationContext-myBatis.xml b/webmagic-admin/src/main/resources/spring/applicationContext-myBatis.xml similarity index 100% rename from webmagic-avalon/src/main/resources/spring/applicationContext-myBatis.xml rename to webmagic-admin/src/main/resources/spring/applicationContext-myBatis.xml diff --git a/webmagic-avalon/src/main/resources/spring/applicationContext.xml b/webmagic-admin/src/main/resources/spring/applicationContext.xml similarity index 100% rename from webmagic-avalon/src/main/resources/spring/applicationContext.xml rename to webmagic-admin/src/main/resources/spring/applicationContext.xml diff --git a/webmagic-avalon/src/main/webapp/WEB-INF/jsp/404.jsp b/webmagic-admin/src/main/webapp/WEB-INF/jsp/404.jsp similarity index 100% rename from webmagic-avalon/src/main/webapp/WEB-INF/jsp/404.jsp rename to webmagic-admin/src/main/webapp/WEB-INF/jsp/404.jsp diff --git a/webmagic-avalon/src/main/webapp/WEB-INF/jsp/500.jsp b/webmagic-admin/src/main/webapp/WEB-INF/jsp/500.jsp similarity index 100% rename from webmagic-avalon/src/main/webapp/WEB-INF/jsp/500.jsp rename to webmagic-admin/src/main/webapp/WEB-INF/jsp/500.jsp diff --git a/webmagic-avalon/src/main/webapp/WEB-INF/pages/create_spider.ftl b/webmagic-admin/src/main/webapp/WEB-INF/pages/create_spider.ftl similarity index 100% rename from webmagic-avalon/src/main/webapp/WEB-INF/pages/create_spider.ftl rename to webmagic-admin/src/main/webapp/WEB-INF/pages/create_spider.ftl diff --git a/webmagic-avalon/src/main/webapp/WEB-INF/pages/dashboard.ftl b/webmagic-admin/src/main/webapp/WEB-INF/pages/dashboard.ftl similarity index 100% rename from webmagic-avalon/src/main/webapp/WEB-INF/pages/dashboard.ftl rename to webmagic-admin/src/main/webapp/WEB-INF/pages/dashboard.ftl diff --git a/webmagic-avalon/src/main/webapp/WEB-INF/pages/spider_list.ftl b/webmagic-admin/src/main/webapp/WEB-INF/pages/spider_list.ftl similarity index 100% rename from webmagic-avalon/src/main/webapp/WEB-INF/pages/spider_list.ftl rename to webmagic-admin/src/main/webapp/WEB-INF/pages/spider_list.ftl diff --git a/webmagic-avalon/src/main/webapp/WEB-INF/web.xml b/webmagic-admin/src/main/webapp/WEB-INF/web.xml similarity index 100% rename from webmagic-avalon/src/main/webapp/WEB-INF/web.xml rename to webmagic-admin/src/main/webapp/WEB-INF/web.xml diff --git a/webmagic-avalon/src/main/webapp/static/css/bootstrap-cerulean.css b/webmagic-admin/src/main/webapp/static/css/bootstrap-cerulean.css similarity index 100% rename from webmagic-avalon/src/main/webapp/static/css/bootstrap-cerulean.css rename to webmagic-admin/src/main/webapp/static/css/bootstrap-cerulean.css diff --git a/webmagic-avalon/src/main/webapp/static/css/bootstrap-classic.css b/webmagic-admin/src/main/webapp/static/css/bootstrap-classic.css similarity index 100% rename from webmagic-avalon/src/main/webapp/static/css/bootstrap-classic.css rename to webmagic-admin/src/main/webapp/static/css/bootstrap-classic.css diff --git a/webmagic-avalon/src/main/webapp/static/css/bootstrap-classic.min.css b/webmagic-admin/src/main/webapp/static/css/bootstrap-classic.min.css similarity index 100% rename from webmagic-avalon/src/main/webapp/static/css/bootstrap-classic.min.css rename to webmagic-admin/src/main/webapp/static/css/bootstrap-classic.min.css diff --git a/webmagic-avalon/src/main/webapp/static/css/bootstrap-cyborg.css b/webmagic-admin/src/main/webapp/static/css/bootstrap-cyborg.css similarity index 100% rename from webmagic-avalon/src/main/webapp/static/css/bootstrap-cyborg.css rename to webmagic-admin/src/main/webapp/static/css/bootstrap-cyborg.css diff --git a/webmagic-avalon/src/main/webapp/static/css/bootstrap-journal.css b/webmagic-admin/src/main/webapp/static/css/bootstrap-journal.css similarity index 100% rename from webmagic-avalon/src/main/webapp/static/css/bootstrap-journal.css rename to webmagic-admin/src/main/webapp/static/css/bootstrap-journal.css diff --git a/webmagic-avalon/src/main/webapp/static/css/bootstrap-redy.css b/webmagic-admin/src/main/webapp/static/css/bootstrap-redy.css similarity index 100% rename from webmagic-avalon/src/main/webapp/static/css/bootstrap-redy.css rename to webmagic-admin/src/main/webapp/static/css/bootstrap-redy.css diff --git a/webmagic-avalon/src/main/webapp/static/css/bootstrap-responsive.css b/webmagic-admin/src/main/webapp/static/css/bootstrap-responsive.css similarity index 100% rename from webmagic-avalon/src/main/webapp/static/css/bootstrap-responsive.css rename to webmagic-admin/src/main/webapp/static/css/bootstrap-responsive.css diff --git a/webmagic-avalon/src/main/webapp/static/css/bootstrap-responsive.min.css b/webmagic-admin/src/main/webapp/static/css/bootstrap-responsive.min.css similarity index 100% rename from webmagic-avalon/src/main/webapp/static/css/bootstrap-responsive.min.css rename to webmagic-admin/src/main/webapp/static/css/bootstrap-responsive.min.css diff --git a/webmagic-avalon/src/main/webapp/static/css/bootstrap-simplex.css b/webmagic-admin/src/main/webapp/static/css/bootstrap-simplex.css similarity index 100% rename from webmagic-avalon/src/main/webapp/static/css/bootstrap-simplex.css rename to webmagic-admin/src/main/webapp/static/css/bootstrap-simplex.css diff --git a/webmagic-avalon/src/main/webapp/static/css/bootstrap-slate.css b/webmagic-admin/src/main/webapp/static/css/bootstrap-slate.css similarity index 100% rename from webmagic-avalon/src/main/webapp/static/css/bootstrap-slate.css rename to webmagic-admin/src/main/webapp/static/css/bootstrap-slate.css diff --git a/webmagic-avalon/src/main/webapp/static/css/bootstrap-spacelab.css b/webmagic-admin/src/main/webapp/static/css/bootstrap-spacelab.css similarity index 100% rename from webmagic-avalon/src/main/webapp/static/css/bootstrap-spacelab.css rename to webmagic-admin/src/main/webapp/static/css/bootstrap-spacelab.css diff --git a/webmagic-avalon/src/main/webapp/static/css/bootstrap-united.css b/webmagic-admin/src/main/webapp/static/css/bootstrap-united.css similarity index 100% rename from webmagic-avalon/src/main/webapp/static/css/bootstrap-united.css rename to webmagic-admin/src/main/webapp/static/css/bootstrap-united.css diff --git a/webmagic-avalon/src/main/webapp/static/css/charisma-app.css b/webmagic-admin/src/main/webapp/static/css/charisma-app.css similarity index 100% rename from webmagic-avalon/src/main/webapp/static/css/charisma-app.css rename to webmagic-admin/src/main/webapp/static/css/charisma-app.css diff --git a/webmagic-avalon/src/main/webapp/static/css/chosen.css b/webmagic-admin/src/main/webapp/static/css/chosen.css similarity index 100% rename from webmagic-avalon/src/main/webapp/static/css/chosen.css rename to webmagic-admin/src/main/webapp/static/css/chosen.css diff --git a/webmagic-avalon/src/main/webapp/static/css/colorbox.css b/webmagic-admin/src/main/webapp/static/css/colorbox.css similarity index 100% rename from webmagic-avalon/src/main/webapp/static/css/colorbox.css rename to webmagic-admin/src/main/webapp/static/css/colorbox.css diff --git a/webmagic-avalon/src/main/webapp/static/css/elfinder.min.css b/webmagic-admin/src/main/webapp/static/css/elfinder.min.css similarity index 100% rename from webmagic-avalon/src/main/webapp/static/css/elfinder.min.css rename to webmagic-admin/src/main/webapp/static/css/elfinder.min.css diff --git a/webmagic-avalon/src/main/webapp/static/css/elfinder.theme.css b/webmagic-admin/src/main/webapp/static/css/elfinder.theme.css similarity index 100% rename from webmagic-avalon/src/main/webapp/static/css/elfinder.theme.css rename to webmagic-admin/src/main/webapp/static/css/elfinder.theme.css diff --git a/webmagic-avalon/src/main/webapp/static/css/fullcalendar.css b/webmagic-admin/src/main/webapp/static/css/fullcalendar.css similarity index 100% rename from webmagic-avalon/src/main/webapp/static/css/fullcalendar.css rename to webmagic-admin/src/main/webapp/static/css/fullcalendar.css diff --git a/webmagic-avalon/src/main/webapp/static/css/fullcalendar.print.css b/webmagic-admin/src/main/webapp/static/css/fullcalendar.print.css similarity index 100% rename from webmagic-avalon/src/main/webapp/static/css/fullcalendar.print.css rename to webmagic-admin/src/main/webapp/static/css/fullcalendar.print.css diff --git a/webmagic-avalon/src/main/webapp/static/css/jquery-ui-1.8.21.custom.css b/webmagic-admin/src/main/webapp/static/css/jquery-ui-1.8.21.custom.css similarity index 100% rename from webmagic-avalon/src/main/webapp/static/css/jquery-ui-1.8.21.custom.css rename to webmagic-admin/src/main/webapp/static/css/jquery-ui-1.8.21.custom.css diff --git a/webmagic-avalon/src/main/webapp/static/css/jquery.cleditor.css b/webmagic-admin/src/main/webapp/static/css/jquery.cleditor.css similarity index 100% rename from webmagic-avalon/src/main/webapp/static/css/jquery.cleditor.css rename to webmagic-admin/src/main/webapp/static/css/jquery.cleditor.css diff --git a/webmagic-avalon/src/main/webapp/static/css/jquery.iphone.toggle.css b/webmagic-admin/src/main/webapp/static/css/jquery.iphone.toggle.css similarity index 100% rename from webmagic-avalon/src/main/webapp/static/css/jquery.iphone.toggle.css rename to webmagic-admin/src/main/webapp/static/css/jquery.iphone.toggle.css diff --git a/webmagic-avalon/src/main/webapp/static/css/jquery.noty.css b/webmagic-admin/src/main/webapp/static/css/jquery.noty.css similarity index 100% rename from webmagic-avalon/src/main/webapp/static/css/jquery.noty.css rename to webmagic-admin/src/main/webapp/static/css/jquery.noty.css diff --git a/webmagic-avalon/src/main/webapp/static/css/noty_theme_default.css b/webmagic-admin/src/main/webapp/static/css/noty_theme_default.css similarity index 100% rename from webmagic-avalon/src/main/webapp/static/css/noty_theme_default.css rename to webmagic-admin/src/main/webapp/static/css/noty_theme_default.css diff --git a/webmagic-avalon/src/main/webapp/static/css/opa-icons.css b/webmagic-admin/src/main/webapp/static/css/opa-icons.css similarity index 100% rename from webmagic-avalon/src/main/webapp/static/css/opa-icons.css rename to webmagic-admin/src/main/webapp/static/css/opa-icons.css diff --git a/webmagic-avalon/src/main/webapp/static/css/uniform.default.css b/webmagic-admin/src/main/webapp/static/css/uniform.default.css similarity index 100% rename from webmagic-avalon/src/main/webapp/static/css/uniform.default.css rename to webmagic-admin/src/main/webapp/static/css/uniform.default.css diff --git a/webmagic-avalon/src/main/webapp/static/css/uploadify.css b/webmagic-admin/src/main/webapp/static/css/uploadify.css similarity index 100% rename from webmagic-avalon/src/main/webapp/static/css/uploadify.css rename to webmagic-admin/src/main/webapp/static/css/uploadify.css diff --git a/webmagic-avalon/src/main/webapp/static/favicon.jpg b/webmagic-admin/src/main/webapp/static/favicon.jpg similarity index 100% rename from webmagic-avalon/src/main/webapp/static/favicon.jpg rename to webmagic-admin/src/main/webapp/static/favicon.jpg diff --git a/webmagic-avalon/src/main/webapp/static/js/bootstrap-alert.js b/webmagic-admin/src/main/webapp/static/js/bootstrap-alert.js similarity index 100% rename from webmagic-avalon/src/main/webapp/static/js/bootstrap-alert.js rename to webmagic-admin/src/main/webapp/static/js/bootstrap-alert.js diff --git a/webmagic-avalon/src/main/webapp/static/js/bootstrap-button.js b/webmagic-admin/src/main/webapp/static/js/bootstrap-button.js similarity index 100% rename from webmagic-avalon/src/main/webapp/static/js/bootstrap-button.js rename to webmagic-admin/src/main/webapp/static/js/bootstrap-button.js diff --git a/webmagic-avalon/src/main/webapp/static/js/bootstrap-carousel.js b/webmagic-admin/src/main/webapp/static/js/bootstrap-carousel.js similarity index 100% rename from webmagic-avalon/src/main/webapp/static/js/bootstrap-carousel.js rename to webmagic-admin/src/main/webapp/static/js/bootstrap-carousel.js diff --git a/webmagic-avalon/src/main/webapp/static/js/bootstrap-collapse.js b/webmagic-admin/src/main/webapp/static/js/bootstrap-collapse.js similarity index 100% rename from webmagic-avalon/src/main/webapp/static/js/bootstrap-collapse.js rename to webmagic-admin/src/main/webapp/static/js/bootstrap-collapse.js diff --git a/webmagic-avalon/src/main/webapp/static/js/bootstrap-dropdown.js b/webmagic-admin/src/main/webapp/static/js/bootstrap-dropdown.js similarity index 100% rename from webmagic-avalon/src/main/webapp/static/js/bootstrap-dropdown.js rename to webmagic-admin/src/main/webapp/static/js/bootstrap-dropdown.js diff --git a/webmagic-avalon/src/main/webapp/static/js/bootstrap-modal.js b/webmagic-admin/src/main/webapp/static/js/bootstrap-modal.js similarity index 100% rename from webmagic-avalon/src/main/webapp/static/js/bootstrap-modal.js rename to webmagic-admin/src/main/webapp/static/js/bootstrap-modal.js diff --git a/webmagic-avalon/src/main/webapp/static/js/bootstrap-popover.js b/webmagic-admin/src/main/webapp/static/js/bootstrap-popover.js similarity index 100% rename from webmagic-avalon/src/main/webapp/static/js/bootstrap-popover.js rename to webmagic-admin/src/main/webapp/static/js/bootstrap-popover.js diff --git a/webmagic-avalon/src/main/webapp/static/js/bootstrap-scrollspy.js b/webmagic-admin/src/main/webapp/static/js/bootstrap-scrollspy.js similarity index 100% rename from webmagic-avalon/src/main/webapp/static/js/bootstrap-scrollspy.js rename to webmagic-admin/src/main/webapp/static/js/bootstrap-scrollspy.js diff --git a/webmagic-avalon/src/main/webapp/static/js/bootstrap-tab.js b/webmagic-admin/src/main/webapp/static/js/bootstrap-tab.js similarity index 100% rename from webmagic-avalon/src/main/webapp/static/js/bootstrap-tab.js rename to webmagic-admin/src/main/webapp/static/js/bootstrap-tab.js diff --git a/webmagic-avalon/src/main/webapp/static/js/bootstrap-toggle.js b/webmagic-admin/src/main/webapp/static/js/bootstrap-toggle.js similarity index 100% rename from webmagic-avalon/src/main/webapp/static/js/bootstrap-toggle.js rename to webmagic-admin/src/main/webapp/static/js/bootstrap-toggle.js diff --git a/webmagic-avalon/src/main/webapp/static/js/bootstrap-tooltip.js b/webmagic-admin/src/main/webapp/static/js/bootstrap-tooltip.js similarity index 100% rename from webmagic-avalon/src/main/webapp/static/js/bootstrap-tooltip.js rename to webmagic-admin/src/main/webapp/static/js/bootstrap-tooltip.js diff --git a/webmagic-avalon/src/main/webapp/static/js/bootstrap-tour.js b/webmagic-admin/src/main/webapp/static/js/bootstrap-tour.js similarity index 100% rename from webmagic-avalon/src/main/webapp/static/js/bootstrap-tour.js rename to webmagic-admin/src/main/webapp/static/js/bootstrap-tour.js diff --git a/webmagic-avalon/src/main/webapp/static/js/bootstrap-transition.js b/webmagic-admin/src/main/webapp/static/js/bootstrap-transition.js similarity index 100% rename from webmagic-avalon/src/main/webapp/static/js/bootstrap-transition.js rename to webmagic-admin/src/main/webapp/static/js/bootstrap-transition.js diff --git a/webmagic-avalon/src/main/webapp/static/js/bootstrap-typeahead.js b/webmagic-admin/src/main/webapp/static/js/bootstrap-typeahead.js similarity index 100% rename from webmagic-avalon/src/main/webapp/static/js/bootstrap-typeahead.js rename to webmagic-admin/src/main/webapp/static/js/bootstrap-typeahead.js diff --git a/webmagic-avalon/src/main/webapp/static/js/charisma.js b/webmagic-admin/src/main/webapp/static/js/charisma.js similarity index 100% rename from webmagic-avalon/src/main/webapp/static/js/charisma.js rename to webmagic-admin/src/main/webapp/static/js/charisma.js diff --git a/webmagic-avalon/src/main/webapp/static/js/excanvas.js b/webmagic-admin/src/main/webapp/static/js/excanvas.js similarity index 100% rename from webmagic-avalon/src/main/webapp/static/js/excanvas.js rename to webmagic-admin/src/main/webapp/static/js/excanvas.js diff --git a/webmagic-avalon/src/main/webapp/static/js/fullcalendar.min.js b/webmagic-admin/src/main/webapp/static/js/fullcalendar.min.js similarity index 100% rename from webmagic-avalon/src/main/webapp/static/js/fullcalendar.min.js rename to webmagic-admin/src/main/webapp/static/js/fullcalendar.min.js diff --git a/webmagic-avalon/src/main/webapp/static/js/jquery-1.7.2.min.js b/webmagic-admin/src/main/webapp/static/js/jquery-1.7.2.min.js similarity index 100% rename from webmagic-avalon/src/main/webapp/static/js/jquery-1.7.2.min.js rename to webmagic-admin/src/main/webapp/static/js/jquery-1.7.2.min.js diff --git a/webmagic-avalon/src/main/webapp/static/js/jquery-ui-1.8.21.custom.min.js b/webmagic-admin/src/main/webapp/static/js/jquery-ui-1.8.21.custom.min.js similarity index 100% rename from webmagic-avalon/src/main/webapp/static/js/jquery-ui-1.8.21.custom.min.js rename to webmagic-admin/src/main/webapp/static/js/jquery-ui-1.8.21.custom.min.js diff --git a/webmagic-avalon/src/main/webapp/static/js/jquery.autogrow-textarea.js b/webmagic-admin/src/main/webapp/static/js/jquery.autogrow-textarea.js similarity index 100% rename from webmagic-avalon/src/main/webapp/static/js/jquery.autogrow-textarea.js rename to webmagic-admin/src/main/webapp/static/js/jquery.autogrow-textarea.js diff --git a/webmagic-avalon/src/main/webapp/static/js/jquery.chosen.min.js b/webmagic-admin/src/main/webapp/static/js/jquery.chosen.min.js similarity index 100% rename from webmagic-avalon/src/main/webapp/static/js/jquery.chosen.min.js rename to webmagic-admin/src/main/webapp/static/js/jquery.chosen.min.js diff --git a/webmagic-avalon/src/main/webapp/static/js/jquery.cleditor.min.js b/webmagic-admin/src/main/webapp/static/js/jquery.cleditor.min.js similarity index 100% rename from webmagic-avalon/src/main/webapp/static/js/jquery.cleditor.min.js rename to webmagic-admin/src/main/webapp/static/js/jquery.cleditor.min.js diff --git a/webmagic-avalon/src/main/webapp/static/js/jquery.colorbox.min.js b/webmagic-admin/src/main/webapp/static/js/jquery.colorbox.min.js similarity index 100% rename from webmagic-avalon/src/main/webapp/static/js/jquery.colorbox.min.js rename to webmagic-admin/src/main/webapp/static/js/jquery.colorbox.min.js diff --git a/webmagic-avalon/src/main/webapp/static/js/jquery.cookie.js b/webmagic-admin/src/main/webapp/static/js/jquery.cookie.js similarity index 100% rename from webmagic-avalon/src/main/webapp/static/js/jquery.cookie.js rename to webmagic-admin/src/main/webapp/static/js/jquery.cookie.js diff --git a/webmagic-avalon/src/main/webapp/static/js/jquery.dataTables.min.js b/webmagic-admin/src/main/webapp/static/js/jquery.dataTables.min.js similarity index 100% rename from webmagic-avalon/src/main/webapp/static/js/jquery.dataTables.min.js rename to webmagic-admin/src/main/webapp/static/js/jquery.dataTables.min.js diff --git a/webmagic-avalon/src/main/webapp/static/js/jquery.elfinder.min.js b/webmagic-admin/src/main/webapp/static/js/jquery.elfinder.min.js similarity index 100% rename from webmagic-avalon/src/main/webapp/static/js/jquery.elfinder.min.js rename to webmagic-admin/src/main/webapp/static/js/jquery.elfinder.min.js diff --git a/webmagic-avalon/src/main/webapp/static/js/jquery.flot.min.js b/webmagic-admin/src/main/webapp/static/js/jquery.flot.min.js similarity index 100% rename from webmagic-avalon/src/main/webapp/static/js/jquery.flot.min.js rename to webmagic-admin/src/main/webapp/static/js/jquery.flot.min.js diff --git a/webmagic-avalon/src/main/webapp/static/js/jquery.flot.pie.min.js b/webmagic-admin/src/main/webapp/static/js/jquery.flot.pie.min.js similarity index 100% rename from webmagic-avalon/src/main/webapp/static/js/jquery.flot.pie.min.js rename to webmagic-admin/src/main/webapp/static/js/jquery.flot.pie.min.js diff --git a/webmagic-avalon/src/main/webapp/static/js/jquery.flot.resize.min.js b/webmagic-admin/src/main/webapp/static/js/jquery.flot.resize.min.js similarity index 100% rename from webmagic-avalon/src/main/webapp/static/js/jquery.flot.resize.min.js rename to webmagic-admin/src/main/webapp/static/js/jquery.flot.resize.min.js diff --git a/webmagic-avalon/src/main/webapp/static/js/jquery.flot.stack.js b/webmagic-admin/src/main/webapp/static/js/jquery.flot.stack.js similarity index 100% rename from webmagic-avalon/src/main/webapp/static/js/jquery.flot.stack.js rename to webmagic-admin/src/main/webapp/static/js/jquery.flot.stack.js diff --git a/webmagic-avalon/src/main/webapp/static/js/jquery.history.js b/webmagic-admin/src/main/webapp/static/js/jquery.history.js similarity index 100% rename from webmagic-avalon/src/main/webapp/static/js/jquery.history.js rename to webmagic-admin/src/main/webapp/static/js/jquery.history.js diff --git a/webmagic-avalon/src/main/webapp/static/js/jquery.iphone.toggle.js b/webmagic-admin/src/main/webapp/static/js/jquery.iphone.toggle.js similarity index 100% rename from webmagic-avalon/src/main/webapp/static/js/jquery.iphone.toggle.js rename to webmagic-admin/src/main/webapp/static/js/jquery.iphone.toggle.js diff --git a/webmagic-avalon/src/main/webapp/static/js/jquery.js b/webmagic-admin/src/main/webapp/static/js/jquery.js similarity index 100% rename from webmagic-avalon/src/main/webapp/static/js/jquery.js rename to webmagic-admin/src/main/webapp/static/js/jquery.js diff --git a/webmagic-avalon/src/main/webapp/static/js/jquery.noty.js b/webmagic-admin/src/main/webapp/static/js/jquery.noty.js similarity index 100% rename from webmagic-avalon/src/main/webapp/static/js/jquery.noty.js rename to webmagic-admin/src/main/webapp/static/js/jquery.noty.js diff --git a/webmagic-avalon/src/main/webapp/static/js/jquery.raty.min.js b/webmagic-admin/src/main/webapp/static/js/jquery.raty.min.js similarity index 100% rename from webmagic-avalon/src/main/webapp/static/js/jquery.raty.min.js rename to webmagic-admin/src/main/webapp/static/js/jquery.raty.min.js diff --git a/webmagic-avalon/src/main/webapp/static/js/jquery.uniform.min.js b/webmagic-admin/src/main/webapp/static/js/jquery.uniform.min.js similarity index 100% rename from webmagic-avalon/src/main/webapp/static/js/jquery.uniform.min.js rename to webmagic-admin/src/main/webapp/static/js/jquery.uniform.min.js diff --git a/webmagic-avalon/src/main/webapp/static/js/jquery.uploadify-3.1.min.js b/webmagic-admin/src/main/webapp/static/js/jquery.uploadify-3.1.min.js similarity index 100% rename from webmagic-avalon/src/main/webapp/static/js/jquery.uploadify-3.1.min.js rename to webmagic-admin/src/main/webapp/static/js/jquery.uploadify-3.1.min.js diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index 43a6743..4bea6e2 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.4.4-SNAPSHOT + 0.5.0-SNAPSHOT 4.0.0 @@ -50,11 +50,6 @@ commons-collections - - net.sourceforge.htmlcleaner - htmlcleaner - - org.assertj assertj-core diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index b6f95ac..0d52ac1 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -13,7 +13,6 @@ import us.codecraft.webmagic.pipeline.ResultItemsCollectorPipeline; import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.scheduler.QueueScheduler; import us.codecraft.webmagic.scheduler.Scheduler; -import us.codecraft.webmagic.utils.EnvironmentUtil; import us.codecraft.webmagic.utils.ThreadUtils; import us.codecraft.webmagic.utils.UrlUtils; @@ -541,15 +540,6 @@ public class Spider implements Runnable, Task { return this; } - /** - * switch off xsoup - * - * @return - */ - public static void xsoupOff() { - EnvironmentUtil.setUseXsoup(false); - } - public boolean isExitWhenComplete() { return exitWhenComplete; } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java index 3f5df76..3db0ff1 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java @@ -4,7 +4,6 @@ import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import us.codecraft.webmagic.utils.EnvironmentUtil; import java.util.ArrayList; import java.util.List; @@ -96,16 +95,11 @@ public class Html extends PlainText { @Override public Selectable xpath(String xpath) { - if (EnvironmentUtil.useXsoup()) { - XsoupSelector xsoupSelector = new XsoupSelector(xpath); - if (document != null) { - return new Html(xsoupSelector.selectList(document)); - } - return selectList(xsoupSelector, strings); - } else { - XpathSelector xpathSelector = new XpathSelector(xpath); - return selectList(xpathSelector, strings); + XpathSelector xpathSelector = new XpathSelector(xpath); + if (document != null) { + return new Html(xpathSelector.selectList(document)); } + return selectList(xpathSelector, strings); } @Override diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectors.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectors.java index 0c34ead..6cac964 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectors.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectors.java @@ -32,8 +32,12 @@ public abstract class Selectors { return new XpathSelector(expr); } - public static XsoupSelector xsoup(String expr) { - return new XsoupSelector(expr); + /** + * @Deprecated + * @see #xpath(String) + */ + public static XpathSelector xsoup(String expr) { + return new XpathSelector(expr); } public static AndSelector and(Selector... selectors) { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XpathSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XpathSelector.java index c0e428c..d1bbcae 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XpathSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XpathSelector.java @@ -1,70 +1,32 @@ package us.codecraft.webmagic.selector; -import org.htmlcleaner.*; +import org.jsoup.nodes.Element; +import us.codecraft.xsoup.XPathEvaluator; +import us.codecraft.xsoup.Xsoup; -import java.util.ArrayList; import java.util.List; /** - * XPath selector based on HtmlCleaner.
+ * XPath selector based on Xsoup.
* * @author code4crafter@gmail.com
- * @since 0.1.0 + * @since 0.3.0 */ -public class XpathSelector implements Selector { +public class XpathSelector extends BaseElementSelector { - private String xpathStr; + private XPathEvaluator xPathEvaluator; public XpathSelector(String xpathStr) { - this.xpathStr = xpathStr; + this.xPathEvaluator = Xsoup.compile(xpathStr); } @Override - public String select(String text) { - HtmlCleaner htmlCleaner = new HtmlCleaner(); - TagNode tagNode = htmlCleaner.clean(text); - if (tagNode == null) { - return null; - } - try { - Object[] objects = tagNode.evaluateXPath(xpathStr); - if (objects != null && objects.length >= 1) { - if (objects[0] instanceof TagNode) { - TagNode tagNode1 = (TagNode) objects[0]; - return htmlCleaner.getInnerHtml(tagNode1); - } else { - return objects[0].toString(); - } - } - } catch (XPatherException e) { - e.printStackTrace(); - } - return null; + public String select(Element element) { + return xPathEvaluator.evaluate(element).get(); } @Override - public List selectList(String text) { - HtmlCleaner htmlCleaner = new HtmlCleaner(); - TagNode tagNode = htmlCleaner.clean(text); - if (tagNode == null) { - return null; - } - List results = new ArrayList(); - try { - Object[] objects = tagNode.evaluateXPath(xpathStr); - if (objects != null && objects.length >= 1) { - for (Object object : objects) { - if (object instanceof TagNode) { - TagNode tagNode1 = (TagNode) object; - results.add(htmlCleaner.getInnerHtml(tagNode1)); - } else { - results.add(object.toString()); - } - } - } - } catch (XPatherException e) { - e.printStackTrace(); - } - return results; + public List selectList(Element element) { + return xPathEvaluator.evaluate(element).list(); } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XsoupSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XsoupSelector.java deleted file mode 100644 index ea46290..0000000 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XsoupSelector.java +++ /dev/null @@ -1,32 +0,0 @@ -package us.codecraft.webmagic.selector; - -import org.jsoup.nodes.Element; -import us.codecraft.xsoup.XPathEvaluator; -import us.codecraft.xsoup.Xsoup; - -import java.util.List; - -/** - * XPath selector based on Xsoup.
- * - * @author code4crafter@gmail.com
- * @since 0.3.0 - */ -public class XsoupSelector extends BaseElementSelector { - - private XPathEvaluator xPathEvaluator; - - public XsoupSelector(String xpathStr) { - this.xPathEvaluator = Xsoup.compile(xpathStr); - } - - @Override - public String select(Element element) { - return xPathEvaluator.evaluate(element).get(); - } - - @Override - public List selectList(Element element) { - return xPathEvaluator.evaluate(element).list(); - } -} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/EnvironmentUtil.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/EnvironmentUtil.java deleted file mode 100644 index 7aa5c13..0000000 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/EnvironmentUtil.java +++ /dev/null @@ -1,28 +0,0 @@ -package us.codecraft.webmagic.utils; - -import org.apache.commons.lang3.BooleanUtils; - -import java.util.Properties; - -/** - * @author code4crafter@gmail.com - * @since 0.3.0 - */ -public abstract class EnvironmentUtil { - - private static final String USE_XSOUP = "xsoup"; - - public static boolean useXsoup() { - Properties properties = System.getProperties(); - Object o = properties.get(USE_XSOUP); - if (o == null) { - return true; - } - return BooleanUtils.toBoolean(((String) o).toLowerCase()); - } - - public static void setUseXsoup(boolean useXsoup) { - Properties properties = System.getProperties(); - properties.setProperty(USE_XSOUP, BooleanUtils.toString(useXsoup, "true", "false")); - } -} diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/ExtractorsTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/ExtractorsTest.java index b398007..e8da48d 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/ExtractorsTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/ExtractorsTest.java @@ -29,6 +29,6 @@ public class ExtractorsTest { Assert.assertEquals("bb", and($("title"), regex("aa(bb)cc")).select(html2)); OrSelector or = or($("div h1 a", "innerHtml"), xpath("//title")); Assert.assertEquals("aabbcc", or.select(html)); - Assert.assertEquals("aabbcc", or.select(html2)); + Assert.assertEquals("aabbcc", or.select(html2)); } } diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/utils/EnvironmentUtilTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/utils/EnvironmentUtilTest.java deleted file mode 100644 index cb620e7..0000000 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/utils/EnvironmentUtilTest.java +++ /dev/null @@ -1,18 +0,0 @@ -package us.codecraft.webmagic.utils; - -import org.junit.Test; - -import static junit.framework.Assert.*; - -/** - * @author code4crafter@gmail.com - */ -public class EnvironmentUtilTest { - - @Test - public void test() { - assertTrue(EnvironmentUtil.useXsoup()); - EnvironmentUtil.setUseXsoup(false); - assertFalse(EnvironmentUtil.useXsoup()); - } -} diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index ad22edd..cd8c12f 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.4.4-SNAPSHOT + 0.5.0-SNAPSHOT 4.0.0 diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/ExtractorUtils.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/ExtractorUtils.java index 0818fde..54a4439 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/ExtractorUtils.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/ExtractorUtils.java @@ -37,12 +37,7 @@ public class ExtractorUtils { } private static Selector getXpathSelector(String value) { - Selector selector; - if (EnvironmentUtil.useXsoup()) { - selector = new XsoupSelector(value); - } else { - selector = new XpathSelector(value); - } + Selector selector = new XpathSelector(value); return selector; } diff --git a/webmagic-lucene/README.md b/webmagic-lucene/README.md deleted file mode 100644 index 77050ab..0000000 --- a/webmagic-lucene/README.md +++ /dev/null @@ -1,3 +0,0 @@ -webmagic-lucene --------- -尝试将webmagic与lucene结合,打造一个搜索引擎。开发中,不作为webmagic主要模块。 \ No newline at end of file diff --git a/webmagic-lucene/pom.xml b/webmagic-lucene/pom.xml deleted file mode 100644 index f8de71b..0000000 --- a/webmagic-lucene/pom.xml +++ /dev/null @@ -1,46 +0,0 @@ - - - - webmagic-parent - us.codecraft - 0.4.4-SNAPSHOT - - 4.0.0 - - webmagic-lucene - - - - org.apache.lucene - lucene-analyzers-common - 4.4.0 - - - org.apache.lucene - lucene-queryparser - 4.4.0 - - - us.codecraft - webmagic-extension - ${project.version} - - - junit - junit - - - - - - - maven-deploy-plugin - - true - - - - - - - \ No newline at end of file diff --git a/webmagic-lucene/src/main/java/us/codecraft/webmagic/pipeline/LucenePipeline.java b/webmagic-lucene/src/main/java/us/codecraft/webmagic/pipeline/LucenePipeline.java deleted file mode 100644 index 6fe2702..0000000 --- a/webmagic-lucene/src/main/java/us/codecraft/webmagic/pipeline/LucenePipeline.java +++ /dev/null @@ -1,92 +0,0 @@ -package us.codecraft.webmagic.pipeline; - -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.standard.StandardAnalyzer; -import org.apache.lucene.document.Document; -import org.apache.lucene.document.Field; -import org.apache.lucene.document.TextField; -import org.apache.lucene.index.DirectoryReader; -import org.apache.lucene.index.IndexWriter; -import org.apache.lucene.index.IndexWriterConfig; -import org.apache.lucene.queryparser.classic.ParseException; -import org.apache.lucene.queryparser.classic.QueryParser; -import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.Query; -import org.apache.lucene.search.ScoreDoc; -import org.apache.lucene.store.Directory; -import org.apache.lucene.store.RAMDirectory; -import org.apache.lucene.util.Version; -import us.codecraft.webmagic.ResultItems; -import us.codecraft.webmagic.Task; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.List; -import java.util.Map; - -/** - * @author code4crafter@gmail.com
- * Date: 13-8-5
- * Time: 下午2:11
- */ -public class LucenePipeline implements Pipeline { - - private Directory directory; - - private Analyzer analyzer; - - private IndexWriterConfig config; - - private void init() throws IOException { - analyzer = new StandardAnalyzer(Version.LUCENE_44); - directory = new RAMDirectory(); - config = new IndexWriterConfig(Version.LUCENE_44, analyzer); - } - - public LucenePipeline() { - try { - init(); - } catch (IOException e) { - e.printStackTrace(); - } - } - - public List search(String fieldName, String value) throws IOException, ParseException { - List documents = new ArrayList(); - DirectoryReader ireader = DirectoryReader.open(directory); - IndexSearcher isearcher = new IndexSearcher(ireader); - // Parse a simple query that searches for "text": - QueryParser parser = new QueryParser(Version.LUCENE_44, fieldName, analyzer); - Query query = parser.parse(value); - ScoreDoc[] hits = isearcher.search(query, null, 1000).scoreDocs; - // Iterate through the results: - for (int i = 0; i < hits.length; i++) { - Document hitDoc = isearcher.doc(hits[i].doc); - documents.add(hitDoc); - } - ireader.close(); - return documents; - } - - @Override - public void process(ResultItems resultItems, Task task) { - if (resultItems.isSkip()){ - return; - } - Document doc = new Document(); - Map all = resultItems.getAll(); - if (all==null){ - return; - } - for (Map.Entry objectEntry : all.entrySet()) { - doc.add(new Field(objectEntry.getKey(), objectEntry.getValue().toString(), TextField.TYPE_STORED)); - } - try { - IndexWriter indexWriter = new IndexWriter(directory, config); - indexWriter.addDocument(doc); - indexWriter.close(); - } catch (IOException e) { - e.printStackTrace(); - } - } -} diff --git a/webmagic-lucene/src/main/test/java/us/codecraft/webmagic/lucene/OschinaBlog.java b/webmagic-lucene/src/main/test/java/us/codecraft/webmagic/lucene/OschinaBlog.java deleted file mode 100644 index b350370..0000000 --- a/webmagic-lucene/src/main/test/java/us/codecraft/webmagic/lucene/OschinaBlog.java +++ /dev/null @@ -1,61 +0,0 @@ -package us.codecraft.webmagic.lucene; - -import org.apache.lucene.document.Document; -import org.apache.lucene.queryparser.classic.ParseException; -import us.codecraft.webmagic.Site; -import us.codecraft.webmagic.model.annotation.ExtractBy; -import us.codecraft.webmagic.model.OOSpider; -import us.codecraft.webmagic.model.annotation.TargetUrl; -import us.codecraft.webmagic.pipeline.LucenePipeline; - -import java.io.IOException; -import java.util.List; - -/** - * @author code4crafter@gmail.com
- * Date: 13-8-2
- * Time: 上午7:52
- */ -@TargetUrl("http://my.oschina.net/flashsword/blog/\\d+") -public class OschinaBlog { - - @ExtractBy("//title") - private String title; - - @ExtractBy(value = "div.BlogContent", type = ExtractBy.Type.Css) - private String content; - - @Override - public String toString() { - return "OschinaBlog{" + - "title='" + title + '\'' + - ", content='" + content + '\'' + - '}'; - } - - public static void main(String[] args) { - LucenePipeline pipeline = new LucenePipeline(); - OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog"), OschinaBlog.class).pipeline(pipeline).runAsync(); - while (true) { - try { - List search = pipeline.search("title", "webmagic"); - System.out.println(search); - Thread.sleep(3000); - } catch (IOException e) { - e.printStackTrace(); - } catch (ParseException e) { - e.printStackTrace(); - } catch (InterruptedException e) { - e.printStackTrace(); - } - } - } - - public String getTitle() { - return title; - } - - public String getContent() { - return content; - } -} diff --git a/webmagic-panel/README.md b/webmagic-panel/README.md deleted file mode 100644 index 30ddd13..0000000 --- a/webmagic-panel/README.md +++ /dev/null @@ -1,20 +0,0 @@ -Worker: - -任务执行者,提供Http接口,监控运行状态,终止和开始job - -队列: - -仍然使用redis - -Panel: - -提供Web管理后台,管理 - - - -1. 新建任务 - 1. 通过脚本 - 2. 配置 - 3. 分配机器 -2. 已有任务 -3. 任务查看 \ No newline at end of file diff --git a/webmagic-panel/pom.xml b/webmagic-panel/pom.xml deleted file mode 100644 index 288e8df..0000000 --- a/webmagic-panel/pom.xml +++ /dev/null @@ -1,35 +0,0 @@ - - - - webmagic-parent - us.codecraft - 0.4.3-SNAPSHOT - - 4.0.0 - - us.codecraft - webmagic-panel - - - - us.codecraft - webmagic-scripts - ${project.version} - - - - - - - maven-deploy-plugin - - true - - - - - - - \ No newline at end of file diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index f13b7ea..3868dda 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.4.4-SNAPSHOT + 0.5.0-SNAPSHOT 4.0.0 diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml index 1632b45..f63c21f 100644 --- a/webmagic-saxon/pom.xml +++ b/webmagic-saxon/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.4.4-SNAPSHOT + 0.5.0-SNAPSHOT 4.0.0 @@ -15,9 +15,15 @@ webmagic-core ${project.version}
+ + net.sourceforge.htmlcleaner + htmlcleaner + 2.5 + net.sf.saxon Saxon-HE + 9.5.1-1 junit diff --git a/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java b/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java index 895ec4b..728bd69 100644 --- a/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java +++ b/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java @@ -1350,7 +1350,7 @@ public class XpathSelectorTest { + "\n" + "\n" + " \n" + " \n" + " \n" + "\n"; String text2 = "
aaa
"; XpathSelector xpathSelector = new XpathSelector( - "//div[@id='main']/div[@class='blog_main']/div[1][@class='blog_title']/h3/a"); + "//div[@id='main']/div[@class='blog_main']/div[@class='blog_title']/h3/a/text()"); String select = xpathSelector.select(text); Assert.assertEquals("jsoup 解析页面商品信息", select); } diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index b9c6e54..5c21160 100644 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.4.4-SNAPSHOT + 0.5.0-SNAPSHOT 4.0.0 diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml index 0c7cfc7..01577ce 100644 --- a/webmagic-selenium/pom.xml +++ b/webmagic-selenium/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.4.4-SNAPSHOT + 0.5.0-SNAPSHOT 4.0.0 diff --git a/webmagic-worker/README.md b/webmagic-worker/README.md new file mode 100644 index 0000000..334ab0e --- /dev/null +++ b/webmagic-worker/README.md @@ -0,0 +1,3 @@ +WebMagic-Worker +===== +Worker is the spider container. \ No newline at end of file diff --git a/webmagic-worker/pom.xml b/webmagic-worker/pom.xml new file mode 100644 index 0000000..cf10ab5 --- /dev/null +++ b/webmagic-worker/pom.xml @@ -0,0 +1,118 @@ + + + + webmagic-parent + us.codecraft + 0.5.0-SNAPSHOT + + 4.0.0 + + webmagic-worker + war + + + + us.codecraft + webmagic-scripts + ${project.version} + + + + org.mybatis + mybatis + 3.1.1 + + + + org.mybatis + mybatis-spring + 1.1.1 + + + + org.freemarker + freemarker + 2.3.19 + + + org.springframework + spring-test + ${spring-version} + test + + + + org.springframework + spring-aop + ${spring-version} + + + + org.aspectj + aspectjrt + 1.7.2 + + + org.aspectj + aspectjweaver + 1.7.2 + + + org.springframework + spring-core + ${spring-version} + + + org.springframework + spring-webmvc + ${spring-version} + + + + javax.servlet + javax.servlet-api + 3.0.1 + + + org.springframework + spring-context + ${spring-version} + + + org.springframework + spring-context-support + ${spring-version} + + + com.alibaba + fastjson + 1.1.37 + + + + + + + maven-deploy-plugin + + true + + + + org.apache.maven.plugins + maven-jar-plugin + 2.4 + + + + true + ./lib/ + us.codecraft.webmagic.main.QuickStarter + + + + + + + + diff --git a/webmagic-worker/src/main/resources/log4j.xml b/webmagic-worker/src/main/resources/log4j.xml new file mode 100644 index 0000000..a6630f8 --- /dev/null +++ b/webmagic-worker/src/main/resources/log4j.xml @@ -0,0 +1,26 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + From 79e6eedb3115b8ec6442403025b831141345a8bd Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Fri, 14 Mar 2014 23:52:56 +0800 Subject: [PATCH 003/130] add web.xml for ci pass in webamgic-worker #70 --- .../avalon/web/DashBoardController.java | 20 +++++ .../webmagic/avalon/web/SpiderController.java | 24 ++++++ .../src/main/resources/freemarker.properties | 7 ++ .../src/main/resources/{ => log}/log4j.xml | 7 +- .../spring/applicationContext-freemarker.xml | 34 +++++++++ .../spring/applicationContext-myBatis.xml | 21 ++++++ .../resources/spring/applicationContext.xml | 45 +++++++++++ .../src/main/webapp/WEB-INF/jsp/404.jsp | 74 +++++++++++++++++++ .../src/main/webapp/WEB-INF/jsp/500.jsp | 18 +++++ .../src/main/webapp/WEB-INF/web.xml | 53 +++++++++++++ 10 files changed, 297 insertions(+), 6 deletions(-) create mode 100644 webmagic-worker/src/main/java/us/codecraft/webmagic/avalon/web/DashBoardController.java create mode 100644 webmagic-worker/src/main/java/us/codecraft/webmagic/avalon/web/SpiderController.java create mode 100644 webmagic-worker/src/main/resources/freemarker.properties rename webmagic-worker/src/main/resources/{ => log}/log4j.xml (76%) create mode 100644 webmagic-worker/src/main/resources/spring/applicationContext-freemarker.xml create mode 100644 webmagic-worker/src/main/resources/spring/applicationContext-myBatis.xml create mode 100644 webmagic-worker/src/main/resources/spring/applicationContext.xml create mode 100644 webmagic-worker/src/main/webapp/WEB-INF/jsp/404.jsp create mode 100644 webmagic-worker/src/main/webapp/WEB-INF/jsp/500.jsp create mode 100644 webmagic-worker/src/main/webapp/WEB-INF/web.xml diff --git a/webmagic-worker/src/main/java/us/codecraft/webmagic/avalon/web/DashBoardController.java b/webmagic-worker/src/main/java/us/codecraft/webmagic/avalon/web/DashBoardController.java new file mode 100644 index 0000000..3ef2a86 --- /dev/null +++ b/webmagic-worker/src/main/java/us/codecraft/webmagic/avalon/web/DashBoardController.java @@ -0,0 +1,20 @@ +package us.codecraft.webmagic.avalon.web; + +import org.springframework.stereotype.Controller; +import org.springframework.web.bind.annotation.RequestMapping; +import org.springframework.web.servlet.ModelAndView; + +/** + * @author code4crafter@gmail.com + */ +@Controller("dashboard") +@RequestMapping("/") +public class DashBoardController { + + @RequestMapping + public ModelAndView index() { + ModelAndView map = new ModelAndView("dashboard"); + return map; + } + +} diff --git a/webmagic-worker/src/main/java/us/codecraft/webmagic/avalon/web/SpiderController.java b/webmagic-worker/src/main/java/us/codecraft/webmagic/avalon/web/SpiderController.java new file mode 100644 index 0000000..2f18569 --- /dev/null +++ b/webmagic-worker/src/main/java/us/codecraft/webmagic/avalon/web/SpiderController.java @@ -0,0 +1,24 @@ +package us.codecraft.webmagic.avalon.web; + +import org.springframework.stereotype.Controller; +import org.springframework.web.bind.annotation.RequestMapping; +import org.springframework.web.bind.annotation.ResponseBody; + +import java.util.HashMap; +import java.util.Map; + +/** + * @author code4crafter@gmail.com + */ +@Controller("spider") +@RequestMapping("spider") +public class SpiderController { + + @RequestMapping("create") + @ResponseBody + public Map create() { + HashMap map = new HashMap(); + map.put("code", 200); + return map; + } +} diff --git a/webmagic-worker/src/main/resources/freemarker.properties b/webmagic-worker/src/main/resources/freemarker.properties new file mode 100644 index 0000000..dbed67f --- /dev/null +++ b/webmagic-worker/src/main/resources/freemarker.properties @@ -0,0 +1,7 @@ +number_format=# +classic_compatible=true + +default_encoding=UTF-8 +template_update_delay=0 +######################### +template_exception_handler=rethrow diff --git a/webmagic-worker/src/main/resources/log4j.xml b/webmagic-worker/src/main/resources/log/log4j.xml similarity index 76% rename from webmagic-worker/src/main/resources/log4j.xml rename to webmagic-worker/src/main/resources/log/log4j.xml index a6630f8..c2b5a2f 100644 --- a/webmagic-worker/src/main/resources/log4j.xml +++ b/webmagic-worker/src/main/resources/log/log4j.xml @@ -8,12 +8,7 @@ - - - - - - + diff --git a/webmagic-worker/src/main/resources/spring/applicationContext-freemarker.xml b/webmagic-worker/src/main/resources/spring/applicationContext-freemarker.xml new file mode 100644 index 0000000..e7b98aa --- /dev/null +++ b/webmagic-worker/src/main/resources/spring/applicationContext-freemarker.xml @@ -0,0 +1,34 @@ + + + + + + + + + 0 + zh_CN + yyyy-MM-dd HH:mm:ss + yyyy-MM-dd + #.## + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/webmagic-worker/src/main/resources/spring/applicationContext-myBatis.xml b/webmagic-worker/src/main/resources/spring/applicationContext-myBatis.xml new file mode 100644 index 0000000..222df02 --- /dev/null +++ b/webmagic-worker/src/main/resources/spring/applicationContext-myBatis.xml @@ -0,0 +1,21 @@ + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/webmagic-worker/src/main/resources/spring/applicationContext.xml b/webmagic-worker/src/main/resources/spring/applicationContext.xml new file mode 100644 index 0000000..1a2ac66 --- /dev/null +++ b/webmagic-worker/src/main/resources/spring/applicationContext.xml @@ -0,0 +1,45 @@ + + + + + + + + web_messages + + + + + + + + + + + + + text/html;charset=UTF-8 + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/webmagic-worker/src/main/webapp/WEB-INF/jsp/404.jsp b/webmagic-worker/src/main/webapp/WEB-INF/jsp/404.jsp new file mode 100644 index 0000000..9a3348f --- /dev/null +++ b/webmagic-worker/src/main/webapp/WEB-INF/jsp/404.jsp @@ -0,0 +1,74 @@ +<%@ page language="java" contentType="text/html; charset=utf8" + pageEncoding="utf8"%> + + + + + + + Page not found · GitLab Pages + + + + +
+ +

404

+

There isn't a Gitlab Page here.

+ +

Forgive my poor design.

+

You can edit 404.jsp to customize your 404 page.

+ + +
+ + diff --git a/webmagic-worker/src/main/webapp/WEB-INF/jsp/500.jsp b/webmagic-worker/src/main/webapp/WEB-INF/jsp/500.jsp new file mode 100644 index 0000000..150df3a --- /dev/null +++ b/webmagic-worker/src/main/webapp/WEB-INF/jsp/500.jsp @@ -0,0 +1,18 @@ +<%@ page language="java" contentType="text/html; charset=utf8" + pageEncoding="utf8" isErrorPage="true" import="java.io.*"%> + + + + + 500 + + +页面出错啦! +<% + + StringWriter stringWriter = new StringWriter(); + exception.printStackTrace(new PrintWriter(stringWriter)); + out.println(stringWriter.toString()); +%> + + \ No newline at end of file diff --git a/webmagic-worker/src/main/webapp/WEB-INF/web.xml b/webmagic-worker/src/main/webapp/WEB-INF/web.xml new file mode 100644 index 0000000..eb253f3 --- /dev/null +++ b/webmagic-worker/src/main/webapp/WEB-INF/web.xml @@ -0,0 +1,53 @@ + + + Archetype Created Web Application + + + contextConfigLocation + + classpath*:spring/applicationContext*.xml, + + + + + contextClass + org.springframework.web.context.support.XmlWebApplicationContext + + + + + log4jConfigLocation + classpath:log/log4j.xml + + + + log4jRefreshInterval + 60000 + + + + + spring + org.springframework.web.servlet.DispatcherServlet + + contextConfigLocation + classpath:/spring/applicationContext*.xml + + 1 + + + spring + / + + + 404 + /WEB-INF/jsp/404.jsp + + + 500 + /WEB-INF/jsp/500.jsp + + + From c2d27056ea6220e6d96134fb42688b09059ee6a6 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 15 Mar 2014 19:06:05 +0800 Subject: [PATCH 004/130] move worker and admin into avalon #70 --- .gitmodules | 3 + pom.xml | 3 +- webmagic-avalon/forger | 1 + webmagic-avalon/pom.xml | 120 ++++++++++++++++++ .../webmagic-admin}/README.md | 0 .../webmagic-admin}/pom.xml | 2 +- .../avalon/web/DashBoardController.java | 0 .../webmagic/avalon/web/SpiderController.java | 0 .../src/main/resources/freemarker.properties | 0 .../src/main/resources/log/log4j.xml | 0 .../spring/applicationContext-freemarker.xml | 0 .../spring/applicationContext-myBatis.xml | 0 .../resources/spring/applicationContext.xml | 0 .../src/main/webapp/WEB-INF/jsp/404.jsp | 0 .../src/main/webapp/WEB-INF/jsp/500.jsp | 0 .../webapp/WEB-INF/pages/create_spider.ftl | 0 .../main/webapp/WEB-INF/pages/dashboard.ftl | 0 .../main/webapp/WEB-INF/pages/spider_list.ftl | 0 .../src/main/webapp/WEB-INF/web.xml | 0 .../webapp/static/css/bootstrap-cerulean.css | 0 .../webapp/static/css/bootstrap-classic.css | 0 .../static/css/bootstrap-classic.min.css | 0 .../webapp/static/css/bootstrap-cyborg.css | 0 .../webapp/static/css/bootstrap-journal.css | 0 .../main/webapp/static/css/bootstrap-redy.css | 0 .../static/css/bootstrap-responsive.css | 0 .../static/css/bootstrap-responsive.min.css | 0 .../webapp/static/css/bootstrap-simplex.css | 0 .../webapp/static/css/bootstrap-slate.css | 0 .../webapp/static/css/bootstrap-spacelab.css | 0 .../webapp/static/css/bootstrap-united.css | 0 .../main/webapp/static/css/charisma-app.css | 0 .../src/main/webapp/static/css/chosen.css | 0 .../src/main/webapp/static/css/colorbox.css | 0 .../main/webapp/static/css/elfinder.min.css | 0 .../main/webapp/static/css/elfinder.theme.css | 0 .../main/webapp/static/css/fullcalendar.css | 0 .../webapp/static/css/fullcalendar.print.css | 0 .../static/css/jquery-ui-1.8.21.custom.css | 0 .../webapp/static/css/jquery.cleditor.css | 0 .../static/css/jquery.iphone.toggle.css | 0 .../main/webapp/static/css/jquery.noty.css | 0 .../webapp/static/css/noty_theme_default.css | 0 .../src/main/webapp/static/css/opa-icons.css | 0 .../webapp/static/css/uniform.default.css | 0 .../src/main/webapp/static/css/uploadify.css | 0 .../src/main/webapp/static/favicon.jpg | Bin .../main/webapp/static/js/bootstrap-alert.js | 0 .../main/webapp/static/js/bootstrap-button.js | 0 .../webapp/static/js/bootstrap-carousel.js | 0 .../webapp/static/js/bootstrap-collapse.js | 0 .../webapp/static/js/bootstrap-dropdown.js | 0 .../main/webapp/static/js/bootstrap-modal.js | 0 .../webapp/static/js/bootstrap-popover.js | 0 .../webapp/static/js/bootstrap-scrollspy.js | 0 .../main/webapp/static/js/bootstrap-tab.js | 0 .../main/webapp/static/js/bootstrap-toggle.js | 0 .../webapp/static/js/bootstrap-tooltip.js | 0 .../main/webapp/static/js/bootstrap-tour.js | 0 .../webapp/static/js/bootstrap-transition.js | 0 .../webapp/static/js/bootstrap-typeahead.js | 0 .../src/main/webapp/static/js/charisma.js | 0 .../src/main/webapp/static/js/excanvas.js | 0 .../main/webapp/static/js/fullcalendar.min.js | 0 .../main/webapp/static/js/jquery-1.7.2.min.js | 0 .../static/js/jquery-ui-1.8.21.custom.min.js | 0 .../static/js/jquery.autogrow-textarea.js | 0 .../webapp/static/js/jquery.chosen.min.js | 0 .../webapp/static/js/jquery.cleditor.min.js | 0 .../webapp/static/js/jquery.colorbox.min.js | 0 .../main/webapp/static/js/jquery.cookie.js | 0 .../webapp/static/js/jquery.dataTables.min.js | 0 .../webapp/static/js/jquery.elfinder.min.js | 0 .../main/webapp/static/js/jquery.flot.min.js | 0 .../webapp/static/js/jquery.flot.pie.min.js | 0 .../static/js/jquery.flot.resize.min.js | 0 .../webapp/static/js/jquery.flot.stack.js | 0 .../main/webapp/static/js/jquery.history.js | 0 .../webapp/static/js/jquery.iphone.toggle.js | 0 .../src/main/webapp/static/js/jquery.js | 0 .../src/main/webapp/static/js/jquery.noty.js | 0 .../main/webapp/static/js/jquery.raty.min.js | 0 .../webapp/static/js/jquery.uniform.min.js | 0 .../static/js/jquery.uploadify-3.1.min.js | 0 .../webmagic-worker}/README.md | 0 .../webmagic-worker}/pom.xml | 2 +- .../avalon/web/DashBoardController.java | 0 .../webmagic/avalon/web/SpiderController.java | 0 .../src/main/resources/freemarker.properties | 0 .../src/main/resources/log/log4j.xml | 0 .../spring/applicationContext-freemarker.xml | 0 .../spring/applicationContext-myBatis.xml | 0 .../resources/spring/applicationContext.xml | 0 .../src/main/webapp/WEB-INF/jsp/404.jsp | 0 .../src/main/webapp/WEB-INF/jsp/500.jsp | 0 .../src/main/webapp/WEB-INF/web.xml | 0 96 files changed, 127 insertions(+), 4 deletions(-) create mode 100644 .gitmodules create mode 160000 webmagic-avalon/forger create mode 100644 webmagic-avalon/pom.xml rename {webmagic-admin => webmagic-avalon/webmagic-admin}/README.md (100%) rename {webmagic-admin => webmagic-avalon/webmagic-admin}/pom.xml (98%) rename {webmagic-admin => webmagic-avalon/webmagic-admin}/src/main/java/us/codecraft/webmagic/avalon/web/DashBoardController.java (100%) rename {webmagic-admin => webmagic-avalon/webmagic-admin}/src/main/java/us/codecraft/webmagic/avalon/web/SpiderController.java (100%) rename {webmagic-admin => webmagic-avalon/webmagic-admin}/src/main/resources/freemarker.properties (100%) rename {webmagic-admin => webmagic-avalon/webmagic-admin}/src/main/resources/log/log4j.xml (100%) rename {webmagic-admin => webmagic-avalon/webmagic-admin}/src/main/resources/spring/applicationContext-freemarker.xml (100%) rename {webmagic-admin => webmagic-avalon/webmagic-admin}/src/main/resources/spring/applicationContext-myBatis.xml (100%) rename {webmagic-admin => webmagic-avalon/webmagic-admin}/src/main/resources/spring/applicationContext.xml (100%) rename {webmagic-admin => webmagic-avalon/webmagic-admin}/src/main/webapp/WEB-INF/jsp/404.jsp (100%) rename {webmagic-admin => webmagic-avalon/webmagic-admin}/src/main/webapp/WEB-INF/jsp/500.jsp (100%) rename {webmagic-admin => webmagic-avalon/webmagic-admin}/src/main/webapp/WEB-INF/pages/create_spider.ftl (100%) rename {webmagic-admin => webmagic-avalon/webmagic-admin}/src/main/webapp/WEB-INF/pages/dashboard.ftl (100%) rename {webmagic-admin => webmagic-avalon/webmagic-admin}/src/main/webapp/WEB-INF/pages/spider_list.ftl (100%) rename {webmagic-admin => webmagic-avalon/webmagic-admin}/src/main/webapp/WEB-INF/web.xml (100%) rename {webmagic-admin => webmagic-avalon/webmagic-admin}/src/main/webapp/static/css/bootstrap-cerulean.css (100%) rename {webmagic-admin => webmagic-avalon/webmagic-admin}/src/main/webapp/static/css/bootstrap-classic.css (100%) rename {webmagic-admin => webmagic-avalon/webmagic-admin}/src/main/webapp/static/css/bootstrap-classic.min.css (100%) rename {webmagic-admin => webmagic-avalon/webmagic-admin}/src/main/webapp/static/css/bootstrap-cyborg.css (100%) rename {webmagic-admin => webmagic-avalon/webmagic-admin}/src/main/webapp/static/css/bootstrap-journal.css (100%) rename {webmagic-admin => webmagic-avalon/webmagic-admin}/src/main/webapp/static/css/bootstrap-redy.css (100%) rename {webmagic-admin => webmagic-avalon/webmagic-admin}/src/main/webapp/static/css/bootstrap-responsive.css (100%) rename {webmagic-admin => webmagic-avalon/webmagic-admin}/src/main/webapp/static/css/bootstrap-responsive.min.css (100%) rename {webmagic-admin => webmagic-avalon/webmagic-admin}/src/main/webapp/static/css/bootstrap-simplex.css (100%) rename {webmagic-admin => webmagic-avalon/webmagic-admin}/src/main/webapp/static/css/bootstrap-slate.css (100%) rename {webmagic-admin => webmagic-avalon/webmagic-admin}/src/main/webapp/static/css/bootstrap-spacelab.css (100%) rename {webmagic-admin => webmagic-avalon/webmagic-admin}/src/main/webapp/static/css/bootstrap-united.css (100%) rename {webmagic-admin => webmagic-avalon/webmagic-admin}/src/main/webapp/static/css/charisma-app.css (100%) rename {webmagic-admin => webmagic-avalon/webmagic-admin}/src/main/webapp/static/css/chosen.css (100%) rename {webmagic-admin => webmagic-avalon/webmagic-admin}/src/main/webapp/static/css/colorbox.css (100%) rename {webmagic-admin => webmagic-avalon/webmagic-admin}/src/main/webapp/static/css/elfinder.min.css (100%) rename {webmagic-admin => webmagic-avalon/webmagic-admin}/src/main/webapp/static/css/elfinder.theme.css (100%) rename {webmagic-admin => webmagic-avalon/webmagic-admin}/src/main/webapp/static/css/fullcalendar.css (100%) rename {webmagic-admin => webmagic-avalon/webmagic-admin}/src/main/webapp/static/css/fullcalendar.print.css (100%) rename {webmagic-admin => webmagic-avalon/webmagic-admin}/src/main/webapp/static/css/jquery-ui-1.8.21.custom.css (100%) rename {webmagic-admin => webmagic-avalon/webmagic-admin}/src/main/webapp/static/css/jquery.cleditor.css (100%) rename {webmagic-admin => webmagic-avalon/webmagic-admin}/src/main/webapp/static/css/jquery.iphone.toggle.css (100%) rename {webmagic-admin => webmagic-avalon/webmagic-admin}/src/main/webapp/static/css/jquery.noty.css (100%) rename {webmagic-admin => webmagic-avalon/webmagic-admin}/src/main/webapp/static/css/noty_theme_default.css (100%) rename {webmagic-admin => webmagic-avalon/webmagic-admin}/src/main/webapp/static/css/opa-icons.css (100%) rename {webmagic-admin => webmagic-avalon/webmagic-admin}/src/main/webapp/static/css/uniform.default.css (100%) rename {webmagic-admin => webmagic-avalon/webmagic-admin}/src/main/webapp/static/css/uploadify.css (100%) rename {webmagic-admin => webmagic-avalon/webmagic-admin}/src/main/webapp/static/favicon.jpg (100%) rename {webmagic-admin => webmagic-avalon/webmagic-admin}/src/main/webapp/static/js/bootstrap-alert.js (100%) rename {webmagic-admin => webmagic-avalon/webmagic-admin}/src/main/webapp/static/js/bootstrap-button.js (100%) rename {webmagic-admin => webmagic-avalon/webmagic-admin}/src/main/webapp/static/js/bootstrap-carousel.js (100%) rename {webmagic-admin => webmagic-avalon/webmagic-admin}/src/main/webapp/static/js/bootstrap-collapse.js (100%) rename {webmagic-admin => webmagic-avalon/webmagic-admin}/src/main/webapp/static/js/bootstrap-dropdown.js (100%) rename {webmagic-admin => webmagic-avalon/webmagic-admin}/src/main/webapp/static/js/bootstrap-modal.js (100%) rename {webmagic-admin => webmagic-avalon/webmagic-admin}/src/main/webapp/static/js/bootstrap-popover.js (100%) rename {webmagic-admin => webmagic-avalon/webmagic-admin}/src/main/webapp/static/js/bootstrap-scrollspy.js (100%) rename {webmagic-admin => webmagic-avalon/webmagic-admin}/src/main/webapp/static/js/bootstrap-tab.js (100%) rename {webmagic-admin => webmagic-avalon/webmagic-admin}/src/main/webapp/static/js/bootstrap-toggle.js (100%) rename {webmagic-admin => webmagic-avalon/webmagic-admin}/src/main/webapp/static/js/bootstrap-tooltip.js (100%) rename {webmagic-admin => webmagic-avalon/webmagic-admin}/src/main/webapp/static/js/bootstrap-tour.js (100%) rename {webmagic-admin => webmagic-avalon/webmagic-admin}/src/main/webapp/static/js/bootstrap-transition.js (100%) rename {webmagic-admin => webmagic-avalon/webmagic-admin}/src/main/webapp/static/js/bootstrap-typeahead.js (100%) rename {webmagic-admin => webmagic-avalon/webmagic-admin}/src/main/webapp/static/js/charisma.js (100%) rename {webmagic-admin => webmagic-avalon/webmagic-admin}/src/main/webapp/static/js/excanvas.js (100%) rename {webmagic-admin => webmagic-avalon/webmagic-admin}/src/main/webapp/static/js/fullcalendar.min.js (100%) rename {webmagic-admin => webmagic-avalon/webmagic-admin}/src/main/webapp/static/js/jquery-1.7.2.min.js (100%) rename {webmagic-admin => webmagic-avalon/webmagic-admin}/src/main/webapp/static/js/jquery-ui-1.8.21.custom.min.js (100%) rename {webmagic-admin => webmagic-avalon/webmagic-admin}/src/main/webapp/static/js/jquery.autogrow-textarea.js (100%) rename {webmagic-admin => webmagic-avalon/webmagic-admin}/src/main/webapp/static/js/jquery.chosen.min.js (100%) rename {webmagic-admin => webmagic-avalon/webmagic-admin}/src/main/webapp/static/js/jquery.cleditor.min.js (100%) rename {webmagic-admin => webmagic-avalon/webmagic-admin}/src/main/webapp/static/js/jquery.colorbox.min.js (100%) rename {webmagic-admin => webmagic-avalon/webmagic-admin}/src/main/webapp/static/js/jquery.cookie.js (100%) rename {webmagic-admin => webmagic-avalon/webmagic-admin}/src/main/webapp/static/js/jquery.dataTables.min.js (100%) rename {webmagic-admin => webmagic-avalon/webmagic-admin}/src/main/webapp/static/js/jquery.elfinder.min.js (100%) rename {webmagic-admin => webmagic-avalon/webmagic-admin}/src/main/webapp/static/js/jquery.flot.min.js (100%) rename {webmagic-admin => webmagic-avalon/webmagic-admin}/src/main/webapp/static/js/jquery.flot.pie.min.js (100%) rename {webmagic-admin => webmagic-avalon/webmagic-admin}/src/main/webapp/static/js/jquery.flot.resize.min.js (100%) rename {webmagic-admin => webmagic-avalon/webmagic-admin}/src/main/webapp/static/js/jquery.flot.stack.js (100%) rename {webmagic-admin => webmagic-avalon/webmagic-admin}/src/main/webapp/static/js/jquery.history.js (100%) rename {webmagic-admin => webmagic-avalon/webmagic-admin}/src/main/webapp/static/js/jquery.iphone.toggle.js (100%) rename {webmagic-admin => webmagic-avalon/webmagic-admin}/src/main/webapp/static/js/jquery.js (100%) rename {webmagic-admin => webmagic-avalon/webmagic-admin}/src/main/webapp/static/js/jquery.noty.js (100%) rename {webmagic-admin => webmagic-avalon/webmagic-admin}/src/main/webapp/static/js/jquery.raty.min.js (100%) rename {webmagic-admin => webmagic-avalon/webmagic-admin}/src/main/webapp/static/js/jquery.uniform.min.js (100%) rename {webmagic-admin => webmagic-avalon/webmagic-admin}/src/main/webapp/static/js/jquery.uploadify-3.1.min.js (100%) rename {webmagic-worker => webmagic-avalon/webmagic-worker}/README.md (100%) rename {webmagic-worker => webmagic-avalon/webmagic-worker}/pom.xml (98%) rename {webmagic-worker => webmagic-avalon/webmagic-worker}/src/main/java/us/codecraft/webmagic/avalon/web/DashBoardController.java (100%) rename {webmagic-worker => webmagic-avalon/webmagic-worker}/src/main/java/us/codecraft/webmagic/avalon/web/SpiderController.java (100%) rename {webmagic-worker => webmagic-avalon/webmagic-worker}/src/main/resources/freemarker.properties (100%) rename {webmagic-worker => webmagic-avalon/webmagic-worker}/src/main/resources/log/log4j.xml (100%) rename {webmagic-worker => webmagic-avalon/webmagic-worker}/src/main/resources/spring/applicationContext-freemarker.xml (100%) rename {webmagic-worker => webmagic-avalon/webmagic-worker}/src/main/resources/spring/applicationContext-myBatis.xml (100%) rename {webmagic-worker => webmagic-avalon/webmagic-worker}/src/main/resources/spring/applicationContext.xml (100%) rename {webmagic-worker => webmagic-avalon/webmagic-worker}/src/main/webapp/WEB-INF/jsp/404.jsp (100%) rename {webmagic-worker => webmagic-avalon/webmagic-worker}/src/main/webapp/WEB-INF/jsp/500.jsp (100%) rename {webmagic-worker => webmagic-avalon/webmagic-worker}/src/main/webapp/WEB-INF/web.xml (100%) diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..19050d9 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "webmagic-avalon/forger"] + path = webmagic-avalon/forger + url = git@github.com:code4craft/forger.git diff --git a/pom.xml b/pom.xml index 77bc93e..86f4885 100644 --- a/pom.xml +++ b/pom.xml @@ -54,8 +54,7 @@ webmagic-selenium webmagic-saxon webmagic-samples - webmagic-admin - webmagic-worker + webmagic-avalon diff --git a/webmagic-avalon/forger b/webmagic-avalon/forger new file mode 160000 index 0000000..9f08a0f --- /dev/null +++ b/webmagic-avalon/forger @@ -0,0 +1 @@ +Subproject commit 9f08a0ffd09f5d59ae38091bca250d51aa54bfde diff --git a/webmagic-avalon/pom.xml b/webmagic-avalon/pom.xml new file mode 100644 index 0000000..21a4ca6 --- /dev/null +++ b/webmagic-avalon/pom.xml @@ -0,0 +1,120 @@ + + + + webmagic-parent + us.codecraft + 0.5.0-SNAPSHOT + + 4.0.0 + + us.codecraft + webmagic-avalon + pom + + + forger + webmagic-admin + webmagic-worker + + + + + us.codecraft + webmagic-scripts + ${project.version} + + + + org.mybatis + mybatis + 3.1.1 + + + + org.mybatis + mybatis-spring + 1.1.1 + + + + us.codecraft + forger + 0.1.0-SNAPSHOT + + + + org.freemarker + freemarker + 2.3.19 + + + + org.springframework + spring-test + ${spring-version} + test + + + + org.springframework + spring-aop + ${spring-version} + + + + org.aspectj + aspectjrt + 1.7.2 + + + org.aspectj + aspectjweaver + 1.7.2 + + + org.springframework + spring-core + ${spring-version} + + + org.springframework + spring-webmvc + ${spring-version} + + + + javax.servlet + javax.servlet-api + 3.0.1 + + + org.springframework + spring-context + ${spring-version} + + + org.springframework + spring-context-support + ${spring-version} + + + com.alibaba + fastjson + 1.1.37 + + + + + + + + maven-deploy-plugin + + true + + + + + + + \ No newline at end of file diff --git a/webmagic-admin/README.md b/webmagic-avalon/webmagic-admin/README.md similarity index 100% rename from webmagic-admin/README.md rename to webmagic-avalon/webmagic-admin/README.md diff --git a/webmagic-admin/pom.xml b/webmagic-avalon/webmagic-admin/pom.xml similarity index 98% rename from webmagic-admin/pom.xml rename to webmagic-avalon/webmagic-admin/pom.xml index 58068f9..e989782 100644 --- a/webmagic-admin/pom.xml +++ b/webmagic-avalon/webmagic-admin/pom.xml @@ -1,7 +1,7 @@ - webmagic-parent + webmagic-avalon us.codecraft 0.5.0-SNAPSHOT diff --git a/webmagic-admin/src/main/java/us/codecraft/webmagic/avalon/web/DashBoardController.java b/webmagic-avalon/webmagic-admin/src/main/java/us/codecraft/webmagic/avalon/web/DashBoardController.java similarity index 100% rename from webmagic-admin/src/main/java/us/codecraft/webmagic/avalon/web/DashBoardController.java rename to webmagic-avalon/webmagic-admin/src/main/java/us/codecraft/webmagic/avalon/web/DashBoardController.java diff --git a/webmagic-admin/src/main/java/us/codecraft/webmagic/avalon/web/SpiderController.java b/webmagic-avalon/webmagic-admin/src/main/java/us/codecraft/webmagic/avalon/web/SpiderController.java similarity index 100% rename from webmagic-admin/src/main/java/us/codecraft/webmagic/avalon/web/SpiderController.java rename to webmagic-avalon/webmagic-admin/src/main/java/us/codecraft/webmagic/avalon/web/SpiderController.java diff --git a/webmagic-admin/src/main/resources/freemarker.properties b/webmagic-avalon/webmagic-admin/src/main/resources/freemarker.properties similarity index 100% rename from webmagic-admin/src/main/resources/freemarker.properties rename to webmagic-avalon/webmagic-admin/src/main/resources/freemarker.properties diff --git a/webmagic-admin/src/main/resources/log/log4j.xml b/webmagic-avalon/webmagic-admin/src/main/resources/log/log4j.xml similarity index 100% rename from webmagic-admin/src/main/resources/log/log4j.xml rename to webmagic-avalon/webmagic-admin/src/main/resources/log/log4j.xml diff --git a/webmagic-admin/src/main/resources/spring/applicationContext-freemarker.xml b/webmagic-avalon/webmagic-admin/src/main/resources/spring/applicationContext-freemarker.xml similarity index 100% rename from webmagic-admin/src/main/resources/spring/applicationContext-freemarker.xml rename to webmagic-avalon/webmagic-admin/src/main/resources/spring/applicationContext-freemarker.xml diff --git a/webmagic-admin/src/main/resources/spring/applicationContext-myBatis.xml b/webmagic-avalon/webmagic-admin/src/main/resources/spring/applicationContext-myBatis.xml similarity index 100% rename from webmagic-admin/src/main/resources/spring/applicationContext-myBatis.xml rename to webmagic-avalon/webmagic-admin/src/main/resources/spring/applicationContext-myBatis.xml diff --git a/webmagic-admin/src/main/resources/spring/applicationContext.xml b/webmagic-avalon/webmagic-admin/src/main/resources/spring/applicationContext.xml similarity index 100% rename from webmagic-admin/src/main/resources/spring/applicationContext.xml rename to webmagic-avalon/webmagic-admin/src/main/resources/spring/applicationContext.xml diff --git a/webmagic-admin/src/main/webapp/WEB-INF/jsp/404.jsp b/webmagic-avalon/webmagic-admin/src/main/webapp/WEB-INF/jsp/404.jsp similarity index 100% rename from webmagic-admin/src/main/webapp/WEB-INF/jsp/404.jsp rename to webmagic-avalon/webmagic-admin/src/main/webapp/WEB-INF/jsp/404.jsp diff --git a/webmagic-admin/src/main/webapp/WEB-INF/jsp/500.jsp b/webmagic-avalon/webmagic-admin/src/main/webapp/WEB-INF/jsp/500.jsp similarity index 100% rename from webmagic-admin/src/main/webapp/WEB-INF/jsp/500.jsp rename to webmagic-avalon/webmagic-admin/src/main/webapp/WEB-INF/jsp/500.jsp diff --git a/webmagic-admin/src/main/webapp/WEB-INF/pages/create_spider.ftl b/webmagic-avalon/webmagic-admin/src/main/webapp/WEB-INF/pages/create_spider.ftl similarity index 100% rename from webmagic-admin/src/main/webapp/WEB-INF/pages/create_spider.ftl rename to webmagic-avalon/webmagic-admin/src/main/webapp/WEB-INF/pages/create_spider.ftl diff --git a/webmagic-admin/src/main/webapp/WEB-INF/pages/dashboard.ftl b/webmagic-avalon/webmagic-admin/src/main/webapp/WEB-INF/pages/dashboard.ftl similarity index 100% rename from webmagic-admin/src/main/webapp/WEB-INF/pages/dashboard.ftl rename to webmagic-avalon/webmagic-admin/src/main/webapp/WEB-INF/pages/dashboard.ftl diff --git a/webmagic-admin/src/main/webapp/WEB-INF/pages/spider_list.ftl b/webmagic-avalon/webmagic-admin/src/main/webapp/WEB-INF/pages/spider_list.ftl similarity index 100% rename from webmagic-admin/src/main/webapp/WEB-INF/pages/spider_list.ftl rename to webmagic-avalon/webmagic-admin/src/main/webapp/WEB-INF/pages/spider_list.ftl diff --git a/webmagic-admin/src/main/webapp/WEB-INF/web.xml b/webmagic-avalon/webmagic-admin/src/main/webapp/WEB-INF/web.xml similarity index 100% rename from webmagic-admin/src/main/webapp/WEB-INF/web.xml rename to webmagic-avalon/webmagic-admin/src/main/webapp/WEB-INF/web.xml diff --git a/webmagic-admin/src/main/webapp/static/css/bootstrap-cerulean.css b/webmagic-avalon/webmagic-admin/src/main/webapp/static/css/bootstrap-cerulean.css similarity index 100% rename from webmagic-admin/src/main/webapp/static/css/bootstrap-cerulean.css rename to webmagic-avalon/webmagic-admin/src/main/webapp/static/css/bootstrap-cerulean.css diff --git a/webmagic-admin/src/main/webapp/static/css/bootstrap-classic.css b/webmagic-avalon/webmagic-admin/src/main/webapp/static/css/bootstrap-classic.css similarity index 100% rename from webmagic-admin/src/main/webapp/static/css/bootstrap-classic.css rename to webmagic-avalon/webmagic-admin/src/main/webapp/static/css/bootstrap-classic.css diff --git a/webmagic-admin/src/main/webapp/static/css/bootstrap-classic.min.css b/webmagic-avalon/webmagic-admin/src/main/webapp/static/css/bootstrap-classic.min.css similarity index 100% rename from webmagic-admin/src/main/webapp/static/css/bootstrap-classic.min.css rename to webmagic-avalon/webmagic-admin/src/main/webapp/static/css/bootstrap-classic.min.css diff --git a/webmagic-admin/src/main/webapp/static/css/bootstrap-cyborg.css b/webmagic-avalon/webmagic-admin/src/main/webapp/static/css/bootstrap-cyborg.css similarity index 100% rename from webmagic-admin/src/main/webapp/static/css/bootstrap-cyborg.css rename to webmagic-avalon/webmagic-admin/src/main/webapp/static/css/bootstrap-cyborg.css diff --git a/webmagic-admin/src/main/webapp/static/css/bootstrap-journal.css b/webmagic-avalon/webmagic-admin/src/main/webapp/static/css/bootstrap-journal.css similarity index 100% rename from webmagic-admin/src/main/webapp/static/css/bootstrap-journal.css rename to webmagic-avalon/webmagic-admin/src/main/webapp/static/css/bootstrap-journal.css diff --git a/webmagic-admin/src/main/webapp/static/css/bootstrap-redy.css b/webmagic-avalon/webmagic-admin/src/main/webapp/static/css/bootstrap-redy.css similarity index 100% rename from webmagic-admin/src/main/webapp/static/css/bootstrap-redy.css rename to webmagic-avalon/webmagic-admin/src/main/webapp/static/css/bootstrap-redy.css diff --git a/webmagic-admin/src/main/webapp/static/css/bootstrap-responsive.css b/webmagic-avalon/webmagic-admin/src/main/webapp/static/css/bootstrap-responsive.css similarity index 100% rename from webmagic-admin/src/main/webapp/static/css/bootstrap-responsive.css rename to webmagic-avalon/webmagic-admin/src/main/webapp/static/css/bootstrap-responsive.css diff --git a/webmagic-admin/src/main/webapp/static/css/bootstrap-responsive.min.css b/webmagic-avalon/webmagic-admin/src/main/webapp/static/css/bootstrap-responsive.min.css similarity index 100% rename from webmagic-admin/src/main/webapp/static/css/bootstrap-responsive.min.css rename to webmagic-avalon/webmagic-admin/src/main/webapp/static/css/bootstrap-responsive.min.css diff --git a/webmagic-admin/src/main/webapp/static/css/bootstrap-simplex.css b/webmagic-avalon/webmagic-admin/src/main/webapp/static/css/bootstrap-simplex.css similarity index 100% rename from webmagic-admin/src/main/webapp/static/css/bootstrap-simplex.css rename to webmagic-avalon/webmagic-admin/src/main/webapp/static/css/bootstrap-simplex.css diff --git a/webmagic-admin/src/main/webapp/static/css/bootstrap-slate.css b/webmagic-avalon/webmagic-admin/src/main/webapp/static/css/bootstrap-slate.css similarity index 100% rename from webmagic-admin/src/main/webapp/static/css/bootstrap-slate.css rename to webmagic-avalon/webmagic-admin/src/main/webapp/static/css/bootstrap-slate.css diff --git a/webmagic-admin/src/main/webapp/static/css/bootstrap-spacelab.css b/webmagic-avalon/webmagic-admin/src/main/webapp/static/css/bootstrap-spacelab.css similarity index 100% rename from webmagic-admin/src/main/webapp/static/css/bootstrap-spacelab.css rename to webmagic-avalon/webmagic-admin/src/main/webapp/static/css/bootstrap-spacelab.css diff --git a/webmagic-admin/src/main/webapp/static/css/bootstrap-united.css b/webmagic-avalon/webmagic-admin/src/main/webapp/static/css/bootstrap-united.css similarity index 100% rename from webmagic-admin/src/main/webapp/static/css/bootstrap-united.css rename to webmagic-avalon/webmagic-admin/src/main/webapp/static/css/bootstrap-united.css diff --git a/webmagic-admin/src/main/webapp/static/css/charisma-app.css b/webmagic-avalon/webmagic-admin/src/main/webapp/static/css/charisma-app.css similarity index 100% rename from webmagic-admin/src/main/webapp/static/css/charisma-app.css rename to webmagic-avalon/webmagic-admin/src/main/webapp/static/css/charisma-app.css diff --git a/webmagic-admin/src/main/webapp/static/css/chosen.css b/webmagic-avalon/webmagic-admin/src/main/webapp/static/css/chosen.css similarity index 100% rename from webmagic-admin/src/main/webapp/static/css/chosen.css rename to webmagic-avalon/webmagic-admin/src/main/webapp/static/css/chosen.css diff --git a/webmagic-admin/src/main/webapp/static/css/colorbox.css b/webmagic-avalon/webmagic-admin/src/main/webapp/static/css/colorbox.css similarity index 100% rename from webmagic-admin/src/main/webapp/static/css/colorbox.css rename to webmagic-avalon/webmagic-admin/src/main/webapp/static/css/colorbox.css diff --git a/webmagic-admin/src/main/webapp/static/css/elfinder.min.css b/webmagic-avalon/webmagic-admin/src/main/webapp/static/css/elfinder.min.css similarity index 100% rename from webmagic-admin/src/main/webapp/static/css/elfinder.min.css rename to webmagic-avalon/webmagic-admin/src/main/webapp/static/css/elfinder.min.css diff --git a/webmagic-admin/src/main/webapp/static/css/elfinder.theme.css b/webmagic-avalon/webmagic-admin/src/main/webapp/static/css/elfinder.theme.css similarity index 100% rename from webmagic-admin/src/main/webapp/static/css/elfinder.theme.css rename to webmagic-avalon/webmagic-admin/src/main/webapp/static/css/elfinder.theme.css diff --git a/webmagic-admin/src/main/webapp/static/css/fullcalendar.css b/webmagic-avalon/webmagic-admin/src/main/webapp/static/css/fullcalendar.css similarity index 100% rename from webmagic-admin/src/main/webapp/static/css/fullcalendar.css rename to webmagic-avalon/webmagic-admin/src/main/webapp/static/css/fullcalendar.css diff --git a/webmagic-admin/src/main/webapp/static/css/fullcalendar.print.css b/webmagic-avalon/webmagic-admin/src/main/webapp/static/css/fullcalendar.print.css similarity index 100% rename from webmagic-admin/src/main/webapp/static/css/fullcalendar.print.css rename to webmagic-avalon/webmagic-admin/src/main/webapp/static/css/fullcalendar.print.css diff --git a/webmagic-admin/src/main/webapp/static/css/jquery-ui-1.8.21.custom.css b/webmagic-avalon/webmagic-admin/src/main/webapp/static/css/jquery-ui-1.8.21.custom.css similarity index 100% rename from webmagic-admin/src/main/webapp/static/css/jquery-ui-1.8.21.custom.css rename to webmagic-avalon/webmagic-admin/src/main/webapp/static/css/jquery-ui-1.8.21.custom.css diff --git a/webmagic-admin/src/main/webapp/static/css/jquery.cleditor.css b/webmagic-avalon/webmagic-admin/src/main/webapp/static/css/jquery.cleditor.css similarity index 100% rename from webmagic-admin/src/main/webapp/static/css/jquery.cleditor.css rename to webmagic-avalon/webmagic-admin/src/main/webapp/static/css/jquery.cleditor.css diff --git a/webmagic-admin/src/main/webapp/static/css/jquery.iphone.toggle.css b/webmagic-avalon/webmagic-admin/src/main/webapp/static/css/jquery.iphone.toggle.css similarity index 100% rename from webmagic-admin/src/main/webapp/static/css/jquery.iphone.toggle.css rename to webmagic-avalon/webmagic-admin/src/main/webapp/static/css/jquery.iphone.toggle.css diff --git a/webmagic-admin/src/main/webapp/static/css/jquery.noty.css b/webmagic-avalon/webmagic-admin/src/main/webapp/static/css/jquery.noty.css similarity index 100% rename from webmagic-admin/src/main/webapp/static/css/jquery.noty.css rename to webmagic-avalon/webmagic-admin/src/main/webapp/static/css/jquery.noty.css diff --git a/webmagic-admin/src/main/webapp/static/css/noty_theme_default.css b/webmagic-avalon/webmagic-admin/src/main/webapp/static/css/noty_theme_default.css similarity index 100% rename from webmagic-admin/src/main/webapp/static/css/noty_theme_default.css rename to webmagic-avalon/webmagic-admin/src/main/webapp/static/css/noty_theme_default.css diff --git a/webmagic-admin/src/main/webapp/static/css/opa-icons.css b/webmagic-avalon/webmagic-admin/src/main/webapp/static/css/opa-icons.css similarity index 100% rename from webmagic-admin/src/main/webapp/static/css/opa-icons.css rename to webmagic-avalon/webmagic-admin/src/main/webapp/static/css/opa-icons.css diff --git a/webmagic-admin/src/main/webapp/static/css/uniform.default.css b/webmagic-avalon/webmagic-admin/src/main/webapp/static/css/uniform.default.css similarity index 100% rename from webmagic-admin/src/main/webapp/static/css/uniform.default.css rename to webmagic-avalon/webmagic-admin/src/main/webapp/static/css/uniform.default.css diff --git a/webmagic-admin/src/main/webapp/static/css/uploadify.css b/webmagic-avalon/webmagic-admin/src/main/webapp/static/css/uploadify.css similarity index 100% rename from webmagic-admin/src/main/webapp/static/css/uploadify.css rename to webmagic-avalon/webmagic-admin/src/main/webapp/static/css/uploadify.css diff --git a/webmagic-admin/src/main/webapp/static/favicon.jpg b/webmagic-avalon/webmagic-admin/src/main/webapp/static/favicon.jpg similarity index 100% rename from webmagic-admin/src/main/webapp/static/favicon.jpg rename to webmagic-avalon/webmagic-admin/src/main/webapp/static/favicon.jpg diff --git a/webmagic-admin/src/main/webapp/static/js/bootstrap-alert.js b/webmagic-avalon/webmagic-admin/src/main/webapp/static/js/bootstrap-alert.js similarity index 100% rename from webmagic-admin/src/main/webapp/static/js/bootstrap-alert.js rename to webmagic-avalon/webmagic-admin/src/main/webapp/static/js/bootstrap-alert.js diff --git a/webmagic-admin/src/main/webapp/static/js/bootstrap-button.js b/webmagic-avalon/webmagic-admin/src/main/webapp/static/js/bootstrap-button.js similarity index 100% rename from webmagic-admin/src/main/webapp/static/js/bootstrap-button.js rename to webmagic-avalon/webmagic-admin/src/main/webapp/static/js/bootstrap-button.js diff --git a/webmagic-admin/src/main/webapp/static/js/bootstrap-carousel.js b/webmagic-avalon/webmagic-admin/src/main/webapp/static/js/bootstrap-carousel.js similarity index 100% rename from webmagic-admin/src/main/webapp/static/js/bootstrap-carousel.js rename to webmagic-avalon/webmagic-admin/src/main/webapp/static/js/bootstrap-carousel.js diff --git a/webmagic-admin/src/main/webapp/static/js/bootstrap-collapse.js b/webmagic-avalon/webmagic-admin/src/main/webapp/static/js/bootstrap-collapse.js similarity index 100% rename from webmagic-admin/src/main/webapp/static/js/bootstrap-collapse.js rename to webmagic-avalon/webmagic-admin/src/main/webapp/static/js/bootstrap-collapse.js diff --git a/webmagic-admin/src/main/webapp/static/js/bootstrap-dropdown.js b/webmagic-avalon/webmagic-admin/src/main/webapp/static/js/bootstrap-dropdown.js similarity index 100% rename from webmagic-admin/src/main/webapp/static/js/bootstrap-dropdown.js rename to webmagic-avalon/webmagic-admin/src/main/webapp/static/js/bootstrap-dropdown.js diff --git a/webmagic-admin/src/main/webapp/static/js/bootstrap-modal.js b/webmagic-avalon/webmagic-admin/src/main/webapp/static/js/bootstrap-modal.js similarity index 100% rename from webmagic-admin/src/main/webapp/static/js/bootstrap-modal.js rename to webmagic-avalon/webmagic-admin/src/main/webapp/static/js/bootstrap-modal.js diff --git a/webmagic-admin/src/main/webapp/static/js/bootstrap-popover.js b/webmagic-avalon/webmagic-admin/src/main/webapp/static/js/bootstrap-popover.js similarity index 100% rename from webmagic-admin/src/main/webapp/static/js/bootstrap-popover.js rename to webmagic-avalon/webmagic-admin/src/main/webapp/static/js/bootstrap-popover.js diff --git a/webmagic-admin/src/main/webapp/static/js/bootstrap-scrollspy.js b/webmagic-avalon/webmagic-admin/src/main/webapp/static/js/bootstrap-scrollspy.js similarity index 100% rename from webmagic-admin/src/main/webapp/static/js/bootstrap-scrollspy.js rename to webmagic-avalon/webmagic-admin/src/main/webapp/static/js/bootstrap-scrollspy.js diff --git a/webmagic-admin/src/main/webapp/static/js/bootstrap-tab.js b/webmagic-avalon/webmagic-admin/src/main/webapp/static/js/bootstrap-tab.js similarity index 100% rename from webmagic-admin/src/main/webapp/static/js/bootstrap-tab.js rename to webmagic-avalon/webmagic-admin/src/main/webapp/static/js/bootstrap-tab.js diff --git a/webmagic-admin/src/main/webapp/static/js/bootstrap-toggle.js b/webmagic-avalon/webmagic-admin/src/main/webapp/static/js/bootstrap-toggle.js similarity index 100% rename from webmagic-admin/src/main/webapp/static/js/bootstrap-toggle.js rename to webmagic-avalon/webmagic-admin/src/main/webapp/static/js/bootstrap-toggle.js diff --git a/webmagic-admin/src/main/webapp/static/js/bootstrap-tooltip.js b/webmagic-avalon/webmagic-admin/src/main/webapp/static/js/bootstrap-tooltip.js similarity index 100% rename from webmagic-admin/src/main/webapp/static/js/bootstrap-tooltip.js rename to webmagic-avalon/webmagic-admin/src/main/webapp/static/js/bootstrap-tooltip.js diff --git a/webmagic-admin/src/main/webapp/static/js/bootstrap-tour.js b/webmagic-avalon/webmagic-admin/src/main/webapp/static/js/bootstrap-tour.js similarity index 100% rename from webmagic-admin/src/main/webapp/static/js/bootstrap-tour.js rename to webmagic-avalon/webmagic-admin/src/main/webapp/static/js/bootstrap-tour.js diff --git a/webmagic-admin/src/main/webapp/static/js/bootstrap-transition.js b/webmagic-avalon/webmagic-admin/src/main/webapp/static/js/bootstrap-transition.js similarity index 100% rename from webmagic-admin/src/main/webapp/static/js/bootstrap-transition.js rename to webmagic-avalon/webmagic-admin/src/main/webapp/static/js/bootstrap-transition.js diff --git a/webmagic-admin/src/main/webapp/static/js/bootstrap-typeahead.js b/webmagic-avalon/webmagic-admin/src/main/webapp/static/js/bootstrap-typeahead.js similarity index 100% rename from webmagic-admin/src/main/webapp/static/js/bootstrap-typeahead.js rename to webmagic-avalon/webmagic-admin/src/main/webapp/static/js/bootstrap-typeahead.js diff --git a/webmagic-admin/src/main/webapp/static/js/charisma.js b/webmagic-avalon/webmagic-admin/src/main/webapp/static/js/charisma.js similarity index 100% rename from webmagic-admin/src/main/webapp/static/js/charisma.js rename to webmagic-avalon/webmagic-admin/src/main/webapp/static/js/charisma.js diff --git a/webmagic-admin/src/main/webapp/static/js/excanvas.js b/webmagic-avalon/webmagic-admin/src/main/webapp/static/js/excanvas.js similarity index 100% rename from webmagic-admin/src/main/webapp/static/js/excanvas.js rename to webmagic-avalon/webmagic-admin/src/main/webapp/static/js/excanvas.js diff --git a/webmagic-admin/src/main/webapp/static/js/fullcalendar.min.js b/webmagic-avalon/webmagic-admin/src/main/webapp/static/js/fullcalendar.min.js similarity index 100% rename from webmagic-admin/src/main/webapp/static/js/fullcalendar.min.js rename to webmagic-avalon/webmagic-admin/src/main/webapp/static/js/fullcalendar.min.js diff --git a/webmagic-admin/src/main/webapp/static/js/jquery-1.7.2.min.js b/webmagic-avalon/webmagic-admin/src/main/webapp/static/js/jquery-1.7.2.min.js similarity index 100% rename from webmagic-admin/src/main/webapp/static/js/jquery-1.7.2.min.js rename to webmagic-avalon/webmagic-admin/src/main/webapp/static/js/jquery-1.7.2.min.js diff --git a/webmagic-admin/src/main/webapp/static/js/jquery-ui-1.8.21.custom.min.js b/webmagic-avalon/webmagic-admin/src/main/webapp/static/js/jquery-ui-1.8.21.custom.min.js similarity index 100% rename from webmagic-admin/src/main/webapp/static/js/jquery-ui-1.8.21.custom.min.js rename to webmagic-avalon/webmagic-admin/src/main/webapp/static/js/jquery-ui-1.8.21.custom.min.js diff --git a/webmagic-admin/src/main/webapp/static/js/jquery.autogrow-textarea.js b/webmagic-avalon/webmagic-admin/src/main/webapp/static/js/jquery.autogrow-textarea.js similarity index 100% rename from webmagic-admin/src/main/webapp/static/js/jquery.autogrow-textarea.js rename to webmagic-avalon/webmagic-admin/src/main/webapp/static/js/jquery.autogrow-textarea.js diff --git a/webmagic-admin/src/main/webapp/static/js/jquery.chosen.min.js b/webmagic-avalon/webmagic-admin/src/main/webapp/static/js/jquery.chosen.min.js similarity index 100% rename from webmagic-admin/src/main/webapp/static/js/jquery.chosen.min.js rename to webmagic-avalon/webmagic-admin/src/main/webapp/static/js/jquery.chosen.min.js diff --git a/webmagic-admin/src/main/webapp/static/js/jquery.cleditor.min.js b/webmagic-avalon/webmagic-admin/src/main/webapp/static/js/jquery.cleditor.min.js similarity index 100% rename from webmagic-admin/src/main/webapp/static/js/jquery.cleditor.min.js rename to webmagic-avalon/webmagic-admin/src/main/webapp/static/js/jquery.cleditor.min.js diff --git a/webmagic-admin/src/main/webapp/static/js/jquery.colorbox.min.js b/webmagic-avalon/webmagic-admin/src/main/webapp/static/js/jquery.colorbox.min.js similarity index 100% rename from webmagic-admin/src/main/webapp/static/js/jquery.colorbox.min.js rename to webmagic-avalon/webmagic-admin/src/main/webapp/static/js/jquery.colorbox.min.js diff --git a/webmagic-admin/src/main/webapp/static/js/jquery.cookie.js b/webmagic-avalon/webmagic-admin/src/main/webapp/static/js/jquery.cookie.js similarity index 100% rename from webmagic-admin/src/main/webapp/static/js/jquery.cookie.js rename to webmagic-avalon/webmagic-admin/src/main/webapp/static/js/jquery.cookie.js diff --git a/webmagic-admin/src/main/webapp/static/js/jquery.dataTables.min.js b/webmagic-avalon/webmagic-admin/src/main/webapp/static/js/jquery.dataTables.min.js similarity index 100% rename from webmagic-admin/src/main/webapp/static/js/jquery.dataTables.min.js rename to webmagic-avalon/webmagic-admin/src/main/webapp/static/js/jquery.dataTables.min.js diff --git a/webmagic-admin/src/main/webapp/static/js/jquery.elfinder.min.js b/webmagic-avalon/webmagic-admin/src/main/webapp/static/js/jquery.elfinder.min.js similarity index 100% rename from webmagic-admin/src/main/webapp/static/js/jquery.elfinder.min.js rename to webmagic-avalon/webmagic-admin/src/main/webapp/static/js/jquery.elfinder.min.js diff --git a/webmagic-admin/src/main/webapp/static/js/jquery.flot.min.js b/webmagic-avalon/webmagic-admin/src/main/webapp/static/js/jquery.flot.min.js similarity index 100% rename from webmagic-admin/src/main/webapp/static/js/jquery.flot.min.js rename to webmagic-avalon/webmagic-admin/src/main/webapp/static/js/jquery.flot.min.js diff --git a/webmagic-admin/src/main/webapp/static/js/jquery.flot.pie.min.js b/webmagic-avalon/webmagic-admin/src/main/webapp/static/js/jquery.flot.pie.min.js similarity index 100% rename from webmagic-admin/src/main/webapp/static/js/jquery.flot.pie.min.js rename to webmagic-avalon/webmagic-admin/src/main/webapp/static/js/jquery.flot.pie.min.js diff --git a/webmagic-admin/src/main/webapp/static/js/jquery.flot.resize.min.js b/webmagic-avalon/webmagic-admin/src/main/webapp/static/js/jquery.flot.resize.min.js similarity index 100% rename from webmagic-admin/src/main/webapp/static/js/jquery.flot.resize.min.js rename to webmagic-avalon/webmagic-admin/src/main/webapp/static/js/jquery.flot.resize.min.js diff --git a/webmagic-admin/src/main/webapp/static/js/jquery.flot.stack.js b/webmagic-avalon/webmagic-admin/src/main/webapp/static/js/jquery.flot.stack.js similarity index 100% rename from webmagic-admin/src/main/webapp/static/js/jquery.flot.stack.js rename to webmagic-avalon/webmagic-admin/src/main/webapp/static/js/jquery.flot.stack.js diff --git a/webmagic-admin/src/main/webapp/static/js/jquery.history.js b/webmagic-avalon/webmagic-admin/src/main/webapp/static/js/jquery.history.js similarity index 100% rename from webmagic-admin/src/main/webapp/static/js/jquery.history.js rename to webmagic-avalon/webmagic-admin/src/main/webapp/static/js/jquery.history.js diff --git a/webmagic-admin/src/main/webapp/static/js/jquery.iphone.toggle.js b/webmagic-avalon/webmagic-admin/src/main/webapp/static/js/jquery.iphone.toggle.js similarity index 100% rename from webmagic-admin/src/main/webapp/static/js/jquery.iphone.toggle.js rename to webmagic-avalon/webmagic-admin/src/main/webapp/static/js/jquery.iphone.toggle.js diff --git a/webmagic-admin/src/main/webapp/static/js/jquery.js b/webmagic-avalon/webmagic-admin/src/main/webapp/static/js/jquery.js similarity index 100% rename from webmagic-admin/src/main/webapp/static/js/jquery.js rename to webmagic-avalon/webmagic-admin/src/main/webapp/static/js/jquery.js diff --git a/webmagic-admin/src/main/webapp/static/js/jquery.noty.js b/webmagic-avalon/webmagic-admin/src/main/webapp/static/js/jquery.noty.js similarity index 100% rename from webmagic-admin/src/main/webapp/static/js/jquery.noty.js rename to webmagic-avalon/webmagic-admin/src/main/webapp/static/js/jquery.noty.js diff --git a/webmagic-admin/src/main/webapp/static/js/jquery.raty.min.js b/webmagic-avalon/webmagic-admin/src/main/webapp/static/js/jquery.raty.min.js similarity index 100% rename from webmagic-admin/src/main/webapp/static/js/jquery.raty.min.js rename to webmagic-avalon/webmagic-admin/src/main/webapp/static/js/jquery.raty.min.js diff --git a/webmagic-admin/src/main/webapp/static/js/jquery.uniform.min.js b/webmagic-avalon/webmagic-admin/src/main/webapp/static/js/jquery.uniform.min.js similarity index 100% rename from webmagic-admin/src/main/webapp/static/js/jquery.uniform.min.js rename to webmagic-avalon/webmagic-admin/src/main/webapp/static/js/jquery.uniform.min.js diff --git a/webmagic-admin/src/main/webapp/static/js/jquery.uploadify-3.1.min.js b/webmagic-avalon/webmagic-admin/src/main/webapp/static/js/jquery.uploadify-3.1.min.js similarity index 100% rename from webmagic-admin/src/main/webapp/static/js/jquery.uploadify-3.1.min.js rename to webmagic-avalon/webmagic-admin/src/main/webapp/static/js/jquery.uploadify-3.1.min.js diff --git a/webmagic-worker/README.md b/webmagic-avalon/webmagic-worker/README.md similarity index 100% rename from webmagic-worker/README.md rename to webmagic-avalon/webmagic-worker/README.md diff --git a/webmagic-worker/pom.xml b/webmagic-avalon/webmagic-worker/pom.xml similarity index 98% rename from webmagic-worker/pom.xml rename to webmagic-avalon/webmagic-worker/pom.xml index cf10ab5..fd8edd5 100644 --- a/webmagic-worker/pom.xml +++ b/webmagic-avalon/webmagic-worker/pom.xml @@ -1,7 +1,7 @@ - webmagic-parent + webmagic-avalon us.codecraft 0.5.0-SNAPSHOT diff --git a/webmagic-worker/src/main/java/us/codecraft/webmagic/avalon/web/DashBoardController.java b/webmagic-avalon/webmagic-worker/src/main/java/us/codecraft/webmagic/avalon/web/DashBoardController.java similarity index 100% rename from webmagic-worker/src/main/java/us/codecraft/webmagic/avalon/web/DashBoardController.java rename to webmagic-avalon/webmagic-worker/src/main/java/us/codecraft/webmagic/avalon/web/DashBoardController.java diff --git a/webmagic-worker/src/main/java/us/codecraft/webmagic/avalon/web/SpiderController.java b/webmagic-avalon/webmagic-worker/src/main/java/us/codecraft/webmagic/avalon/web/SpiderController.java similarity index 100% rename from webmagic-worker/src/main/java/us/codecraft/webmagic/avalon/web/SpiderController.java rename to webmagic-avalon/webmagic-worker/src/main/java/us/codecraft/webmagic/avalon/web/SpiderController.java diff --git a/webmagic-worker/src/main/resources/freemarker.properties b/webmagic-avalon/webmagic-worker/src/main/resources/freemarker.properties similarity index 100% rename from webmagic-worker/src/main/resources/freemarker.properties rename to webmagic-avalon/webmagic-worker/src/main/resources/freemarker.properties diff --git a/webmagic-worker/src/main/resources/log/log4j.xml b/webmagic-avalon/webmagic-worker/src/main/resources/log/log4j.xml similarity index 100% rename from webmagic-worker/src/main/resources/log/log4j.xml rename to webmagic-avalon/webmagic-worker/src/main/resources/log/log4j.xml diff --git a/webmagic-worker/src/main/resources/spring/applicationContext-freemarker.xml b/webmagic-avalon/webmagic-worker/src/main/resources/spring/applicationContext-freemarker.xml similarity index 100% rename from webmagic-worker/src/main/resources/spring/applicationContext-freemarker.xml rename to webmagic-avalon/webmagic-worker/src/main/resources/spring/applicationContext-freemarker.xml diff --git a/webmagic-worker/src/main/resources/spring/applicationContext-myBatis.xml b/webmagic-avalon/webmagic-worker/src/main/resources/spring/applicationContext-myBatis.xml similarity index 100% rename from webmagic-worker/src/main/resources/spring/applicationContext-myBatis.xml rename to webmagic-avalon/webmagic-worker/src/main/resources/spring/applicationContext-myBatis.xml diff --git a/webmagic-worker/src/main/resources/spring/applicationContext.xml b/webmagic-avalon/webmagic-worker/src/main/resources/spring/applicationContext.xml similarity index 100% rename from webmagic-worker/src/main/resources/spring/applicationContext.xml rename to webmagic-avalon/webmagic-worker/src/main/resources/spring/applicationContext.xml diff --git a/webmagic-worker/src/main/webapp/WEB-INF/jsp/404.jsp b/webmagic-avalon/webmagic-worker/src/main/webapp/WEB-INF/jsp/404.jsp similarity index 100% rename from webmagic-worker/src/main/webapp/WEB-INF/jsp/404.jsp rename to webmagic-avalon/webmagic-worker/src/main/webapp/WEB-INF/jsp/404.jsp diff --git a/webmagic-worker/src/main/webapp/WEB-INF/jsp/500.jsp b/webmagic-avalon/webmagic-worker/src/main/webapp/WEB-INF/jsp/500.jsp similarity index 100% rename from webmagic-worker/src/main/webapp/WEB-INF/jsp/500.jsp rename to webmagic-avalon/webmagic-worker/src/main/webapp/WEB-INF/jsp/500.jsp diff --git a/webmagic-worker/src/main/webapp/WEB-INF/web.xml b/webmagic-avalon/webmagic-worker/src/main/webapp/WEB-INF/web.xml similarity index 100% rename from webmagic-worker/src/main/webapp/WEB-INF/web.xml rename to webmagic-avalon/webmagic-worker/src/main/webapp/WEB-INF/web.xml From 8f84fb2ab5c154f8ed65d98448b897c104d08950 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 15 Mar 2014 19:14:56 +0800 Subject: [PATCH 005/130] fix submodule url for travis-ci --- .gitmodules | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitmodules b/.gitmodules index 19050d9..2d72a69 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,3 @@ [submodule "webmagic-avalon/forger"] path = webmagic-avalon/forger - url = git@github.com:code4craft/forger.git + url = https://github.com/code4craft/forger.git From 8ea0a0624befa510e3f1294e02a5c4ad639f7a7c Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 15 Mar 2014 19:16:46 +0800 Subject: [PATCH 006/130] fix submodule url for travis-ci --- .gitmodules | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitmodules b/.gitmodules index 2d72a69..3c25d06 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,3 @@ [submodule "webmagic-avalon/forger"] path = webmagic-avalon/forger - url = https://github.com/code4craft/forger.git + url = git://github.com/someuser/somelibrary.git From 994c1d53411e644f0141429428cc81a87d225258 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 15 Mar 2014 19:25:33 +0800 Subject: [PATCH 007/130] avalon readme --- webmagic-avalon/README.md | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 webmagic-avalon/README.md diff --git a/webmagic-avalon/README.md b/webmagic-avalon/README.md new file mode 100644 index 0000000..4b15ed3 --- /dev/null +++ b/webmagic-avalon/README.md @@ -0,0 +1,5 @@ +WebMagic-Avalon +======== +> Spiders Manage Web + +see [#issue43](https://github.com/code4craft/webmagic/issues/43) \ No newline at end of file From 6e882b78b5a083659b0e5dd50c0482f8839213b7 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 15 Mar 2014 20:19:39 +0800 Subject: [PATCH 008/130] add DynamicClassDao and UT --- webmagic-avalon/pom.xml | 182 ++++++++++-------- webmagic-avalon/sql/create-tables.sql | 9 + webmagic-avalon/webmagic-admin/pom.xml | 74 +------ .../webmagic-avalon-common/pom.xml | 128 ++++++++++++ .../webmagic/dao/DynamicClassDao.java | 13 ++ .../webmagic/model/DynamicClass.java | 49 +++++ .../src/main/resources/freemarker.properties | 7 + .../src/main/resources/log/log4j.xml | 21 ++ .../spring/applicationContext-freemarker.xml | 34 ++++ .../spring/applicationContext-myBatis.xml | 23 +++ .../resources/spring/applicationContext.xml | 44 +++++ .../us/codecraft/dao/DynamicClassDaoTest.java | 33 ++++ .../spring/applicationContext-freemarker.xml | 34 ++++ .../spring/applicationContext-myBatis.xml | 23 +++ .../spring/applicationContext-tx.xml | 18 ++ .../resources/spring/applicationContext.xml | 44 +++++ webmagic-avalon/webmagic-worker/pom.xml | 74 +------ 17 files changed, 585 insertions(+), 225 deletions(-) create mode 100644 webmagic-avalon/sql/create-tables.sql create mode 100644 webmagic-avalon/webmagic-avalon-common/pom.xml create mode 100644 webmagic-avalon/webmagic-avalon-common/src/main/java/us/codecraft/webmagic/dao/DynamicClassDao.java create mode 100644 webmagic-avalon/webmagic-avalon-common/src/main/java/us/codecraft/webmagic/model/DynamicClass.java create mode 100644 webmagic-avalon/webmagic-avalon-common/src/main/resources/freemarker.properties create mode 100644 webmagic-avalon/webmagic-avalon-common/src/main/resources/log/log4j.xml create mode 100644 webmagic-avalon/webmagic-avalon-common/src/main/resources/spring/applicationContext-freemarker.xml create mode 100644 webmagic-avalon/webmagic-avalon-common/src/main/resources/spring/applicationContext-myBatis.xml create mode 100644 webmagic-avalon/webmagic-avalon-common/src/main/resources/spring/applicationContext.xml create mode 100644 webmagic-avalon/webmagic-avalon-common/src/test/java/us/codecraft/dao/DynamicClassDaoTest.java create mode 100644 webmagic-avalon/webmagic-avalon-common/src/test/resources/spring/applicationContext-freemarker.xml create mode 100644 webmagic-avalon/webmagic-avalon-common/src/test/resources/spring/applicationContext-myBatis.xml create mode 100644 webmagic-avalon/webmagic-avalon-common/src/test/resources/spring/applicationContext-tx.xml create mode 100644 webmagic-avalon/webmagic-avalon-common/src/test/resources/spring/applicationContext.xml diff --git a/webmagic-avalon/pom.xml b/webmagic-avalon/pom.xml index 21a4ca6..e74af11 100644 --- a/webmagic-avalon/pom.xml +++ b/webmagic-avalon/pom.xml @@ -1,5 +1,6 @@ - + webmagic-parent us.codecraft @@ -15,95 +16,118 @@ forger webmagic-admin webmagic-worker + webmagic-avalon-common - - - us.codecraft - webmagic-scripts - ${project.version} - + + - - org.mybatis - mybatis - 3.1.1 - + + us.codecraft + webmagic-scripts + ${project.version} + - - org.mybatis - mybatis-spring - 1.1.1 - + + org.mybatis + mybatis + 3.1.1 + - - us.codecraft - forger - 0.1.0-SNAPSHOT - + + org.mybatis + mybatis-spring + 1.1.1 + - - org.freemarker - freemarker - 2.3.19 - + + us.codecraft + forger + 0.1.0-SNAPSHOT + - - org.springframework - spring-test - ${spring-version} - test - + + org.freemarker + freemarker + 2.3.19 + - - org.springframework - spring-aop - ${spring-version} - + + org.springframework + spring-test + ${spring-version} + test + - - org.aspectj - aspectjrt - 1.7.2 - - - org.aspectj - aspectjweaver - 1.7.2 - - - org.springframework - spring-core - ${spring-version} - - - org.springframework - spring-webmvc - ${spring-version} - + + org.assertj + assertj-core + 1.5.0 + test + - - javax.servlet - javax.servlet-api - 3.0.1 - - - org.springframework - spring-context - ${spring-version} - - - org.springframework - spring-context-support - ${spring-version} - - - com.alibaba - fastjson - 1.1.37 - + + mysql + mysql-connector-java + 5.1.18 + - + + commons-dbcp + commons-dbcp + 1.3 + + + + org.springframework + spring-aop + ${spring-version} + + + + org.aspectj + aspectjrt + 1.7.2 + + + org.aspectj + aspectjweaver + 1.7.2 + + + org.springframework + spring-core + ${spring-version} + + + org.springframework + spring-webmvc + ${spring-version} + + + + javax.servlet + javax.servlet-api + 3.0.1 + + + org.springframework + spring-context + ${spring-version} + + + org.springframework + spring-context-support + ${spring-version} + + + com.alibaba + fastjson + 1.1.37 + + + + diff --git a/webmagic-avalon/sql/create-tables.sql b/webmagic-avalon/sql/create-tables.sql new file mode 100644 index 0000000..9312e01 --- /dev/null +++ b/webmagic-avalon/sql/create-tables.sql @@ -0,0 +1,9 @@ +CREATE TABLE `DynamicClass` ( + `Id` int(11) unsigned NOT NULL AUTO_INCREMENT, + `ClassName` varchar(20) NOT NULL, + `SourceCode` text NOT NULL, + `AddTime` datetime NOT NULL, + `UpdateTime` datetime NOT NULL, + PRIMARY KEY (`Id`), + UNIQUE KEY `un_class_name` (`ClassName`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8; \ No newline at end of file diff --git a/webmagic-avalon/webmagic-admin/pom.xml b/webmagic-avalon/webmagic-admin/pom.xml index e989782..b3a05b8 100644 --- a/webmagic-avalon/webmagic-admin/pom.xml +++ b/webmagic-avalon/webmagic-admin/pom.xml @@ -14,82 +14,10 @@ us.codecraft - webmagic-scripts + webmagic-avalon-common ${project.version} - - org.mybatis - mybatis - 3.1.1 - - - - org.mybatis - mybatis-spring - 1.1.1 - - - - org.freemarker - freemarker - 2.3.19 - - - org.springframework - spring-test - ${spring-version} - test - - - - org.springframework - spring-aop - ${spring-version} - - - - org.aspectj - aspectjrt - 1.7.2 - - - org.aspectj - aspectjweaver - 1.7.2 - - - org.springframework - spring-core - ${spring-version} - - - org.springframework - spring-webmvc - ${spring-version} - - - - javax.servlet - javax.servlet-api - 3.0.1 - - - org.springframework - spring-context - ${spring-version} - - - org.springframework - spring-context-support - ${spring-version} - - - com.alibaba - fastjson - 1.1.37 - - diff --git a/webmagic-avalon/webmagic-avalon-common/pom.xml b/webmagic-avalon/webmagic-avalon-common/pom.xml new file mode 100644 index 0000000..e7c6380 --- /dev/null +++ b/webmagic-avalon/webmagic-avalon-common/pom.xml @@ -0,0 +1,128 @@ + + + + webmagic-avalon + us.codecraft + 0.5.0-SNAPSHOT + + 4.0.0 + + webmagic-avalon-common + jar + + + + us.codecraft + webmagic-extension + ${project.version} + + + + org.mybatis + mybatis + + + + org.mybatis + mybatis-spring + + + + org.freemarker + freemarker + + + + org.springframework + spring-test + test + + + + org.assertj + assertj-core + test + + + + junit + junit + + + + mysql + mysql-connector-java + + + + commons-dbcp + commons-dbcp + + + + org.springframework + spring-aop + ${spring-version} + + + + org.aspectj + aspectjrt + + + org.aspectj + aspectjweaver + + + org.springframework + spring-core + + + org.springframework + spring-webmvc + + + + javax.servlet + javax.servlet-api + + + org.springframework + spring-context + + + org.springframework + spring-context-support + + + com.alibaba + fastjson + + + + + + + maven-deploy-plugin + + true + + + + org.apache.maven.plugins + maven-jar-plugin + 2.4 + + + + true + ./lib/ + us.codecraft.webmagic.main.QuickStarter + + + + + + + + diff --git a/webmagic-avalon/webmagic-avalon-common/src/main/java/us/codecraft/webmagic/dao/DynamicClassDao.java b/webmagic-avalon/webmagic-avalon-common/src/main/java/us/codecraft/webmagic/dao/DynamicClassDao.java new file mode 100644 index 0000000..1e7a6e3 --- /dev/null +++ b/webmagic-avalon/webmagic-avalon-common/src/main/java/us/codecraft/webmagic/dao/DynamicClassDao.java @@ -0,0 +1,13 @@ +package us.codecraft.webmagic.dao; + +import org.apache.ibatis.annotations.Insert; +import us.codecraft.webmagic.model.DynamicClass; + +/** + * @author code4crafter@gmail.com + */ +public interface DynamicClassDao { + + @Insert("insert into DynamicClass (`ClassName`,`SourceCode`,`AddTime`,`UpdateTime`) values (#{className},#{sourceCode},now(),now())") + public int add(DynamicClass dynamicClass); +} diff --git a/webmagic-avalon/webmagic-avalon-common/src/main/java/us/codecraft/webmagic/model/DynamicClass.java b/webmagic-avalon/webmagic-avalon-common/src/main/java/us/codecraft/webmagic/model/DynamicClass.java new file mode 100644 index 0000000..4809128 --- /dev/null +++ b/webmagic-avalon/webmagic-avalon-common/src/main/java/us/codecraft/webmagic/model/DynamicClass.java @@ -0,0 +1,49 @@ +package us.codecraft.webmagic.model; + +import java.util.Date; + +/** + * @author code4crafter@gmail.com + */ +public class DynamicClass { + + private String className; + + private String sourceCode; + + private Date addTime; + + private Date updateTime; + + public String getClassName() { + return className; + } + + public void setClassName(String className) { + this.className = className; + } + + public String getSourceCode() { + return sourceCode; + } + + public void setSourceCode(String sourceCode) { + this.sourceCode = sourceCode; + } + + public Date getAddTime() { + return addTime; + } + + public void setAddTime(Date addTime) { + this.addTime = addTime; + } + + public Date getUpdateTime() { + return updateTime; + } + + public void setUpdateTime(Date updateTime) { + this.updateTime = updateTime; + } +} diff --git a/webmagic-avalon/webmagic-avalon-common/src/main/resources/freemarker.properties b/webmagic-avalon/webmagic-avalon-common/src/main/resources/freemarker.properties new file mode 100644 index 0000000..dbed67f --- /dev/null +++ b/webmagic-avalon/webmagic-avalon-common/src/main/resources/freemarker.properties @@ -0,0 +1,7 @@ +number_format=# +classic_compatible=true + +default_encoding=UTF-8 +template_update_delay=0 +######################### +template_exception_handler=rethrow diff --git a/webmagic-avalon/webmagic-avalon-common/src/main/resources/log/log4j.xml b/webmagic-avalon/webmagic-avalon-common/src/main/resources/log/log4j.xml new file mode 100644 index 0000000..c2b5a2f --- /dev/null +++ b/webmagic-avalon/webmagic-avalon-common/src/main/resources/log/log4j.xml @@ -0,0 +1,21 @@ + + + + + + + + + + + + + + + + + + + + + diff --git a/webmagic-avalon/webmagic-avalon-common/src/main/resources/spring/applicationContext-freemarker.xml b/webmagic-avalon/webmagic-avalon-common/src/main/resources/spring/applicationContext-freemarker.xml new file mode 100644 index 0000000..e7b98aa --- /dev/null +++ b/webmagic-avalon/webmagic-avalon-common/src/main/resources/spring/applicationContext-freemarker.xml @@ -0,0 +1,34 @@ + + + + + + + + + 0 + zh_CN + yyyy-MM-dd HH:mm:ss + yyyy-MM-dd + #.## + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/webmagic-avalon/webmagic-avalon-common/src/main/resources/spring/applicationContext-myBatis.xml b/webmagic-avalon/webmagic-avalon-common/src/main/resources/spring/applicationContext-myBatis.xml new file mode 100644 index 0000000..bf54793 --- /dev/null +++ b/webmagic-avalon/webmagic-avalon-common/src/main/resources/spring/applicationContext-myBatis.xml @@ -0,0 +1,23 @@ + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/webmagic-avalon/webmagic-avalon-common/src/main/resources/spring/applicationContext.xml b/webmagic-avalon/webmagic-avalon-common/src/main/resources/spring/applicationContext.xml new file mode 100644 index 0000000..3296bdd --- /dev/null +++ b/webmagic-avalon/webmagic-avalon-common/src/main/resources/spring/applicationContext.xml @@ -0,0 +1,44 @@ + + + + + + + + web_messages + + + + + + + + + + + + + text/html;charset=UTF-8 + + + + + + + + + + + + + \ No newline at end of file diff --git a/webmagic-avalon/webmagic-avalon-common/src/test/java/us/codecraft/dao/DynamicClassDaoTest.java b/webmagic-avalon/webmagic-avalon-common/src/test/java/us/codecraft/dao/DynamicClassDaoTest.java new file mode 100644 index 0000000..b5008bd --- /dev/null +++ b/webmagic-avalon/webmagic-avalon-common/src/test/java/us/codecraft/dao/DynamicClassDaoTest.java @@ -0,0 +1,33 @@ +package us.codecraft.dao; + +import org.junit.Test; +import org.junit.runner.RunWith; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.test.annotation.Rollback; +import org.springframework.test.context.ContextConfiguration; +import org.springframework.test.context.junit4.SpringJUnit4ClassRunner; +import org.springframework.transaction.annotation.Transactional; +import us.codecraft.webmagic.dao.DynamicClassDao; +import us.codecraft.webmagic.model.DynamicClass; + +/** + * @author code4crafter@gmail.com + */ +@RunWith(SpringJUnit4ClassRunner.class) +@ContextConfiguration(locations = {"classpath:/spring/applicationContext-*.xml"}) +@Transactional +public class DynamicClassDaoTest { + + @Autowired + private DynamicClassDao dynamicClassDao; + + @Test + @Transactional + @Rollback(true) + public void testAdd() throws Exception { + DynamicClass dynamicClass = new DynamicClass(); + dynamicClass.setClassName("test"); + dynamicClass.setSourceCode("testSource"); + dynamicClassDao.add(dynamicClass); + } +} diff --git a/webmagic-avalon/webmagic-avalon-common/src/test/resources/spring/applicationContext-freemarker.xml b/webmagic-avalon/webmagic-avalon-common/src/test/resources/spring/applicationContext-freemarker.xml new file mode 100644 index 0000000..e7b98aa --- /dev/null +++ b/webmagic-avalon/webmagic-avalon-common/src/test/resources/spring/applicationContext-freemarker.xml @@ -0,0 +1,34 @@ + + + + + + + + + 0 + zh_CN + yyyy-MM-dd HH:mm:ss + yyyy-MM-dd + #.## + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/webmagic-avalon/webmagic-avalon-common/src/test/resources/spring/applicationContext-myBatis.xml b/webmagic-avalon/webmagic-avalon-common/src/test/resources/spring/applicationContext-myBatis.xml new file mode 100644 index 0000000..bf54793 --- /dev/null +++ b/webmagic-avalon/webmagic-avalon-common/src/test/resources/spring/applicationContext-myBatis.xml @@ -0,0 +1,23 @@ + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/webmagic-avalon/webmagic-avalon-common/src/test/resources/spring/applicationContext-tx.xml b/webmagic-avalon/webmagic-avalon-common/src/test/resources/spring/applicationContext-tx.xml new file mode 100644 index 0000000..79421a2 --- /dev/null +++ b/webmagic-avalon/webmagic-avalon-common/src/test/resources/spring/applicationContext-tx.xml @@ -0,0 +1,18 @@ + + + + + + + + + + + \ No newline at end of file diff --git a/webmagic-avalon/webmagic-avalon-common/src/test/resources/spring/applicationContext.xml b/webmagic-avalon/webmagic-avalon-common/src/test/resources/spring/applicationContext.xml new file mode 100644 index 0000000..3296bdd --- /dev/null +++ b/webmagic-avalon/webmagic-avalon-common/src/test/resources/spring/applicationContext.xml @@ -0,0 +1,44 @@ + + + + + + + + web_messages + + + + + + + + + + + + + text/html;charset=UTF-8 + + + + + + + + + + + + + \ No newline at end of file diff --git a/webmagic-avalon/webmagic-worker/pom.xml b/webmagic-avalon/webmagic-worker/pom.xml index fd8edd5..f7833a0 100644 --- a/webmagic-avalon/webmagic-worker/pom.xml +++ b/webmagic-avalon/webmagic-worker/pom.xml @@ -13,81 +13,9 @@ us.codecraft - webmagic-scripts + webmagic-avalon-common ${project.version} - - - org.mybatis - mybatis - 3.1.1 - - - - org.mybatis - mybatis-spring - 1.1.1 - - - - org.freemarker - freemarker - 2.3.19 - - - org.springframework - spring-test - ${spring-version} - test - - - - org.springframework - spring-aop - ${spring-version} - - - - org.aspectj - aspectjrt - 1.7.2 - - - org.aspectj - aspectjweaver - 1.7.2 - - - org.springframework - spring-core - ${spring-version} - - - org.springframework - spring-webmvc - ${spring-version} - - - - javax.servlet - javax.servlet-api - 3.0.1 - - - org.springframework - spring-context - ${spring-version} - - - org.springframework - spring-context-support - ${spring-version} - - - com.alibaba - fastjson - 1.1.37 - From 3eaa5c4d2331ee8ecce4f779908e6e7e39f751f7 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 15 Mar 2014 21:05:13 +0800 Subject: [PATCH 009/130] ignore dao test after test pass --- .../src/test/java/us/codecraft/dao/DynamicClassDaoTest.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/webmagic-avalon/webmagic-avalon-common/src/test/java/us/codecraft/dao/DynamicClassDaoTest.java b/webmagic-avalon/webmagic-avalon-common/src/test/java/us/codecraft/dao/DynamicClassDaoTest.java index b5008bd..73df642 100644 --- a/webmagic-avalon/webmagic-avalon-common/src/test/java/us/codecraft/dao/DynamicClassDaoTest.java +++ b/webmagic-avalon/webmagic-avalon-common/src/test/java/us/codecraft/dao/DynamicClassDaoTest.java @@ -1,5 +1,6 @@ package us.codecraft.dao; +import org.junit.Ignore; import org.junit.Test; import org.junit.runner.RunWith; import org.springframework.beans.factory.annotation.Autowired; @@ -13,6 +14,7 @@ import us.codecraft.webmagic.model.DynamicClass; /** * @author code4crafter@gmail.com */ +@Ignore @RunWith(SpringJUnit4ClassRunner.class) @ContextConfiguration(locations = {"classpath:/spring/applicationContext-*.xml"}) @Transactional From f763764a21ba84ebad4cccdc830a57ed9162790c Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 15 Mar 2014 21:09:44 +0800 Subject: [PATCH 010/130] add travis ci support for submodule --- .travis.yml | 2 ++ .../us/codecraft/{ => webmagic}/dao/DynamicClassDaoTest.java | 3 +-- 2 files changed, 3 insertions(+), 2 deletions(-) rename webmagic-avalon/webmagic-avalon-common/src/test/java/us/codecraft/{ => webmagic}/dao/DynamicClassDaoTest.java (93%) diff --git a/.travis.yml b/.travis.yml index c7c99f4..c18b7b6 100644 --- a/.travis.yml +++ b/.travis.yml @@ -2,3 +2,5 @@ language: java jdk: - oraclejdk7 - openjdk6 +before_install: + - git submodule update --init --recursive diff --git a/webmagic-avalon/webmagic-avalon-common/src/test/java/us/codecraft/dao/DynamicClassDaoTest.java b/webmagic-avalon/webmagic-avalon-common/src/test/java/us/codecraft/webmagic/dao/DynamicClassDaoTest.java similarity index 93% rename from webmagic-avalon/webmagic-avalon-common/src/test/java/us/codecraft/dao/DynamicClassDaoTest.java rename to webmagic-avalon/webmagic-avalon-common/src/test/java/us/codecraft/webmagic/dao/DynamicClassDaoTest.java index 73df642..8f2a942 100644 --- a/webmagic-avalon/webmagic-avalon-common/src/test/java/us/codecraft/dao/DynamicClassDaoTest.java +++ b/webmagic-avalon/webmagic-avalon-common/src/test/java/us/codecraft/webmagic/dao/DynamicClassDaoTest.java @@ -1,4 +1,4 @@ -package us.codecraft.dao; +package us.codecraft.webmagic.dao; import org.junit.Ignore; import org.junit.Test; @@ -8,7 +8,6 @@ import org.springframework.test.annotation.Rollback; import org.springframework.test.context.ContextConfiguration; import org.springframework.test.context.junit4.SpringJUnit4ClassRunner; import org.springframework.transaction.annotation.Transactional; -import us.codecraft.webmagic.dao.DynamicClassDao; import us.codecraft.webmagic.model.DynamicClass; /** From 7036950f281a7e027af87fb786bb7dcda6708388 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 15 Mar 2014 21:11:39 +0800 Subject: [PATCH 011/130] update submodule url --- .gitmodules | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitmodules b/.gitmodules index 3c25d06..67d52cc 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,3 @@ [submodule "webmagic-avalon/forger"] path = webmagic-avalon/forger - url = git://github.com/someuser/somelibrary.git + url = git://github.com/code4craft/forger.git From 09c43efc969ccf044281c24c75059c369a677dcd Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 15 Mar 2014 21:56:12 +0800 Subject: [PATCH 012/130] service and mock --- pom.xml | 6 +++ webmagic-avalon/sql/create-tables.sql | 2 +- .../webmagic-avalon-common/pom.xml | 11 +++++ .../webmagic/service/DynamicClassService.java | 10 ++++ .../service/impl/DynamicClassServiceImpl.java | 34 ++++++++++++++ .../spring/applicationContext-service.xml | 22 +++++++++ .../test/java/us/codecraft/webmagic/Foo.java | 45 ++++++++++++++++++ .../service/DynamicClassServiceImplTest.java | 47 +++++++++++++++++++ .../resources/spring/applicationContext.xml | 1 + 9 files changed, 177 insertions(+), 1 deletion(-) create mode 100644 webmagic-avalon/webmagic-avalon-common/src/main/java/us/codecraft/webmagic/service/DynamicClassService.java create mode 100644 webmagic-avalon/webmagic-avalon-common/src/main/java/us/codecraft/webmagic/service/impl/DynamicClassServiceImpl.java create mode 100644 webmagic-avalon/webmagic-avalon-common/src/main/resources/spring/applicationContext-service.xml create mode 100644 webmagic-avalon/webmagic-avalon-common/src/test/java/us/codecraft/webmagic/Foo.java create mode 100644 webmagic-avalon/webmagic-avalon-common/src/test/java/us/codecraft/webmagic/service/DynamicClassServiceImplTest.java diff --git a/pom.xml b/pom.xml index 86f4885..bcc219c 100644 --- a/pom.xml +++ b/pom.xml @@ -125,6 +125,12 @@ jsoup 1.7.2
+ + org.mockito + mockito-all + 1.9.5 + test +
diff --git a/webmagic-avalon/sql/create-tables.sql b/webmagic-avalon/sql/create-tables.sql index 9312e01..6c361b1 100644 --- a/webmagic-avalon/sql/create-tables.sql +++ b/webmagic-avalon/sql/create-tables.sql @@ -1,6 +1,6 @@ CREATE TABLE `DynamicClass` ( `Id` int(11) unsigned NOT NULL AUTO_INCREMENT, - `ClassName` varchar(20) NOT NULL, + `ClassName` varchar(200) NOT NULL, `SourceCode` text NOT NULL, `AddTime` datetime NOT NULL, `UpdateTime` datetime NOT NULL, diff --git a/webmagic-avalon/webmagic-avalon-common/pom.xml b/webmagic-avalon/webmagic-avalon-common/pom.xml index e7c6380..388c4f1 100644 --- a/webmagic-avalon/webmagic-avalon-common/pom.xml +++ b/webmagic-avalon/webmagic-avalon-common/pom.xml @@ -22,6 +22,12 @@ mybatis + + us.codecraft + forger + 0.1.0-SNAPSHOT + + org.mybatis mybatis-spring @@ -82,6 +88,11 @@ spring-webmvc + + org.mockito + mockito-all + + javax.servlet javax.servlet-api diff --git a/webmagic-avalon/webmagic-avalon-common/src/main/java/us/codecraft/webmagic/service/DynamicClassService.java b/webmagic-avalon/webmagic-avalon-common/src/main/java/us/codecraft/webmagic/service/DynamicClassService.java new file mode 100644 index 0000000..389ab01 --- /dev/null +++ b/webmagic-avalon/webmagic-avalon-common/src/main/java/us/codecraft/webmagic/service/DynamicClassService.java @@ -0,0 +1,10 @@ +package us.codecraft.webmagic.service; + +/** + * @author code4crafter@gmail.com + */ +public interface DynamicClassService { + + public String compileAndSave(String sourceCode); + +} diff --git a/webmagic-avalon/webmagic-avalon-common/src/main/java/us/codecraft/webmagic/service/impl/DynamicClassServiceImpl.java b/webmagic-avalon/webmagic-avalon-common/src/main/java/us/codecraft/webmagic/service/impl/DynamicClassServiceImpl.java new file mode 100644 index 0000000..c063a50 --- /dev/null +++ b/webmagic-avalon/webmagic-avalon-common/src/main/java/us/codecraft/webmagic/service/impl/DynamicClassServiceImpl.java @@ -0,0 +1,34 @@ +package us.codecraft.webmagic.service.impl; + +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.stereotype.Service; +import us.codecraft.forger.Forger; +import us.codecraft.forger.ForgerFactory; +import us.codecraft.webmagic.dao.DynamicClassDao; +import us.codecraft.webmagic.model.DynamicClass; +import us.codecraft.webmagic.service.DynamicClassService; + +/** + * @author code4crafter@gmail.com + */ +@Service +public class DynamicClassServiceImpl implements DynamicClassService { + + @Autowired + private DynamicClassDao dynamicClassDao; + + @Autowired + private ForgerFactory forgerFactory; + + @Override + public String compileAndSave(String sourceCode) { + Forger forger = forgerFactory.compile(sourceCode); + String className = forger.getClazz().getCanonicalName(); + DynamicClass dynamicClass = new DynamicClass(); + dynamicClass.setClassName(className); + dynamicClass.setSourceCode(sourceCode); + dynamicClassDao.add(dynamicClass); + return className; + } + +} diff --git a/webmagic-avalon/webmagic-avalon-common/src/main/resources/spring/applicationContext-service.xml b/webmagic-avalon/webmagic-avalon-common/src/main/resources/spring/applicationContext-service.xml new file mode 100644 index 0000000..f854d3d --- /dev/null +++ b/webmagic-avalon/webmagic-avalon-common/src/main/resources/spring/applicationContext-service.xml @@ -0,0 +1,22 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/webmagic-avalon/webmagic-avalon-common/src/test/java/us/codecraft/webmagic/Foo.java b/webmagic-avalon/webmagic-avalon-common/src/test/java/us/codecraft/webmagic/Foo.java new file mode 100644 index 0000000..9078eb4 --- /dev/null +++ b/webmagic-avalon/webmagic-avalon-common/src/test/java/us/codecraft/webmagic/Foo.java @@ -0,0 +1,45 @@ +package us.codecraft.webmagic; + +import us.codecraft.forger.property.Inject; +import us.codecraft.forger.property.format.Formatter; + +/** + * @author code4crafter@gmail.com + */ +public class Foo { + + @Formatter("") + @Inject("fooa") + private String foo; + + public static final String SOURCE_CODE="package us.codecraft.webmagic;\n" + + "\n" + + "import us.codecraft.forger.property.Inject;\n" + + "import us.codecraft.forger.property.format.Formatter;\n" + + "\n" + + "/**\n" + + " * @author code4crafter@gmail.com\n" + + " */\n" + + "public class Foo {\n" + + "\n" + + " @Formatter(\"\")\n" + + " @Inject(\"fooa\")\n" + + " private String foo;\n" + + "\n" + + " public String getFoo() {\n" + + " return foo;\n" + + " }\n" + + "\n" + + " public String foo() {\n" + + " return foo;\n" + + " }\n" + + "}"; + + public String getFoo() { + return foo; + } + + public String foo() { + return foo; + } +} diff --git a/webmagic-avalon/webmagic-avalon-common/src/test/java/us/codecraft/webmagic/service/DynamicClassServiceImplTest.java b/webmagic-avalon/webmagic-avalon-common/src/test/java/us/codecraft/webmagic/service/DynamicClassServiceImplTest.java new file mode 100644 index 0000000..108de62 --- /dev/null +++ b/webmagic-avalon/webmagic-avalon-common/src/test/java/us/codecraft/webmagic/service/DynamicClassServiceImplTest.java @@ -0,0 +1,47 @@ +package us.codecraft.webmagic.service; + +import org.junit.Before; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.mockito.InjectMocks; +import org.mockito.Mock; +import org.mockito.MockitoAnnotations; +import org.mockito.Spy; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.test.context.ContextConfiguration; +import org.springframework.test.context.junit4.SpringJUnit4ClassRunner; +import us.codecraft.forger.ForgerFactory; +import us.codecraft.webmagic.Foo; +import us.codecraft.webmagic.dao.DynamicClassDao; +import us.codecraft.webmagic.service.impl.DynamicClassServiceImpl; + +import static org.assertj.core.api.Assertions.assertThat; + +/** + * @author code4crafter@gmail.com + */ +@RunWith(SpringJUnit4ClassRunner.class) +@ContextConfiguration(locations = {"classpath*:/spring/applicationContext*.xml"}) +public class DynamicClassServiceImplTest { + + @Before + public void setUp(){ + MockitoAnnotations.initMocks(this); + } + + @Spy + @Autowired + private ForgerFactory forgerFactory; + + @InjectMocks + private DynamicClassService dynamicClassService = new DynamicClassServiceImpl(); + + @Mock + private DynamicClassDao dynamicClassDao; + + @Test + public void testCompileAndSave() throws Exception { + String className = dynamicClassService.compileAndSave(Foo.SOURCE_CODE); + assertThat(className).isEqualTo("us.codecraft.webmagic.Foo"); + } +} diff --git a/webmagic-avalon/webmagic-avalon-common/src/test/resources/spring/applicationContext.xml b/webmagic-avalon/webmagic-avalon-common/src/test/resources/spring/applicationContext.xml index 3296bdd..550d810 100644 --- a/webmagic-avalon/webmagic-avalon-common/src/test/resources/spring/applicationContext.xml +++ b/webmagic-avalon/webmagic-avalon-common/src/test/resources/spring/applicationContext.xml @@ -9,6 +9,7 @@ http://www.springframework.org/schema/beans/spring-beans-4.0.xsd http://www.springframework.org/schema/context http://www.springframework.org/schema/context/spring-context-4.0.xsd"> + From 9a4eab44bef700077d3affbbb7758adf74dba2b3 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sun, 16 Mar 2014 08:48:04 +0800 Subject: [PATCH 013/130] add DynamicClassException to DynamicClassService.compileAndSave --- .../exception/DynamicClassCompileException.java | 15 +++++++++++++++ .../webmagic/service/DynamicClassService.java | 4 +++- .../service/impl/DynamicClassServiceImpl.java | 13 ++++++++++--- .../service/DynamicClassServiceImplTest.java | 17 ++++++++++++++--- 4 files changed, 42 insertions(+), 7 deletions(-) create mode 100644 webmagic-avalon/webmagic-avalon-common/src/main/java/us/codecraft/webmagic/exception/DynamicClassCompileException.java diff --git a/webmagic-avalon/webmagic-avalon-common/src/main/java/us/codecraft/webmagic/exception/DynamicClassCompileException.java b/webmagic-avalon/webmagic-avalon-common/src/main/java/us/codecraft/webmagic/exception/DynamicClassCompileException.java new file mode 100644 index 0000000..8512ae5 --- /dev/null +++ b/webmagic-avalon/webmagic-avalon-common/src/main/java/us/codecraft/webmagic/exception/DynamicClassCompileException.java @@ -0,0 +1,15 @@ +package us.codecraft.webmagic.exception; + +/** + * @author code4crafter@gmail.com + */ +public class DynamicClassCompileException extends Exception{ + + public DynamicClassCompileException(String message) { + super(message); + } + + public DynamicClassCompileException(String message, Throwable cause) { + super(message, cause); + } +} diff --git a/webmagic-avalon/webmagic-avalon-common/src/main/java/us/codecraft/webmagic/service/DynamicClassService.java b/webmagic-avalon/webmagic-avalon-common/src/main/java/us/codecraft/webmagic/service/DynamicClassService.java index 389ab01..1cd719c 100644 --- a/webmagic-avalon/webmagic-avalon-common/src/main/java/us/codecraft/webmagic/service/DynamicClassService.java +++ b/webmagic-avalon/webmagic-avalon-common/src/main/java/us/codecraft/webmagic/service/DynamicClassService.java @@ -1,10 +1,12 @@ package us.codecraft.webmagic.service; +import us.codecraft.webmagic.exception.DynamicClassCompileException; + /** * @author code4crafter@gmail.com */ public interface DynamicClassService { - public String compileAndSave(String sourceCode); + public Class compileAndSave(String sourceCode) throws DynamicClassCompileException; } diff --git a/webmagic-avalon/webmagic-avalon-common/src/main/java/us/codecraft/webmagic/service/impl/DynamicClassServiceImpl.java b/webmagic-avalon/webmagic-avalon-common/src/main/java/us/codecraft/webmagic/service/impl/DynamicClassServiceImpl.java index c063a50..ec83efd 100644 --- a/webmagic-avalon/webmagic-avalon-common/src/main/java/us/codecraft/webmagic/service/impl/DynamicClassServiceImpl.java +++ b/webmagic-avalon/webmagic-avalon-common/src/main/java/us/codecraft/webmagic/service/impl/DynamicClassServiceImpl.java @@ -1,10 +1,12 @@ package us.codecraft.webmagic.service.impl; +import org.codehaus.groovy.control.CompilationFailedException; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Service; import us.codecraft.forger.Forger; import us.codecraft.forger.ForgerFactory; import us.codecraft.webmagic.dao.DynamicClassDao; +import us.codecraft.webmagic.exception.DynamicClassCompileException; import us.codecraft.webmagic.model.DynamicClass; import us.codecraft.webmagic.service.DynamicClassService; @@ -21,14 +23,19 @@ public class DynamicClassServiceImpl implements DynamicClassService { private ForgerFactory forgerFactory; @Override - public String compileAndSave(String sourceCode) { - Forger forger = forgerFactory.compile(sourceCode); + public Class compileAndSave(String sourceCode) throws DynamicClassCompileException { + Forger forger; + try { + forger = forgerFactory.compile(sourceCode); + } catch (CompilationFailedException e) { + throw new DynamicClassCompileException(e.getMessage(),e); + } String className = forger.getClazz().getCanonicalName(); DynamicClass dynamicClass = new DynamicClass(); dynamicClass.setClassName(className); dynamicClass.setSourceCode(sourceCode); dynamicClassDao.add(dynamicClass); - return className; + return forger.getClazz(); } } diff --git a/webmagic-avalon/webmagic-avalon-common/src/test/java/us/codecraft/webmagic/service/DynamicClassServiceImplTest.java b/webmagic-avalon/webmagic-avalon-common/src/test/java/us/codecraft/webmagic/service/DynamicClassServiceImplTest.java index 108de62..185252b 100644 --- a/webmagic-avalon/webmagic-avalon-common/src/test/java/us/codecraft/webmagic/service/DynamicClassServiceImplTest.java +++ b/webmagic-avalon/webmagic-avalon-common/src/test/java/us/codecraft/webmagic/service/DynamicClassServiceImplTest.java @@ -13,9 +13,11 @@ import org.springframework.test.context.junit4.SpringJUnit4ClassRunner; import us.codecraft.forger.ForgerFactory; import us.codecraft.webmagic.Foo; import us.codecraft.webmagic.dao.DynamicClassDao; +import us.codecraft.webmagic.exception.DynamicClassCompileException; import us.codecraft.webmagic.service.impl.DynamicClassServiceImpl; import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.failBecauseExceptionWasNotThrown; /** * @author code4crafter@gmail.com @@ -25,7 +27,7 @@ import static org.assertj.core.api.Assertions.assertThat; public class DynamicClassServiceImplTest { @Before - public void setUp(){ + public void setUp() { MockitoAnnotations.initMocks(this); } @@ -41,7 +43,16 @@ public class DynamicClassServiceImplTest { @Test public void testCompileAndSave() throws Exception { - String className = dynamicClassService.compileAndSave(Foo.SOURCE_CODE); - assertThat(className).isEqualTo("us.codecraft.webmagic.Foo"); + Class aClass = dynamicClassService.compileAndSave(Foo.SOURCE_CODE); + assertThat(aClass.getCanonicalName()).isEqualTo("us.codecraft.webmagic.Foo"); + } + + @Test + public void testCompileFail() { + try { + dynamicClassService.compileAndSave("class s(("); + failBecauseExceptionWasNotThrown(DynamicClassCompileException.class); + } catch (DynamicClassCompileException e) { + } } } From 316980973c8e52fe9e5a12d7f6d3d909e8d84d8d Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sun, 16 Mar 2014 18:10:05 +0800 Subject: [PATCH 014/130] invite h2 as test and standalone db, add spring profile for switch --- .../webmagic-avalon-common/pom.xml | 14 ++++++ .../webmagic/dao/DynamicClassDao.java | 2 - .../main/resources/mapper/DynamicClass.xml | 32 +++++++++++++ .../spring/applicationContext-datasource.xml | 41 +++++++++++++++++ .../spring/applicationContext-myBatis.xml | 30 ++++++++----- .../spring/applicationContext-tx.xml | 0 .../resources/spring/applicationContext.xml | 4 ++ .../src/main/resources/sql/h2/schema.sql | 8 ++++ .../src/main/resources/sql/mysql/schema.sql} | 0 .../us/codecraft/webmagic/AbstractTest.java | 17 +++++++ .../webmagic/dao/DynamicClassDaoTest.java | 11 +---- .../service/DynamicClassServiceImplTest.java | 8 +--- .../spring/applicationContext-freemarker.xml | 34 -------------- .../spring/applicationContext-myBatis.xml | 23 ---------- .../resources/spring/applicationContext.xml | 45 ------------------- 15 files changed, 140 insertions(+), 129 deletions(-) create mode 100644 webmagic-avalon/webmagic-avalon-common/src/main/resources/mapper/DynamicClass.xml create mode 100644 webmagic-avalon/webmagic-avalon-common/src/main/resources/spring/applicationContext-datasource.xml rename webmagic-avalon/webmagic-avalon-common/src/{test => main}/resources/spring/applicationContext-tx.xml (100%) create mode 100644 webmagic-avalon/webmagic-avalon-common/src/main/resources/sql/h2/schema.sql rename webmagic-avalon/{sql/create-tables.sql => webmagic-avalon-common/src/main/resources/sql/mysql/schema.sql} (100%) create mode 100644 webmagic-avalon/webmagic-avalon-common/src/test/java/us/codecraft/webmagic/AbstractTest.java delete mode 100644 webmagic-avalon/webmagic-avalon-common/src/test/resources/spring/applicationContext-freemarker.xml delete mode 100644 webmagic-avalon/webmagic-avalon-common/src/test/resources/spring/applicationContext-myBatis.xml delete mode 100644 webmagic-avalon/webmagic-avalon-common/src/test/resources/spring/applicationContext.xml diff --git a/webmagic-avalon/webmagic-avalon-common/pom.xml b/webmagic-avalon/webmagic-avalon-common/pom.xml index 388c4f1..e15d715 100644 --- a/webmagic-avalon/webmagic-avalon-common/pom.xml +++ b/webmagic-avalon/webmagic-avalon-common/pom.xml @@ -11,6 +11,7 @@ jar + us.codecraft webmagic-extension @@ -75,19 +76,28 @@ org.aspectj aspectjrt + org.aspectj aspectjweaver + org.springframework spring-core + org.springframework spring-webmvc + + com.h2database + h2 + 1.3.175 + + org.mockito mockito-all @@ -97,18 +107,22 @@ javax.servlet javax.servlet-api + org.springframework spring-context + org.springframework spring-context-support + com.alibaba fastjson + diff --git a/webmagic-avalon/webmagic-avalon-common/src/main/java/us/codecraft/webmagic/dao/DynamicClassDao.java b/webmagic-avalon/webmagic-avalon-common/src/main/java/us/codecraft/webmagic/dao/DynamicClassDao.java index 1e7a6e3..b3d93ad 100644 --- a/webmagic-avalon/webmagic-avalon-common/src/main/java/us/codecraft/webmagic/dao/DynamicClassDao.java +++ b/webmagic-avalon/webmagic-avalon-common/src/main/java/us/codecraft/webmagic/dao/DynamicClassDao.java @@ -1,6 +1,5 @@ package us.codecraft.webmagic.dao; -import org.apache.ibatis.annotations.Insert; import us.codecraft.webmagic.model.DynamicClass; /** @@ -8,6 +7,5 @@ import us.codecraft.webmagic.model.DynamicClass; */ public interface DynamicClassDao { - @Insert("insert into DynamicClass (`ClassName`,`SourceCode`,`AddTime`,`UpdateTime`) values (#{className},#{sourceCode},now(),now())") public int add(DynamicClass dynamicClass); } diff --git a/webmagic-avalon/webmagic-avalon-common/src/main/resources/mapper/DynamicClass.xml b/webmagic-avalon/webmagic-avalon-common/src/main/resources/mapper/DynamicClass.xml new file mode 100644 index 0000000..1e09b7f --- /dev/null +++ b/webmagic-avalon/webmagic-avalon-common/src/main/resources/mapper/DynamicClass.xml @@ -0,0 +1,32 @@ + + + + + + + + insert into DynamicClass (`ClassName`,`SourceCode`,`AddTime`,`UpdateTime`) + values (#{className},#{sourceCode},now(),now()) + + + + insert into DynamicClass (`ClassName`,`SourceCode`,`AddTime`,`UpdateTime`) + values (#{className},#{sourceCode},now(),now()) + + + \ No newline at end of file diff --git a/webmagic-avalon/webmagic-avalon-common/src/main/resources/spring/applicationContext-datasource.xml b/webmagic-avalon/webmagic-avalon-common/src/main/resources/spring/applicationContext-datasource.xml new file mode 100644 index 0000000..7d468af --- /dev/null +++ b/webmagic-avalon/webmagic-avalon-common/src/main/resources/spring/applicationContext-datasource.xml @@ -0,0 +1,41 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/webmagic-avalon/webmagic-avalon-common/src/main/resources/spring/applicationContext-myBatis.xml b/webmagic-avalon/webmagic-avalon-common/src/main/resources/spring/applicationContext-myBatis.xml index bf54793..e8a8629 100644 --- a/webmagic-avalon/webmagic-avalon-common/src/main/resources/spring/applicationContext-myBatis.xml +++ b/webmagic-avalon/webmagic-avalon-common/src/main/resources/spring/applicationContext-myBatis.xml @@ -4,20 +4,30 @@ xsi:schemaLocation="http://www.springframework.org/schema/beans http://www.springframework.org/schema/beans/spring-beans-3.0.xsd"> - - - - - - - - - + + + + sqlserver + db2 + oracle + mysql + h2 + + + + + + + + + + + + \ No newline at end of file diff --git a/webmagic-avalon/webmagic-avalon-common/src/test/resources/spring/applicationContext-tx.xml b/webmagic-avalon/webmagic-avalon-common/src/main/resources/spring/applicationContext-tx.xml similarity index 100% rename from webmagic-avalon/webmagic-avalon-common/src/test/resources/spring/applicationContext-tx.xml rename to webmagic-avalon/webmagic-avalon-common/src/main/resources/spring/applicationContext-tx.xml diff --git a/webmagic-avalon/webmagic-avalon-common/src/main/resources/spring/applicationContext.xml b/webmagic-avalon/webmagic-avalon-common/src/main/resources/spring/applicationContext.xml index 3296bdd..0e0e44b 100644 --- a/webmagic-avalon/webmagic-avalon-common/src/main/resources/spring/applicationContext.xml +++ b/webmagic-avalon/webmagic-avalon-common/src/main/resources/spring/applicationContext.xml @@ -11,6 +11,10 @@ http://www.springframework.org/schema/context/spring-context-4.0.xsd"> + + diff --git a/webmagic-avalon/webmagic-avalon-common/src/main/resources/sql/h2/schema.sql b/webmagic-avalon/webmagic-avalon-common/src/main/resources/sql/h2/schema.sql new file mode 100644 index 0000000..37c3758 --- /dev/null +++ b/webmagic-avalon/webmagic-avalon-common/src/main/resources/sql/h2/schema.sql @@ -0,0 +1,8 @@ +CREATE TABLE DynamicClass( + Id int(11) NOT NULL AUTO_INCREMENT PRIMARY KEY, + `ClassName` varchar(200) NOT NULL, + `SourceCode` text NOT NULL, + `AddTime` datetime NOT NULL, + `UpdateTime` datetime NOT NULL, + UNIQUE INDEX `un_class_name` (`ClassName`) +); \ No newline at end of file diff --git a/webmagic-avalon/sql/create-tables.sql b/webmagic-avalon/webmagic-avalon-common/src/main/resources/sql/mysql/schema.sql similarity index 100% rename from webmagic-avalon/sql/create-tables.sql rename to webmagic-avalon/webmagic-avalon-common/src/main/resources/sql/mysql/schema.sql diff --git a/webmagic-avalon/webmagic-avalon-common/src/test/java/us/codecraft/webmagic/AbstractTest.java b/webmagic-avalon/webmagic-avalon-common/src/test/java/us/codecraft/webmagic/AbstractTest.java new file mode 100644 index 0000000..dab122c --- /dev/null +++ b/webmagic-avalon/webmagic-avalon-common/src/test/java/us/codecraft/webmagic/AbstractTest.java @@ -0,0 +1,17 @@ +package us.codecraft.webmagic; + +import org.junit.runner.RunWith; +import org.springframework.test.context.ActiveProfiles; +import org.springframework.test.context.ContextConfiguration; +import org.springframework.test.context.junit4.SpringJUnit4ClassRunner; +import org.springframework.transaction.annotation.Transactional; + +/** + * @author code4crafter@gmail.com + */ +@RunWith(SpringJUnit4ClassRunner.class) +@ContextConfiguration(locations = {"classpath*:/spring/applicationContext*.xml"}) +@ActiveProfiles("test") +@Transactional +public abstract class AbstractTest { +} diff --git a/webmagic-avalon/webmagic-avalon-common/src/test/java/us/codecraft/webmagic/dao/DynamicClassDaoTest.java b/webmagic-avalon/webmagic-avalon-common/src/test/java/us/codecraft/webmagic/dao/DynamicClassDaoTest.java index 8f2a942..86a9a15 100644 --- a/webmagic-avalon/webmagic-avalon-common/src/test/java/us/codecraft/webmagic/dao/DynamicClassDaoTest.java +++ b/webmagic-avalon/webmagic-avalon-common/src/test/java/us/codecraft/webmagic/dao/DynamicClassDaoTest.java @@ -1,23 +1,16 @@ package us.codecraft.webmagic.dao; -import org.junit.Ignore; import org.junit.Test; -import org.junit.runner.RunWith; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.test.annotation.Rollback; -import org.springframework.test.context.ContextConfiguration; -import org.springframework.test.context.junit4.SpringJUnit4ClassRunner; import org.springframework.transaction.annotation.Transactional; +import us.codecraft.webmagic.AbstractTest; import us.codecraft.webmagic.model.DynamicClass; /** * @author code4crafter@gmail.com */ -@Ignore -@RunWith(SpringJUnit4ClassRunner.class) -@ContextConfiguration(locations = {"classpath:/spring/applicationContext-*.xml"}) -@Transactional -public class DynamicClassDaoTest { +public class DynamicClassDaoTest extends AbstractTest { @Autowired private DynamicClassDao dynamicClassDao; diff --git a/webmagic-avalon/webmagic-avalon-common/src/test/java/us/codecraft/webmagic/service/DynamicClassServiceImplTest.java b/webmagic-avalon/webmagic-avalon-common/src/test/java/us/codecraft/webmagic/service/DynamicClassServiceImplTest.java index 185252b..92e213a 100644 --- a/webmagic-avalon/webmagic-avalon-common/src/test/java/us/codecraft/webmagic/service/DynamicClassServiceImplTest.java +++ b/webmagic-avalon/webmagic-avalon-common/src/test/java/us/codecraft/webmagic/service/DynamicClassServiceImplTest.java @@ -2,15 +2,13 @@ package us.codecraft.webmagic.service; import org.junit.Before; import org.junit.Test; -import org.junit.runner.RunWith; import org.mockito.InjectMocks; import org.mockito.Mock; import org.mockito.MockitoAnnotations; import org.mockito.Spy; import org.springframework.beans.factory.annotation.Autowired; -import org.springframework.test.context.ContextConfiguration; -import org.springframework.test.context.junit4.SpringJUnit4ClassRunner; import us.codecraft.forger.ForgerFactory; +import us.codecraft.webmagic.AbstractTest; import us.codecraft.webmagic.Foo; import us.codecraft.webmagic.dao.DynamicClassDao; import us.codecraft.webmagic.exception.DynamicClassCompileException; @@ -22,9 +20,7 @@ import static org.assertj.core.api.Assertions.failBecauseExceptionWasNotThrown; /** * @author code4crafter@gmail.com */ -@RunWith(SpringJUnit4ClassRunner.class) -@ContextConfiguration(locations = {"classpath*:/spring/applicationContext*.xml"}) -public class DynamicClassServiceImplTest { +public class DynamicClassServiceImplTest extends AbstractTest { @Before public void setUp() { diff --git a/webmagic-avalon/webmagic-avalon-common/src/test/resources/spring/applicationContext-freemarker.xml b/webmagic-avalon/webmagic-avalon-common/src/test/resources/spring/applicationContext-freemarker.xml deleted file mode 100644 index e7b98aa..0000000 --- a/webmagic-avalon/webmagic-avalon-common/src/test/resources/spring/applicationContext-freemarker.xml +++ /dev/null @@ -1,34 +0,0 @@ - - - - - - - - - 0 - zh_CN - yyyy-MM-dd HH:mm:ss - yyyy-MM-dd - #.## - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/webmagic-avalon/webmagic-avalon-common/src/test/resources/spring/applicationContext-myBatis.xml b/webmagic-avalon/webmagic-avalon-common/src/test/resources/spring/applicationContext-myBatis.xml deleted file mode 100644 index bf54793..0000000 --- a/webmagic-avalon/webmagic-avalon-common/src/test/resources/spring/applicationContext-myBatis.xml +++ /dev/null @@ -1,23 +0,0 @@ - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/webmagic-avalon/webmagic-avalon-common/src/test/resources/spring/applicationContext.xml b/webmagic-avalon/webmagic-avalon-common/src/test/resources/spring/applicationContext.xml deleted file mode 100644 index 550d810..0000000 --- a/webmagic-avalon/webmagic-avalon-common/src/test/resources/spring/applicationContext.xml +++ /dev/null @@ -1,45 +0,0 @@ - - - - - - - - - web_messages - - - - - - - - - - - - - text/html;charset=UTF-8 - - - - - - - - - - - - - \ No newline at end of file From 23d1151bc03562402a8d4e6450f959c6f0e10d52 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sun, 16 Mar 2014 18:18:17 +0800 Subject: [PATCH 015/130] seperate spring config file --- .../spring/applicationContext-component.xml | 24 +++++++++++++++++++ ...text.xml => applicationContext-webmvc.xml} | 19 +-------------- 2 files changed, 25 insertions(+), 18 deletions(-) create mode 100644 webmagic-avalon/webmagic-avalon-common/src/main/resources/spring/applicationContext-component.xml rename webmagic-avalon/webmagic-avalon-common/src/main/resources/spring/{applicationContext.xml => applicationContext-webmvc.xml} (67%) diff --git a/webmagic-avalon/webmagic-avalon-common/src/main/resources/spring/applicationContext-component.xml b/webmagic-avalon/webmagic-avalon-common/src/main/resources/spring/applicationContext-component.xml new file mode 100644 index 0000000..faba6ca --- /dev/null +++ b/webmagic-avalon/webmagic-avalon-common/src/main/resources/spring/applicationContext-component.xml @@ -0,0 +1,24 @@ + + + + + + + + web_messages + + + + + + + \ No newline at end of file diff --git a/webmagic-avalon/webmagic-avalon-common/src/main/resources/spring/applicationContext.xml b/webmagic-avalon/webmagic-avalon-common/src/main/resources/spring/applicationContext-webmvc.xml similarity index 67% rename from webmagic-avalon/webmagic-avalon-common/src/main/resources/spring/applicationContext.xml rename to webmagic-avalon/webmagic-avalon-common/src/main/resources/spring/applicationContext-webmvc.xml index 0e0e44b..340cfb2 100644 --- a/webmagic-avalon/webmagic-avalon-common/src/main/resources/spring/applicationContext.xml +++ b/webmagic-avalon/webmagic-avalon-common/src/main/resources/spring/applicationContext-webmvc.xml @@ -9,21 +9,6 @@ http://www.springframework.org/schema/beans/spring-beans-4.0.xsd http://www.springframework.org/schema/context http://www.springframework.org/schema/context/spring-context-4.0.xsd"> - - - - - - - - web_messages - - - - - @@ -41,8 +26,6 @@ - - - + \ No newline at end of file From c7169328c79c6eaf9ae38eec4fda3e0b22ff6880 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sun, 16 Mar 2014 19:19:11 +0800 Subject: [PATCH 016/130] refactor config resource --- .../spring/applicationContext-myBatis.xml | 21 --------- .../resources/spring/applicationContext.xml | 47 ------------------- .../src/main/webapp/WEB-INF/web.xml | 2 +- .../{ => config}/freemarker.properties | 0 .../main/resources/{ => config}/log/log4j.xml | 0 .../{ => config}/mapper/DynamicClass.xml | 0 .../spring/applicationContext-component.xml | 0 .../spring/applicationContext-datasource.xml | 0 .../spring/applicationContext-freemarker.xml | 0 .../spring/applicationContext-myBatis.xml | 2 +- .../spring/applicationContext-service.xml | 0 .../spring/applicationContext-tx.xml | 0 .../spring/applicationContext-webmvc.xml | 0 .../spring/applicationContext-freemarker.xml | 34 -------------- .../us/codecraft/webmagic/AbstractTest.java | 2 +- .../spring/applicationContext-freemarker.xml | 34 -------------- .../spring/applicationContext-myBatis.xml | 21 --------- .../resources/spring/applicationContext.xml | 45 ------------------ .../src/main/webapp/WEB-INF/web.xml | 2 +- 19 files changed, 4 insertions(+), 206 deletions(-) delete mode 100644 webmagic-avalon/webmagic-admin/src/main/resources/spring/applicationContext-myBatis.xml delete mode 100644 webmagic-avalon/webmagic-admin/src/main/resources/spring/applicationContext.xml rename webmagic-avalon/webmagic-avalon-common/src/main/resources/{ => config}/freemarker.properties (100%) rename webmagic-avalon/webmagic-avalon-common/src/main/resources/{ => config}/log/log4j.xml (100%) rename webmagic-avalon/webmagic-avalon-common/src/main/resources/{ => config}/mapper/DynamicClass.xml (100%) rename webmagic-avalon/webmagic-avalon-common/src/main/resources/{ => config}/spring/applicationContext-component.xml (100%) rename webmagic-avalon/webmagic-avalon-common/src/main/resources/{ => config}/spring/applicationContext-datasource.xml (100%) rename webmagic-avalon/{webmagic-admin/src/main/resources => webmagic-avalon-common/src/main/resources/config}/spring/applicationContext-freemarker.xml (100%) rename webmagic-avalon/webmagic-avalon-common/src/main/resources/{ => config}/spring/applicationContext-myBatis.xml (93%) rename webmagic-avalon/webmagic-avalon-common/src/main/resources/{ => config}/spring/applicationContext-service.xml (100%) rename webmagic-avalon/webmagic-avalon-common/src/main/resources/{ => config}/spring/applicationContext-tx.xml (100%) rename webmagic-avalon/webmagic-avalon-common/src/main/resources/{ => config}/spring/applicationContext-webmvc.xml (100%) delete mode 100644 webmagic-avalon/webmagic-avalon-common/src/main/resources/spring/applicationContext-freemarker.xml delete mode 100644 webmagic-avalon/webmagic-worker/src/main/resources/spring/applicationContext-freemarker.xml delete mode 100644 webmagic-avalon/webmagic-worker/src/main/resources/spring/applicationContext-myBatis.xml delete mode 100644 webmagic-avalon/webmagic-worker/src/main/resources/spring/applicationContext.xml diff --git a/webmagic-avalon/webmagic-admin/src/main/resources/spring/applicationContext-myBatis.xml b/webmagic-avalon/webmagic-admin/src/main/resources/spring/applicationContext-myBatis.xml deleted file mode 100644 index 222df02..0000000 --- a/webmagic-avalon/webmagic-admin/src/main/resources/spring/applicationContext-myBatis.xml +++ /dev/null @@ -1,21 +0,0 @@ - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/webmagic-avalon/webmagic-admin/src/main/resources/spring/applicationContext.xml b/webmagic-avalon/webmagic-admin/src/main/resources/spring/applicationContext.xml deleted file mode 100644 index 7c19641..0000000 --- a/webmagic-avalon/webmagic-admin/src/main/resources/spring/applicationContext.xml +++ /dev/null @@ -1,47 +0,0 @@ - - - - - - - - web_messages - - - - - - - - - - - - - text/html;charset=UTF-8 - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/webmagic-avalon/webmagic-admin/src/main/webapp/WEB-INF/web.xml b/webmagic-avalon/webmagic-admin/src/main/webapp/WEB-INF/web.xml index eb253f3..4c255cd 100644 --- a/webmagic-avalon/webmagic-admin/src/main/webapp/WEB-INF/web.xml +++ b/webmagic-avalon/webmagic-admin/src/main/webapp/WEB-INF/web.xml @@ -7,7 +7,7 @@ contextConfigLocation - classpath*:spring/applicationContext*.xml, + classpath*:/config/spring/applicationContext*.xml, diff --git a/webmagic-avalon/webmagic-avalon-common/src/main/resources/freemarker.properties b/webmagic-avalon/webmagic-avalon-common/src/main/resources/config/freemarker.properties similarity index 100% rename from webmagic-avalon/webmagic-avalon-common/src/main/resources/freemarker.properties rename to webmagic-avalon/webmagic-avalon-common/src/main/resources/config/freemarker.properties diff --git a/webmagic-avalon/webmagic-avalon-common/src/main/resources/log/log4j.xml b/webmagic-avalon/webmagic-avalon-common/src/main/resources/config/log/log4j.xml similarity index 100% rename from webmagic-avalon/webmagic-avalon-common/src/main/resources/log/log4j.xml rename to webmagic-avalon/webmagic-avalon-common/src/main/resources/config/log/log4j.xml diff --git a/webmagic-avalon/webmagic-avalon-common/src/main/resources/mapper/DynamicClass.xml b/webmagic-avalon/webmagic-avalon-common/src/main/resources/config/mapper/DynamicClass.xml similarity index 100% rename from webmagic-avalon/webmagic-avalon-common/src/main/resources/mapper/DynamicClass.xml rename to webmagic-avalon/webmagic-avalon-common/src/main/resources/config/mapper/DynamicClass.xml diff --git a/webmagic-avalon/webmagic-avalon-common/src/main/resources/spring/applicationContext-component.xml b/webmagic-avalon/webmagic-avalon-common/src/main/resources/config/spring/applicationContext-component.xml similarity index 100% rename from webmagic-avalon/webmagic-avalon-common/src/main/resources/spring/applicationContext-component.xml rename to webmagic-avalon/webmagic-avalon-common/src/main/resources/config/spring/applicationContext-component.xml diff --git a/webmagic-avalon/webmagic-avalon-common/src/main/resources/spring/applicationContext-datasource.xml b/webmagic-avalon/webmagic-avalon-common/src/main/resources/config/spring/applicationContext-datasource.xml similarity index 100% rename from webmagic-avalon/webmagic-avalon-common/src/main/resources/spring/applicationContext-datasource.xml rename to webmagic-avalon/webmagic-avalon-common/src/main/resources/config/spring/applicationContext-datasource.xml diff --git a/webmagic-avalon/webmagic-admin/src/main/resources/spring/applicationContext-freemarker.xml b/webmagic-avalon/webmagic-avalon-common/src/main/resources/config/spring/applicationContext-freemarker.xml similarity index 100% rename from webmagic-avalon/webmagic-admin/src/main/resources/spring/applicationContext-freemarker.xml rename to webmagic-avalon/webmagic-avalon-common/src/main/resources/config/spring/applicationContext-freemarker.xml diff --git a/webmagic-avalon/webmagic-avalon-common/src/main/resources/spring/applicationContext-myBatis.xml b/webmagic-avalon/webmagic-avalon-common/src/main/resources/config/spring/applicationContext-myBatis.xml similarity index 93% rename from webmagic-avalon/webmagic-avalon-common/src/main/resources/spring/applicationContext-myBatis.xml rename to webmagic-avalon/webmagic-avalon-common/src/main/resources/config/spring/applicationContext-myBatis.xml index e8a8629..8601852 100644 --- a/webmagic-avalon/webmagic-avalon-common/src/main/resources/spring/applicationContext-myBatis.xml +++ b/webmagic-avalon/webmagic-avalon-common/src/main/resources/config/spring/applicationContext-myBatis.xml @@ -27,7 +27,7 @@ - + \ No newline at end of file diff --git a/webmagic-avalon/webmagic-avalon-common/src/main/resources/spring/applicationContext-service.xml b/webmagic-avalon/webmagic-avalon-common/src/main/resources/config/spring/applicationContext-service.xml similarity index 100% rename from webmagic-avalon/webmagic-avalon-common/src/main/resources/spring/applicationContext-service.xml rename to webmagic-avalon/webmagic-avalon-common/src/main/resources/config/spring/applicationContext-service.xml diff --git a/webmagic-avalon/webmagic-avalon-common/src/main/resources/spring/applicationContext-tx.xml b/webmagic-avalon/webmagic-avalon-common/src/main/resources/config/spring/applicationContext-tx.xml similarity index 100% rename from webmagic-avalon/webmagic-avalon-common/src/main/resources/spring/applicationContext-tx.xml rename to webmagic-avalon/webmagic-avalon-common/src/main/resources/config/spring/applicationContext-tx.xml diff --git a/webmagic-avalon/webmagic-avalon-common/src/main/resources/spring/applicationContext-webmvc.xml b/webmagic-avalon/webmagic-avalon-common/src/main/resources/config/spring/applicationContext-webmvc.xml similarity index 100% rename from webmagic-avalon/webmagic-avalon-common/src/main/resources/spring/applicationContext-webmvc.xml rename to webmagic-avalon/webmagic-avalon-common/src/main/resources/config/spring/applicationContext-webmvc.xml diff --git a/webmagic-avalon/webmagic-avalon-common/src/main/resources/spring/applicationContext-freemarker.xml b/webmagic-avalon/webmagic-avalon-common/src/main/resources/spring/applicationContext-freemarker.xml deleted file mode 100644 index e7b98aa..0000000 --- a/webmagic-avalon/webmagic-avalon-common/src/main/resources/spring/applicationContext-freemarker.xml +++ /dev/null @@ -1,34 +0,0 @@ - - - - - - - - - 0 - zh_CN - yyyy-MM-dd HH:mm:ss - yyyy-MM-dd - #.## - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/webmagic-avalon/webmagic-avalon-common/src/test/java/us/codecraft/webmagic/AbstractTest.java b/webmagic-avalon/webmagic-avalon-common/src/test/java/us/codecraft/webmagic/AbstractTest.java index dab122c..b259a6d 100644 --- a/webmagic-avalon/webmagic-avalon-common/src/test/java/us/codecraft/webmagic/AbstractTest.java +++ b/webmagic-avalon/webmagic-avalon-common/src/test/java/us/codecraft/webmagic/AbstractTest.java @@ -10,7 +10,7 @@ import org.springframework.transaction.annotation.Transactional; * @author code4crafter@gmail.com */ @RunWith(SpringJUnit4ClassRunner.class) -@ContextConfiguration(locations = {"classpath*:/spring/applicationContext*.xml"}) +@ContextConfiguration(locations = {"classpath*:/config/spring/applicationContext*.xml"}) @ActiveProfiles("test") @Transactional public abstract class AbstractTest { diff --git a/webmagic-avalon/webmagic-worker/src/main/resources/spring/applicationContext-freemarker.xml b/webmagic-avalon/webmagic-worker/src/main/resources/spring/applicationContext-freemarker.xml deleted file mode 100644 index e7b98aa..0000000 --- a/webmagic-avalon/webmagic-worker/src/main/resources/spring/applicationContext-freemarker.xml +++ /dev/null @@ -1,34 +0,0 @@ - - - - - - - - - 0 - zh_CN - yyyy-MM-dd HH:mm:ss - yyyy-MM-dd - #.## - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/webmagic-avalon/webmagic-worker/src/main/resources/spring/applicationContext-myBatis.xml b/webmagic-avalon/webmagic-worker/src/main/resources/spring/applicationContext-myBatis.xml deleted file mode 100644 index 222df02..0000000 --- a/webmagic-avalon/webmagic-worker/src/main/resources/spring/applicationContext-myBatis.xml +++ /dev/null @@ -1,21 +0,0 @@ - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/webmagic-avalon/webmagic-worker/src/main/resources/spring/applicationContext.xml b/webmagic-avalon/webmagic-worker/src/main/resources/spring/applicationContext.xml deleted file mode 100644 index 1a2ac66..0000000 --- a/webmagic-avalon/webmagic-worker/src/main/resources/spring/applicationContext.xml +++ /dev/null @@ -1,45 +0,0 @@ - - - - - - - - web_messages - - - - - - - - - - - - - text/html;charset=UTF-8 - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/webmagic-avalon/webmagic-worker/src/main/webapp/WEB-INF/web.xml b/webmagic-avalon/webmagic-worker/src/main/webapp/WEB-INF/web.xml index eb253f3..4c255cd 100644 --- a/webmagic-avalon/webmagic-worker/src/main/webapp/WEB-INF/web.xml +++ b/webmagic-avalon/webmagic-worker/src/main/webapp/WEB-INF/web.xml @@ -7,7 +7,7 @@ contextConfigLocation - classpath*:spring/applicationContext*.xml, + classpath*:/config/spring/applicationContext*.xml, From 6577a75892f30fff67c7f0bc46914eb93af64ebd Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Mon, 17 Mar 2014 11:08:12 +0800 Subject: [PATCH 017/130] fix spring config path in web.xml of webmagic-worker --- webmagic-avalon/webmagic-worker/src/main/webapp/WEB-INF/web.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webmagic-avalon/webmagic-worker/src/main/webapp/WEB-INF/web.xml b/webmagic-avalon/webmagic-worker/src/main/webapp/WEB-INF/web.xml index 4c255cd..b521ed6 100644 --- a/webmagic-avalon/webmagic-worker/src/main/webapp/WEB-INF/web.xml +++ b/webmagic-avalon/webmagic-worker/src/main/webapp/WEB-INF/web.xml @@ -33,7 +33,7 @@ org.springframework.web.servlet.DispatcherServlet contextConfigLocation - classpath:/spring/applicationContext*.xml + classpath:/config/spring/applicationContext*.xml 1 From 6201fd6966ec2496d31192478a2ebe4a631bacd9 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Mon, 17 Mar 2014 23:01:58 +0800 Subject: [PATCH 018/130] add worker as container --- webmagic-avalon/webmagic-worker/pom.xml | 12 +++++ .../us/codecraft/webmagic/worker/Worker.java | 46 +++++++++++++++++++ .../codecraft/webmagic/worker/WorkerTest.java | 27 +++++++++++ .../java/us/codecraft/webmagic/Spider.java | 5 ++ 4 files changed, 90 insertions(+) create mode 100644 webmagic-avalon/webmagic-worker/src/main/java/us/codecraft/webmagic/worker/Worker.java create mode 100644 webmagic-avalon/webmagic-worker/src/test/java/us/codecraft/webmagic/worker/WorkerTest.java diff --git a/webmagic-avalon/webmagic-worker/pom.xml b/webmagic-avalon/webmagic-worker/pom.xml index f7833a0..84e1d73 100644 --- a/webmagic-avalon/webmagic-worker/pom.xml +++ b/webmagic-avalon/webmagic-worker/pom.xml @@ -16,6 +16,18 @@ webmagic-avalon-common ${project.version} + + junit + junit + + + org.mockito + mockito-all + + + org.aspectj + aspectjrt + diff --git a/webmagic-avalon/webmagic-worker/src/main/java/us/codecraft/webmagic/worker/Worker.java b/webmagic-avalon/webmagic-worker/src/main/java/us/codecraft/webmagic/worker/Worker.java new file mode 100644 index 0000000..91867cc --- /dev/null +++ b/webmagic-avalon/webmagic-worker/src/main/java/us/codecraft/webmagic/worker/Worker.java @@ -0,0 +1,46 @@ +package us.codecraft.webmagic.worker; + +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.utils.ThreadUtils; + +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ExecutorService; + +/** + * @author code4crafter@gmail.com + */ +public class Worker { + + public static final int DEFAULT_POOL_SIZE = 10; + + private int poolSize; + + private ExecutorService executorService; + + private Map spiderMap; + + public Worker(int poolSize) { + this.poolSize = poolSize; + this.executorService = initExecutorService(); + this.spiderMap = new ConcurrentHashMap(); + } + + public Worker() { + this(DEFAULT_POOL_SIZE); + } + + protected ExecutorService initExecutorService() { + return ThreadUtils.newFixedThreadPool(poolSize); + } + + public void addSpider(Spider spider) { + spider.setExecutorService(executorService); + spiderMap.put(spider.getUUID(), spider); + } + + public Spider getSpider(String uuid){ + return spiderMap.get(uuid); + } + +} diff --git a/webmagic-avalon/webmagic-worker/src/test/java/us/codecraft/webmagic/worker/WorkerTest.java b/webmagic-avalon/webmagic-worker/src/test/java/us/codecraft/webmagic/worker/WorkerTest.java new file mode 100644 index 0000000..24bca19 --- /dev/null +++ b/webmagic-avalon/webmagic-worker/src/test/java/us/codecraft/webmagic/worker/WorkerTest.java @@ -0,0 +1,27 @@ +package us.codecraft.webmagic.worker; + +import org.junit.Test; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.processor.PageProcessor; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.mockito.Mockito.*; + +/** + * @author code4crafter@gmail.com + */ +public class WorkerTest { + + @Test + public void testWorkerAsSpiderContains() throws Exception { + PageProcessor pageProcessor = mock(PageProcessor.class); + Site site = mock(Site.class); + when(pageProcessor.getSite()).thenReturn(site); + when(site.getDomain()).thenReturn("codecraft.us"); + Worker worker = new Worker(); + Spider spider = Spider.create(pageProcessor); + worker.addSpider(spider); + assertThat(worker.getSpider("codecraft.us")).isEqualTo(spider); + } +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 0d52ac1..6fe2880 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -643,6 +643,11 @@ public class Spider implements Runnable, Task { return uuid; } + public Spider setExecutorService(ExecutorService executorService) { + this.executorService = executorService; + return this; + } + @Override public Site getSite() { return site; From 8b35d79569fae48d58087b4f8183b5324babc4e1 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Wed, 19 Mar 2014 22:19:06 +0800 Subject: [PATCH 019/130] Do not cache document in Selectable for selected Html element #73 --- .../src/main/resources/sql/mysql/schema.sql | 22 +++++++++++++++ .../avalon/web/DashBoardController.java | 20 ------------- .../us/codecraft/webmagic/worker/Worker.java | 2 ++ .../controller}/SpiderController.java | 13 +++++++-- .../us/codecraft/webmagic/selector/Html.java | 28 +++++++++++++------ .../webmagic/selector/SelectorTest.java | 26 +++++++++++++++++ 6 files changed, 79 insertions(+), 32 deletions(-) delete mode 100644 webmagic-avalon/webmagic-worker/src/main/java/us/codecraft/webmagic/avalon/web/DashBoardController.java rename webmagic-avalon/webmagic-worker/src/main/java/us/codecraft/webmagic/{avalon/web => worker/controller}/SpiderController.java (59%) create mode 100644 webmagic-core/src/test/java/us/codecraft/webmagic/selector/SelectorTest.java diff --git a/webmagic-avalon/webmagic-avalon-common/src/main/resources/sql/mysql/schema.sql b/webmagic-avalon/webmagic-avalon-common/src/main/resources/sql/mysql/schema.sql index 6c361b1..c75a884 100644 --- a/webmagic-avalon/webmagic-avalon-common/src/main/resources/sql/mysql/schema.sql +++ b/webmagic-avalon/webmagic-avalon-common/src/main/resources/sql/mysql/schema.sql @@ -6,4 +6,26 @@ CREATE TABLE `DynamicClass` ( `UpdateTime` datetime NOT NULL, PRIMARY KEY (`Id`), UNIQUE KEY `un_class_name` (`ClassName`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8; + +CREATE TABLE `Spider` ( + `Id` int(11) unsigned NOT NULL AUTO_INCREMENT, + `PageProcessorId` int(11) unsigned NOT NULL AUTO_INCREMENT, + `PipelineId` int(11) unsigned NOT NULL AUTO_INCREMENT, + `SchedulerId` int(11) unsigned NOT NULL AUTO_INCREMENT, + `Config` text NOT NULL, + `AddTime` datetime NOT NULL, + `UpdateTime` datetime NOT NULL, + PRIMARY KEY (`Id`), + UNIQUE KEY `un_class_name` (`ClassName`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8; + +CREATE TABLE `PageProcessor` ( + `Id` int(11) unsigned NOT NULL AUTO_INCREMENT, + `ClassName` varchar(200) NOT NULL, + `Params` text NOT NULL, + `AddTime` datetime NOT NULL, + `UpdateTime` datetime NOT NULL, + PRIMARY KEY (`Id`), + UNIQUE KEY `un_class_name` (`ClassName`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8; \ No newline at end of file diff --git a/webmagic-avalon/webmagic-worker/src/main/java/us/codecraft/webmagic/avalon/web/DashBoardController.java b/webmagic-avalon/webmagic-worker/src/main/java/us/codecraft/webmagic/avalon/web/DashBoardController.java deleted file mode 100644 index 3ef2a86..0000000 --- a/webmagic-avalon/webmagic-worker/src/main/java/us/codecraft/webmagic/avalon/web/DashBoardController.java +++ /dev/null @@ -1,20 +0,0 @@ -package us.codecraft.webmagic.avalon.web; - -import org.springframework.stereotype.Controller; -import org.springframework.web.bind.annotation.RequestMapping; -import org.springframework.web.servlet.ModelAndView; - -/** - * @author code4crafter@gmail.com - */ -@Controller("dashboard") -@RequestMapping("/") -public class DashBoardController { - - @RequestMapping - public ModelAndView index() { - ModelAndView map = new ModelAndView("dashboard"); - return map; - } - -} diff --git a/webmagic-avalon/webmagic-worker/src/main/java/us/codecraft/webmagic/worker/Worker.java b/webmagic-avalon/webmagic-worker/src/main/java/us/codecraft/webmagic/worker/Worker.java index 91867cc..a65c94b 100644 --- a/webmagic-avalon/webmagic-worker/src/main/java/us/codecraft/webmagic/worker/Worker.java +++ b/webmagic-avalon/webmagic-worker/src/main/java/us/codecraft/webmagic/worker/Worker.java @@ -8,6 +8,8 @@ import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ExecutorService; /** + * Container of Spiders. + * * @author code4crafter@gmail.com */ public class Worker { diff --git a/webmagic-avalon/webmagic-worker/src/main/java/us/codecraft/webmagic/avalon/web/SpiderController.java b/webmagic-avalon/webmagic-worker/src/main/java/us/codecraft/webmagic/worker/controller/SpiderController.java similarity index 59% rename from webmagic-avalon/webmagic-worker/src/main/java/us/codecraft/webmagic/avalon/web/SpiderController.java rename to webmagic-avalon/webmagic-worker/src/main/java/us/codecraft/webmagic/worker/controller/SpiderController.java index 2f18569..d33b0da 100644 --- a/webmagic-avalon/webmagic-worker/src/main/java/us/codecraft/webmagic/avalon/web/SpiderController.java +++ b/webmagic-avalon/webmagic-worker/src/main/java/us/codecraft/webmagic/worker/controller/SpiderController.java @@ -1,8 +1,11 @@ -package us.codecraft.webmagic.avalon.web; +package us.codecraft.webmagic.worker.controller; +import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Controller; import org.springframework.web.bind.annotation.RequestMapping; +import org.springframework.web.bind.annotation.RequestParam; import org.springframework.web.bind.annotation.ResponseBody; +import us.codecraft.webmagic.worker.Worker; import java.util.HashMap; import java.util.Map; @@ -10,15 +13,19 @@ import java.util.Map; /** * @author code4crafter@gmail.com */ -@Controller("spider") +@Controller @RequestMapping("spider") public class SpiderController { + @Autowired + private Worker worker; + @RequestMapping("create") @ResponseBody - public Map create() { + public Map create(@RequestParam("id") String id) { HashMap map = new HashMap(); map.put("code", 200); return map; } + } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java index 3db0ff1..614b111 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java @@ -23,7 +23,7 @@ public class Html extends PlainText { */ private Document document; - private boolean init = false; + private boolean needInitCache = true; public Html(List strings) { super(strings); @@ -33,12 +33,22 @@ public class Html extends PlainText { super(text); } + public Html(List strings, boolean needInitCache) { + super(strings); + this.needInitCache = needInitCache; + } + + public Html(String text, boolean needInitCache) { + super(text); + this.needInitCache = needInitCache; + } + /** * lazy init */ private void initDocument() { - if (this.document == null && !init) { - init = true; + if (this.document == null && needInitCache) { + needInitCache = false; //just init once whether the parsing succeeds or not try { this.document = Jsoup.parse(getText()); @@ -67,7 +77,7 @@ public class Html extends PlainText { results.add(result); } } - return new Html(results); + return new Html(results, false); } @Override @@ -78,7 +88,7 @@ public class Html extends PlainText { List result = selector.selectList(string); results.addAll(result); } - return new Html(results); + return new Html(results, false); } @Override @@ -95,9 +105,9 @@ public class Html extends PlainText { @Override public Selectable xpath(String xpath) { - XpathSelector xpathSelector = new XpathSelector(xpath); + XpathSelector xpathSelector = Selectors.xpath(xpath); if (document != null) { - return new Html(xpathSelector.selectList(document)); + return new Html(xpathSelector.selectList(document), false); } return selectList(xpathSelector, strings); } @@ -106,7 +116,7 @@ public class Html extends PlainText { public Selectable $(String selector) { CssSelector cssSelector = Selectors.$(selector); if (document != null) { - return new Html(cssSelector.selectList(document)); + return new Html(cssSelector.selectList(document), false); } return selectList(cssSelector, strings); } @@ -115,7 +125,7 @@ public class Html extends PlainText { public Selectable $(String selector, String attrName) { CssSelector cssSelector = Selectors.$(selector, attrName); if (document != null) { - return new Html(cssSelector.selectList(document)); + return new Html(cssSelector.selectList(document), false); } return selectList(cssSelector, strings); } diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/SelectorTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/SelectorTest.java new file mode 100644 index 0000000..249a837 --- /dev/null +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/SelectorTest.java @@ -0,0 +1,26 @@ +package us.codecraft.webmagic.selector; + +import org.junit.Test; + +import java.util.List; + +import static org.assertj.core.api.Assertions.assertThat; + +/** + * @author code4crafter@gmail.com + */ +public class SelectorTest { + + private String html = "
"; + + @Test + public void testChain() throws Exception { + Html selectable = new Html(html); + List linksWithoutChain = selectable.links().all(); + Selectable xpath = selectable.xpath("//div"); + List linksWithChainFirstCall = xpath.links().all(); + List linksWithChainSecondCall = xpath.links().all(); + assertThat(linksWithoutChain).hasSameSizeAs(linksWithChainFirstCall); + assertThat(linksWithChainFirstCall).hasSameSizeAs(linksWithChainSecondCall); + } +} From 90447bff027e1d03794d4d794a7e40e3a85596dd Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 22 Mar 2014 20:23:16 +0800 Subject: [PATCH 020/130] update xsoup to 0.2.1 #75 --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index bcc219c..da4424f 100644 --- a/pom.xml +++ b/pom.xml @@ -88,7 +88,7 @@ us.codecraft xsoup - 0.2.0 + 0.2.1 com.alibaba From 8958d774f20a4780bca22acdc2e95eed4afacb16 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Mon, 24 Mar 2014 13:52:17 +0800 Subject: [PATCH 021/130] add default values for @Formatter --- .../webmagic/example/OschinaBlog.java | 2 +- .../webmagic/model/annotation/Formatter.java | 2 +- .../model/formatter/DateFormatter.java | 7 +++++-- .../src/main/resouces/log4j.xml | 21 +++++++++++++++++++ .../src/test/resouces/log4j.xml | 12 +---------- 5 files changed, 29 insertions(+), 15 deletions(-) create mode 100644 webmagic-extension/src/main/resouces/log4j.xml diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/OschinaBlog.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/OschinaBlog.java index e8ac20c..b527ea7 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/OschinaBlog.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/OschinaBlog.java @@ -26,11 +26,11 @@ public class OschinaBlog { @ExtractBy(value = "//div[@class='BlogTags']/a/text()", multi = true) private List tags; - @Formatter("yyyy-MM-dd HH:mm") @ExtractBy("//div[@class='BlogStat']/regex('\\d+-\\d+-\\d+\\s+\\d+:\\d+')") private Date date; public static void main(String[] args) { + //results will be saved to "/data/webmagic/" in json format OOSpider.create(Site.me(), new JsonFilePageModelPipeline("/data/webmagic/"), OschinaBlog.class) .addUrl("http://my.oschina.net/flashsword/blog").run(); } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/Formatter.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/Formatter.java index e603c59..a3a56f8 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/Formatter.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/Formatter.java @@ -21,7 +21,7 @@ public @interface Formatter { * * @return formatter params */ - String[] value(); + String[] value() default ""; /** * Specific the class of field of class of elements in collection for field.
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/DateFormatter.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/DateFormatter.java index b0f6e77..6305d7b 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/DateFormatter.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/DateFormatter.java @@ -10,7 +10,8 @@ import java.util.Date; */ public class DateFormatter implements ObjectFormatter { - private String[] datePatterns = new String[]{"yyyy-MM-dd HH:mm"}; + public static final String[] DEFAULT_PATTERN = new String[]{"yyyy-MM-dd HH:mm"}; + private String[] datePatterns = DEFAULT_PATTERN; @Override public Date format(String raw) throws Exception { @@ -24,6 +25,8 @@ public class DateFormatter implements ObjectFormatter { @Override public void initParam(String[] extra) { - datePatterns = extra; + if (extra != null && !(extra.length == 1 && extra[0].length() == 0)) { + datePatterns = extra; + } } } diff --git a/webmagic-extension/src/main/resouces/log4j.xml b/webmagic-extension/src/main/resouces/log4j.xml new file mode 100644 index 0000000..c2b5a2f --- /dev/null +++ b/webmagic-extension/src/main/resouces/log4j.xml @@ -0,0 +1,21 @@ + + + + + + + + + + + + + + + + + + + + + diff --git a/webmagic-extension/src/test/resouces/log4j.xml b/webmagic-extension/src/test/resouces/log4j.xml index a58e889..c2b5a2f 100644 --- a/webmagic-extension/src/test/resouces/log4j.xml +++ b/webmagic-extension/src/test/resouces/log4j.xml @@ -8,23 +8,13 @@ - - - - - - - - - - - + From e008b7b85191900984e7199aa4b9da25c4e98d8f Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Mon, 24 Mar 2014 17:22:06 +0800 Subject: [PATCH 022/130] update zh_docs --- zh_docs/README.md | 68 ++++++++++++++++++++++++++++++++++++----------- 1 file changed, 52 insertions(+), 16 deletions(-) diff --git a/zh_docs/README.md b/zh_docs/README.md index c58469a..a0c75b2 100644 --- a/zh_docs/README.md +++ b/zh_docs/README.md @@ -1,9 +1,13 @@ -webmagic ---------- +![logo](https://raw.github.com/code4craft/webmagic/master/asserts/logo.jpg) + + [![Build Status](https://travis-ci.org/code4craft/webmagic.png?branch=master)](https://travis-ci.org/code4craft/webmagic) + [Readme in English](https://github.com/code4craft/webmagic/tree/master/en_docs) +[用户手册](https://github.com/code4craft/webmagic/blob/master/user-manual.md) + >webmagic是一个开源的Java垂直爬虫框架,目标是简化爬虫的开发流程,让开发者专注于逻辑功能的开发。webmagic的核心非常简单,但是覆盖爬虫的整个流程,也是很好的学习爬虫开发的材料。作者曾经在前公司进行过一年的垂直爬虫的开发,webmagic就是为了解决爬虫开发的一些重复劳动而产生的框架。 >web爬虫是一种技术,webmagic致力于将这种技术的实现成本降低,但是出于对资源提供者的尊重,webmagic不会做反封锁的事情,包括:验证码破解、代理切换、自动登录等。 @@ -25,6 +29,8 @@ python爬虫 **scrapy** [https://github.com/scrapy/scrapy](https://github.com/sc Java爬虫 **Spiderman** [https://gitcafe.com/laiweiwei/Spiderman](https://gitcafe.com/laiweiwei/Spiderman) +webmagic的github地址:[https://github.com/code4craft/webmagic](https://github.com/code4craft/webmagic)。 + ## 快速开始 ### 使用maven @@ -34,12 +40,12 @@ webmagic使用maven管理依赖,在项目中添加对应的依赖即可使用w us.codecraft webmagic-core - 0.4.2 + 0.4.3 us.codecraft webmagic-extension - 0.4.2 + 0.4.3 #### 项目结构 @@ -68,11 +74,7 @@ webmagic还包含两个可用的扩展包,因为这两个包都依赖了比较 ### 不使用maven -不使用maven的用户,可以下载这个二进制打包版本(感谢[oschina](http://www.oschina.net/)): - - git clone http://git.oschina.net/flashsword20/webmagic-bin.git - -在**bin/lib**目录下,有项目依赖的所有jar包,直接在IDE里import即可。 +在项目的**lib**目录下,有依赖的所有jar包,直接在IDE里import即可。 ### 第一个爬虫 @@ -80,10 +82,10 @@ webmagic还包含两个可用的扩展包,因为这两个包都依赖了比较 PageProcessor是webmagic-core的一部分,定制一个PageProcessor即可实现自己的爬虫逻辑。以下是抓取osc博客的一段代码: +```java public class OschinaBlogPageProcesser implements PageProcessor { - private Site site = Site.me().setDomain("my.oschina.net") - .addStartUrl("http://my.oschina.net/flashsword/blog"); + private Site site = Site.me().setDomain("my.oschina.net"); @Override public void process(Page page) { @@ -101,10 +103,12 @@ PageProcessor是webmagic-core的一部分,定制一个PageProcessor即可实 } public static void main(String[] args) { - Spider.create(new OschinaBlogPageProcesser()) - .pipeline(new ConsolePipeline()).run(); + Spider.create(new OschinaBlogPageProcesser()).addUrl("http://my.oschina.net/flashsword/blog") + .addPipeline(new ConsolePipeline()).run(); } } +``` + 这里通过page.addTargetRequests()方法来增加要抓取的URL,并通过page.putField()来保存抽取结果。page.getHtml().xpath()则是按照某个规则对结果进行抽取,这里抽取支持链式调用。调用结束后,toString()表示转化为单个String,all()则转化为一个String列表。 @@ -116,6 +120,7 @@ Spider是爬虫的入口类。Pipeline是结果输出和持久化的接口,这 webmagic-extension包括了注解方式编写爬虫的方法,只需基于一个POJO增加注解即可完成一个爬虫。以下仍然是抓取oschina博客的一段代码,功能与OschinaBlogPageProcesser完全相同: +```java @TargetUrl("http://my.oschina.net/flashsword/blog/\\d+") public class OschinaBlog { @@ -130,10 +135,11 @@ webmagic-extension包括了注解方式编写爬虫的方法,只需基于一 public static void main(String[] args) { OOSpider.create( - Site.me().addStartUrl("http://my.oschina.net/flashsword/blog"), - new ConsolePageModelPipeline(), OschinaBlog.class).run(); + Site.me(), + new ConsolePageModelPipeline(), OschinaBlog.class).addUrl("http://my.oschina.net/flashsword/blog").run(); } } +``` 这个例子定义了一个Model类,Model类的字段'title'、'content'、'tags'均为要抽取的属性。这个类在Pipeline里是可以复用的。 @@ -145,10 +151,40 @@ webmagic-extension包括了注解方式编写爬虫的方法,只需基于一 webmagic-samples目录里有一些定制PageProcessor以抽取不同站点的例子。 -作者还有一个使用webmagic进行抽取并持久化到数据库的项目[JobHunter](http://git.oschina.net/flashsword20/jobhunter)。这个项目整合了Spring,自定义了Pipeline,使用mybatis进行数据持久化。 +webmagic的使用可以参考:[oschina openapi 应用:博客搬家](http://my.oschina.net/oscfox/blog/194507) + ### 协议 webmagic遵循[Apache 2.0协议](http://opensource.org/licenses/Apache-2.0) +### 贡献者: +以下是为WebMagic提交过代码或者issue的朋友: + +* [yuany](https://github.com/yuany) +* [yxssfxwzy](https://github.com/yxssfxwzy) +* [linkerlin](https://github.com/linkerlin) +* [d0ngw](https://github.com/d0ngw) +* [xuchaoo](https://github.com/xuchaoo) +* [supermicah](https://github.com/supermicah) +* [SimpleExpress](https://github.com/SimpleExpress) +* [aruanruan](https://github.com/aruanruan) +* [l1z2g9](https://github.com/l1z2g9) +* [zhegexiaohuozi](https://github.com/zhegexiaohuozi) +* [ywooer](https://github.com/ywooer) +* [yyw258520](https://github.com/yyw258520) +* [perfecking](https://github.com/perfecking) +* [lidongyang](http://my.oschina.net/lidongyang) + +### 邮件组: + +Gmail: +[https://groups.google.com/forum/#!forum/webmagic-java](https://groups.google.com/forum/#!forum/webmagic-java) + +QQ: +[http://list.qq.com/cgi-bin/qf_invite?id=023a01f505246785f77c5a5a9aff4e57ab20fcdde871e988](http://list.qq.com/cgi-bin/qf_invite?id=023a01f505246785f77c5a5a9aff4e57ab20fcdde871e988) + +### QQ群: + +330192938 From c72483a2209468e9b0b7389a1ceb6feb90798932 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Mon, 24 Mar 2014 17:22:37 +0800 Subject: [PATCH 023/130] update en_docs --- en_docs/README.md | 67 ++++++++++++++++++++++++++++++++--------------- 1 file changed, 46 insertions(+), 21 deletions(-) diff --git a/en_docs/README.md b/en_docs/README.md index 684da90..cccbf3f 100644 --- a/en_docs/README.md +++ b/en_docs/README.md @@ -1,10 +1,13 @@ -webmagic ---- +![logo](https://raw.github.com/code4craft/webmagic/master/asserts/logo.jpg) + [Readme in Chinese](https://github.com/code4craft/webmagic/tree/master/zh_docs) +[User Manual (Chinese)](https://github.com/code4craft/webmagic/blob/master/user-manual.md) + + [![Build Status](https://travis-ci.org/code4craft/webmagic.png?branch=master)](https://travis-ci.org/code4craft/webmagic) ->A scalable crawler framework. It covers the whole lifecycle of crawler: downloading, url management, content extraction and persistent. It can simply the development of a specific crawler. +>A scalable crawler framework. It covers the whole lifecycle of crawler: downloading, url management, content extraction and persistent. It can simplify the development of a specific crawler. ## Features: @@ -14,26 +17,19 @@ webmagic * Multi-thread and Distribution support. * Easy to be integrated. - ## Install: - -Clone the repo and build: - - git clone https://github.com/code4craft/webmagic.git - cd webmagic - mvn clean install - -Add dependencies to your project: + +Add dependencies to your pom.xml: us.codecraft webmagic-core - 0.4.2 + 0.4.3 us.codecraft webmagic-extension - 0.4.2 + 0.4.3 ## Get Started: @@ -42,10 +38,10 @@ Add dependencies to your project: Write a class implements PageProcessor: +```java public class OschinaBlogPageProcesser implements PageProcessor { - private Site site = Site.me().setDomain("my.oschina.net") - .addStartUrl("http://my.oschina.net/flashsword/blog"); + private Site site = Site.me().setDomain("my.oschina.net"); @Override public void process(Page page) { @@ -63,10 +59,11 @@ Write a class implements PageProcessor: } public static void main(String[] args) { - Spider.create(new OschinaBlogPageProcesser()) - .pipeline(new ConsolePipeline()).run(); + Spider.create(new OschinaBlogPageProcesser()).addUrl("http://my.oschina.net/flashsword/blog") + .addPipeline(new ConsolePipeline()).run(); } } +``` * `page.addTargetRequests(links)` @@ -74,6 +71,7 @@ Write a class implements PageProcessor: You can also use annotation way: +```java @TargetUrl("http://my.oschina.net/flashsword/blog/\\d+") public class OschinaBlog { @@ -88,10 +86,11 @@ You can also use annotation way: public static void main(String[] args) { OOSpider.create( - Site.me().addStartUrl("http://my.oschina.net/flashsword/blog"), - new ConsolePageModelPipeline(), OschinaBlog.class).run(); + Site.me(), + new ConsolePageModelPipeline(), OschinaBlog.class).addUrl("http://my.oschina.net/flashsword/blog").run(); } } +``` ### Docs and samples: @@ -103,11 +102,30 @@ Javadocs: [http://code4craft.github.io/webmagic/docs/en/](http://code4craft.gith There are some samples in `webmagic-samples` package. - ### Lisence: Lisenced under [Apache 2.0 lisence](http://opensource.org/licenses/Apache-2.0) +### Contributors: + +Thanks these people for commiting source code, reporting bugs or suggesting for new feature: + +* [yuany](https://github.com/yuany) +* [yxssfxwzy](https://github.com/yxssfxwzy) +* [linkerlin](https://github.com/linkerlin) +* [d0ngw](https://github.com/d0ngw) +* [xuchaoo](https://github.com/xuchaoo) +* [supermicah](https://github.com/supermicah) +* [SimpleExpress](https://github.com/SimpleExpress) +* [aruanruan](https://github.com/aruanruan) +* [l1z2g9](https://github.com/l1z2g9) +* [zhegexiaohuozi](https://github.com/zhegexiaohuozi) +* [ywooer](https://github.com/ywooer) +* [yyw258520](https://github.com/yyw258520) +* [perfecking](https://github.com/perfecking) +* [lidongyang](http://my.oschina.net/lidongyang) + + ### Thanks: To write webmagic, I refered to the projects below : @@ -124,3 +142,10 @@ To write webmagic, I refered to the projects below : [https://gitcafe.com/laiweiwei/Spiderman](https://gitcafe.com/laiweiwei/Spiderman) +### Mail-list: + +[https://groups.google.com/forum/#!forum/webmagic-java](https://groups.google.com/forum/#!forum/webmagic-java) + + +[![Bitdeli Badge](https://d2weczhvl823v0.cloudfront.net/code4craft/webmagic/trend.png)](https://bitdeli.com/free "Bitdeli Badge") + From 8d8194bee468bf259e28ee391358f9a56fbc6d45 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Tue, 25 Mar 2014 08:23:20 +0800 Subject: [PATCH 024/130] Change HashMap to LinkedHashMap in ResultItems for same order of input and output #76 --- .../us/codecraft/webmagic/ResultItems.java | 3 ++- .../codecraft/webmagic/ResultItemsTest.java | 22 +++++++++++++++++++ 2 files changed, 24 insertions(+), 1 deletion(-) create mode 100644 webmagic-core/src/test/java/us/codecraft/webmagic/ResultItemsTest.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/ResultItems.java b/webmagic-core/src/main/java/us/codecraft/webmagic/ResultItems.java index 4791e77..7b54361 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/ResultItems.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/ResultItems.java @@ -1,6 +1,7 @@ package us.codecraft.webmagic; import java.util.HashMap; +import java.util.LinkedHashMap; import java.util.Map; /** @@ -14,7 +15,7 @@ import java.util.Map; */ public class ResultItems { - private Map fields = new HashMap(); + private Map fields = new LinkedHashMap(); private Request request; diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/ResultItemsTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/ResultItemsTest.java new file mode 100644 index 0000000..0aa9e94 --- /dev/null +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/ResultItemsTest.java @@ -0,0 +1,22 @@ +package us.codecraft.webmagic; + +import org.junit.Test; + + +import static org.assertj.core.api.Assertions.assertThat; + +/** + * @author code4crafter@gmail.com + */ +public class ResultItemsTest { + + @Test + public void testOrderOfEntries() throws Exception { + ResultItems resultItems = new ResultItems(); + resultItems.put("a", "a"); + resultItems.put("b", "b"); + resultItems.put("c", "c"); + assertThat(resultItems.getAll().keySet()).containsExactly("a","b","c"); + + } +} From 4e15a68ff4c26d812c074729c45d3578e8433529 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sun, 30 Mar 2014 09:43:35 +0800 Subject: [PATCH 025/130] update xsoup version to 0.2.2 Bugfix: Parsing error when separate chars in quotes cause in xsoup #77 --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index da4424f..085e94e 100644 --- a/pom.xml +++ b/pom.xml @@ -88,7 +88,7 @@ us.codecraft xsoup - 0.2.1 + 0.2.2 com.alibaba From f3c2503a296c65067aee494aa45035df1840f55d Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Tue, 1 Apr 2014 07:42:23 +0800 Subject: [PATCH 026/130] add warning of slf4j #78 --- README.md | 14 ++++++++++++++ webmagic-extension/pom.xml | 6 ++++++ 2 files changed, 20 insertions(+) diff --git a/README.md b/README.md index cccbf3f..a9f856e 100644 --- a/README.md +++ b/README.md @@ -21,6 +21,7 @@ Add dependencies to your pom.xml: +```xml us.codecraft webmagic-core @@ -31,6 +32,19 @@ Add dependencies to your pom.xml: webmagic-extension 0.4.3 +``` + +WebMagic use slf4j with slf4j-log4j12 implementation. If you customized your slf4j implementation, please exclude slf4j-log4j12. + +```xml + + + org.slf4j + slf4j-log4j12 + + +``` + ## Get Started: diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index cd8c12f..78f2757 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -23,6 +23,12 @@ us.codecraft webmagic-core ${project.version} + + + org.slf4j + slf4j-log4j12 + + junit From 6252042ed2ea857a574a42cf0b2741685b8d8e35 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Tue, 1 Apr 2014 08:02:22 +0800 Subject: [PATCH 027/130] add warning of slf4j #78 --- zh_docs/README.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/zh_docs/README.md b/zh_docs/README.md index a0c75b2..30c6056 100644 --- a/zh_docs/README.md +++ b/zh_docs/README.md @@ -37,6 +37,7 @@ webmagic的github地址:[https://github.com/code4craft/webmagic](https://githu webmagic使用maven管理依赖,在项目中添加对应的依赖即可使用webmagic: +```xml us.codecraft webmagic-core @@ -47,6 +48,18 @@ webmagic使用maven管理依赖,在项目中添加对应的依赖即可使用w webmagic-extension 0.4.3 +``` + +WebMagic 使用slf4j-log4j12作为slf4j的实现.如果你自己定制了slf4j的实现,请在项目中去掉此依赖。 + +```xml + + + org.slf4j + slf4j-log4j12 + + +``` #### 项目结构 From 7038c00a9a88821847e8aad2becb319f1ad39725 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Tue, 1 Apr 2014 08:03:47 +0800 Subject: [PATCH 028/130] reformat --- README.md | 102 +++++++++++++++++++++++++++--------------------------- 1 file changed, 51 insertions(+), 51 deletions(-) diff --git a/README.md b/README.md index a9f856e..1600a78 100644 --- a/README.md +++ b/README.md @@ -22,27 +22,27 @@ Add dependencies to your pom.xml: ```xml - - us.codecraft - webmagic-core - 0.4.3 - - - us.codecraft - webmagic-extension - 0.4.3 - + + us.codecraft + webmagic-core + 0.4.3 + + + us.codecraft + webmagic-extension + 0.4.3 + ``` WebMagic use slf4j with slf4j-log4j12 implementation. If you customized your slf4j implementation, please exclude slf4j-log4j12. ```xml - - - org.slf4j - slf4j-log4j12 - - + + + org.slf4j + slf4j-log4j12 + + ``` @@ -53,30 +53,30 @@ WebMagic use slf4j with slf4j-log4j12 implementation. If you customized your slf Write a class implements PageProcessor: ```java - public class OschinaBlogPageProcesser implements PageProcessor { +public class OschinaBlogPageProcesser implements PageProcessor { - private Site site = Site.me().setDomain("my.oschina.net"); + private Site site = Site.me().setDomain("my.oschina.net"); - @Override - public void process(Page page) { - List links = page.getHtml().links().regex("http://my\\.oschina\\.net/flashsword/blog/\\d+").all(); - page.addTargetRequests(links); - page.putField("title", page.getHtml().xpath("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1").toString()); - page.putField("content", page.getHtml().$("div.content").toString()); - page.putField("tags",page.getHtml().xpath("//div[@class='BlogTags']/a/text()").all()); - } - - @Override - public Site getSite() { - return site; - - } - - public static void main(String[] args) { - Spider.create(new OschinaBlogPageProcesser()).addUrl("http://my.oschina.net/flashsword/blog") - .addPipeline(new ConsolePipeline()).run(); - } + @Override + public void process(Page page) { + List links = page.getHtml().links().regex("http://my\\.oschina\\.net/flashsword/blog/\\d+").all(); + page.addTargetRequests(links); + page.putField("title", page.getHtml().xpath("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1").toString()); + page.putField("content", page.getHtml().$("div.content").toString()); + page.putField("tags",page.getHtml().xpath("//div[@class='BlogTags']/a/text()").all()); } + + @Override + public Site getSite() { + return site; + + } + + public static void main(String[] args) { + Spider.create(new OschinaBlogPageProcesser()).addUrl("http://my.oschina.net/flashsword/blog") + .addPipeline(new ConsolePipeline()).run(); + } +} ``` * `page.addTargetRequests(links)` @@ -86,24 +86,24 @@ Write a class implements PageProcessor: You can also use annotation way: ```java - @TargetUrl("http://my.oschina.net/flashsword/blog/\\d+") - public class OschinaBlog { +@TargetUrl("http://my.oschina.net/flashsword/blog/\\d+") +public class OschinaBlog { - @ExtractBy("//title") - private String title; + @ExtractBy("//title") + private String title; - @ExtractBy(value = "div.BlogContent",type = ExtractBy.Type.Css) - private String content; + @ExtractBy(value = "div.BlogContent",type = ExtractBy.Type.Css) + private String content; - @ExtractBy(value = "//div[@class='BlogTags']/a/text()", multi = true) - private List tags; + @ExtractBy(value = "//div[@class='BlogTags']/a/text()", multi = true) + private List tags; - public static void main(String[] args) { - OOSpider.create( - Site.me(), - new ConsolePageModelPipeline(), OschinaBlog.class).addUrl("http://my.oschina.net/flashsword/blog").run(); - } - } + public static void main(String[] args) { + OOSpider.create( + Site.me(), + new ConsolePageModelPipeline(), OschinaBlog.class).addUrl("http://my.oschina.net/flashsword/blog").run(); + } +} ``` ### Docs and samples: From b13f1da03905bc7c05295b7ec9afc7613f1fb13c Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Tue, 1 Apr 2014 08:04:43 +0800 Subject: [PATCH 029/130] reformat --- zh_docs/README.md | 102 +++++++++++++++++++++++----------------------- 1 file changed, 51 insertions(+), 51 deletions(-) diff --git a/zh_docs/README.md b/zh_docs/README.md index 30c6056..6d9ac99 100644 --- a/zh_docs/README.md +++ b/zh_docs/README.md @@ -38,27 +38,27 @@ webmagic的github地址:[https://github.com/code4craft/webmagic](https://githu webmagic使用maven管理依赖,在项目中添加对应的依赖即可使用webmagic: ```xml - - us.codecraft - webmagic-core - 0.4.3 - - - us.codecraft - webmagic-extension - 0.4.3 - + + us.codecraft + webmagic-core + 0.4.3 + + + us.codecraft + webmagic-extension + 0.4.3 + ``` WebMagic 使用slf4j-log4j12作为slf4j的实现.如果你自己定制了slf4j的实现,请在项目中去掉此依赖。 ```xml - - - org.slf4j - slf4j-log4j12 - - + + + org.slf4j + slf4j-log4j12 + + ``` #### 项目结构 @@ -96,30 +96,30 @@ webmagic还包含两个可用的扩展包,因为这两个包都依赖了比较 PageProcessor是webmagic-core的一部分,定制一个PageProcessor即可实现自己的爬虫逻辑。以下是抓取osc博客的一段代码: ```java - public class OschinaBlogPageProcesser implements PageProcessor { +public class OschinaBlogPageProcesser implements PageProcessor { - private Site site = Site.me().setDomain("my.oschina.net"); + private Site site = Site.me().setDomain("my.oschina.net"); - @Override - public void process(Page page) { - List links = page.getHtml().links().regex("http://my\\.oschina\\.net/flashsword/blog/\\d+").all(); - page.addTargetRequests(links); - page.putField("title", page.getHtml().xpath("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1").toString()); - page.putField("content", page.getHtml().$("div.content").toString()); - page.putField("tags",page.getHtml().xpath("//div[@class='BlogTags']/a/text()").all()); - } - - @Override - public Site getSite() { - return site; - - } - - public static void main(String[] args) { - Spider.create(new OschinaBlogPageProcesser()).addUrl("http://my.oschina.net/flashsword/blog") - .addPipeline(new ConsolePipeline()).run(); - } + @Override + public void process(Page page) { + List links = page.getHtml().links().regex("http://my\\.oschina\\.net/flashsword/blog/\\d+").all(); + page.addTargetRequests(links); + page.putField("title", page.getHtml().xpath("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1").toString()); + page.putField("content", page.getHtml().$("div.content").toString()); + page.putField("tags",page.getHtml().xpath("//div[@class='BlogTags']/a/text()").all()); } + + @Override + public Site getSite() { + return site; + + } + + public static void main(String[] args) { + Spider.create(new OschinaBlogPageProcesser()).addUrl("http://my.oschina.net/flashsword/blog") + .addPipeline(new ConsolePipeline()).run(); + } +} ``` @@ -134,24 +134,24 @@ Spider是爬虫的入口类。Pipeline是结果输出和持久化的接口,这 webmagic-extension包括了注解方式编写爬虫的方法,只需基于一个POJO增加注解即可完成一个爬虫。以下仍然是抓取oschina博客的一段代码,功能与OschinaBlogPageProcesser完全相同: ```java - @TargetUrl("http://my.oschina.net/flashsword/blog/\\d+") - public class OschinaBlog { +@TargetUrl("http://my.oschina.net/flashsword/blog/\\d+") +public class OschinaBlog { - @ExtractBy("//title") - private String title; + @ExtractBy("//title") + private String title; - @ExtractBy(value = "div.BlogContent",type = ExtractBy.Type.Css) - private String content; + @ExtractBy(value = "div.BlogContent",type = ExtractBy.Type.Css) + private String content; - @ExtractBy(value = "//div[@class='BlogTags']/a/text()", multi = true) - private List tags; + @ExtractBy(value = "//div[@class='BlogTags']/a/text()", multi = true) + private List tags; - public static void main(String[] args) { - OOSpider.create( - Site.me(), - new ConsolePageModelPipeline(), OschinaBlog.class).addUrl("http://my.oschina.net/flashsword/blog").run(); - } - } + public static void main(String[] args) { + OOSpider.create( + Site.me(), + new ConsolePageModelPipeline(), OschinaBlog.class).addUrl("http://my.oschina.net/flashsword/blog").run(); + } +} ``` 这个例子定义了一个Model类,Model类的字段'title'、'content'、'tags'均为要抽取的属性。这个类在Pipeline里是可以复用的。 From d1563da33b59493d8632827131cab9bf9a71f99b Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Tue, 1 Apr 2014 08:07:25 +0800 Subject: [PATCH 030/130] add contributor --- README.md | 1 + zh_docs/README.md | 1 + 2 files changed, 2 insertions(+) diff --git a/README.md b/README.md index 1600a78..7eee9e3 100644 --- a/README.md +++ b/README.md @@ -137,6 +137,7 @@ Thanks these people for commiting source code, reporting bugs or suggesting for * [ywooer](https://github.com/ywooer) * [yyw258520](https://github.com/yyw258520) * [perfecking](https://github.com/perfecking) +* [ccliangbo](https://github.com/ccliangbo) * [lidongyang](http://my.oschina.net/lidongyang) diff --git a/zh_docs/README.md b/zh_docs/README.md index 6d9ac99..8d40752 100644 --- a/zh_docs/README.md +++ b/zh_docs/README.md @@ -188,6 +188,7 @@ webmagic遵循[Apache 2.0协议](http://opensource.org/licenses/Apache-2.0) * [ywooer](https://github.com/ywooer) * [yyw258520](https://github.com/yyw258520) * [perfecking](https://github.com/perfecking) +* [ccliangbo](https://github.com/ccliangbo) * [lidongyang](http://my.oschina.net/lidongyang) ### 邮件组: From 97b6f4628095fd6c46ddf087f9d32324c382809a Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Tue, 1 Apr 2014 20:12:25 +0800 Subject: [PATCH 031/130] Bugfix: break loop in addTargetRequests #81 --- webmagic-core/src/main/java/us/codecraft/webmagic/Page.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java index a22fbdc..e2d923e 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java @@ -94,7 +94,7 @@ public class Page { synchronized (targetRequests) { for (String s : requests) { if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) { - break; + continue; } s = UrlUtils.canonicalizeUrl(s, url.toString()); targetRequests.add(new Request(s)); From 2780423e60c1f17e08594519f32a29115b044631 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Tue, 1 Apr 2014 20:35:11 +0800 Subject: [PATCH 032/130] enable blank space in quotes in UrlUtils.fixAllRelativeHrefs #80 --- .../us/codecraft/webmagic/utils/UrlUtils.java | 23 +++++++++- .../webmagic/utils/UrlUtilsTest.java | 44 ++++++++++++++----- 2 files changed, 54 insertions(+), 13 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java index 456b3cc..99b71e0 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java @@ -73,18 +73,37 @@ public class UrlUtils { return domain; } - private static Pattern patternForHref = Pattern.compile("(]*href=)[\"']{0,1}([^\"'<>\\s]*)[\"']{0,1}", Pattern.CASE_INSENSITIVE); + /** + * allow blank space in quote + */ + private static Pattern patternForHrefWithQuote = Pattern.compile("(]*href=)[\"']([^\"'<>]*)[\"']", Pattern.CASE_INSENSITIVE); + + /** + * disallow blank space without quote + */ + private static Pattern patternForHrefWithoutQuote = Pattern.compile("(]*href=)([^\"'<>\\s]+)", Pattern.CASE_INSENSITIVE); public static String fixAllRelativeHrefs(String html, String url) { + html = replaceByPattern(html, url, patternForHrefWithQuote); + html = replaceByPattern(html, url, patternForHrefWithoutQuote); + return html; + } + + public static String replaceByPattern(String html, String url, Pattern pattern) { StringBuilder stringBuilder = new StringBuilder(); - Matcher matcher = patternForHref.matcher(html); + Matcher matcher = pattern.matcher(html); int lastEnd = 0; + boolean modified = false; while (matcher.find()) { + modified = true; stringBuilder.append(StringUtils.substring(html, lastEnd, matcher.start())); stringBuilder.append(matcher.group(1)); stringBuilder.append("\"").append(canonicalizeUrl(matcher.group(2), url)).append("\""); lastEnd = matcher.end(); } + if (!modified) { + return html; + } stringBuilder.append(StringUtils.substring(html, lastEnd)); return stringBuilder.toString(); } diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java index abe6adc..1e403c4 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java @@ -3,6 +3,8 @@ package us.codecraft.webmagic.utils; import org.junit.Assert; import org.junit.Test; +import static org.assertj.core.api.Assertions.assertThat; + /** * @author code4crafter@gmail.com
* Date: 13-4-21 @@ -12,19 +14,39 @@ public class UrlUtilsTest { @Test public void testFixRelativeUrl() { - String fixrelativeurl = UrlUtils.canonicalizeUrl("aa", "http://www.dianping.com/sh/ss/com"); - System.out.println("fix: " + fixrelativeurl); - Assert.assertEquals("http://www.dianping.com/sh/ss/aa", fixrelativeurl); + String absoluteUrl = UrlUtils.canonicalizeUrl("aa", "http://www.dianping.com/sh/ss/com"); + assertThat(absoluteUrl).isEqualTo("http://www.dianping.com/sh/ss/aa"); - fixrelativeurl = UrlUtils.canonicalizeUrl("../aa", "http://www.dianping.com/sh/ss/com"); - Assert.assertEquals("http://www.dianping.com/sh/aa", fixrelativeurl); + absoluteUrl = UrlUtils.canonicalizeUrl("../aa", "http://www.dianping.com/sh/ss/com"); + assertThat(absoluteUrl).isEqualTo("http://www.dianping.com/sh/aa"); - fixrelativeurl = UrlUtils.canonicalizeUrl("..aa", "http://www.dianping.com/sh/ss/com"); - Assert.assertEquals("http://www.dianping.com/sh/ss/..aa", fixrelativeurl); - fixrelativeurl = UrlUtils.canonicalizeUrl("../../aa", "http://www.dianping.com/sh/ss/com/"); - Assert.assertEquals("http://www.dianping.com/sh/aa", fixrelativeurl); - fixrelativeurl = UrlUtils.canonicalizeUrl("../../aa", "http://www.dianping.com/sh/ss/com"); - Assert.assertEquals("http://www.dianping.com/aa", fixrelativeurl); + absoluteUrl = UrlUtils.canonicalizeUrl("..aa", "http://www.dianping.com/sh/ss/com"); + assertThat(absoluteUrl).isEqualTo("http://www.dianping.com/sh/ss/..aa"); + + absoluteUrl = UrlUtils.canonicalizeUrl("../../aa", "http://www.dianping.com/sh/ss/com/"); + assertThat(absoluteUrl).isEqualTo("http://www.dianping.com/sh/aa"); + + absoluteUrl = UrlUtils.canonicalizeUrl("../../aa", "http://www.dianping.com/sh/ss/com"); + assertThat(absoluteUrl).isEqualTo("http://www.dianping.com/aa"); + } + + @Test + public void testFixAllRelativeHrefs() { + String originHtml = ""; + String replacedHtml = UrlUtils.fixAllRelativeHrefs(originHtml, "http://www.dianping.com/"); + assertThat(replacedHtml).isEqualTo(""); + + originHtml = ""; + replacedHtml = UrlUtils.fixAllRelativeHrefs(originHtml, "http://www.dianping.com/"); + assertThat(replacedHtml).isEqualTo(""); + + originHtml = ""; + replacedHtml = UrlUtils.fixAllRelativeHrefs(originHtml, "http://www.dianping.com/"); + assertThat(replacedHtml).isEqualTo(""); + + originHtml = ""; + replacedHtml = UrlUtils.fixAllRelativeHrefs(originHtml, "http://www.dianping.com/"); + assertThat(replacedHtml).isEqualTo(""); } @Test From 01848301d4098c8d884e1ccd667a774196f49f29 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Tue, 1 Apr 2014 22:14:30 +0800 Subject: [PATCH 033/130] encode illegal charactors in url #80 --- .../us/codecraft/webmagic/utils/UrlUtils.java | 18 ++++++++++++++---- .../downloader/HttpClientDownloaderTest.java | 8 ++++++++ .../codecraft/webmagic/utils/UrlUtilsTest.java | 9 +++++++-- 3 files changed, 29 insertions(+), 6 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java index 99b71e0..60eacee 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java @@ -43,12 +43,22 @@ public class UrlUtils { if (url.startsWith("?")) url = base.getPath() + url; URL abs = new URL(base, url); - return abs.toExternalForm(); + return encodeIllegalCharacterInUrl(abs.toExternalForm()); } catch (MalformedURLException e) { return ""; } } + /** + * + * @param url + * @return + */ + public static String encodeIllegalCharacterInUrl(String url) { + //TODO more charator support + return url.replace(" ", "%20"); + } + public static String getHost(String url) { String host = url; int i = StringUtils.ordinalIndexOf(url, "/", 3); @@ -101,9 +111,9 @@ public class UrlUtils { stringBuilder.append("\"").append(canonicalizeUrl(matcher.group(2), url)).append("\""); lastEnd = matcher.end(); } - if (!modified) { - return html; - } + if (!modified) { + return html; + } stringBuilder.append(StringUtils.substring(html, lastEnd)); return stringBuilder.toString(); } diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java index ac01926..e6fe5ae 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java @@ -8,6 +8,8 @@ import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.selector.Html; +import java.io.UnsupportedEncodingException; + import static org.assertj.core.api.Assertions.assertThat; import static org.junit.Assert.assertTrue; @@ -32,6 +34,12 @@ public class HttpClientDownloaderTest { assertTrue(!html.getText().isEmpty()); } + @Test(expected = IllegalArgumentException.class) + public void testDownloaderInIllegalUrl() throws UnsupportedEncodingException { + HttpClientDownloader httpClientDownloader = new HttpClientDownloader(); + httpClientDownloader.download("http://www.oschina.net/>"); + } + @Test public void testCycleTriedTimes() { HttpClientDownloader httpClientDownloader = new HttpClientDownloader(); diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java index 1e403c4..565fde4 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java @@ -38,17 +38,22 @@ public class UrlUtilsTest { originHtml = ""; replacedHtml = UrlUtils.fixAllRelativeHrefs(originHtml, "http://www.dianping.com/"); - assertThat(replacedHtml).isEqualTo(""); + assertThat(replacedHtml).isEqualTo(""); originHtml = ""; replacedHtml = UrlUtils.fixAllRelativeHrefs(originHtml, "http://www.dianping.com/"); - assertThat(replacedHtml).isEqualTo(""); + assertThat(replacedHtml).isEqualTo(""); originHtml = ""; replacedHtml = UrlUtils.fixAllRelativeHrefs(originHtml, "http://www.dianping.com/"); assertThat(replacedHtml).isEqualTo(""); } + @Test + public void test(){ + UrlUtils.canonicalizeUrl("start tag", "http://www.dianping.com/"); + } + @Test public void testGetDomain(){ String url = "http://www.dianping.com/aa/"; From 05abd566a4bbb9f90037c9f9c7d6a2eb22f34b99 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Tue, 1 Apr 2014 23:08:28 +0800 Subject: [PATCH 034/130] remove submodule --- .gitmodules | 3 --- .travis.yml | 2 -- webmagic-avalon/webmagic-avalon-common/pom.xml | 14 ++++++++++++++ 3 files changed, 14 insertions(+), 5 deletions(-) delete mode 100644 .gitmodules diff --git a/.gitmodules b/.gitmodules deleted file mode 100644 index 67d52cc..0000000 --- a/.gitmodules +++ /dev/null @@ -1,3 +0,0 @@ -[submodule "webmagic-avalon/forger"] - path = webmagic-avalon/forger - url = git://github.com/code4craft/forger.git diff --git a/.travis.yml b/.travis.yml index c18b7b6..c7c99f4 100644 --- a/.travis.yml +++ b/.travis.yml @@ -2,5 +2,3 @@ language: java jdk: - oraclejdk7 - openjdk6 -before_install: - - git submodule update --init --recursive diff --git a/webmagic-avalon/webmagic-avalon-common/pom.xml b/webmagic-avalon/webmagic-avalon-common/pom.xml index e15d715..ed0bc23 100644 --- a/webmagic-avalon/webmagic-avalon-common/pom.xml +++ b/webmagic-avalon/webmagic-avalon-common/pom.xml @@ -150,4 +150,18 @@
+ + + sonatype-nexus-snapshots + Sonatype Nexus Snapshots + https://oss.sonatype.org/content/repositories/snapshots + + false + + + true + + + + From 22e86976718f0a4315212c0cde2ebae598f44029 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Tue, 1 Apr 2014 23:16:03 +0800 Subject: [PATCH 035/130] add forger to folder --- webmagic-avalon/forger | 1 - webmagic-avalon/forger/LICENSE | 202 ++++++++++++++++++ webmagic-avalon/forger/README.md | 27 +++ webmagic-avalon/forger/pom.xml | 193 +++++++++++++++++ .../main/java/us/codecraft/forger/Forger.java | 36 ++++ .../us/codecraft/forger/ForgerFactory.java | 28 +++ .../forger/compiler/ForgerCompiler.java | 9 + .../forger/compiler/GroovyForgerCompiler.java | 16 ++ .../property/AbstractPropertyLoader.java | 112 ++++++++++ .../property/AnnotationPropertyLoader.java | 32 +++ .../us/codecraft/forger/property/Inject.java | 16 ++ .../codecraft/forger/property/Property.java | 60 ++++++ .../forger/property/PropertyLoader.java | 15 ++ .../forger/property/PropertyType.java | 23 ++ .../property/SimpleFieldPropertyLoader.java | 28 +++ .../property/format/BasicTypeFormatter.java | 168 +++++++++++++++ .../forger/property/format/DateFormatter.java | 35 +++ .../forger/property/format/Formatter.java | 39 ++++ .../property/format/ObjectFormatter.java | 9 + .../format/ObjectFormatterWithParams.java | 34 +++ .../forger/property/format/TypeFormatter.java | 12 ++ .../property/format/TypeFormatterFactory.java | 53 +++++ .../forger/src/main/resources/log4j.xml | 21 ++ .../test/java/us/codecraft/forger/Bar.java | 47 ++++ .../test/java/us/codecraft/forger/Foo.java | 47 ++++ .../java/us/codecraft/forger/Fooable.java | 9 + .../codecraft/forger/ForgerFactoryTest.java | 66 ++++++ .../compiler/GroovyForgerCompilerTest.java | 19 ++ .../forger/src/test/resources/log4j.xml | 31 +++ 29 files changed, 1387 insertions(+), 1 deletion(-) delete mode 160000 webmagic-avalon/forger create mode 100644 webmagic-avalon/forger/LICENSE create mode 100644 webmagic-avalon/forger/README.md create mode 100644 webmagic-avalon/forger/pom.xml create mode 100644 webmagic-avalon/forger/src/main/java/us/codecraft/forger/Forger.java create mode 100644 webmagic-avalon/forger/src/main/java/us/codecraft/forger/ForgerFactory.java create mode 100644 webmagic-avalon/forger/src/main/java/us/codecraft/forger/compiler/ForgerCompiler.java create mode 100644 webmagic-avalon/forger/src/main/java/us/codecraft/forger/compiler/GroovyForgerCompiler.java create mode 100644 webmagic-avalon/forger/src/main/java/us/codecraft/forger/property/AbstractPropertyLoader.java create mode 100644 webmagic-avalon/forger/src/main/java/us/codecraft/forger/property/AnnotationPropertyLoader.java create mode 100644 webmagic-avalon/forger/src/main/java/us/codecraft/forger/property/Inject.java create mode 100644 webmagic-avalon/forger/src/main/java/us/codecraft/forger/property/Property.java create mode 100644 webmagic-avalon/forger/src/main/java/us/codecraft/forger/property/PropertyLoader.java create mode 100644 webmagic-avalon/forger/src/main/java/us/codecraft/forger/property/PropertyType.java create mode 100644 webmagic-avalon/forger/src/main/java/us/codecraft/forger/property/SimpleFieldPropertyLoader.java create mode 100644 webmagic-avalon/forger/src/main/java/us/codecraft/forger/property/format/BasicTypeFormatter.java create mode 100644 webmagic-avalon/forger/src/main/java/us/codecraft/forger/property/format/DateFormatter.java create mode 100644 webmagic-avalon/forger/src/main/java/us/codecraft/forger/property/format/Formatter.java create mode 100644 webmagic-avalon/forger/src/main/java/us/codecraft/forger/property/format/ObjectFormatter.java create mode 100644 webmagic-avalon/forger/src/main/java/us/codecraft/forger/property/format/ObjectFormatterWithParams.java create mode 100644 webmagic-avalon/forger/src/main/java/us/codecraft/forger/property/format/TypeFormatter.java create mode 100644 webmagic-avalon/forger/src/main/java/us/codecraft/forger/property/format/TypeFormatterFactory.java create mode 100644 webmagic-avalon/forger/src/main/resources/log4j.xml create mode 100644 webmagic-avalon/forger/src/test/java/us/codecraft/forger/Bar.java create mode 100644 webmagic-avalon/forger/src/test/java/us/codecraft/forger/Foo.java create mode 100644 webmagic-avalon/forger/src/test/java/us/codecraft/forger/Fooable.java create mode 100644 webmagic-avalon/forger/src/test/java/us/codecraft/forger/ForgerFactoryTest.java create mode 100644 webmagic-avalon/forger/src/test/java/us/codecraft/forger/compiler/GroovyForgerCompilerTest.java create mode 100644 webmagic-avalon/forger/src/test/resources/log4j.xml diff --git a/webmagic-avalon/forger b/webmagic-avalon/forger deleted file mode 160000 index 9f08a0f..0000000 --- a/webmagic-avalon/forger +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 9f08a0ffd09f5d59ae38091bca250d51aa54bfde diff --git a/webmagic-avalon/forger/LICENSE b/webmagic-avalon/forger/LICENSE new file mode 100644 index 0000000..e06d208 --- /dev/null +++ b/webmagic-avalon/forger/LICENSE @@ -0,0 +1,202 @@ +Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "{}" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright {yyyy} {name of copyright owner} + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + diff --git a/webmagic-avalon/forger/README.md b/webmagic-avalon/forger/README.md new file mode 100644 index 0000000..1e4d7f5 --- /dev/null +++ b/webmagic-avalon/forger/README.md @@ -0,0 +1,27 @@ +forger +====== + +Dynamic Java object generator with template class and configuration. + +## Compiler + +Use groovy compiler. Compile source code to Java class. + +## PropertyLoader + +Load properties of object from user input. + +## API + +```java + @Test + public void testForgerCreateByClassAnnotationCompile() throws Exception { + ForgerFactory forgerFactory = new ForgerFactory(new AnnotationPropertyLoader(), new GroovyForgerCompiler()); + Forger forger = forgerFactory.compile(Foo.SOURCE_CODE); + Fooable foo = forger.forge(ImmutableMap.of("fooa", "test")); + Field field = forger.getClazz().getDeclaredField("foo"); + field.setAccessible(true); + assertThat(field.get(foo)).isEqualTo("test"); + assertThat(foo.foo()).isEqualTo("test"); + } +``` \ No newline at end of file diff --git a/webmagic-avalon/forger/pom.xml b/webmagic-avalon/forger/pom.xml new file mode 100644 index 0000000..9738c10 --- /dev/null +++ b/webmagic-avalon/forger/pom.xml @@ -0,0 +1,193 @@ + + + + org.sonatype.oss + oss-parent + 7 + + us.codecraft + forger + 0.1.0-SNAPSHOT + 4.0.0 + jar + + UTF-8 + UTF-8 + + forger + + Dynamic Java object generator with template class and configuration. + + https://github.com/code4craft/forger/ + + + code4craft + Yihua huang + code4crafer@gmail.com + + + + scm:git:git@github.com:code4craft/forger.git + scm:git:git@github.com:code4craft/forger.git + git@github.com:code4craft/forger.git + HEAD + + + + Apache License,Version 2 + http://www.apache.org/licenses/LICENSE-2.0 + repo + + + + + + junit + junit + 4.11 + test + + + org.assertj + assertj-core + 1.5.0 + + + org.codehaus.groovy + groovy + 2.2.2 + + + org.slf4j + slf4j-api + 1.7.6 + + + + org.slf4j + slf4j-log4j12 + 1.7.6 + + + + org.apache.commons + commons-lang3 + 3.1 + + + + com.google.guava + guava + 15.0 + + + + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.1 + + 1.6 + 1.6 + UTF-8 + + + + org.apache.maven.plugins + maven-dependency-plugin + 2.8 + + + copy-dependencies + package + + copy-dependencies + + + ${project.build.directory}/lib + false + false + true + + + + + + org.apache.maven.plugins + maven-resources-plugin + 2.6 + + UTF-8 + + + + org.apache.maven.plugins + maven-source-plugin + 2.2.1 + + + attach-sources + + jar + + + + + + org.apache.maven.plugins + maven-javadoc-plugin + 2.9.1 + + UTF-8 + + + + attach-javadocs + + jar + + + + + + org.apache.maven.plugins + maven-release-plugin + 2.4.1 + + + + + + + release-sign-artifacts + + + performRelease + true + + + + + + org.apache.maven.plugins + maven-gpg-plugin + 1.1 + + + sign-artifacts + verify + + sign + + + + + + + + + + + diff --git a/webmagic-avalon/forger/src/main/java/us/codecraft/forger/Forger.java b/webmagic-avalon/forger/src/main/java/us/codecraft/forger/Forger.java new file mode 100644 index 0000000..57ec2ab --- /dev/null +++ b/webmagic-avalon/forger/src/main/java/us/codecraft/forger/Forger.java @@ -0,0 +1,36 @@ +package us.codecraft.forger; + +import us.codecraft.forger.property.Property; +import us.codecraft.forger.property.PropertyLoader; + +import java.util.List; +import java.util.Map; + +/** + * @author code4crafter@gmail.com + */ +public class Forger { + + private final Class clazz; + + private final PropertyLoader propertyLoader; + + public Forger(Class clazz,PropertyLoader propertyLoader) { + this.clazz = clazz; + this.propertyLoader = propertyLoader; + } + + public T forge(Map properties) throws IllegalAccessException, InstantiationException { + T t = clazz.newInstance(); + propertyLoader.load(t, properties); + return t; + } + + public List getPropertyNames() { + return propertyLoader.getProperties(clazz); + } + + public Class getClazz() { + return clazz; + } +} diff --git a/webmagic-avalon/forger/src/main/java/us/codecraft/forger/ForgerFactory.java b/webmagic-avalon/forger/src/main/java/us/codecraft/forger/ForgerFactory.java new file mode 100644 index 0000000..84b507b --- /dev/null +++ b/webmagic-avalon/forger/src/main/java/us/codecraft/forger/ForgerFactory.java @@ -0,0 +1,28 @@ +package us.codecraft.forger; + +import us.codecraft.forger.compiler.ForgerCompiler; +import us.codecraft.forger.property.PropertyLoader; + +/** + * @author code4crafter@gmail.com + */ +public class ForgerFactory { + + private final PropertyLoader propertyLoader; + + private final ForgerCompiler forgerCompiler; + + public ForgerFactory(PropertyLoader propertyLoader, ForgerCompiler forgerCompiler) { + this.propertyLoader = propertyLoader; + this.forgerCompiler = forgerCompiler; + } + + public Forger compile(String sourceCode) { + Class clazz = forgerCompiler.compile(sourceCode); + return new Forger(clazz, propertyLoader); + } + + public Forger create(Class clazz) { + return new Forger(clazz, propertyLoader); + } +} diff --git a/webmagic-avalon/forger/src/main/java/us/codecraft/forger/compiler/ForgerCompiler.java b/webmagic-avalon/forger/src/main/java/us/codecraft/forger/compiler/ForgerCompiler.java new file mode 100644 index 0000000..5e9e378 --- /dev/null +++ b/webmagic-avalon/forger/src/main/java/us/codecraft/forger/compiler/ForgerCompiler.java @@ -0,0 +1,9 @@ +package us.codecraft.forger.compiler; + +/** + * @author code4crafter@gmail.com + */ +public interface ForgerCompiler { + + public Class compile(String sourceCode); +} diff --git a/webmagic-avalon/forger/src/main/java/us/codecraft/forger/compiler/GroovyForgerCompiler.java b/webmagic-avalon/forger/src/main/java/us/codecraft/forger/compiler/GroovyForgerCompiler.java new file mode 100644 index 0000000..26a137e --- /dev/null +++ b/webmagic-avalon/forger/src/main/java/us/codecraft/forger/compiler/GroovyForgerCompiler.java @@ -0,0 +1,16 @@ +package us.codecraft.forger.compiler; + +import groovy.lang.GroovyClassLoader; + +/** + * @author code4crafter@gmail.com + */ +public class GroovyForgerCompiler implements ForgerCompiler{ + + private GroovyClassLoader groovyClassLoader = new GroovyClassLoader(); + + @Override + public Class compile(String sourceCode) { + return groovyClassLoader.parseClass(sourceCode); + } +} diff --git a/webmagic-avalon/forger/src/main/java/us/codecraft/forger/property/AbstractPropertyLoader.java b/webmagic-avalon/forger/src/main/java/us/codecraft/forger/property/AbstractPropertyLoader.java new file mode 100644 index 0000000..f0b638d --- /dev/null +++ b/webmagic-avalon/forger/src/main/java/us/codecraft/forger/property/AbstractPropertyLoader.java @@ -0,0 +1,112 @@ +package us.codecraft.forger.property; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import us.codecraft.forger.property.format.*; + +import java.lang.reflect.Field; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +/** + * @author code4crafter@gmail.com + */ +public abstract class AbstractPropertyLoader implements PropertyLoader { + + private TypeFormatterFactory typeFormatterFactory = new TypeFormatterFactory(); + + protected Logger logger = LoggerFactory.getLogger(getClass()); + + protected TypeFormatterFactory getTypeFormatterFactory() { + return typeFormatterFactory; + } + + @Override + public T load(T object, Map propertyConfigs) { + List properties = getProperties(object.getClass()); + for (Property property : properties) { + Object value = propertyConfigs.get(property.getName()); + if (value == null) { + throw new IllegalArgumentException("Config for property " + property.getName() + " is missing!"); + } + ObjectFormatter objectFormatter = property.getObjectFormatter(); + switch (property.getType()) { + case PropertyString: + Object fieldValue = objectFormatter.format(String.valueOf(value)); + try { + property.getField().set(object, fieldValue); + } catch (IllegalAccessException e) { + logger.warn("Set field " + property.getField() + " error!", e); + } + break; + case PropertyList: + if (!List.class.isAssignableFrom(value.getClass())) { + throw new IllegalArgumentException("Config for property " + property.getName() + " should be subclass of List!"); + } + List listField = new ArrayList(); + List listConfigs = (List) value; + for (String listConfig : listConfigs) { + listField.add(objectFormatter.format(listConfig)); + } + try { + property.getField().set(object, listField); + } catch (IllegalAccessException e) { + logger.warn("Set field " + property.getField() + " error!", e); + } + break; + case PropertyMap: + if (!Map.class.isAssignableFrom(value.getClass())) { + throw new IllegalArgumentException("Config for property " + property.getName() + " should be subclass of List!"); + } + Map mapField = new HashMap(); + Map mapConfigs = (Map) value; + for (Map.Entry entry : mapConfigs.entrySet()) { + mapField.put(entry.getKey(), objectFormatter.format(entry.getValue())); + } + try { + property.getField().set(object, mapField); + } catch (IllegalAccessException e) { + logger.warn("Set field " + property.getField() + " error!", e); + } + break; + } + } + return object; + } + + protected ObjectFormatter prepareTypeFormatterParam(TypeFormatter objectFormatter, String[] params) { + if (params == null) { + return objectFormatter; + } + return new ObjectFormatterWithParams().setTypeFormatter(objectFormatter).setParams(params); + } + + protected ObjectFormatter getObjectFormatter(Field field) { + Class type = field.getType(); + if (List.class.isAssignableFrom(type) || Map.class.isAssignableFrom(type)) { + type = String.class; + } + if (field.isAnnotationPresent(Formatter.class)) { + Formatter formatter = field.getAnnotation(Formatter.class); + if (!formatter.formatter().equals(TypeFormatter.class)) { + TypeFormatter typeFormatter = typeFormatterFactory.getByFormatterClass(formatter.formatter()); + if (typeFormatter != null) { + return prepareTypeFormatterParam(typeFormatter,formatter.value()); + } + typeFormatterFactory.put(formatter.formatter()); + return prepareTypeFormatterParam(typeFormatterFactory.getByFormatterClass(formatter.formatter()), formatter.value()); + } else if (!formatter.subClazz().equals(String.class)) { + type = formatter.subClazz(); + TypeFormatter typeFormatter = typeFormatterFactory.get(type); + if (typeFormatter == null) { + throw new IllegalArgumentException("No typeFormatter for class " + type); + } + return prepareTypeFormatterParam(typeFormatter, formatter.value()); + } + } + return getTypeFormatterFactory().get(BasicTypeFormatter.detectBasicClass(type)); + } + +} diff --git a/webmagic-avalon/forger/src/main/java/us/codecraft/forger/property/AnnotationPropertyLoader.java b/webmagic-avalon/forger/src/main/java/us/codecraft/forger/property/AnnotationPropertyLoader.java new file mode 100644 index 0000000..ea630b9 --- /dev/null +++ b/webmagic-avalon/forger/src/main/java/us/codecraft/forger/property/AnnotationPropertyLoader.java @@ -0,0 +1,32 @@ +package us.codecraft.forger.property; + +import java.lang.reflect.Field; +import java.util.ArrayList; +import java.util.List; + +/** + * @author code4crafter@gmail.com + */ +public class AnnotationPropertyLoader extends AbstractPropertyLoader { + + @Override + public List getProperties(Class clazz) { + Field[] fields = clazz.getDeclaredFields(); + List properties = new ArrayList(fields.length); + for (Field field : fields) { + Inject inject = field.getAnnotation(Inject.class); + if (inject != null) { + if (!field.isAccessible()) { + field.setAccessible(true); + } + Property property = Property.fromField(field); + if (inject.value().length() > 0) { + property.setName(inject.value()); + } + property.setObjectFormatter(getObjectFormatter(field)); + properties.add(property); + } + } + return properties; + } +} diff --git a/webmagic-avalon/forger/src/main/java/us/codecraft/forger/property/Inject.java b/webmagic-avalon/forger/src/main/java/us/codecraft/forger/property/Inject.java new file mode 100644 index 0000000..262e45a --- /dev/null +++ b/webmagic-avalon/forger/src/main/java/us/codecraft/forger/property/Inject.java @@ -0,0 +1,16 @@ +package us.codecraft.forger.property; + +import java.lang.annotation.ElementType; +import java.lang.annotation.Retention; +import java.lang.annotation.Target; + +/** + * @author code4crafter@gmail.com + */ +@Retention(java.lang.annotation.RetentionPolicy.RUNTIME) +@Target({ElementType.FIELD}) +public @interface Inject { + + String value() default ""; + +} diff --git a/webmagic-avalon/forger/src/main/java/us/codecraft/forger/property/Property.java b/webmagic-avalon/forger/src/main/java/us/codecraft/forger/property/Property.java new file mode 100644 index 0000000..66b196c --- /dev/null +++ b/webmagic-avalon/forger/src/main/java/us/codecraft/forger/property/Property.java @@ -0,0 +1,60 @@ +package us.codecraft.forger.property; + +import us.codecraft.forger.property.format.ObjectFormatter; + +import java.lang.reflect.Field; + +/** + * @author code4crafter@gmail.com + */ +public class Property { + + private String name; + + private PropertyType type; + + private Field field; + + private ObjectFormatter objectFormatter; + + public ObjectFormatter getObjectFormatter() { + return objectFormatter; + } + + public Property setObjectFormatter(ObjectFormatter objectFormatter) { + this.objectFormatter = objectFormatter; + return this; + } + + public String getName() { + return name; + } + + public Property setName(String name) { + this.name = name; + return this; + } + + public PropertyType getType() { + return type; + } + + public Property setType(PropertyType type) { + this.type = type; + return this; + } + + public Field getField() { + return field; + } + + public Property setField(Field field) { + this.field = field; + return this; + } + + public static Property fromField(Field field) { + return new Property().setName(field.getName()).setType(PropertyType.from(field.getType())).setField(field); + } + +} diff --git a/webmagic-avalon/forger/src/main/java/us/codecraft/forger/property/PropertyLoader.java b/webmagic-avalon/forger/src/main/java/us/codecraft/forger/property/PropertyLoader.java new file mode 100644 index 0000000..226407a --- /dev/null +++ b/webmagic-avalon/forger/src/main/java/us/codecraft/forger/property/PropertyLoader.java @@ -0,0 +1,15 @@ +package us.codecraft.forger.property; + +import java.util.List; +import java.util.Map; + +/** + * @author code4crafter@gmail.com + */ +public interface PropertyLoader { + + public T load(T object, Map propertyConfigs); + + public List getProperties(Class clazz); + +} diff --git a/webmagic-avalon/forger/src/main/java/us/codecraft/forger/property/PropertyType.java b/webmagic-avalon/forger/src/main/java/us/codecraft/forger/property/PropertyType.java new file mode 100644 index 0000000..aa0df51 --- /dev/null +++ b/webmagic-avalon/forger/src/main/java/us/codecraft/forger/property/PropertyType.java @@ -0,0 +1,23 @@ +package us.codecraft.forger.property; + +import java.util.List; +import java.util.Map; + +/** + * @author code4crafter@gmail.com + */ +public enum PropertyType { + + PropertyString,PropertyMap,PropertyList; + + public static PropertyType from(Class clazz){ + if (Map.class.isAssignableFrom(clazz)){ + return PropertyMap; + } + if (List.class.isAssignableFrom(clazz)){ + return PropertyList; + } + return PropertyString; + } + +} diff --git a/webmagic-avalon/forger/src/main/java/us/codecraft/forger/property/SimpleFieldPropertyLoader.java b/webmagic-avalon/forger/src/main/java/us/codecraft/forger/property/SimpleFieldPropertyLoader.java new file mode 100644 index 0000000..13ff68a --- /dev/null +++ b/webmagic-avalon/forger/src/main/java/us/codecraft/forger/property/SimpleFieldPropertyLoader.java @@ -0,0 +1,28 @@ +package us.codecraft.forger.property; + +import java.lang.reflect.Field; +import java.lang.reflect.Modifier; +import java.util.ArrayList; +import java.util.List; + +/** + * @author code4crafter@gmail.com + */ +public class SimpleFieldPropertyLoader extends AbstractPropertyLoader { + + @Override + public List getProperties(Class clazz) { + Field[] fields = clazz.getDeclaredFields(); + List properties = new ArrayList(fields.length); + for (Field field : fields) { + if (Modifier.isStatic(field.getModifiers())){ + continue; + } + if (!field.isAccessible()){ + field.setAccessible(true); + } + properties.add(Property.fromField(field).setObjectFormatter(getObjectFormatter(field))); + } + return properties; + } +} diff --git a/webmagic-avalon/forger/src/main/java/us/codecraft/forger/property/format/BasicTypeFormatter.java b/webmagic-avalon/forger/src/main/java/us/codecraft/forger/property/format/BasicTypeFormatter.java new file mode 100644 index 0000000..a6d0e5f --- /dev/null +++ b/webmagic-avalon/forger/src/main/java/us/codecraft/forger/property/format/BasicTypeFormatter.java @@ -0,0 +1,168 @@ +package us.codecraft.forger.property.format; + +import java.util.Arrays; +import java.util.List; + +/** + * @author code4crafter@gmail.com + * @since 0.3.2 + */ +public abstract class BasicTypeFormatter implements TypeFormatter { + + @Override + public T format(String text) { + if (text == null) { + return null; + } + text = text.trim(); + return formatTrimmed(text); + } + + @Override + public T format(String text, String[] params) { + return format(text); + } + + protected abstract T formatTrimmed(String raw); + + public static final List> basicTypeFormatters = Arrays.>asList(IntegerFormatter.class, + LongFormatter.class, DoubleFormatter.class, FloatFormatter.class, ShortFormatter.class, + CharactorFormatter.class, ByteFormatter.class, BooleanFormatter.class, DateFormatter.class, StringFormatter.class); + + public static Class detectBasicClass(Class type) { + if (type.equals(Integer.TYPE) || type.equals(Integer.class)) { + return Integer.class; + } else if (type.equals(Long.TYPE) || type.equals(Long.class)) { + return Long.class; + } else if (type.equals(Double.TYPE) || type.equals(Double.class)) { + return Double.class; + } else if (type.equals(Float.TYPE) || type.equals(Float.class)) { + return Float.class; + } else if (type.equals(Short.TYPE) || type.equals(Short.class)) { + return Short.class; + } else if (type.equals(Character.TYPE) || type.equals(Character.class)) { + return Character.class; + } else if (type.equals(Byte.TYPE) || type.equals(Byte.class)) { + return Byte.class; + } else if (type.equals(Boolean.TYPE) || type.equals(Boolean.class)) { + return Boolean.class; + } + return type; + } + + public static class IntegerFormatter extends BasicTypeFormatter { + @Override + public Integer formatTrimmed(String raw) { + return Integer.parseInt(raw); + } + + @Override + public Class clazz() { + return Integer.class; + } + } + + public static class LongFormatter extends BasicTypeFormatter { + @Override + public Long formatTrimmed(String raw) { + return Long.parseLong(raw); + } + + @Override + public Class clazz() { + return Long.class; + } + } + + public static class DoubleFormatter extends BasicTypeFormatter { + @Override + public Double formatTrimmed(String raw) { + return Double.parseDouble(raw); + } + + @Override + public Class clazz() { + return Double.class; + } + } + + public static class FloatFormatter extends BasicTypeFormatter { + @Override + public Float formatTrimmed(String raw) { + return Float.parseFloat(raw); + } + + @Override + public Class clazz() { + return Float.class; + } + } + + public static class ShortFormatter extends BasicTypeFormatter { + @Override + public Short formatTrimmed(String raw) { + return Short.parseShort(raw); + } + + @Override + public Class clazz() { + return Short.class; + } + } + + public static class CharactorFormatter extends BasicTypeFormatter { + @Override + public Character formatTrimmed(String raw) { + return raw.charAt(0); + } + + @Override + public Class clazz() { + return Character.class; + } + } + + public static class ByteFormatter extends BasicTypeFormatter { + @Override + public Byte formatTrimmed(String raw) { + return Byte.parseByte(raw, 10); + } + + @Override + public Class clazz() { + return Byte.class; + } + } + + public static class BooleanFormatter extends BasicTypeFormatter { + @Override + public Boolean formatTrimmed(String raw) { + return Boolean.parseBoolean(raw); + } + + @Override + public Class clazz() { + return Boolean.class; + } + } + + public static class StringFormatter implements TypeFormatter { + + @Override + public String format(String text) { + return text; + } + + @Override + public String format(String text, String[] params) { + return format(text); + } + + @Override + public Class clazz() { + return String.class; + } + } + + +} diff --git a/webmagic-avalon/forger/src/main/java/us/codecraft/forger/property/format/DateFormatter.java b/webmagic-avalon/forger/src/main/java/us/codecraft/forger/property/format/DateFormatter.java new file mode 100644 index 0000000..f9bdd9f --- /dev/null +++ b/webmagic-avalon/forger/src/main/java/us/codecraft/forger/property/format/DateFormatter.java @@ -0,0 +1,35 @@ +package us.codecraft.forger.property.format; + +import org.apache.commons.lang3.time.DateUtils; + +import java.text.ParseException; +import java.util.Date; + +/** + * @author code4crafter@gmail.com + * @since 0.3.2 + */ +public class DateFormatter implements TypeFormatter { + + public static final String[] DEFAULT_PATTERN = new String[]{"yyyy-MM-dd HH:mm"}; + + @Override + public Date format(String text) { + return format(text,DEFAULT_PATTERN); + } + + @Override + public Date format(String text, String[] params) { + try { + return DateUtils.parseDate(text, params); + } catch (ParseException e) { + throw new IllegalArgumentException(e); + } + } + + @Override + public Class clazz() { + return Date.class; + } + +} diff --git a/webmagic-avalon/forger/src/main/java/us/codecraft/forger/property/format/Formatter.java b/webmagic-avalon/forger/src/main/java/us/codecraft/forger/property/format/Formatter.java new file mode 100644 index 0000000..45b84b1 --- /dev/null +++ b/webmagic-avalon/forger/src/main/java/us/codecraft/forger/property/format/Formatter.java @@ -0,0 +1,39 @@ +package us.codecraft.forger.property.format; + +import java.lang.annotation.ElementType; +import java.lang.annotation.Retention; +import java.lang.annotation.Target; + +/** + * Define how the result string is convert to an object for field. + * + * @author code4crafter@gmail.com
+ * @since 0.3.2 + */ +@Retention(java.lang.annotation.RetentionPolicy.RUNTIME) +@Target({ElementType.FIELD}) +public @interface Formatter { + + /** + * Set formatter params. + * + * @return formatter params + */ + String[] value(); + + /** + * Specific the class of field of class of elements in collection for field.
+ * It is not necessary to be set because we can detect the class by class of field, + * unless you use a collection as a field.
+ * + * @return the class of field + */ + Class subClazz() default String.class; + + /** + * If there are more than one formatter for a class, just specify the implement. + * @return implement + */ + Class formatter() default TypeFormatter.class; + +} diff --git a/webmagic-avalon/forger/src/main/java/us/codecraft/forger/property/format/ObjectFormatter.java b/webmagic-avalon/forger/src/main/java/us/codecraft/forger/property/format/ObjectFormatter.java new file mode 100644 index 0000000..a5a8134 --- /dev/null +++ b/webmagic-avalon/forger/src/main/java/us/codecraft/forger/property/format/ObjectFormatter.java @@ -0,0 +1,9 @@ +package us.codecraft.forger.property.format; + +/** + * @author code4crafter@gmail.com + */ +public interface ObjectFormatter { + + T format(String text); +} diff --git a/webmagic-avalon/forger/src/main/java/us/codecraft/forger/property/format/ObjectFormatterWithParams.java b/webmagic-avalon/forger/src/main/java/us/codecraft/forger/property/format/ObjectFormatterWithParams.java new file mode 100644 index 0000000..051cc5d --- /dev/null +++ b/webmagic-avalon/forger/src/main/java/us/codecraft/forger/property/format/ObjectFormatterWithParams.java @@ -0,0 +1,34 @@ +package us.codecraft.forger.property.format; + +/** + * @author code4crafter@gmail.com + */ +public class ObjectFormatterWithParams implements ObjectFormatter { + + private TypeFormatter typeFormatter; + + private String[] params; + + public TypeFormatter getTypeFormatter() { + return typeFormatter; + } + + public ObjectFormatterWithParams setTypeFormatter(TypeFormatter typeFormatter) { + this.typeFormatter = typeFormatter; + return this; + } + + public String[] getParams() { + return params; + } + + public ObjectFormatterWithParams setParams(String[] params) { + this.params = params; + return this; + } + + @Override + public T format(String text) { + return typeFormatter.format(text, params); + } +} diff --git a/webmagic-avalon/forger/src/main/java/us/codecraft/forger/property/format/TypeFormatter.java b/webmagic-avalon/forger/src/main/java/us/codecraft/forger/property/format/TypeFormatter.java new file mode 100644 index 0000000..e6e436d --- /dev/null +++ b/webmagic-avalon/forger/src/main/java/us/codecraft/forger/property/format/TypeFormatter.java @@ -0,0 +1,12 @@ +package us.codecraft.forger.property.format; + +/** + * @author code4crafter@gmail.com + */ +public interface TypeFormatter extends ObjectFormatter { + + T format(String text, String[] params); + + Class clazz(); + +} diff --git a/webmagic-avalon/forger/src/main/java/us/codecraft/forger/property/format/TypeFormatterFactory.java b/webmagic-avalon/forger/src/main/java/us/codecraft/forger/property/format/TypeFormatterFactory.java new file mode 100644 index 0000000..027d8fe --- /dev/null +++ b/webmagic-avalon/forger/src/main/java/us/codecraft/forger/property/format/TypeFormatterFactory.java @@ -0,0 +1,53 @@ +package us.codecraft.forger.property.format; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; + +/** + * @author code4crafter@gmail.com + * @since 0.3.2 + */ +public class TypeFormatterFactory { + + private Logger logger = LoggerFactory.getLogger(getClass()); + + private Map objectFormatterMapWithPropertyAsKey = new ConcurrentHashMap(); + + private Map objectFormatterMapWithClassAsKey = new ConcurrentHashMap(); + + public TypeFormatterFactory() { + initFormatterMap(); + } + + private void initFormatterMap() { + for (Class basicTypeFormatter : BasicTypeFormatter.basicTypeFormatters) { + put(basicTypeFormatter); + } + put(DateFormatter.class); + } + + public synchronized void put(Class objectFormatterClazz) { + try { + TypeFormatter typeFormatter = objectFormatterClazz.newInstance(); + if (typeFormatter.clazz() != null) { + objectFormatterMapWithPropertyAsKey.put(typeFormatter.clazz(), typeFormatter); + } + objectFormatterMapWithClassAsKey.put(objectFormatterClazz, typeFormatter); + } catch (InstantiationException e) { + logger.error("Init objectFormatter error", e); + } catch (IllegalAccessException e) { + logger.error("Init objectFormatter error", e); + } + } + + public TypeFormatter get(Class clazz) { + return objectFormatterMapWithPropertyAsKey.get(clazz); + } + + public TypeFormatter getByFormatterClass(Class clazz) { + return objectFormatterMapWithClassAsKey.get(clazz); + } +} diff --git a/webmagic-avalon/forger/src/main/resources/log4j.xml b/webmagic-avalon/forger/src/main/resources/log4j.xml new file mode 100644 index 0000000..c2b5a2f --- /dev/null +++ b/webmagic-avalon/forger/src/main/resources/log4j.xml @@ -0,0 +1,21 @@ + + + + + + + + + + + + + + + + + + + + + diff --git a/webmagic-avalon/forger/src/test/java/us/codecraft/forger/Bar.java b/webmagic-avalon/forger/src/test/java/us/codecraft/forger/Bar.java new file mode 100644 index 0000000..3b51a5c --- /dev/null +++ b/webmagic-avalon/forger/src/test/java/us/codecraft/forger/Bar.java @@ -0,0 +1,47 @@ +package us.codecraft.forger; + +import us.codecraft.forger.property.Inject; +import us.codecraft.forger.property.format.Formatter; + +import java.util.List; +import java.util.Map; + +/** + * @author code4crafter@gmail.com + */ +public class Bar { + + @Inject("bar") + private String bar; + + @Inject + private List values; + + @Formatter(value = "", subClazz = Integer.class) + @Inject + private Map idMap; + + public String getBar() { + return bar; + } + + public void setBar(String bar) { + this.bar = bar; + } + + public List getValues() { + return values; + } + + public void setValues(List values) { + this.values = values; + } + + public Map getIdMap() { + return idMap; + } + + public void setIdMap(Map idMap) { + this.idMap = idMap; + } +} diff --git a/webmagic-avalon/forger/src/test/java/us/codecraft/forger/Foo.java b/webmagic-avalon/forger/src/test/java/us/codecraft/forger/Foo.java new file mode 100644 index 0000000..daa2e15 --- /dev/null +++ b/webmagic-avalon/forger/src/test/java/us/codecraft/forger/Foo.java @@ -0,0 +1,47 @@ +package us.codecraft.forger; + +import us.codecraft.forger.property.Inject; +import us.codecraft.forger.property.format.Formatter; + +/** + * @author code4crafter@gmail.com + */ +public class Foo implements Fooable{ + + @Formatter("") + @Inject("fooa") + private String foo; + + public static final String SOURCE_CODE="import us.codecraft.forger.*;\n" + + "import us.codecraft.forger.property.Inject;\n" + + "import us.codecraft.forger.property.Inject;\n" + + "import us.codecraft.forger.property.format.Formatter;\n" + + "\n" + + "/**\n" + + " * @author code4crafter@gmail.com\n" + + " */\n" + + "public class Foo implements Fooable{\n" + + "\n" + + " @Formatter(\"\")\n" + + " @Inject(\"fooa\")\n" + + " private String foo;\n" + + "\n" + + " public String getFoo() {\n" + + " return foo;\n" + + " }\n" + + "\n" + + " @Override\n" + + " public String foo() {\n" + + " return foo;\n" + + " }\n" + + "}"; + + public String getFoo() { + return foo; + } + + @Override + public String foo() { + return foo; + } +} diff --git a/webmagic-avalon/forger/src/test/java/us/codecraft/forger/Fooable.java b/webmagic-avalon/forger/src/test/java/us/codecraft/forger/Fooable.java new file mode 100644 index 0000000..86c1d02 --- /dev/null +++ b/webmagic-avalon/forger/src/test/java/us/codecraft/forger/Fooable.java @@ -0,0 +1,9 @@ +package us.codecraft.forger; + +/** + * @author code4crafter@gmail.com + */ +public interface Fooable { + + public String foo(); +} diff --git a/webmagic-avalon/forger/src/test/java/us/codecraft/forger/ForgerFactoryTest.java b/webmagic-avalon/forger/src/test/java/us/codecraft/forger/ForgerFactoryTest.java new file mode 100644 index 0000000..50f248a --- /dev/null +++ b/webmagic-avalon/forger/src/test/java/us/codecraft/forger/ForgerFactoryTest.java @@ -0,0 +1,66 @@ +package us.codecraft.forger; + +import com.google.common.collect.ImmutableMap; +import org.junit.Test; +import us.codecraft.forger.compiler.GroovyForgerCompiler; +import us.codecraft.forger.property.AnnotationPropertyLoader; +import us.codecraft.forger.property.SimpleFieldPropertyLoader; + +import java.lang.reflect.Field; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import static org.assertj.core.api.Assertions.*; + +/** + * @author code4crafter@gmail.com + */ +public class ForgerFactoryTest { + + @Test + public void testForgerCreateByClassProperty() throws Exception { + ForgerFactory forgerFactory = new ForgerFactory(new SimpleFieldPropertyLoader(), null); + Forger forger = forgerFactory.create(Foo.class); + Foo foo = forger.forge(ImmutableMap.of("foo", "test")); + assertThat(foo.getFoo()).isEqualTo("test"); + } + + @Test + public void testForgerCreateByClassAnnotation() throws Exception { + ForgerFactory forgerFactory = new ForgerFactory(new AnnotationPropertyLoader(), null); + Forger forger = forgerFactory.create(Foo.class); + Foo foo = forger.forge(ImmutableMap.of("fooa", "test")); + assertThat(foo.getFoo()).isEqualTo("test"); + } + + @Test + public void testForgerCreateByClassAnnotationCompile() throws Exception { + ForgerFactory forgerFactory = new ForgerFactory(new AnnotationPropertyLoader(), new GroovyForgerCompiler()); + Forger forger = forgerFactory.compile(Foo.SOURCE_CODE); + Fooable foo = forger.forge(ImmutableMap.of("fooa", "test")); + Field field = forger.getClazz().getDeclaredField("foo"); + field.setAccessible(true); + assertThat(field.get(foo)).isEqualTo("test"); + assertThat(foo.foo()).isEqualTo("test"); + } + + @Test + public void testForgerCreateByClassAnnotationWithCollections() throws Exception { + ForgerFactory forgerFactory = new ForgerFactory(new AnnotationPropertyLoader(), null); + Forger forger = forgerFactory.create(Bar.class); + Map map = new HashMap(); + map.put("bar", "bar"); + Map submap = new HashMap(); + submap.put("1", "1"); + submap.put("2", "2"); + map.put("idMap", submap); + List sublist = new ArrayList(); + sublist.add("test"); + map.put("values", sublist); + Bar forge = forger.forge(map); + assertThat(forge.getValues().size() > 0); + assertThat(forge.getIdMap().get("1")).isEqualTo(1); + } +} diff --git a/webmagic-avalon/forger/src/test/java/us/codecraft/forger/compiler/GroovyForgerCompilerTest.java b/webmagic-avalon/forger/src/test/java/us/codecraft/forger/compiler/GroovyForgerCompilerTest.java new file mode 100644 index 0000000..244c25f --- /dev/null +++ b/webmagic-avalon/forger/src/test/java/us/codecraft/forger/compiler/GroovyForgerCompilerTest.java @@ -0,0 +1,19 @@ +package us.codecraft.forger.compiler; + +import org.junit.Test; +import us.codecraft.forger.Foo; + +import static org.assertj.core.api.Assertions.assertThat; + +/** + * @author code4crafter@gmail.com + */ +public class GroovyForgerCompilerTest { + + @Test + public void testGroovyClassLoader() throws Exception { + GroovyForgerCompiler groovyForgerCompiler = new GroovyForgerCompiler(); + Class compiledClass = groovyForgerCompiler.compile(Foo.SOURCE_CODE); + assertThat(compiledClass.getName()).isEqualTo("Foo"); + } +} diff --git a/webmagic-avalon/forger/src/test/resources/log4j.xml b/webmagic-avalon/forger/src/test/resources/log4j.xml new file mode 100644 index 0000000..9084694 --- /dev/null +++ b/webmagic-avalon/forger/src/test/resources/log4j.xml @@ -0,0 +1,31 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + From 94f97da4dc96f19ad3b263e964f2961c59903b02 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Wed, 2 Apr 2014 07:36:31 +0800 Subject: [PATCH 036/130] [Avalon] fix spring config for static and ignore google fonts for better loading speed --- webmagic-avalon/webmagic-admin/src/main/webapp/WEB-INF/web.xml | 2 +- .../src/main/webapp/static/css/bootstrap-cerulean.css | 2 +- .../src/main/webapp/static/css/bootstrap-cyborg.css | 2 +- .../src/main/webapp/static/css/bootstrap-journal.css | 2 +- .../src/main/webapp/static/css/bootstrap-redy.css | 2 +- .../src/main/webapp/static/css/bootstrap-united.css | 2 +- .../webmagic-admin/src/main/webapp/static/css/charisma-app.css | 2 +- .../main/resources/config/spring/applicationContext-webmvc.xml | 2 ++ webmagic-avalon/webmagic-worker/src/main/webapp/WEB-INF/web.xml | 2 +- 9 files changed, 10 insertions(+), 8 deletions(-) diff --git a/webmagic-avalon/webmagic-admin/src/main/webapp/WEB-INF/web.xml b/webmagic-avalon/webmagic-admin/src/main/webapp/WEB-INF/web.xml index 4c255cd..cd7ee5b 100644 --- a/webmagic-avalon/webmagic-admin/src/main/webapp/WEB-INF/web.xml +++ b/webmagic-avalon/webmagic-admin/src/main/webapp/WEB-INF/web.xml @@ -33,7 +33,7 @@ org.springframework.web.servlet.DispatcherServlet contextConfigLocation - classpath:/spring/applicationContext*.xml + classpath*:config/spring/applicationContext*.xml 1 diff --git a/webmagic-avalon/webmagic-admin/src/main/webapp/static/css/bootstrap-cerulean.css b/webmagic-avalon/webmagic-admin/src/main/webapp/static/css/bootstrap-cerulean.css index 82037d4..3d95708 100755 --- a/webmagic-avalon/webmagic-admin/src/main/webapp/static/css/bootstrap-cerulean.css +++ b/webmagic-avalon/webmagic-admin/src/main/webapp/static/css/bootstrap-cerulean.css @@ -1,4 +1,4 @@ -@import url(https://fonts.googleapis.com/css?family=Karla|Ubuntu); +/*@import url(https://fonts.googleapis.com/css?family=Karla|Ubuntu);*/ /*! * Bootstrap v2.0.4 * diff --git a/webmagic-avalon/webmagic-admin/src/main/webapp/static/css/bootstrap-cyborg.css b/webmagic-avalon/webmagic-admin/src/main/webapp/static/css/bootstrap-cyborg.css index 6f4b9c4..39ec617 100755 --- a/webmagic-avalon/webmagic-admin/src/main/webapp/static/css/bootstrap-cyborg.css +++ b/webmagic-avalon/webmagic-admin/src/main/webapp/static/css/bootstrap-cyborg.css @@ -1,4 +1,4 @@ -@import url('https://fonts.googleapis.com/css?family=Droid+Sans:400,700'); +/*@import url('https://fonts.googleapis.com/css?family=Droid+Sans:400,700');*/ /*! * Bootstrap v2.0.4 * diff --git a/webmagic-avalon/webmagic-admin/src/main/webapp/static/css/bootstrap-journal.css b/webmagic-avalon/webmagic-admin/src/main/webapp/static/css/bootstrap-journal.css index 9c18433..e335d98 100755 --- a/webmagic-avalon/webmagic-admin/src/main/webapp/static/css/bootstrap-journal.css +++ b/webmagic-avalon/webmagic-admin/src/main/webapp/static/css/bootstrap-journal.css @@ -1,4 +1,4 @@ -@import url('https://fonts.googleapis.com/css?family=Open+Sans:400,700'); +/*@import url('https://fonts.googleapis.com/css?family=Open+Sans:400,700');*/ /*! * Bootstrap v2.0.4 * diff --git a/webmagic-avalon/webmagic-admin/src/main/webapp/static/css/bootstrap-redy.css b/webmagic-avalon/webmagic-admin/src/main/webapp/static/css/bootstrap-redy.css index f498982..2e208b2 100644 --- a/webmagic-avalon/webmagic-admin/src/main/webapp/static/css/bootstrap-redy.css +++ b/webmagic-avalon/webmagic-admin/src/main/webapp/static/css/bootstrap-redy.css @@ -1,4 +1,4 @@ -@import url(https://fonts.googleapis.com/css?family=Karla|Ubuntu); +/*@import url(https://fonts.googleapis.com/css?family=Karla|Ubuntu);*/ /*! * Bootstrap v2.0.4 * diff --git a/webmagic-avalon/webmagic-admin/src/main/webapp/static/css/bootstrap-united.css b/webmagic-avalon/webmagic-admin/src/main/webapp/static/css/bootstrap-united.css index b05b04e..94e4c79 100755 --- a/webmagic-avalon/webmagic-admin/src/main/webapp/static/css/bootstrap-united.css +++ b/webmagic-avalon/webmagic-admin/src/main/webapp/static/css/bootstrap-united.css @@ -1,4 +1,4 @@ -@import url(https://fonts.googleapis.com/css?family=Ubuntu); +/*@import url(https://fonts.googleapis.com/css?family=Ubuntu);*/ /*! * Bootstrap v2.0.4 * diff --git a/webmagic-avalon/webmagic-admin/src/main/webapp/static/css/charisma-app.css b/webmagic-avalon/webmagic-admin/src/main/webapp/static/css/charisma-app.css index f795fb4..5b46b39 100755 --- a/webmagic-avalon/webmagic-admin/src/main/webapp/static/css/charisma-app.css +++ b/webmagic-avalon/webmagic-admin/src/main/webapp/static/css/charisma-app.css @@ -1,4 +1,4 @@ -@import url(https://fonts.googleapis.com/css?family=Shojumaru); +/*@import url(https://fonts.googleapis.com/css?family=Shojumaru);*/ select{ background-color:#fff; diff --git a/webmagic-avalon/webmagic-avalon-common/src/main/resources/config/spring/applicationContext-webmvc.xml b/webmagic-avalon/webmagic-avalon-common/src/main/resources/config/spring/applicationContext-webmvc.xml index 340cfb2..476cd39 100644 --- a/webmagic-avalon/webmagic-avalon-common/src/main/resources/config/spring/applicationContext-webmvc.xml +++ b/webmagic-avalon/webmagic-avalon-common/src/main/resources/config/spring/applicationContext-webmvc.xml @@ -26,6 +26,8 @@
+ + \ No newline at end of file diff --git a/webmagic-avalon/webmagic-worker/src/main/webapp/WEB-INF/web.xml b/webmagic-avalon/webmagic-worker/src/main/webapp/WEB-INF/web.xml index b521ed6..533832e 100644 --- a/webmagic-avalon/webmagic-worker/src/main/webapp/WEB-INF/web.xml +++ b/webmagic-avalon/webmagic-worker/src/main/webapp/WEB-INF/web.xml @@ -33,7 +33,7 @@ org.springframework.web.servlet.DispatcherServlet contextConfigLocation - classpath:/config/spring/applicationContext*.xml + classpath*:/config/spring/applicationContext*.xml 1 From 7e0e5b0969e6fe13e64fd9d4778b4444a425417f Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Wed, 2 Apr 2014 11:47:44 +0800 Subject: [PATCH 037/130] clean ui --- .../main/webapp/WEB-INF/pages/dashboard.ftl | 43 +++---------------- 1 file changed, 5 insertions(+), 38 deletions(-) diff --git a/webmagic-avalon/webmagic-admin/src/main/webapp/WEB-INF/pages/dashboard.ftl b/webmagic-avalon/webmagic-admin/src/main/webapp/WEB-INF/pages/dashboard.ftl index 591d180..5ed6fb6 100644 --- a/webmagic-avalon/webmagic-admin/src/main/webapp/WEB-INF/pages/dashboard.ftl +++ b/webmagic-avalon/webmagic-admin/src/main/webapp/WEB-INF/pages/dashboard.ftl @@ -15,8 +15,8 @@ WebMaigc Avalon - - + + @@ -123,23 +123,10 @@ @@ -173,26 +160,6 @@ 6 - - -
Pro Members
-
228
- 4 -
- - - -
Sales
-
$13320
- $34 -
- - - -
Messages
-
25
- 12 -
From 9ec0ca02c629aac8cdbec362a71cc67447dc6ce4 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Thu, 3 Apr 2014 08:18:59 +0800 Subject: [PATCH 038/130] doc2.0 ch1 --- zh_docs/user-manual-new.md | 120 +++++++++++++++++++++++++++++++++++++ 1 file changed, 120 insertions(+) create mode 100644 zh_docs/user-manual-new.md diff --git a/zh_docs/user-manual-new.md b/zh_docs/user-manual-new.md new file mode 100644 index 0000000..cd4d0f5 --- /dev/null +++ b/zh_docs/user-manual-new.md @@ -0,0 +1,120 @@ +WebMagic文档2.0版 +======== + +WebMagic是一个简单灵活、便于二次开发的爬虫框架。除了可以便捷的实现一个爬虫,WebMagic还提供多线程功能,以及基本的分布式功能。 + +你可以直接使用WebMagic进行爬虫开发,也可以定制WebMagic以适应复杂项目的需要。 + +## 1. 安装和使用 + +WebMagic包含两个主要的jar包:`webmagic-core-{version}.jar`和`webmagic-extension-{version}.jar`。在项目中添加这两个包的依赖,即可使用WebMagic。 + +### 1.1 使用Maven + +WebMagic基于Maven进行构建,推荐使用Maven来安装WebMagic。在项目中添加以下坐标即可: + +```xml + + us.codecraft + webmagic-extension + 0.4.3 + +``` + +WebMagic 使用slf4j-log4j12作为slf4j的实现.如果你自己定制了slf4j的实现,请在项目中去掉此依赖。 + +```xml + + us.codecraft + webmagic-extension + 0.4.3 + + + org.slf4j + slf4j-log4j12 + + + +``` + +### 1.2 不使用Maven + +不使用maven的用户,可以下载附带二进制jar包的版本(感谢[oschina](http://www.oschina.net/)): + + git clone http://git.oschina.net/flashsword20/webmagic.git + +在**lib**目录下,有项目依赖的所有jar包,直接在IDE里import即可。 + +### 1.3 从源码安装 + +## 2. 基本的爬虫 + +### 2.1 抽取内容(xpath, regex, css selector, jsonpath) + +### 2.2 发现链接 + +### 2.3 处理多个页面 + +## 3. 使用注解 + +### 3.1 抽取内容(xpath, regex, css selector, jsonpath) + +### 3.2 发现链接 + +### 3.3 处理多个页面 + +### 3.4 在POJO中实现复杂逻辑 + +## 4. 配置爬虫 + +### 4.1 抓取频率 + +### 4.2 编码 + +### 4.3 代理 + +### 4.4 设置cookie/UA等http头信息 + +### 4.5 重试机制 + +### 4.6 多线程 + +## 5. 爬虫的启动和终止 + +### 5.1 启动爬虫 + +### 5.2 终止爬虫 + +### 5.3 设置执行时间 + +### 5.4 定期抓取 + +## 6. 管理URL + +### 6.1 手动添加URL + +### 6.2 在URL中保存信息 + +### 6.3 几种URL管理方式 + +### 6.4 自己管理爬虫的URL + +## 7. 抽取结果的处理 + +### 7.1 输出到控制台 + +### 7.2 保存到文件 + +### 7.3 JSON格式输出 + +### 7.4 自定义持久化方式(mysql/mongodb…) + +## 8. 实例 + +### 8.1 基本的列表+详情页的抓取 + +### 8.2 抓取动态页面 + +### 8.3 分页抓取 + +### 8.4 定期抓取 \ No newline at end of file From 50cee4c7bb97f157798476c5c0ffa13ea2257890 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Thu, 3 Apr 2014 11:06:03 +0800 Subject: [PATCH 039/130] [doc] complete docs2.0 ch1 --- user-manual.md | 2 +- zh_docs/user-manual-new.md | 183 +++++++++++++++++++++++++++++-------- 2 files changed, 146 insertions(+), 39 deletions(-) diff --git a/user-manual.md b/user-manual.md index f225c8a..d191965 100644 --- a/user-manual.md +++ b/user-manual.md @@ -65,7 +65,7 @@ webmagic还包含两个可用的扩展包,因为这两个包都依赖了比较 git clone http://git.oschina.net/flashsword20/webmagic.git -在**bin/lib**目录下,有项目依赖的所有jar包,直接在IDE里import即可。 +在**lib**目录下,有项目依赖的所有jar包,直接在IDE里import即可。 -------- diff --git a/zh_docs/user-manual-new.md b/zh_docs/user-manual-new.md index cd4d0f5..537f8fc 100644 --- a/zh_docs/user-manual-new.md +++ b/zh_docs/user-manual-new.md @@ -5,9 +5,9 @@ WebMagic是一个简单灵活、便于二次开发的爬虫框架。除了可以 你可以直接使用WebMagic进行爬虫开发,也可以定制WebMagic以适应复杂项目的需要。 -## 1. 安装和使用 +## 1. 使用WebMagic -WebMagic包含两个主要的jar包:`webmagic-core-{version}.jar`和`webmagic-extension-{version}.jar`。在项目中添加这两个包的依赖,即可使用WebMagic。 +WebMagic主要包含两个jar包:`webmagic-core-{version}.jar`和`webmagic-extension-{version}.jar`。在项目中添加这两个包的依赖,即可使用WebMagic。 ### 1.1 使用Maven @@ -21,7 +21,7 @@ WebMagic基于Maven进行构建,推荐使用Maven来安装WebMagic。在项目 ``` -WebMagic 使用slf4j-log4j12作为slf4j的实现.如果你自己定制了slf4j的实现,请在项目中去掉此依赖。 +WebMagic使用slf4j-log4j12作为slf4j的实现.如果你自己定制了slf4j的实现,请在项目中去掉此依赖。 ```xml @@ -43,19 +43,118 @@ WebMagic 使用slf4j-log4j12作为slf4j的实现.如果你自己定制了slf4j git clone http://git.oschina.net/flashsword20/webmagic.git -在**lib**目录下,有项目依赖的所有jar包,直接在IDE里import即可。 +在**lib**目录下,有项目依赖的所有jar包,直接在IDE里,将这些jar添加到Libraries即可。 -### 1.3 从源码安装 +![import jars](http://static.oschina.net/uploads/space/2014/0403/102848_ETcU_190591.png) -## 2. 基本的爬虫 +### 1.3 第一个项目 -### 2.1 抽取内容(xpath, regex, css selector, jsonpath) +在你的项目中添加了WebMagic的依赖之后,即可开始第一个爬虫的开发了!我们这里拿一个抓取Github信息的例子: -### 2.2 发现链接 +```java +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.processor.PageProcessor; -### 2.3 处理多个页面 +public class GithubRepoPageProcessor implements PageProcessor { -## 3. 使用注解 + private Site site = Site.me().setRetryTimes(3).setSleepTime(100); + + @Override + public void process(Page page) { + page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all()); + page.putField("author", page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString()); + page.putField("name", page.getHtml().xpath("//h1[@class='entry-title public']/strong/a/text()").toString()); + if (page.getResultItems().get("name")==null){ + //skip this page + page.setSkip(true); + } + page.putField("readme", page.getHtml().xpath("//div[@id='readme']/tidyText()")); + } + + @Override + public Site getSite() { + return site; + } + + public static void main(String[] args) { + Spider.create(new GithubRepoPageProcessor()).addUrl("https://github.com/code4craft").thread(5).run(); + } +} +``` + +点击main方法,选择“运行”,你会发现爬虫已经可以正常工作了! + +![runlog](http://static.oschina.net/uploads/space/2014/0403/103741_3Gf5_190591.png) + +## 2.下载和编译源码 + +WebMagic是一个纯Java项目,如果你熟悉Maven,那么下载并编译源码是非常简单的。如果不熟悉Maven也没关系,这部分会介绍如何在Eclipse里导入这个项目。 + +### 2.1 下载源码 + +WebMagic目前有两个仓库: + +* [https://github.com/code4craft/webmagic](https://github.com/code4craft/webmagic) + +github上的仓库保存最新版本,所有issue、pull request都在这里。大家觉得项目不错的话别忘了去给个star哦! + +* [http://git.oschina.net/flashsword20/webmagic](http://git.oschina.net/flashsword20/webmagic) + +此仓库包含所有编译好的依赖包,只保存项目的稳定版本,最新版本仍在github上更新。oschina在国内比较稳定,主要作为镜像。 + +无论在哪个仓库,使用 + + git clone https://github.com/code4craft/webmagic.git + +或者 + + git clone http://git.oschina.net/flashsword20/webmagic.git + +即可下载最新代码。 + +如果你对git本身使用也不熟悉,建议看看@黄勇的 [从 Git@OSC 下载 Smart 源码](http://my.oschina.net/huangyong/blog/200075) + +### 2.2 导入项目 + +Intellij Idea默认自带Maven支持,import项目时选择Maven项目即可。 + +#### 2.2.1 使用m2e插件 + +使用Eclipse的用户,推荐安装m2e插件,安装地址:https://www.eclipse.org/m2e/download/[](https://www.eclipse.org/m2e/download/) + +安装后,在File->Import中选择Maven->Existing Maven Projects即可导入项目。 + +![m2e-import](http://static.oschina.net/uploads/space/2014/0403/104427_eNuc_190591.png) + +导入后看到项目选择界面,点击finish即可。 + +![m2e-import2](http://static.oschina.net/uploads/space/2014/0403/104735_6vwG_190591.png) + +#### 2.2.2 使用Maven Eclipse插件 + +如果没有安装m2e插件,只要你安装了Maven,也是比较好办的。在项目根目录下使用命令: + + mvn eclipse:eclipse + +生成maven项目结构的eclipse配置文件,然后在File->Import中选择General->Existing Projects into Workspace即可导入项目。 + +![eclipse-import-1](http://static.oschina.net/uploads/space/2014/0403/100025_DAcy_190591.png) + +导入后看到项目选择界面,点击finish即可。 + +![eclipse-import-2](http://static.oschina.net/uploads/space/2014/0403/100227_73DJ_190591.png) + +### 2.3 编译和执行源码 + +导入成功之后,应该就没有编译错误了!此时你可以运行一下webmagic-core项目中自带的exmaple:"us.codecraft.webmagic.processor.example.GithubRepoPageProcessor"。 + +同样,看到控制台输出如下,则表示源码编译和执行成功了! + +![runlog](http://static.oschina.net/uploads/space/2014/0403/103741_3Gf5_190591.png) + +## 3. 基本的爬虫 ### 3.1 抽取内容(xpath, regex, css selector, jsonpath) @@ -63,58 +162,66 @@ WebMagic 使用slf4j-log4j12作为slf4j的实现.如果你自己定制了slf4j ### 3.3 处理多个页面 -### 3.4 在POJO中实现复杂逻辑 +## 4. 使用注解 -## 4. 配置爬虫 +### 4.1 抽取内容(xpath, regex, css selector, jsonpath) -### 4.1 抓取频率 +### 4.2 发现链接 -### 4.2 编码 +### 4.3 处理多个页面 -### 4.3 代理 +### 4.4 在POJO中实现复杂逻辑 -### 4.4 设置cookie/UA等http头信息 +## 5. 配置爬虫 -### 4.5 重试机制 +### 5.1 抓取频率 -### 4.6 多线程 +### 5.2 编码 -## 5. 爬虫的启动和终止 +### 5.3 代理 -### 5.1 启动爬虫 +### 5.4 设置cookie/UA等http头信息 -### 5.2 终止爬虫 +### 5.5 重试机制 -### 5.3 设置执行时间 +### 5.6 多线程 -### 5.4 定期抓取 +## 6. 爬虫的启动和终止 -## 6. 管理URL +### 6.1 启动爬虫 -### 6.1 手动添加URL +### 6.2 终止爬虫 -### 6.2 在URL中保存信息 +### 6.3 设置执行时间 -### 6.3 几种URL管理方式 +### 6.4 定期抓取 -### 6.4 自己管理爬虫的URL +## 7. 管理URL -## 7. 抽取结果的处理 +### 7.1 手动添加URL -### 7.1 输出到控制台 +### 7.2 在URL中保存信息 -### 7.2 保存到文件 +### 7.3 几种URL管理方式 -### 7.3 JSON格式输出 +### 7.4 自己管理爬虫的URL -### 7.4 自定义持久化方式(mysql/mongodb…) +## 8. 抽取结果的处理 -## 8. 实例 +### 8.1 输出到控制台 -### 8.1 基本的列表+详情页的抓取 +### 8.2 保存到文件 -### 8.2 抓取动态页面 +### 8.3 JSON格式输出 -### 8.3 分页抓取 +### 8.4 自定义持久化方式(mysql/mongodb…) -### 8.4 定期抓取 \ No newline at end of file +## 9. 实例 + +### 9.1 基本的列表+详情页的抓取 + +### 9.2 抓取动态页面 + +### 9.3 分页抓取 + +### 9.4 定期抓取 \ No newline at end of file From a34e92d11a241690862d41a04b97c7a9a8993991 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Thu, 3 Apr 2014 22:33:10 +0800 Subject: [PATCH 040/130] fix huabanprocessor --- .../us/codecraft/webmagic/samples/HuabanProcessor.java | 9 +++++---- zh_docs/user-manual-new.md | 4 +++- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/webmagic-selenium/src/test/java/us/codecraft/webmagic/samples/HuabanProcessor.java b/webmagic-selenium/src/test/java/us/codecraft/webmagic/samples/HuabanProcessor.java index 1696a3f..fcfb068 100644 --- a/webmagic-selenium/src/test/java/us/codecraft/webmagic/samples/HuabanProcessor.java +++ b/webmagic-selenium/src/test/java/us/codecraft/webmagic/samples/HuabanProcessor.java @@ -22,7 +22,7 @@ public class HuabanProcessor implements PageProcessor { public void process(Page page) { page.addTargetRequests(page.getHtml().links().regex("http://huaban\\.com/.*").all()); if (page.getUrl().toString().contains("pins")) { - page.putField("img", page.getHtml().xpath("//div[@id='pin_img']/img/@src").toString()); + page.putField("img", page.getHtml().xpath("//div[@id='pin_img']/a/img/@src").toString()); } else { page.getResultItems().setSkip(true); } @@ -31,15 +31,16 @@ public class HuabanProcessor implements PageProcessor { @Override public Site getSite() { if (site == null) { - site = Site.me().setDomain("huaban.com").addStartUrl("http://huaban.com/").setSleepTime(0); + site = Site.me().setDomain("huaban.com").setSleepTime(0); } return site; } public static void main(String[] args) { Spider.create(new HuabanProcessor()).thread(5) - .pipeline(new FilePipeline("/data/webmagic/test/")) - .downloader(new SeleniumDownloader("/Users/yihua/Downloads/chromedriver")) + .addPipeline(new FilePipeline("/data/webmagic/test/")) + .setDownloader(new SeleniumDownloader("/Users/yihua/Downloads/chromedriver")) + .addUrl("http://huaban.com/") .runAsync(); } } diff --git a/zh_docs/user-manual-new.md b/zh_docs/user-manual-new.md index 537f8fc..229c9a6 100644 --- a/zh_docs/user-manual-new.md +++ b/zh_docs/user-manual-new.md @@ -45,7 +45,7 @@ WebMagic使用slf4j-log4j12作为slf4j的实现.如果你自己定制了slf4j的 在**lib**目录下,有项目依赖的所有jar包,直接在IDE里,将这些jar添加到Libraries即可。 -![import jars](http://static.oschina.net/uploads/space/2014/0403/102848_ETcU_190591.png) +![import jars](http://static.oschina.net/uploads/space/2014/0403/143318_gBQE_190591.jpeg) ### 1.3 第一个项目 @@ -154,6 +154,8 @@ Intellij Idea默认自带Maven支持,import项目时选择Maven项目即可。 ![runlog](http://static.oschina.net/uploads/space/2014/0403/103741_3Gf5_190591.png) +
+ ## 3. 基本的爬虫 ### 3.1 抽取内容(xpath, regex, css selector, jsonpath) From a1c7e826f7a0a675428ed22a44f1f2bc3dd5d707 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Thu, 3 Apr 2014 23:04:31 +0800 Subject: [PATCH 041/130] fix dep of slf4j-log4j12 --- README.md | 47 ++++++++++--------- .../example/GithubRepoPageProcessor.java | 2 +- webmagic-extension/pom.xml | 6 --- 3 files changed, 26 insertions(+), 29 deletions(-) diff --git a/README.md b/README.md index 7eee9e3..1f4bc13 100644 --- a/README.md +++ b/README.md @@ -50,33 +50,35 @@ WebMagic use slf4j with slf4j-log4j12 implementation. If you customized your slf ### First crawler: -Write a class implements PageProcessor: +Write a class implements PageProcessor. For example, I wrote a crawler of github repository infomation. ```java -public class OschinaBlogPageProcesser implements PageProcessor { +public class GithubRepoPageProcessor implements PageProcessor { - private Site site = Site.me().setDomain("my.oschina.net"); + private Site site = Site.me().setRetryTimes(3).setSleepTime(1000); @Override public void process(Page page) { - List links = page.getHtml().links().regex("http://my\\.oschina\\.net/flashsword/blog/\\d+").all(); - page.addTargetRequests(links); - page.putField("title", page.getHtml().xpath("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1").toString()); - page.putField("content", page.getHtml().$("div.content").toString()); - page.putField("tags",page.getHtml().xpath("//div[@class='BlogTags']/a/text()").all()); + page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all()); + page.putField("author", page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString()); + page.putField("name", page.getHtml().xpath("//h1[@class='entry-title public']/strong/a/text()").toString()); + if (page.getResultItems().get("name")==null){ + //skip this page + page.setSkip(true); + } + page.putField("readme", page.getHtml().xpath("//div[@id='readme']/tidyText()")); } @Override public Site getSite() { return site; - } public static void main(String[] args) { - Spider.create(new OschinaBlogPageProcesser()).addUrl("http://my.oschina.net/flashsword/blog") - .addPipeline(new ConsolePipeline()).run(); + Spider.create(new GithubRepoPageProcessor()).addUrl("https://github.com/code4craft").thread(5).run(); } } + ``` * `page.addTargetRequests(links)` @@ -86,22 +88,23 @@ public class OschinaBlogPageProcesser implements PageProcessor { You can also use annotation way: ```java -@TargetUrl("http://my.oschina.net/flashsword/blog/\\d+") -public class OschinaBlog { +@TargetUrl("https://github.com/\\w+/\\w+") +@HelpUrl("https://github.com/\\w+") +public class GithubRepo { - @ExtractBy("//title") - private String title; + @ExtractBy(value = "//h1[@class='entry-title public']/strong/a/text()", notNull = true) + private String name; - @ExtractBy(value = "div.BlogContent",type = ExtractBy.Type.Css) - private String content; + @ExtractByUrl("https://github\\.com/(\\w+)/.*") + private String author; - @ExtractBy(value = "//div[@class='BlogTags']/a/text()", multi = true) - private List tags; + @ExtractBy("//div[@id='readme']/tidyText()") + private String readme; public static void main(String[] args) { - OOSpider.create( - Site.me(), - new ConsolePageModelPipeline(), OschinaBlog.class).addUrl("http://my.oschina.net/flashsword/blog").run(); + OOSpider.create(Site.me().setSleepTime(1000) + , new ConsolePageModelPipeline(), GithubRepo.class) + .addUrl("https://github.com/code4craft").thread(5).run(); } } ``` diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/GithubRepoPageProcessor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/GithubRepoPageProcessor.java index 179bad4..c512265 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/GithubRepoPageProcessor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/GithubRepoPageProcessor.java @@ -11,7 +11,7 @@ import us.codecraft.webmagic.processor.PageProcessor; */ public class GithubRepoPageProcessor implements PageProcessor { - private Site site = Site.me().setRetryTimes(3).setSleepTime(100); + private Site site = Site.me().setRetryTimes(3).setSleepTime(1000); @Override public void process(Page page) { diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index 78f2757..cd8c12f 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -23,12 +23,6 @@ us.codecraft webmagic-core ${project.version} - - - org.slf4j - slf4j-log4j12 - -
junit From 7ca644cdd927c459561469924cab380ad1757ca1 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Fri, 4 Apr 2014 06:47:28 +0800 Subject: [PATCH 042/130] format readme --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index 1f4bc13..bb5921a 100644 --- a/README.md +++ b/README.md @@ -78,7 +78,6 @@ public class GithubRepoPageProcessor implements PageProcessor { Spider.create(new GithubRepoPageProcessor()).addUrl("https://github.com/code4craft").thread(5).run(); } } - ``` * `page.addTargetRequests(links)` From 9a0a4051ed51eb73391d4c6a6894714eedec1960 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Fri, 4 Apr 2014 08:05:34 +0800 Subject: [PATCH 043/130] [doc] ch3 part1 --- zh_docs/user-manual-new.md | 57 +++++++++++++++++++++++++++++++++++--- 1 file changed, 53 insertions(+), 4 deletions(-) diff --git a/zh_docs/user-manual-new.md b/zh_docs/user-manual-new.md index 229c9a6..e8526fd 100644 --- a/zh_docs/user-manual-new.md +++ b/zh_docs/user-manual-new.md @@ -5,7 +5,7 @@ WebMagic是一个简单灵活、便于二次开发的爬虫框架。除了可以 你可以直接使用WebMagic进行爬虫开发,也可以定制WebMagic以适应复杂项目的需要。 -## 1. 使用WebMagic +## 1. 在项目中使用WebMagic WebMagic主要包含两个jar包:`webmagic-core-{version}.jar`和`webmagic-extension-{version}.jar`。在项目中添加这两个包的依赖,即可使用WebMagic。 @@ -88,6 +88,8 @@ public class GithubRepoPageProcessor implements PageProcessor { ![runlog](http://static.oschina.net/uploads/space/2014/0403/103741_3Gf5_190591.png) +
+ ## 2.下载和编译源码 WebMagic是一个纯Java项目,如果你熟悉Maven,那么下载并编译源码是非常简单的。如果不熟悉Maven也没关系,这部分会介绍如何在Eclipse里导入这个项目。 @@ -158,11 +160,58 @@ Intellij Idea默认自带Maven支持,import项目时选择Maven项目即可。 ## 3. 基本的爬虫 -### 3.1 抽取内容(xpath, regex, css selector, jsonpath) +## 3. 基本的爬虫 -### 3.2 发现链接 +### 3.1 实现PageProcessor -### 3.3 处理多个页面 +在WebMagic里,实现一个基本的爬虫只需要编写一个类,实现`PageProcessor`接口。这个类包含了抓取一个网站所需要的所有定制化信息。以之前的`GithubRepoPageProcessor`为例: + +```java +public class GithubRepoPageProcessor implements PageProcessor { + + // 抓取网站的相关配置,包括编码、抓取间隔、重试次数等 + private Site site = Site.me().setRetryTimes(3).setSleepTime(1000); + + @Override + // process是定制爬虫逻辑的核心接口,在这里编写抽取逻辑 + public void process(Page page) { + // 以下部分定义了如何抽取页面信息,并保存下来 + page.putField("author", page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString()); + page.putField("name", page.getHtml().xpath("//h1[@class='entry-title public']/strong/a/text()").toString()); + if (page.getResultItems().get("name") == null) { + //skip this page + page.setSkip(true); + } + page.putField("readme", page.getHtml().xpath("//div[@id='readme']/tidyText()")); + + // 这一步从页面发现后续的url地址来抓取 + page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all()); + } + + @Override + public Site getSite() { + return site; + } + + public static void main(String[] args) { + + Spider.create(new GithubRepoPageProcessor()) + //从"https://github.com/code4craft"开始抓 + .addUrl("https://github.com/code4craft") + //开启5个线程抓取 + .thread(5) + //启动爬虫 + .run(); + } +} +``` + + +### 3.2 抽取内容(xpath, regex, css selector, jsonpath) + +### 3.3 发现链接 + +### 3.4 处理多个页面 ## 4. 使用注解 From 44293cd8948fdb004115d16ee486d3dab2b2b8aa Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Fri, 4 Apr 2014 10:07:48 +0800 Subject: [PATCH 044/130] [doc]add qq group in readme --- README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.md b/README.md index bb5921a..2056fba 100644 --- a/README.md +++ b/README.md @@ -163,6 +163,10 @@ To write webmagic, I refered to the projects below : [https://groups.google.com/forum/#!forum/webmagic-java](https://groups.google.com/forum/#!forum/webmagic-java) +[http://list.qq.com/cgi-bin/qf_invite?id=023a01f505246785f77c5a5a9aff4e57ab20fcdde871e988](http://list.qq.com/cgi-bin/qf_invite?id=023a01f505246785f77c5a5a9aff4e57ab20fcdde871e988) + +QQ Group: 330192938 + [![Bitdeli Badge](https://d2weczhvl823v0.cloudfront.net/code4craft/webmagic/trend.png)](https://bitdeli.com/free "Bitdeli Badge") From 762a3973fd912b6812d0bc059930ed4567a2677a Mon Sep 17 00:00:00 2001 From: Bo LIANG Date: Fri, 4 Apr 2014 15:53:46 +0800 Subject: [PATCH 045/130] Modify the log levels of LocalDuplicatedRemovedScheduler.java The old version will print a debug level log each time the push method is called. So sometimes, when a html page have multiple links for the same page, the debug log will appears more than once. Also, when we meet a duplicate URL, it will also print a log, which will be confusing. I change the level of it to trace. And each time a URL is really push into queue, print a debug level log. --- .../webmagic/scheduler/LocalDuplicatedRemovedScheduler.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/LocalDuplicatedRemovedScheduler.java b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/LocalDuplicatedRemovedScheduler.java index c4b08f3..da32c66 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/LocalDuplicatedRemovedScheduler.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/LocalDuplicatedRemovedScheduler.java @@ -23,8 +23,9 @@ public abstract class LocalDuplicatedRemovedScheduler implements Scheduler { @Override public void push(Request request, Task task) { - logger.debug("push to queue " + request.getUrl()); + logger.trace("get a candidate url " + request.getUrl()); if (request.getExtra(Request.CYCLE_TRIED_TIMES) != null || urls.add(request.getUrl())) { + logger.debug("push to queue " + request.getUrl()); pushWhenNoDuplicate(request, task); } } From 22c394e6290edc2b0e4435a2d5a3647ed16d5646 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Fri, 4 Apr 2014 20:00:58 +0800 Subject: [PATCH 046/130] [doc] --- webmagic-core/src/test/resources/log4j.xml | 10 -- zh_docs/user-manual-new.md | 182 ++++++++++++++++++--- 2 files changed, 157 insertions(+), 35 deletions(-) diff --git a/webmagic-core/src/test/resources/log4j.xml b/webmagic-core/src/test/resources/log4j.xml index 9084694..c2b5a2f 100644 --- a/webmagic-core/src/test/resources/log4j.xml +++ b/webmagic-core/src/test/resources/log4j.xml @@ -8,21 +8,11 @@ - - - - - - - - - - diff --git a/zh_docs/user-manual-new.md b/zh_docs/user-manual-new.md index e8526fd..a8ae5c2 100644 --- a/zh_docs/user-manual-new.md +++ b/zh_docs/user-manual-new.md @@ -1,4 +1,4 @@ -WebMagic文档2.0版 +WebMagic in Action ======== WebMagic是一个简单灵活、便于二次开发的爬虫框架。除了可以便捷的实现一个爬虫,WebMagic还提供多线程功能,以及基本的分布式功能。 @@ -160,22 +160,22 @@ Intellij Idea默认自带Maven支持,import项目时选择Maven项目即可。 ## 3. 基本的爬虫 -## 3. 基本的爬虫 - ### 3.1 实现PageProcessor -在WebMagic里,实现一个基本的爬虫只需要编写一个类,实现`PageProcessor`接口。这个类包含了抓取一个网站所需要的所有定制化信息。以之前的`GithubRepoPageProcessor`为例: +在WebMagic里,实现一个基本的爬虫只需要编写一个类,实现`PageProcessor`接口即可。这个类基本上包含了抓取一个网站,你需要写的所有代码。 + +以之前的`GithubRepoPageProcessor`为例,我将PageProcessor的定制分为三个部分,分别是爬虫的配置、页面元素的抽取和链接的发现。 ```java public class GithubRepoPageProcessor implements PageProcessor { - // 抓取网站的相关配置,包括编码、抓取间隔、重试次数等 + // 部分一:抓取网站的相关配置,包括编码、抓取间隔、重试次数等 private Site site = Site.me().setRetryTimes(3).setSleepTime(1000); @Override // process是定制爬虫逻辑的核心接口,在这里编写抽取逻辑 public void process(Page page) { - // 以下部分定义了如何抽取页面信息,并保存下来 + // 部分二:定义如何抽取页面信息,并保存下来 page.putField("author", page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString()); page.putField("name", page.getHtml().xpath("//h1[@class='entry-title public']/strong/a/text()").toString()); if (page.getResultItems().get("name") == null) { @@ -184,7 +184,7 @@ public class GithubRepoPageProcessor implements PageProcessor { } page.putField("readme", page.getHtml().xpath("//div[@id='readme']/tidyText()")); - // 这一步从页面发现后续的url地址来抓取 + // 部分三:从页面发现后续的url地址来抓取 page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all()); } @@ -206,22 +206,154 @@ public class GithubRepoPageProcessor implements PageProcessor { } ``` +#### 3.1.1 爬虫的配置 -### 3.2 抽取内容(xpath, regex, css selector, jsonpath) +第一部分关于爬虫的配置,包括编码、抓取间隔、超时时间、重试次数等,也包括一些模拟的参数,例如User Agent、cookie,以及代理的设置,我们会在第5章-“爬虫的配置”里进行介绍。在这里我们先简单设置一下:重试次数为3次,抓取间隔为一秒。 -### 3.3 发现链接 +#### 3.1.2 页面元素的抽取 -### 3.4 处理多个页面 +第二部分是爬虫的核心部分:对于下载到的Html页面,你如何从中抽取到你想要的信息?WebMagic里主要使用了三种抽取技术:XPath、正则表达式和CSS选择器。 -## 4. 使用注解 +1. XPath -### 4.1 抽取内容(xpath, regex, css selector, jsonpath) + XPath本来是用于XML中获取元素的一种查询语言,但是用于Html也是比较方便的。例如: -### 4.2 发现链接 + ```java + page.getHtml().xpath("//h1[@class='entry-title public']/strong/a/text()") + ``` + 这段代码使用了XPath,它的意思是“查找所有class属性为'entry-title public'的h1元素,并找到他的strong子节点的a子节点,并提取a节点的文本信息”。 +对应的Html是这样子的: -### 4.3 处理多个页面 + ![xpath-html](http://static.oschina.net/uploads/space/2014/0404/104607_Aqq8_190591.png) -### 4.4 在POJO中实现复杂逻辑 +2. CSS选择器 + + CSS选择器是与XPath类似的语言。如果大家做过前端开发,肯定知道$('h1.entry-title')这种写法的含义。客观的说,它比XPath写起来要简单一些,但是如果写复杂一点的抽取规则,就相对要麻烦一点。 + +3. 正则表达式 + + 正则表达式则是一种通用的文本抽取语言。 + + ```java + page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all()); + ``` + + 这段代码就用到了正则表达式,它表示匹配所有"https://github.com/code4craft/webmagic"这样的链接。 + +XPath、CSS选择器和正则表达式的具体用法会在第4章“抽取工具详解”中讲到。 + +#### 3.1.3 链接的发现 + +有了处理页面的逻辑,我们的爬虫就接近完工了! + +但是现在还有一个问题:一个站点的页面是很多的,一开始我们不可能全部列举出来,于是如何发现后续的链接,是一个爬虫不可缺少的一部分。 + +```java +page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all()); +``` + +这段代码的分为两部分,`page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all()`用于获取所有满足"(https://github\\.com/\\w+/\\w+)"这个正则表达式的链接,`page.addTargetRequests()`则将这些链接加入到待抓取的队列中去。 + +### 3.2 使用Selectable的链式API + +`Selectable`相关的链式API是WebMagic的一个核心功能。使用Selectable接口,你可以直接完成页面元素的链式抽取,也无需去关心抽取的细节。 + +在刚才的例子中可以看到,page.getHtml()返回的是一个`Html`对象,它实现了`Selectable`接口。这个接口包含一些重要的方法,我将它分为两类:抽取部分和获取结果部分。 + +#### 3.2.1 抽取部分API: + +| 方法 | 说明 | 示例 | +| ------------ | ------------- | ------------ | +| xpath(String xpath) | 使用XPath选择 | html.xpath("//div[@class='title']") | +| \$(String selector) | 使用Css选择器选择 | html.\$("div.title") | +| \$(String selector,String attr) | 使用Css选择器选择 | html.\$("div.title","text") | +| css(String selector) | 功能同$(),使用Css选择器选择 | html.css("div.title") | +| links() | 选择所有链接 | html.links() | +| regex(String regex) | 使用正则表达式抽取 | html.regex("\(.\*?)\
") | +| regex(String regex,int group) | 使用正则表达式抽取,并指定捕获组 | html.regex("\(.\*?)\",1) | +| replace(String regex, String replacement) | 替换内容| html.replace("\","")| + +这部分抽取API返回的都是一个`Selectable`接口,意思是说,抽取是支持链式调用的。下面我用一个实例来讲解链式API的使用。 + +例如,我现在要抓取github上所有的Java项目,这些项目可以在[https://github.com/search?l=Java&p=1&q=stars%3A%3E1&s=stars&type=Repositories](https://github.com/search?l=Java&p=1&q=stars%3A%3E1&s=stars&type=Repositories)搜索结果中看到。 + +为了避免抓取范围太宽,我指定只从分页部分抓取链接。这个抓取规则是比较复杂的,我会要怎么写呢? + +![selectable-chain-ui](http://static.oschina.net/uploads/space/2014/0404/151454_2T01_190591.png) + +首先看到页面的html结构是这个样子的: + +![selectable-chain](http://static.oschina.net/uploads/space/2014/0404/151632_88Oq_190591.png) + +那么我可以先用CSS选择器提取出这个div,然后在取到所有的链接。为了保险起见,我再使用正则表达式限定一下提取出的URL的格式,那么最终的写法是这样子的: + +```java +List urls = page.getHtml().css("div.pagination").links().regex(".*/search/\?l=java.*").all(); +``` + +然后,我们可以把这些URL加到抓取列表中去: + +```java +List urls = page.getHtml().css("div.pagination").links().regex(".*/search/\?l=java.*").all(); +page.addTargetRequests(urls); +``` + +是不是比较简单?除了发现链接,Selectable的链式抽取还可以完成很多工作。我们会在第9章示例中再讲到。 + +#### 3.2.2 获取结果的API: + +当链式调用结束时,我们一般都想要拿到一个字符串类型的结果。这时候就需要用到获取结果的API了。我们知道,一条抽取规则,无论是XPath、CSS选择器或者正则表达式,总有可能抽取到多条元素。WebMagic对这些进行了统一,你可以通过不同的API获取到一个或者多个元素。 + +| 方法 | 说明 | 示例 | +| ------------ | ------------- | ------------ | +| get() | 返回一条String类型的结果 | String link= html.links().get()| +| toString() | 功能同get(),返回一条String类型的结果 | String link= html.links().toString()| +| all() | 返回所有抽取结果 | List links= html.links().all()| +| match() | 是否有匹配结果 | if (html.links().match()){ xxx; }| + +例如,我们知道页面只会有一条结果,那么可以使用selectable.get()或者selectable.toString()拿到这条结果。 + +这里selectable.toString()采用了toString()这个接口,是为了在输出以及和一些框架结合的时候,更加方便。因为一般情况下,我们都只需要选择一个元素! + +selectable.all()则会获取到所有元素。 + +好了,到现在为止,在回过头看看3.1中的GithubRepoPageProcessor,可能就觉得更加清晰了吧?指定main方法,已经可以看到抓取结果在控制台输出了。 + +### 3.3 保存结果 + +好了,爬虫编写完成,现在我们可能还有一个问题:我如果想把抓取的结果保存下来,要怎么做呢?WebMagic用于保存结果的组件叫做`Pipeline`。例如我们通过“控制台输出结果”这件事也是通过一个内置的Pipeline完成的,它叫做`ConsolePipeline`。那么,我现在想要把结果用Json的格式保存下来,怎么做呢?我只需要将Pipeline的实现换成"JsonFilePipeline"就可以了。 + +```java + public static void main(String[] args) { + + Spider.create(new GithubRepoPageProcessor()) + //从"https://github.com/code4craft"开始抓 + .addUrl("https://github.com/code4craft") + .addPipeline(new JsonFilePipeline("D:\webmagic\")) + //开启5个线程抓取 + .thread(5) + //启动爬虫 + .run(); + } +``` + +这样子下载下来的文件就会保存在D盘的webmagic目录中了。 + +通过定制Pipeline,我们还可以实现保存结果到文件、数据库等一系列功能。这个会在第7章“抽取结果的处理”中介绍。 + +至此为止,我们已经完成了一个基本爬虫的编写,也具有了一些定制功能。 + +
+ +## 4. 抽取工具详解 + +### 4.1 XPath + +### 4.2 CSS选择器 + +### 4.3 正则表达式 + +### 4.4 JsonPath ## 5. 配置爬虫 @@ -247,25 +379,25 @@ public class GithubRepoPageProcessor implements PageProcessor { ### 6.4 定期抓取 -## 7. 管理URL +## 7. 抽取结果的处理 -### 7.1 手动添加URL +### 7.1 输出到控制台 -### 7.2 在URL中保存信息 +### 7.2 保存到文件 -### 7.3 几种URL管理方式 +### 7.3 JSON格式输出 -### 7.4 自己管理爬虫的URL +### 7.4 自定义持久化方式(mysql/mongodb…) -## 8. 抽取结果的处理 +## 8. 管理URL -### 8.1 输出到控制台 +### 8.1 手动添加URL -### 8.2 保存到文件 +### 8.2 在URL中保存信息 -### 8.3 JSON格式输出 +### 8.3 几种URL管理方式 -### 8.4 自定义持久化方式(mysql/mongodb…) +### 8.4 自己管理爬虫的URL ## 9. 实例 From 7aaf837e154e89da4616caab289ffcdb622006c5 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Fri, 4 Apr 2014 20:10:00 +0800 Subject: [PATCH 047/130] change logger to slf4j style for performance #84 --- .../webmagic/scheduler/LocalDuplicatedRemovedScheduler.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/LocalDuplicatedRemovedScheduler.java b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/LocalDuplicatedRemovedScheduler.java index da32c66..c9b016c 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/LocalDuplicatedRemovedScheduler.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/LocalDuplicatedRemovedScheduler.java @@ -23,9 +23,9 @@ public abstract class LocalDuplicatedRemovedScheduler implements Scheduler { @Override public void push(Request request, Task task) { - logger.trace("get a candidate url " + request.getUrl()); + logger.trace("get a candidate url ", request.getUrl()); if (request.getExtra(Request.CYCLE_TRIED_TIMES) != null || urls.add(request.getUrl())) { - logger.debug("push to queue " + request.getUrl()); + logger.debug("push to queue ", request.getUrl()); pushWhenNoDuplicate(request, task); } } From dafd0b5875cd174db545e3cdcbd9b1c9908e4a76 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Fri, 4 Apr 2014 20:36:31 +0800 Subject: [PATCH 048/130] [BugFix]multi model in one pageprocessor will be skipped #85 --- .../webmagic/model/ModelPageProcessor.java | 5 ++- .../model/ModelPageProcessorTest.java | 45 +++++++++++++++++++ 2 files changed, 49 insertions(+), 1 deletion(-) create mode 100644 webmagic-extension/src/test/java/us/codecraft/webmagic/model/ModelPageProcessorTest.java diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ModelPageProcessor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ModelPageProcessor.java index 8a40dae..bc2afcf 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ModelPageProcessor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ModelPageProcessor.java @@ -55,11 +55,14 @@ class ModelPageProcessor implements PageProcessor { extractLinks(page, pageModelExtractor.getTargetUrlRegionSelector(), pageModelExtractor.getTargetUrlPatterns()); Object process = pageModelExtractor.process(page); if (process == null || (process instanceof List && ((List) process).size() == 0)) { - page.getResultItems().setSkip(true); + continue; } postProcessPageModel(pageModelExtractor.getClazz(), process); page.putField(pageModelExtractor.getClazz().getCanonicalName(), process); } + if (page.getResultItems().getAll().size() == 0) { + page.getResultItems().setSkip(true); + } } private void extractLinks(Page page, Selector urlRegionSelector, List urlPatterns) { diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/model/ModelPageProcessorTest.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/ModelPageProcessorTest.java new file mode 100644 index 0000000..74f3f6a --- /dev/null +++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/ModelPageProcessorTest.java @@ -0,0 +1,45 @@ +package us.codecraft.webmagic.model; + +import org.junit.Test; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.model.annotation.ExtractBy; +import us.codecraft.webmagic.model.annotation.TargetUrl; +import us.codecraft.webmagic.selector.PlainText; + +import static org.assertj.core.api.Assertions.assertThat; + +/** + * @author code4crafter@gmail.com + * @date 14-4-4 + */ +public class ModelPageProcessorTest { + + @TargetUrl("http://codecraft.us/foo") + public static class ModelFoo { + + @ExtractBy(value = "//div/@foo", notNull = true) + private String foo; + + } + + @TargetUrl("http://codecraft.us/bar") + public static class ModelBar { + + @ExtractBy(value = "//div/@bar", notNull = true) + private String bar; + + } + + @Test + public void testMultiModel_should_not_skip_when_match() throws Exception { + Page page = new Page(); + page.setRawText("
"); + page.setRequest(new Request("http://codecraft.us/foo")); + page.setUrl(PlainText.create("http://codecraft.us/foo")); + ModelPageProcessor modelPageProcessor = ModelPageProcessor.create(null, ModelFoo.class, ModelBar.class); + modelPageProcessor.process(page); + assertThat(page.getResultItems().isSkip()).isFalse(); + + } +} From 38a12f864113b383824a1903a1b780487810102d Mon Sep 17 00:00:00 2001 From: Tian Date: Fri, 4 Apr 2014 22:02:52 +0800 Subject: [PATCH 049/130] new feature: PatternProcessor --- .../example/PatternProcessorDemo.java | 53 ++++++++ .../webmagic/handler/PatternHandler.java | 113 ++++++++++++++++++ .../webmagic/pipeline/PatternPipeline.java | 44 +++++++ .../processor/PatternPageProcessor.java | 78 ++++++++++++ 4 files changed, 288 insertions(+) create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/example/PatternProcessorDemo.java create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/handler/PatternHandler.java create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/PatternPipeline.java create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/processor/PatternPageProcessor.java diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/PatternProcessorDemo.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/PatternProcessorDemo.java new file mode 100644 index 0000000..51a9484 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/PatternProcessorDemo.java @@ -0,0 +1,53 @@ +package us.codecraft.webmagic.example; + +import org.apache.log4j.Logger; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.ResultItems; +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.Task; +import us.codecraft.webmagic.handler.PatternHandler; +import us.codecraft.webmagic.pipeline.PatternPipeline; +import us.codecraft.webmagic.processor.PatternPageProcessor; + +/** + * Created with IntelliJ IDEA. + * User: Sebastian MA + * Date: April 04, 2014 + * Time: 21:23 + */ +public class PatternProcessorDemo { + + private static Logger log = Logger.getLogger(PatternProcessorDemo.class); + + public static void main(String... args) { + + PatternPageProcessor processor + = new PatternPageProcessor("http://item.jd.com/981821.html", + PatternPageProcessor.TARGET_PATTERN_ALL + ); + + PatternPipeline pipeline = new PatternPipeline(); + + // define a handler which handles only "http://item.jd.com/.*" + PatternHandler handler = new PatternHandler("http://item.jd.com/.*") { + + @Override + public void onExtract(Page page) { + + log.info("Extracting from " + page.getUrl()); + page.putField("test", "hello world:)"); + } + + @Override + public void onHandle(ResultItems result, Task task) { + + log.info("Handling " + result.getRequest().getUrl()); + log.info("Retrieved test=" + result.get("test")); + } + }; + + handler.register(processor, pipeline); + + Spider.create(processor).thread(5).addPipeline(pipeline).runAsync(); + } +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/PatternHandler.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/PatternHandler.java new file mode 100644 index 0000000..51e44e0 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/PatternHandler.java @@ -0,0 +1,113 @@ +package us.codecraft.webmagic.handler; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.ResultItems; +import us.codecraft.webmagic.Task; +import us.codecraft.webmagic.pipeline.PatternPipeline; +import us.codecraft.webmagic.processor.PatternPageProcessor; + +import java.util.UUID; + +/** + * Created with IntelliJ IDEA. + * User: Sebastian MA + * Date: April 03, 2014 + * Time: 10:00 + *

+ * A PatternHandler is in charge of both page extraction and data processing by implementing + * its two abstract methods. + */ +public abstract class PatternHandler { + + /** + * identity of the handler. + */ + protected String id; + + /** + * match pattern. only matched page should be handled. + */ + protected String pattern; + + /** + * @param pattern + * url pattern to handle + */ + protected PatternHandler(String pattern) { + + this.pattern = pattern; + this.id = UUID.randomUUID().toString(); + } + + /** + * determine if the page should be handled. + */ + public boolean match(String url) { + + return url.matches(pattern); + } + + /** + * registers to both the page processor and the pipeline so the handler could take charge of + * both end of procedure. + * + * @param processor + * the processor to handle + * @param pipeline + * the pipeline to handle + */ + public void register(PatternPageProcessor processor, PatternPipeline pipeline) { + + processor.addHandler(this); + pipeline.addHandler(this); + } + + public void unregister(PatternPageProcessor processor, PatternPipeline pipeline) { + + processor.removeHandler(this); + pipeline.removeHandler(this); + } + + public boolean process(Page page) { + + if(match(page.getUrl().toString())) { + page.putField(id, true); + onExtract(page); + return true; + } else { + return false; + } + } + + public boolean process(ResultItems resultItems, Task task) { + + if(resultItems.isSkip()) { + return false; + } + + if(match(resultItems.getRequest().getUrl()) && resultItems.get(id) != null) { + onHandle(resultItems, task); + return true; + } else { + return false; + } + } + + /** + * implements this method to extract from page. + * + * @param page + * the page to extract + */ + public abstract void onExtract(Page page); + + /** + * implements this method to handle the extraction result. + * + * @param result + * extraction result + * @param task + */ + public abstract void onHandle(ResultItems result, Task task); + +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/PatternPipeline.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/PatternPipeline.java new file mode 100644 index 0000000..582b162 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/PatternPipeline.java @@ -0,0 +1,44 @@ +package us.codecraft.webmagic.pipeline; + +import us.codecraft.webmagic.ResultItems; +import us.codecraft.webmagic.Task; +import us.codecraft.webmagic.handler.PatternHandler; + +import java.util.ArrayList; + +/** + * Created with IntelliJ IDEA. + * User: Sebastian MA + * Date: April 04, 2014 + * Time: 20:44 + */ +public class PatternPipeline implements Pipeline { + + protected ArrayList handlers = new ArrayList(); + + /** + * A handler works only if it is added to BOTH the page processor and the pipeline. + * Uses PatternHandler's register instead. + * + * @param handler the pattern handler + * + * @see PatternHandler#register + */ + public void addHandler(PatternHandler handler) { + + handlers.add(handler); + } + + public void removeHandler(PatternHandler handler) { + + handlers.remove(handler); + } + + @Override + public void process(ResultItems resultItems, Task task) { + + for(PatternHandler handler : handlers) { + handler.process(resultItems, task); + } + } +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/processor/PatternPageProcessor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/processor/PatternPageProcessor.java new file mode 100644 index 0000000..d7d909c --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/processor/PatternPageProcessor.java @@ -0,0 +1,78 @@ +package us.codecraft.webmagic.processor; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.handler.PatternHandler; +import us.codecraft.webmagic.utils.UrlUtils; + +import java.util.ArrayList; +import java.util.List; + +/** + * Created with IntelliJ IDEA. + * User: Sebastian MA + * Date: April 04, 2014 + * Time: 15:36 + *

+ * A PatternPageProcessor uses PatternHandler to setup extraction rules for specific url pattern. + * + * @see us.codecraft.webmagic.handler.PatternHandler + */ +public class PatternPageProcessor implements PageProcessor { + + public static final String TARGET_PATTERN_ALL = "http://*"; + + protected Site site; + + protected String targetPattern; + + protected ArrayList handlers = new ArrayList(); + + public PatternPageProcessor(String startUrl, String targetPattern) { + + this.targetPattern = targetPattern; + + this.site = Site.me().addStartUrl(startUrl).setDomain(UrlUtils.getDomain(startUrl)); + this.targetPattern = "(" + targetPattern.replace(".", "\\.").replace("*", + "[^\"'#]*") + ")"; + + site.setUserAgent("Chrome/5.0.354.0"); + } + + @Override + public void process(Page page) { + + + List requests = page.getHtml().links().regex(targetPattern).all(); + page.addTargetRequests(requests); + for(PatternHandler handler : handlers) { + if(handler.match(page.getUrl().toString())) { + handler.process(page); + } + } + } + + /** + * A handler works only if it is added to BOTH the page processor and the pipeline. + * Uses PatternHandler's register instead. + * + * @param handler the pattern handler + * + * @see PatternHandler#register + */ + public void addHandler(PatternHandler handler) { + + handlers.add(handler); + } + + public void removeHandler(PatternHandler handler) { + + handlers.remove(handler); + } + + @Override + public Site getSite() { + + return site; + } +} From 8fe967ba8d0d33804079c461ba9a0efb20fcac8d Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Fri, 4 Apr 2014 23:39:32 +0800 Subject: [PATCH 050/130] [BugFix]exclude log4j.xml from maven jar plugin #82 --- pom.xml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/pom.xml b/pom.xml index 085e94e..b277b38 100644 --- a/pom.xml +++ b/pom.xml @@ -182,6 +182,15 @@ UTF-8 + + org.apache.maven.plugins + maven-jar-plugin + + + log4j.xml + + + org.apache.maven.plugins maven-source-plugin From b043ac76d656467f342249f82a7a4c8f300030f6 Mon Sep 17 00:00:00 2001 From: Bo LIANG Date: Sat, 5 Apr 2014 11:31:56 +0800 Subject: [PATCH 051/130] change the formatter of log. To use slf4j, we should insert {} into the formatter string. --- .../webmagic/scheduler/LocalDuplicatedRemovedScheduler.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/LocalDuplicatedRemovedScheduler.java b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/LocalDuplicatedRemovedScheduler.java index c9b016c..397199c 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/LocalDuplicatedRemovedScheduler.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/LocalDuplicatedRemovedScheduler.java @@ -23,9 +23,9 @@ public abstract class LocalDuplicatedRemovedScheduler implements Scheduler { @Override public void push(Request request, Task task) { - logger.trace("get a candidate url ", request.getUrl()); + logger.trace("get a candidate url {}", request.getUrl()); if (request.getExtra(Request.CYCLE_TRIED_TIMES) != null || urls.add(request.getUrl())) { - logger.debug("push to queue ", request.getUrl()); + logger.debug("push to queue {}", request.getUrl()); pushWhenNoDuplicate(request, task); } } From c143fc662cb0e21ced7ed084aff63d25b09b5b3b Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 5 Apr 2014 18:17:48 +0800 Subject: [PATCH 052/130] add SubPageProcessor #86 --- .../handler/CompositePageProcessor.java | 49 +++++++++++++++++++ .../webmagic/handler/SubPageProcessor.java | 33 +++++++++++++ 2 files changed, 82 insertions(+) create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/handler/CompositePageProcessor.java create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/handler/SubPageProcessor.java diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/CompositePageProcessor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/CompositePageProcessor.java new file mode 100644 index 0000000..ecf4aa1 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/CompositePageProcessor.java @@ -0,0 +1,49 @@ +package us.codecraft.webmagic.handler; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.processor.PageProcessor; + +import java.util.ArrayList; +import java.util.List; + +/** + * @author code4crafter@gmail.com + * @date 14-4-5 + */ +public class CompositePageProcessor implements PageProcessor { + + private Site site; + + private List subPageProcessors; + + @Override + public void process(Page page) { + for (SubPageProcessor subPageProcessor : subPageProcessors) { + if (subPageProcessor.match(page)) { + SubPageProcessor.MatchOtherProcessor matchOtherProcessorProcessor = subPageProcessor.process(page); + if (matchOtherProcessorProcessor == null || matchOtherProcessorProcessor != SubPageProcessor.MatchOtherProcessor.YES) { + return; + } + } + } + } + + public CompositePageProcessor setSite(Site site) { + this.site = site; + return this; + } + + public CompositePageProcessor setSubPageProcessors(SubPageProcessor... subPageProcessors) { + this.subPageProcessors = new ArrayList(); + for (SubPageProcessor subPageProcessor : subPageProcessors) { + this.subPageProcessors.add(subPageProcessor); + } + return this; + } + + @Override + public Site getSite() { + return site; + } +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/SubPageProcessor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/SubPageProcessor.java new file mode 100644 index 0000000..c880500 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/SubPageProcessor.java @@ -0,0 +1,33 @@ +package us.codecraft.webmagic.handler; + +import us.codecraft.webmagic.Page; + +/** + * @author code4crafter@gmail.com + * @date 14-4-5 + */ +public interface SubPageProcessor { + + /** + * Check whether the SubPageProcessor can process the page.

+ * Please DO NOT change page status in this method. + * + * @param page + * @return + */ + public boolean match(Page page); + + /** + * + * process the page, extract urls to fetch, extract the data and store + * + * @param page + * @return whether continue to match + */ + public MatchOtherProcessor process(Page page); + + public enum MatchOtherProcessor { + YES, NO; + } + +} From 159eeea2f56504fb1019667723ea16432e30e0ec Mon Sep 17 00:00:00 2001 From: Bo LIANG Date: Sat, 5 Apr 2014 18:32:12 +0800 Subject: [PATCH 053/130] Remove unused variable to make the project cleaner. --- .../java/us/codecraft/webmagic/model/ModelPageProcessor.java | 4 ---- 1 file changed, 4 deletions(-) diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ModelPageProcessor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ModelPageProcessor.java index bc2afcf..3a97e1d 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ModelPageProcessor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ModelPageProcessor.java @@ -25,8 +25,6 @@ class ModelPageProcessor implements PageProcessor { private Site site; - private Set targetUrlPatterns = new HashSet(); - public static ModelPageProcessor create(Site site, Class... clazzs) { ModelPageProcessor modelPageProcessor = new ModelPageProcessor(site); for (Class clazz : clazzs) { @@ -38,8 +36,6 @@ class ModelPageProcessor implements PageProcessor { public ModelPageProcessor addPageModel(Class clazz) { PageModelExtractor pageModelExtractor = PageModelExtractor.create(clazz); - targetUrlPatterns.addAll(pageModelExtractor.getTargetUrlPatterns()); - targetUrlPatterns.addAll(pageModelExtractor.getHelpUrlPatterns()); pageModelExtractorList.add(pageModelExtractor); return this; } From 9b2cb43f47a367623279fb20e8ff0de93e7cc56b Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 5 Apr 2014 23:40:10 +0800 Subject: [PATCH 054/130] ConfigurablePageProcessor #91 --- .../us/codecraft/webmagic/selector/Html.java | 1 + .../ConfigurablePageProcessor.java | 49 ++++++++ .../webmagic/configurable/ExpressionType.java | 11 ++ .../webmagic/configurable/ExtractRule.java | 113 ++++++++++++++++++ .../webmagic/configurable/Inject.java | 15 --- .../webmagic/configurable/PropertyLoader.java | 18 --- .../ConfigurableBlogPageProcessor.java | 51 -------- .../ConfigurablePageProcessorTest.java | 39 ++++++ 8 files changed, 213 insertions(+), 84 deletions(-) create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ConfigurablePageProcessor.java create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ExpressionType.java create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ExtractRule.java delete mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/Inject.java delete mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/PropertyLoader.java delete mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/example/ConfigurableBlogPageProcessor.java create mode 100644 webmagic-extension/src/test/java/us/codecraft/webmagic/configurable/ConfigurablePageProcessorTest.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java index 614b111..34386b5 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java @@ -131,6 +131,7 @@ public class Html extends PlainText { } public Document getDocument() { + initDocument(); return document; } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ConfigurablePageProcessor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ConfigurablePageProcessor.java new file mode 100644 index 0000000..36615d8 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ConfigurablePageProcessor.java @@ -0,0 +1,49 @@ +package us.codecraft.webmagic.configurable; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.processor.PageProcessor; + +import java.util.List; + +/** + * @author code4crafter@gmail.com
+ */ +public class ConfigurablePageProcessor implements PageProcessor { + + private Site site; + + private List extractRules; + + public ConfigurablePageProcessor(Site site, List extractRules) { + this.site = site; + this.extractRules = extractRules; + } + + @Override + public void process(Page page) { + for (ExtractRule extractRule : extractRules) { + if (extractRule.isMulti()) { + List results = page.getHtml().selectDocumentForList(extractRule.getSelector()); + if (extractRule.isNotNull() && results.size() == 0) { + page.setSkip(true); + } else { + page.getResultItems().put(extractRule.getFieldName(), results); + } + } else { + String result = page.getHtml().selectDocument(extractRule.getSelector()); + if (extractRule.isNotNull() && result == null) { + page.setSkip(true); + } else { + page.getResultItems().put(extractRule.getFieldName(), result); + } + } + } + } + + @Override + public Site getSite() { + return site; + } + +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ExpressionType.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ExpressionType.java new file mode 100644 index 0000000..bd84be3 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ExpressionType.java @@ -0,0 +1,11 @@ +package us.codecraft.webmagic.configurable; + +/** + * @author code4crafter@gmail.com + * @date 14-4-5 + */ +public enum ExpressionType { + + XPath, Regex, Css, JsonPath; + +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ExtractRule.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ExtractRule.java new file mode 100644 index 0000000..82337c4 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ExtractRule.java @@ -0,0 +1,113 @@ +package us.codecraft.webmagic.configurable; + +import us.codecraft.webmagic.selector.JsonPathSelector; +import us.codecraft.webmagic.selector.Selector; + +import static us.codecraft.webmagic.selector.Selectors.*; + +/** + * @author code4crafter@gmail.com + * @date 14-4-5 + */ +public class ExtractRule { + + private String fieldName; + + private ExpressionType expressionType; + + private String expressionValue; + + private String[] expressionParams; + + private boolean multi = false; + + private volatile Selector selector; + + private boolean notNull = false; + + public String getFieldName() { + return fieldName; + } + + public void setFieldName(String fieldName) { + this.fieldName = fieldName; + } + + public ExpressionType getExpressionType() { + return expressionType; + } + + public void setExpressionType(ExpressionType expressionType) { + this.expressionType = expressionType; + } + + public String getExpressionValue() { + return expressionValue; + } + + public void setExpressionValue(String expressionValue) { + this.expressionValue = expressionValue; + } + + public String[] getExpressionParams() { + return expressionParams; + } + + public void setExpressionParams(String[] expressionParams) { + this.expressionParams = expressionParams; + } + + public boolean isMulti() { + return multi; + } + + public void setMulti(boolean multi) { + this.multi = multi; + } + + public Selector getSelector() { + if (selector == null) { + synchronized (this) { + if (selector == null) { + selector = compileSelector(); + } + } + } + return selector; + } + + private Selector compileSelector() { + switch (expressionType) { + case Css: + if (expressionParams.length >= 1) { + return $(expressionValue, expressionParams[0]); + } else { + return $(expressionValue); + } + case XPath: + return xpath(expressionValue); + case Regex: + if (expressionParams.length >= 1) { + return regex(expressionValue, Integer.parseInt(expressionParams[0])); + } else { + return regex(expressionValue); + } + case JsonPath: + return new JsonPathSelector(expressionValue); + default: + return xpath(expressionValue); + } + } + + public void setSelector(Selector selector) { + this.selector = selector; + } + + public boolean isNotNull() { + return notNull; + } + + public void setNotNull(boolean notNull) { + this.notNull = notNull; + } +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/Inject.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/Inject.java deleted file mode 100644 index c6608ae..0000000 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/Inject.java +++ /dev/null @@ -1,15 +0,0 @@ -package us.codecraft.webmagic.configurable; - -import java.lang.annotation.ElementType; -import java.lang.annotation.Retention; -import java.lang.annotation.Target; - -/** - * @author yihua.huang@dianping.com - */ -@Retention(java.lang.annotation.RetentionPolicy.RUNTIME) -@Target({ElementType.FIELD}) -public @interface Inject { - - String value() default ""; -} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/PropertyLoader.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/PropertyLoader.java deleted file mode 100644 index bffbcf2..0000000 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/PropertyLoader.java +++ /dev/null @@ -1,18 +0,0 @@ -package us.codecraft.webmagic.configurable; - -import us.codecraft.webmagic.processor.PageProcessor; - -import java.util.Map; - -/** - * Inject property to object by {@link Inject} annotation. - * - * @author yihua.huang@dianping.com - */ -public class PropertyLoader { - - public T load(T object, Map properties) { - return object; - } - -} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/ConfigurableBlogPageProcessor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/ConfigurableBlogPageProcessor.java deleted file mode 100644 index 28d3ab0..0000000 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/ConfigurableBlogPageProcessor.java +++ /dev/null @@ -1,51 +0,0 @@ -package us.codecraft.webmagic.example; - -import java.util.List; -import us.codecraft.webmagic.Page; -import us.codecraft.webmagic.Site; -import us.codecraft.webmagic.Spider; -import us.codecraft.webmagic.configurable.Inject; -import us.codecraft.webmagic.processor.PageProcessor; - -/** - * @author code4crafter@gmail.com
- */ -public class ConfigurableBlogPageProcessor implements PageProcessor { - - private Site site = Site.me().setDomain("my.oschina.net"); - - @Inject("linkRegex") - private String linkRegex; - - @Inject("titleXpath") - private String titleXpath; - - @Inject("contentXpath") - private String contentXpath; - - @Inject("tagsXpath") - private String tagsXpath; - - @Override - public void process(Page page) { - List links = page.getHtml().links().regex(linkRegex).all(); - page.addTargetRequests(links); - page.putField("title", page.getHtml().xpath(titleXpath).toString()); - if (page.getResultItems().get("title") == null) { - //skip this page - page.setSkip(true); - } - page.putField("content", page.getHtml().smartContent().toString()); - page.putField("tags", page.getHtml().xpath(tagsXpath).all()); - } - - @Override - public Site getSite() { - return site; - - } - - public static void main(String[] args) { - Spider.create(new ConfigurableBlogPageProcessor()).addUrl("http://my.oschina.net/flashsword/blog").thread(2).run(); - } -} diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/configurable/ConfigurablePageProcessorTest.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/configurable/ConfigurablePageProcessorTest.java new file mode 100644 index 0000000..a35fffa --- /dev/null +++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/configurable/ConfigurablePageProcessorTest.java @@ -0,0 +1,39 @@ +package us.codecraft.webmagic.configurable; + +import org.junit.Test; +import us.codecraft.webmagic.ResultItems; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.downloader.MockGithubDownloader; + +import java.util.ArrayList; +import java.util.List; + +import static org.assertj.core.api.Assertions.assertThat; + +/** + * @author code4crafter@gmail.com + * @date 14-4-5 + */ +public class ConfigurablePageProcessorTest { + + @Test + public void test() throws Exception { + List extractRules = new ArrayList(); + ExtractRule extractRule = new ExtractRule(); + extractRule.setExpressionType(ExpressionType.XPath); + extractRule.setExpressionValue("//title"); + extractRule.setFieldName("title"); + extractRules.add(extractRule); + extractRule = new ExtractRule(); + extractRule.setExpressionType(ExpressionType.XPath); + extractRule.setExpressionValue("//ul[@class='pagehead-actions']/li[1]//a[@class='social-count js-social-count']/text()"); + extractRule.setFieldName("star"); + extractRules.add(extractRule); + ResultItems resultItems = Spider.create(new ConfigurablePageProcessor(Site.me(), extractRules)) + .setDownloader(new MockGithubDownloader()).get("https://github.com/code4craft/webmagic"); + assertThat(resultItems.getAll()).containsEntry("title", "code4craft/webmagic · GitHub"); + assertThat(resultItems.getAll()).containsEntry("star", " 86 "); + + } +} From 969ad1766b51d9e7ad63a3862de3fde8e159a0d4 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sun, 6 Apr 2014 21:32:20 +0800 Subject: [PATCH 055/130] change logger style to slf4j for cleaner code --- .../codecraft/webmagic/downloader/HttpClientDownloader.java | 2 +- .../java/us/codecraft/webmagic/model/PageModelExtractor.java | 4 +--- .../codecraft/webmagic/scheduler/FileCacheQueueScheduler.java | 4 +--- 3 files changed, 3 insertions(+), 7 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index bcf4a53..30c561b 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -74,7 +74,7 @@ public class HttpClientDownloader extends AbstractDownloader { } else { acceptStatCode = Sets.newHashSet(200); } - logger.info("downloading page " + request.getUrl()); + logger.info("downloading page {}" , request.getUrl()); RequestBuilder requestBuilder = RequestBuilder.get().setUri(request.getUrl()); if (headers != null) { for (Map.Entry headerEntry : headers.entrySet()) { diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java index 5e4da11..b7b7900 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java @@ -340,9 +340,7 @@ class PageModelExtractor { private Object convert(String value, ObjectFormatter objectFormatter) { try { Object format = objectFormatter.format(value); - if (logger.isDebugEnabled()) { - logger.debug("String " + value + " is converted to " + format); - } + logger.debug("String {} is converted to {}", value, format); return format; } catch (Exception e) { logger.error("convert " + value + " to " + objectFormatter.clazz() + " error!", e); diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java index 38e8a79..2698f73 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java @@ -149,9 +149,7 @@ public class FileCacheQueueScheduler implements Scheduler { if (!inited.get()) { init(task); } - if (logger.isDebugEnabled()) { - logger.debug("push to queue " + request.getUrl()); - } + logger.debug("push to queue {}" , request.getUrl()); if (urls.add(request.getUrl())) { queue.add(request); fileUrlWriter.println(request.getUrl()); From c1e7207869417a4f35bbabaa0216f703e78e07e5 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Mon, 7 Apr 2014 11:00:09 +0800 Subject: [PATCH 056/130] add FileCacheQueueScheduler support for cycleRetryTimes --- .../webmagic/scheduler/FileCacheQueueScheduler.java | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java index 2698f73..79f3b8b 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java @@ -23,7 +23,7 @@ import java.util.concurrent.atomic.AtomicInteger; * @author code4crafter@gmail.com
* @since 0.2.0 */ -public class FileCacheQueueScheduler implements Scheduler { +public class FileCacheQueueScheduler extends LocalDuplicatedRemovedScheduler { private Logger logger = LoggerFactory.getLogger(getClass()); @@ -145,16 +145,12 @@ public class FileCacheQueueScheduler implements Scheduler { } @Override - public synchronized void push(Request request, Task task) { + protected void pushWhenNoDuplicate(Request request, Task task) { if (!inited.get()) { init(task); } - logger.debug("push to queue {}" , request.getUrl()); - if (urls.add(request.getUrl())) { - queue.add(request); - fileUrlWriter.println(request.getUrl()); - } - + queue.add(request); + fileUrlWriter.println(request.getUrl()); } @Override From 37666a7151f64a8446b1b4a4a563d59a65bdbcd8 Mon Sep 17 00:00:00 2001 From: friddle Date: Mon, 7 Apr 2014 23:04:24 +0800 Subject: [PATCH 057/130] update the script --- webmagic-scripts/README.md | 0 webmagic-scripts/deploy.sh | 0 webmagic-scripts/pom.xml | 4 ++ .../codecraft/webmagic/scripts/Language.java | 4 +- .../webmagic/scripts/ScriptConsole.java | 0 .../webmagic/scripts/ScriptEnginePool.java | 0 .../webmagic/scripts/ScriptProcessor.java | 48 +++++++++++++------ .../scripts/ScriptProcessorBuilder.java | 0 .../src/main/resources/js/defines.js | 0 .../src/main/resources/js/github.js | 0 .../src/main/resources/js/oschina.js | 0 webmagic-scripts/src/main/resources/log4j.xml | 0 .../src/main/resources/python/defines.py | 13 +++++ .../src/main/resources/python/oschina.py | 4 ++ .../src/main/resources/ruby/defines.rb | 0 .../src/main/resources/ruby/github.rb | 0 .../src/main/resources/ruby/oschina.rb | 0 .../webmagic/scripts/ScriptProcessorTest.java | 8 ++++ webmagic-scripts/src/test/resouces/log4j.xml | 0 19 files changed, 66 insertions(+), 15 deletions(-) mode change 100644 => 100755 webmagic-scripts/README.md mode change 100644 => 100755 webmagic-scripts/deploy.sh mode change 100644 => 100755 webmagic-scripts/pom.xml mode change 100644 => 100755 webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/Language.java mode change 100644 => 100755 webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptConsole.java mode change 100644 => 100755 webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptEnginePool.java mode change 100644 => 100755 webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptProcessor.java mode change 100644 => 100755 webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptProcessorBuilder.java mode change 100644 => 100755 webmagic-scripts/src/main/resources/js/defines.js mode change 100644 => 100755 webmagic-scripts/src/main/resources/js/github.js mode change 100644 => 100755 webmagic-scripts/src/main/resources/js/oschina.js mode change 100644 => 100755 webmagic-scripts/src/main/resources/log4j.xml create mode 100755 webmagic-scripts/src/main/resources/python/defines.py create mode 100755 webmagic-scripts/src/main/resources/python/oschina.py mode change 100644 => 100755 webmagic-scripts/src/main/resources/ruby/defines.rb mode change 100644 => 100755 webmagic-scripts/src/main/resources/ruby/github.rb mode change 100644 => 100755 webmagic-scripts/src/main/resources/ruby/oschina.rb mode change 100644 => 100755 webmagic-scripts/src/test/java/us/codecraft/webmagic/scripts/ScriptProcessorTest.java mode change 100644 => 100755 webmagic-scripts/src/test/resouces/log4j.xml diff --git a/webmagic-scripts/README.md b/webmagic-scripts/README.md old mode 100644 new mode 100755 diff --git a/webmagic-scripts/deploy.sh b/webmagic-scripts/deploy.sh old mode 100644 new mode 100755 diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml old mode 100644 new mode 100755 index 5c21160..41c79ea --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -16,6 +16,10 @@ jruby 1.7.6 + org.python + jython + 2.5.3 + commons-cli commons-cli diff --git a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/Language.java b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/Language.java old mode 100644 new mode 100755 index c7ddcda..2f9d22d --- a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/Language.java +++ b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/Language.java @@ -7,7 +7,9 @@ public enum Language { JavaScript("javascript","js/defines.js",""), - JRuby("jruby","ruby/defines.rb",""); + JRuby("jruby","ruby/defines.rb",""), + + Jython("jython","python/defines.py",""); private String engineName; diff --git a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptConsole.java b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptConsole.java old mode 100644 new mode 100755 diff --git a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptEnginePool.java b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptEnginePool.java old mode 100644 new mode 100755 diff --git a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptProcessor.java b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptProcessor.java old mode 100644 new mode 100755 index 5801851..0214e8a --- a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptProcessor.java +++ b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptProcessor.java @@ -1,6 +1,9 @@ package us.codecraft.webmagic.scripts; import org.apache.commons.io.IOUtils; +import org.jruby.RubyHash; +import org.python.core.PyDictionary; +import sun.org.mozilla.javascript.internal.NativeObject; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.processor.PageProcessor; @@ -10,6 +13,8 @@ import javax.script.ScriptEngine; import javax.script.ScriptException; import java.io.IOException; import java.io.InputStream; +import java.util.Iterator; +import java.util.Map; /** * @author code4crafter@gmail.com @@ -50,20 +55,34 @@ public class ScriptProcessor implements PageProcessor { context.setAttribute("page", page, ScriptContext.ENGINE_SCOPE); context.setAttribute("config", site, ScriptContext.ENGINE_SCOPE); try { - engine.eval(defines + "\n" + script, context); -// switch (language) { -// case JavaScript: -// NativeObject o = (NativeObject) engine.get("result"); -// if (o != null) { -// for (Map.Entry objectObjectEntry : o.entrySet()) { -// page.getResultItems().put(objectObjectEntry.getKey().toString(), objectObjectEntry.getValue()); -// } -// } -// break; -// case JRuby: -// Object o1 = engine.get("result"); -// break; -// } + switch (language) { + case JavaScript: + engine.eval(defines + "\n" + script, context); + NativeObject o = (NativeObject) engine.get("result"); + if (o != null) { + for (Map.Entry objectObjectEntry : o.entrySet()) { + page.getResultItems().put(objectObjectEntry.getKey().toString(), objectObjectEntry.getValue()); + } + } + break; + case JRuby: + RubyHash oRuby=(RubyHash)engine.eval(defines+"\n"+script,context); + Iterator itruby = oRuby.entrySet().iterator(); + while (itruby.hasNext()) { + Map.Entry pairs = (Map.Entry)itruby.next(); + page.getResultItems().put(pairs.getKey().toString(),pairs.getValue()); + } + break; + case Jython: + engine.eval(defines + "\n" + script, context); + PyDictionary oJython=(PyDictionary)engine.get("result"); + Iterator it = oJython.entrySet().iterator(); + while (it.hasNext()) { + Map.Entry pairs = (Map.Entry)it.next(); + page.getResultItems().put(pairs.getKey().toString(),pairs.getValue()); + } + break; + } } catch (ScriptException e) { e.printStackTrace(); } @@ -72,6 +91,7 @@ public class ScriptProcessor implements PageProcessor { } } + @Override public Site getSite() { return site; diff --git a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptProcessorBuilder.java b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptProcessorBuilder.java old mode 100644 new mode 100755 diff --git a/webmagic-scripts/src/main/resources/js/defines.js b/webmagic-scripts/src/main/resources/js/defines.js old mode 100644 new mode 100755 diff --git a/webmagic-scripts/src/main/resources/js/github.js b/webmagic-scripts/src/main/resources/js/github.js old mode 100644 new mode 100755 diff --git a/webmagic-scripts/src/main/resources/js/oschina.js b/webmagic-scripts/src/main/resources/js/oschina.js old mode 100644 new mode 100755 diff --git a/webmagic-scripts/src/main/resources/log4j.xml b/webmagic-scripts/src/main/resources/log4j.xml old mode 100644 new mode 100755 diff --git a/webmagic-scripts/src/main/resources/python/defines.py b/webmagic-scripts/src/main/resources/python/defines.py new file mode 100755 index 0000000..913a4b4 --- /dev/null +++ b/webmagic-scripts/src/main/resources/python/defines.py @@ -0,0 +1,13 @@ +def xpath(str): + return page.getHtml().xpath(str).toString() + +def css(str): + return page.getHtml().css(str).toString() + +def urls(str): + links=page.getHtml().links().regex(str).all() + page.addTargetRequests(links); + +def tomap(key,value): + return "hello world" + diff --git a/webmagic-scripts/src/main/resources/python/oschina.py b/webmagic-scripts/src/main/resources/python/oschina.py new file mode 100755 index 0000000..51a188b --- /dev/null +++ b/webmagic-scripts/src/main/resources/python/oschina.py @@ -0,0 +1,4 @@ +title=xpath("div[@class=BlogTitle]") +urls="http://my\\.oschina\\.net/flashsword/blog/\\d+" + +result={"title":title,"urls":urls} diff --git a/webmagic-scripts/src/main/resources/ruby/defines.rb b/webmagic-scripts/src/main/resources/ruby/defines.rb old mode 100644 new mode 100755 diff --git a/webmagic-scripts/src/main/resources/ruby/github.rb b/webmagic-scripts/src/main/resources/ruby/github.rb old mode 100644 new mode 100755 diff --git a/webmagic-scripts/src/main/resources/ruby/oschina.rb b/webmagic-scripts/src/main/resources/ruby/oschina.rb old mode 100644 new mode 100755 diff --git a/webmagic-scripts/src/test/java/us/codecraft/webmagic/scripts/ScriptProcessorTest.java b/webmagic-scripts/src/test/java/us/codecraft/webmagic/scripts/ScriptProcessorTest.java old mode 100644 new mode 100755 index ec3f674..23fe093 --- a/webmagic-scripts/src/test/java/us/codecraft/webmagic/scripts/ScriptProcessorTest.java +++ b/webmagic-scripts/src/test/java/us/codecraft/webmagic/scripts/ScriptProcessorTest.java @@ -22,4 +22,12 @@ public class ScriptProcessorTest { pageProcessor.getSite().setSleepTime(0); Spider.create(pageProcessor).addUrl("http://my.oschina.net/flashsword/blog").setSpawnUrl(false).run(); } + + + @Test + public void testPythonProcessor() { + ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom().language(Language.Jython).scriptFromClassPathFile("python/oschina.py").build(); + pageProcessor.getSite().setSleepTime(0); + Spider.create(pageProcessor).addUrl("http://my.oschina.net/flashsword/blog").setSpawnUrl(false).run(); + } } diff --git a/webmagic-scripts/src/test/resouces/log4j.xml b/webmagic-scripts/src/test/resouces/log4j.xml old mode 100644 new mode 100755 From 933800147b9109141eef107c412f8b8bd263829d Mon Sep 17 00:00:00 2001 From: friddle Date: Mon, 7 Apr 2014 23:18:00 +0800 Subject: [PATCH 058/130] update ruby --- webmagic-scripts/src/main/resources/ruby/oschina.rb | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) mode change 100755 => 100644 webmagic-scripts/src/main/resources/ruby/oschina.rb diff --git a/webmagic-scripts/src/main/resources/ruby/oschina.rb b/webmagic-scripts/src/main/resources/ruby/oschina.rb old mode 100755 new mode 100644 index cbced0b..dbea13b --- a/webmagic-scripts/src/main/resources/ruby/oschina.rb +++ b/webmagic-scripts/src/main/resources/ruby/oschina.rb @@ -1,3 +1,6 @@ +urls "http://my\\.oschina\\.net/flashsword/blog/\\d+" title = css "div.BlogTitle h1" content = css "div.BlogContent" -urls "http://my\\.oschina\\.net/flashsword/blog/\\d+" \ No newline at end of file + +return {"title"=>title,"content"=>content} + From c2873928c855869b6f05f270d5228909bf111b96 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Wed, 9 Apr 2014 09:54:01 +0800 Subject: [PATCH 059/130] [prototype] extractrule --- asserts/page-extract-rule.bmml | 9 +++++++++ .../main/webapp/WEB-INF/pages/create_spider.ftl | 14 ++++++++++++++ 2 files changed, 23 insertions(+) create mode 100644 asserts/page-extract-rule.bmml diff --git a/asserts/page-extract-rule.bmml b/asserts/page-extract-rule.bmml new file mode 100644 index 0000000..fec8d3e --- /dev/null +++ b/asserts/page-extract-rule.bmml @@ -0,0 +1,9 @@ + + + + + A%20Web%20Page%0Ahttp%3A// + + + + \ No newline at end of file diff --git a/webmagic-avalon/webmagic-admin/src/main/webapp/WEB-INF/pages/create_spider.ftl b/webmagic-avalon/webmagic-admin/src/main/webapp/WEB-INF/pages/create_spider.ftl index e69de29..4cd838c 100644 --- a/webmagic-avalon/webmagic-admin/src/main/webapp/WEB-INF/pages/create_spider.ftl +++ b/webmagic-avalon/webmagic-admin/src/main/webapp/WEB-INF/pages/create_spider.ftl @@ -0,0 +1,14 @@ + + + + + +
+ +
+ +
+ +
+ + \ No newline at end of file From 9f5a6494a00c962232750cd0d9aeb04dce2bdf67 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Wed, 9 Apr 2014 10:44:52 +0800 Subject: [PATCH 060/130] add support for JDK6 #93 --- .../webmagic/scripts/ScriptProcessor.java | 55 ++++++++++--------- .../src/main/resources/js/oschina.js | 1 + 2 files changed, 29 insertions(+), 27 deletions(-) diff --git a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptProcessor.java b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptProcessor.java index 0214e8a..946d15e 100755 --- a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptProcessor.java +++ b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptProcessor.java @@ -55,34 +55,35 @@ public class ScriptProcessor implements PageProcessor { context.setAttribute("page", page, ScriptContext.ENGINE_SCOPE); context.setAttribute("config", site, ScriptContext.ENGINE_SCOPE); try { - switch (language) { - case JavaScript: - engine.eval(defines + "\n" + script, context); - NativeObject o = (NativeObject) engine.get("result"); - if (o != null) { - for (Map.Entry objectObjectEntry : o.entrySet()) { - page.getResultItems().put(objectObjectEntry.getKey().toString(), objectObjectEntry.getValue()); + switch (language) { + case JavaScript: + engine.eval(defines + "\n" + script, context); + NativeObject o = (NativeObject) engine.get("result"); + if (o != null) { + for (Object o1 : o.getIds()) { + String key = String.valueOf(o1); + page.getResultItems().put(key, NativeObject.getProperty(o, key)); + } } - } - break; - case JRuby: - RubyHash oRuby=(RubyHash)engine.eval(defines+"\n"+script,context); - Iterator itruby = oRuby.entrySet().iterator(); - while (itruby.hasNext()) { - Map.Entry pairs = (Map.Entry)itruby.next(); - page.getResultItems().put(pairs.getKey().toString(),pairs.getValue()); - } - break; - case Jython: - engine.eval(defines + "\n" + script, context); - PyDictionary oJython=(PyDictionary)engine.get("result"); - Iterator it = oJython.entrySet().iterator(); - while (it.hasNext()) { - Map.Entry pairs = (Map.Entry)it.next(); - page.getResultItems().put(pairs.getKey().toString(),pairs.getValue()); - } - break; - } + break; + case JRuby: + RubyHash oRuby = (RubyHash) engine.eval(defines + "\n" + script, context); + Iterator itruby = oRuby.entrySet().iterator(); + while (itruby.hasNext()) { + Map.Entry pairs = (Map.Entry) itruby.next(); + page.getResultItems().put(pairs.getKey().toString(), pairs.getValue()); + } + break; + case Jython: + engine.eval(defines + "\n" + script, context); + PyDictionary oJython = (PyDictionary) engine.get("result"); + Iterator it = oJython.entrySet().iterator(); + while (it.hasNext()) { + Map.Entry pairs = (Map.Entry) it.next(); + page.getResultItems().put(pairs.getKey().toString(), pairs.getValue()); + } + break; + } } catch (ScriptException e) { e.printStackTrace(); } diff --git a/webmagic-scripts/src/main/resources/js/oschina.js b/webmagic-scripts/src/main/resources/js/oschina.js index 305682e..02191c3 100755 --- a/webmagic-scripts/src/main/resources/js/oschina.js +++ b/webmagic-scripts/src/main/resources/js/oschina.js @@ -9,3 +9,4 @@ var config = { title = $("div.BlogTitle h1"), content = $("div.BlogContent") urls("http://my\\.oschina\\.net/flashsword/blog/\\d+") +config; From 02b441ad38e11cc8e23ca68010a0437f71f70f7d Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Wed, 9 Apr 2014 15:40:33 +0800 Subject: [PATCH 061/130] disable NativeObject in Rhino because it is a hotspot internal api and compile error in OpenJDK #93 --- .../webmagic/scripts/ScriptProcessor.java | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptProcessor.java b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptProcessor.java index 946d15e..1822318 100755 --- a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptProcessor.java +++ b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptProcessor.java @@ -3,7 +3,6 @@ package us.codecraft.webmagic.scripts; import org.apache.commons.io.IOUtils; import org.jruby.RubyHash; import org.python.core.PyDictionary; -import sun.org.mozilla.javascript.internal.NativeObject; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.processor.PageProcessor; @@ -58,13 +57,13 @@ public class ScriptProcessor implements PageProcessor { switch (language) { case JavaScript: engine.eval(defines + "\n" + script, context); - NativeObject o = (NativeObject) engine.get("result"); - if (o != null) { - for (Object o1 : o.getIds()) { - String key = String.valueOf(o1); - page.getResultItems().put(key, NativeObject.getProperty(o, key)); - } - } +// NativeObject o = (NativeObject) engine.get("result"); +// if (o != null) { +// for (Object o1 : o.getIds()) { +// String key = String.valueOf(o1); +// page.getResultItems().put(key, NativeObject.getProperty(o, key)); +// } +// } break; case JRuby: RubyHash oRuby = (RubyHash) engine.eval(defines + "\n" + script, context); From 3669e73e4a6fe0d56010f76eb63f00281158188d Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Wed, 9 Apr 2014 16:43:22 +0800 Subject: [PATCH 062/130] update News163: use Xsoup 0.2.0 syntax instead of ComboExtract --- .../us/codecraft/webmagic/model/samples/News163.java | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java index e9dfb26..45bee2f 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java @@ -3,7 +3,6 @@ package us.codecraft.webmagic.model.samples; import us.codecraft.webmagic.MultiPageModel; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.model.OOSpider; -import us.codecraft.webmagic.model.annotation.ComboExtract; import us.codecraft.webmagic.model.annotation.ExtractBy; import us.codecraft.webmagic.model.annotation.ExtractByUrl; import us.codecraft.webmagic.model.annotation.TargetUrl; @@ -26,9 +25,8 @@ public class News163 implements MultiPageModel { @ExtractByUrl(value = "http://news\\.163\\.com/\\d+/\\d+/\\d+/\\w+_(\\d+)\\.html", notNull = false) private String page; - @ComboExtract(value = {@ExtractBy("//div[@class=\"ep-pages\"]//a/@href"), - @ExtractBy(value = "http://news\\.163\\.com/\\d+/\\d+/\\d+/\\w+_(\\d+)\\.html", type = ExtractBy.Type.Regex)}, - multi = true, notNull = false) + @ExtractBy(value = "//div[@class=\"ep-pages\"]//a/regex('http://news\\.163\\.com/\\d+/\\d+/\\d+/\\w+_(\\d+)\\.html',1)" + , multi = true, notNull = false) private List otherPage; @ExtractBy("//h1[@id=\"h1title\"]/text()") @@ -74,8 +72,8 @@ public class News163 implements MultiPageModel { } public static void main(String[] args) { - OOSpider.create(Site.me().addStartUrl("http://news.163.com/13/0802/05/958I1E330001124J_2.html"), News163.class) - .scheduler(new RedisScheduler("localhost")).clearPipeline().pipeline(new MultiPagePipeline()).pipeline(new ConsolePipeline()).run(); + OOSpider.create(Site.me(), News163.class).addUrl("http://news.163.com/13/0802/05/958I1E330001124J_2.html") + .scheduler(new RedisScheduler("localhost")).addPipeline(new MultiPagePipeline()).addPipeline(new ConsolePipeline()).run(); } } From db65dfafb831bb491f7a9cf9db21979cbd913471 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Wed, 9 Apr 2014 23:32:07 +0800 Subject: [PATCH 063/130] add baidunews sample --- .../webmagic/model/samples/BaiduNews.java | 43 +++++++++++++++++++ 1 file changed, 43 insertions(+) create mode 100644 webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/BaiduNews.java diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/BaiduNews.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/BaiduNews.java new file mode 100644 index 0000000..4795662 --- /dev/null +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/BaiduNews.java @@ -0,0 +1,43 @@ +package us.codecraft.webmagic.model.samples; + +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.model.OOSpider; +import us.codecraft.webmagic.model.annotation.ExtractBy; + +/** + * @author code4crafter@gmail.com + * @date 14-4-9 + */ +public class BaiduNews { + + @ExtractBy("//h3[@class='c-title']/a/text()") + private String name; + + @ExtractBy("//div[@class='c-summary']/text()") + private String description; + + @Override + public String toString() { + return "BaiduNews{" + + "name='" + name + '\'' + + ", description='" + description + '\'' + + '}'; + } + + public static void main(String[] args) { + OOSpider ooSpider = OOSpider.create(Site.me().setSleepTime(0), BaiduNews.class); + //single download + BaiduNews baike = ooSpider.get("http://news.baidu.com/ns?tn=news&cl=2&rn=20&ct=1&fr=bks0000&ie=utf-8&word=httpclient"); + System.out.println(baike); + + ooSpider.close(); + } + + public String getName() { + return name; + } + + public String getDescription() { + return description; + } +} \ No newline at end of file From 2b023c95c2d0ccd16614914405e925363a77e93f Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Fri, 11 Apr 2014 11:43:04 +0800 Subject: [PATCH 064/130] qqmeishi demo --- .../webmagic/model/samples/QQMeishi.java | 27 +++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/QQMeishi.java diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/QQMeishi.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/QQMeishi.java new file mode 100644 index 0000000..f4f8591 --- /dev/null +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/QQMeishi.java @@ -0,0 +1,27 @@ +package us.codecraft.webmagic.model.samples; + +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.model.ConsolePageModelPipeline; +import us.codecraft.webmagic.model.OOSpider; +import us.codecraft.webmagic.model.annotation.ExtractBy; +import us.codecraft.webmagic.model.annotation.TargetUrl; + +/** + * @author code4crafter@gmail.com + * @date 14-4-11 + */ +@TargetUrl("http://meishi.qq.com/beijing/c/all[\\-p2]*") +@ExtractBy(value = "//ul[@id=\"promos_list2\"]/li",multi = true) +public class QQMeishi { + + @ExtractBy("//div[@class=info]/a[@class=title]/h4/text()") + private String shopName; + + @ExtractBy("//div[@class=info]/a[@class=title]/text()") + private String promo; + + public static void main(String[] args) { + OOSpider.create(Site.me(), new ConsolePageModelPipeline(), QQMeishi.class).addUrl("http://meishi.qq.com/beijing/c/all").thread(4).run(); + } + +} From 094f9d1552b2439b326387f44e0b959852a0e2d1 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 12 Apr 2014 13:42:32 +0800 Subject: [PATCH 065/130] rename assets for spell mistake --- {asserts => assets}/data.plist | 0 {asserts => assets}/image1.pdf | Bin {asserts => assets}/logo-simple.jpg | Bin {asserts => assets}/logo.graffle | 0 {asserts => assets}/logo.jpg | Bin {asserts => assets}/logo2.graffle/data.plist | 0 {asserts => assets}/logo2.graffle/image1.tiff | Bin {asserts => assets}/logo3.graffle/data.plist | 0 {asserts => assets}/logo3.graffle/image1.tiff | Bin {asserts => assets}/logo3.graffle/image2.tiff | Bin {asserts => assets}/logo3.graffle/image4.tiff | Bin {asserts => assets}/logo3.graffle/image5.tiff | Bin {asserts => assets}/logo3.png | Bin {asserts => assets}/logo4.png | Bin {asserts => assets}/page-extract-rule.bmml | 0 {asserts => assets}/webmagic-create-spider.bmml | 0 {asserts => assets}/webmagic-create-spider.png | Bin {asserts => assets}/webmagic-spider-manage.bmml | 0 {asserts => assets}/webmagic-spider-manage.png | Bin {asserts => assets}/webmagic.psd | Bin 20 files changed, 0 insertions(+), 0 deletions(-) rename {asserts => assets}/data.plist (100%) rename {asserts => assets}/image1.pdf (100%) rename {asserts => assets}/logo-simple.jpg (100%) rename {asserts => assets}/logo.graffle (100%) rename {asserts => assets}/logo.jpg (100%) rename {asserts => assets}/logo2.graffle/data.plist (100%) rename {asserts => assets}/logo2.graffle/image1.tiff (100%) rename {asserts => assets}/logo3.graffle/data.plist (100%) rename {asserts => assets}/logo3.graffle/image1.tiff (100%) rename {asserts => assets}/logo3.graffle/image2.tiff (100%) rename {asserts => assets}/logo3.graffle/image4.tiff (100%) rename {asserts => assets}/logo3.graffle/image5.tiff (100%) rename {asserts => assets}/logo3.png (100%) rename {asserts => assets}/logo4.png (100%) rename {asserts => assets}/page-extract-rule.bmml (100%) rename {asserts => assets}/webmagic-create-spider.bmml (100%) rename {asserts => assets}/webmagic-create-spider.png (100%) rename {asserts => assets}/webmagic-spider-manage.bmml (100%) rename {asserts => assets}/webmagic-spider-manage.png (100%) rename {asserts => assets}/webmagic.psd (100%) diff --git a/asserts/data.plist b/assets/data.plist similarity index 100% rename from asserts/data.plist rename to assets/data.plist diff --git a/asserts/image1.pdf b/assets/image1.pdf similarity index 100% rename from asserts/image1.pdf rename to assets/image1.pdf diff --git a/asserts/logo-simple.jpg b/assets/logo-simple.jpg similarity index 100% rename from asserts/logo-simple.jpg rename to assets/logo-simple.jpg diff --git a/asserts/logo.graffle b/assets/logo.graffle similarity index 100% rename from asserts/logo.graffle rename to assets/logo.graffle diff --git a/asserts/logo.jpg b/assets/logo.jpg similarity index 100% rename from asserts/logo.jpg rename to assets/logo.jpg diff --git a/asserts/logo2.graffle/data.plist b/assets/logo2.graffle/data.plist similarity index 100% rename from asserts/logo2.graffle/data.plist rename to assets/logo2.graffle/data.plist diff --git a/asserts/logo2.graffle/image1.tiff b/assets/logo2.graffle/image1.tiff similarity index 100% rename from asserts/logo2.graffle/image1.tiff rename to assets/logo2.graffle/image1.tiff diff --git a/asserts/logo3.graffle/data.plist b/assets/logo3.graffle/data.plist similarity index 100% rename from asserts/logo3.graffle/data.plist rename to assets/logo3.graffle/data.plist diff --git a/asserts/logo3.graffle/image1.tiff b/assets/logo3.graffle/image1.tiff similarity index 100% rename from asserts/logo3.graffle/image1.tiff rename to assets/logo3.graffle/image1.tiff diff --git a/asserts/logo3.graffle/image2.tiff b/assets/logo3.graffle/image2.tiff similarity index 100% rename from asserts/logo3.graffle/image2.tiff rename to assets/logo3.graffle/image2.tiff diff --git a/asserts/logo3.graffle/image4.tiff b/assets/logo3.graffle/image4.tiff similarity index 100% rename from asserts/logo3.graffle/image4.tiff rename to assets/logo3.graffle/image4.tiff diff --git a/asserts/logo3.graffle/image5.tiff b/assets/logo3.graffle/image5.tiff similarity index 100% rename from asserts/logo3.graffle/image5.tiff rename to assets/logo3.graffle/image5.tiff diff --git a/asserts/logo3.png b/assets/logo3.png similarity index 100% rename from asserts/logo3.png rename to assets/logo3.png diff --git a/asserts/logo4.png b/assets/logo4.png similarity index 100% rename from asserts/logo4.png rename to assets/logo4.png diff --git a/asserts/page-extract-rule.bmml b/assets/page-extract-rule.bmml similarity index 100% rename from asserts/page-extract-rule.bmml rename to assets/page-extract-rule.bmml diff --git a/asserts/webmagic-create-spider.bmml b/assets/webmagic-create-spider.bmml similarity index 100% rename from asserts/webmagic-create-spider.bmml rename to assets/webmagic-create-spider.bmml diff --git a/asserts/webmagic-create-spider.png b/assets/webmagic-create-spider.png similarity index 100% rename from asserts/webmagic-create-spider.png rename to assets/webmagic-create-spider.png diff --git a/asserts/webmagic-spider-manage.bmml b/assets/webmagic-spider-manage.bmml similarity index 100% rename from asserts/webmagic-spider-manage.bmml rename to assets/webmagic-spider-manage.bmml diff --git a/asserts/webmagic-spider-manage.png b/assets/webmagic-spider-manage.png similarity index 100% rename from asserts/webmagic-spider-manage.png rename to assets/webmagic-spider-manage.png diff --git a/asserts/webmagic.psd b/assets/webmagic.psd similarity index 100% rename from asserts/webmagic.psd rename to assets/webmagic.psd From be37d8b2165e6514f2a3ef2dc8d5a19498d84180 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 12 Apr 2014 20:03:44 +0800 Subject: [PATCH 066/130] sinablogprocessor sample --- .../webmagic/samples/SinaBlogProcesser.java | 37 --------------- .../webmagic/samples/SinaBlogProcessor.java | 47 +++++++++++++++++++ .../processor/SinablogProcessorTest.java | 6 +-- 3 files changed, 50 insertions(+), 40 deletions(-) delete mode 100644 webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcesser.java create mode 100644 webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcessor.java diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcesser.java deleted file mode 100644 index dcb6eff..0000000 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcesser.java +++ /dev/null @@ -1,37 +0,0 @@ -package us.codecraft.webmagic.samples; - -import us.codecraft.webmagic.Page; -import us.codecraft.webmagic.Site; -import us.codecraft.webmagic.Spider; -import us.codecraft.webmagic.processor.PageProcessor; - -/** - * @author code4crafter@gmail.com
- */ -public class SinaBlogProcesser implements PageProcessor { - - private Site site; - - @Override - public void process(Page page) { - page.addTargetRequests(page.getHtml().xpath("//div[@class='articalfrontback SG_j_linedot1 clearfix']").links().all()); - page.putField("title", page.getHtml().xpath("//div[@class='articalTitle']/h2")); - page.putField("content",page.getHtml().xpath("//div[@id='articlebody']//div[@class='articalContent']")); - page.putField("id",page.getUrl().regex("http://blog\\.sina\\.com\\.cn/s/blog_(\\w+)")); - page.putField("date",page.getHtml().xpath("//div[@id='articlebody']//span[@class='time SG_txtc']").regex("\\((.*)\\)")); -// page.putField("tags",page.getHtml().xpath("//td[@class='blog_tag']/h3/a")); - } - - @Override - public Site getSite() { - if (site==null){ - site = Site.me().setDomain("blog.sina.com.cn").addStartUrl("http://blog.sina.com.cn/s/blog_4701280b0102egl0.html").setSleepTime(3000). - setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); - } - return site; - } - - public static void main(String[] args) { - Spider.create(new SinaBlogProcesser()).run(); - } -} diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcessor.java new file mode 100644 index 0000000..01094aa --- /dev/null +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcessor.java @@ -0,0 +1,47 @@ +package us.codecraft.webmagic.samples; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.processor.PageProcessor; + +/** + * @author code4crafter@gmail.com
+ */ +public class SinaBlogProcessor implements PageProcessor { + + public static final String URL_LIST = "http://blog\\.sina\\.com\\.cn/s/articlelist_1487828712_0_\\d+\\.html"; + + public static final String URL_POST = "http://blog\\.sina\\.com\\.cn/s/blog_\\w+\\.html"; + + private Site site = Site + .me() + .setDomain("blog.sina.com.cn") + .setSleepTime(3000) + .setUserAgent( + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); + + @Override + public void process(Page page) { + if (page.getUrl().regex(URL_LIST).match()) { + page.addTargetRequests(page.getHtml().xpath("//div[@class=\"articleList\"]").links().regex(URL_POST).all()); + page.addTargetRequests(page.getHtml().links().regex(URL_LIST).all()); + } else { + page.putField("title", page.getHtml().xpath("//div[@class='articalTitle']/h2")); + page.putField("content", page.getHtml().xpath("//div[@id='articlebody']//div[@class='articalContent']")); + page.putField("id", page.getUrl().regex("http://blog\\.sina\\.com\\.cn/s/blog_(\\w+)")); + page.putField("date", + page.getHtml().xpath("//div[@id='articlebody']//span[@class='time SG_txtc']").regex("\\((.*)\\)")); + } + } + + @Override + public Site getSite() { + return site; + } + + public static void main(String[] args) { + Spider.create(new SinaBlogProcessor()).addUrl("http://blog.sina.com.cn/s/articlelist_1487828712_0_1.html") + .run(); + } +} diff --git a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/SinablogProcessorTest.java b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/SinablogProcessorTest.java index 026f8d5..d7cd5d5 100644 --- a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/SinablogProcessorTest.java +++ b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/SinablogProcessorTest.java @@ -5,7 +5,7 @@ import org.junit.Test; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.pipeline.FilePipeline; import us.codecraft.webmagic.pipeline.JsonFilePipeline; -import us.codecraft.webmagic.samples.SinaBlogProcesser; +import us.codecraft.webmagic.samples.SinaBlogProcessor; import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler; import java.io.IOException; @@ -20,7 +20,7 @@ public class SinablogProcessorTest { @Ignore @Test public void test() throws IOException { - SinaBlogProcesser sinaBlogProcesser = new SinaBlogProcesser(); + SinaBlogProcessor sinaBlogProcessor = new SinaBlogProcessor(); //pipeline是抓取结束后的处理 //默认放到/data/webmagic/ftl/[domain]目录下 JsonFilePipeline pipeline = new JsonFilePipeline("/data/webmagic/"); @@ -29,7 +29,7 @@ public class SinablogProcessorTest { //ConsolePipeline输出结果到控制台 //FileCacheQueueSchedular保存url,支持断点续传,临时文件输出到/data/temp/webmagic/cache目录 //Spider.run()执行 - Spider.create(sinaBlogProcesser).pipeline(new FilePipeline()).pipeline(pipeline).scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")). + Spider.create(sinaBlogProcessor).pipeline(new FilePipeline()).pipeline(pipeline).scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")). run(); } } From 843e928c2c81ed5385f2f281a829a71470e2b983 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 12 Apr 2014 20:10:24 +0800 Subject: [PATCH 067/130] comments on sinablogprocessor sample --- .../java/us/codecraft/webmagic/samples/SinaBlogProcessor.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcessor.java index 01094aa..2872e02 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcessor.java @@ -23,13 +23,14 @@ public class SinaBlogProcessor implements PageProcessor { @Override public void process(Page page) { + //列表页 if (page.getUrl().regex(URL_LIST).match()) { page.addTargetRequests(page.getHtml().xpath("//div[@class=\"articleList\"]").links().regex(URL_POST).all()); page.addTargetRequests(page.getHtml().links().regex(URL_LIST).all()); + //文章页 } else { page.putField("title", page.getHtml().xpath("//div[@class='articalTitle']/h2")); page.putField("content", page.getHtml().xpath("//div[@id='articlebody']//div[@class='articalContent']")); - page.putField("id", page.getUrl().regex("http://blog\\.sina\\.com\\.cn/s/blog_(\\w+)")); page.putField("date", page.getHtml().xpath("//div[@id='articlebody']//span[@class='time SG_txtc']").regex("\\((.*)\\)")); } From 5b254e446b0a616bc91f5d9526fc83d0d2bc54cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=84=A4=E6=80=92=E7=9A=84=E7=95=AA=E8=8C=84?= Date: Sat, 12 Apr 2014 22:08:53 +0800 Subject: [PATCH 068/130] =?UTF-8?q?=E6=9B=B4=E6=96=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../us/codecraft/webmagic/downloader/selenium/WebDriverPool.java | 1 + 1 file changed, 1 insertion(+) diff --git a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java index 98b93a9..f628ede 100644 --- a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java +++ b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java @@ -87,4 +87,5 @@ class WebDriverPool { webDriver.quit(); } } + } From 610ac42c070ea639ea5bad574916b90963232fac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=84=A4=E6=80=92=E7=9A=84=E7=95=AA=E8=8C=84?= Date: Sat, 12 Apr 2014 22:22:07 +0800 Subject: [PATCH 069/130] =?UTF-8?q?=E6=9B=B4=E6=96=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../webmagic/samples/SinaBlogProcesser.java | 37 -------------- .../webmagic/samples/SinaBlogProcessor.java | 48 +++++++++++++++++++ 2 files changed, 48 insertions(+), 37 deletions(-) delete mode 100644 webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcesser.java create mode 100644 webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcessor.java diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcesser.java deleted file mode 100644 index dcb6eff..0000000 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcesser.java +++ /dev/null @@ -1,37 +0,0 @@ -package us.codecraft.webmagic.samples; - -import us.codecraft.webmagic.Page; -import us.codecraft.webmagic.Site; -import us.codecraft.webmagic.Spider; -import us.codecraft.webmagic.processor.PageProcessor; - -/** - * @author code4crafter@gmail.com
- */ -public class SinaBlogProcesser implements PageProcessor { - - private Site site; - - @Override - public void process(Page page) { - page.addTargetRequests(page.getHtml().xpath("//div[@class='articalfrontback SG_j_linedot1 clearfix']").links().all()); - page.putField("title", page.getHtml().xpath("//div[@class='articalTitle']/h2")); - page.putField("content",page.getHtml().xpath("//div[@id='articlebody']//div[@class='articalContent']")); - page.putField("id",page.getUrl().regex("http://blog\\.sina\\.com\\.cn/s/blog_(\\w+)")); - page.putField("date",page.getHtml().xpath("//div[@id='articlebody']//span[@class='time SG_txtc']").regex("\\((.*)\\)")); -// page.putField("tags",page.getHtml().xpath("//td[@class='blog_tag']/h3/a")); - } - - @Override - public Site getSite() { - if (site==null){ - site = Site.me().setDomain("blog.sina.com.cn").addStartUrl("http://blog.sina.com.cn/s/blog_4701280b0102egl0.html").setSleepTime(3000). - setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); - } - return site; - } - - public static void main(String[] args) { - Spider.create(new SinaBlogProcesser()).run(); - } -} diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcessor.java new file mode 100644 index 0000000..2872e02 --- /dev/null +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcessor.java @@ -0,0 +1,48 @@ +package us.codecraft.webmagic.samples; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.processor.PageProcessor; + +/** + * @author code4crafter@gmail.com
+ */ +public class SinaBlogProcessor implements PageProcessor { + + public static final String URL_LIST = "http://blog\\.sina\\.com\\.cn/s/articlelist_1487828712_0_\\d+\\.html"; + + public static final String URL_POST = "http://blog\\.sina\\.com\\.cn/s/blog_\\w+\\.html"; + + private Site site = Site + .me() + .setDomain("blog.sina.com.cn") + .setSleepTime(3000) + .setUserAgent( + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); + + @Override + public void process(Page page) { + //列表页 + if (page.getUrl().regex(URL_LIST).match()) { + page.addTargetRequests(page.getHtml().xpath("//div[@class=\"articleList\"]").links().regex(URL_POST).all()); + page.addTargetRequests(page.getHtml().links().regex(URL_LIST).all()); + //文章页 + } else { + page.putField("title", page.getHtml().xpath("//div[@class='articalTitle']/h2")); + page.putField("content", page.getHtml().xpath("//div[@id='articlebody']//div[@class='articalContent']")); + page.putField("date", + page.getHtml().xpath("//div[@id='articlebody']//span[@class='time SG_txtc']").regex("\\((.*)\\)")); + } + } + + @Override + public Site getSite() { + return site; + } + + public static void main(String[] args) { + Spider.create(new SinaBlogProcessor()).addUrl("http://blog.sina.com.cn/s/articlelist_1487828712_0_1.html") + .run(); + } +} From 644e8d1f72c08c83348e5c31a42f0f0dfa32f07d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=84=A4=E6=80=92=E7=9A=84=E7=95=AA=E8=8C=84?= Date: Sat, 12 Apr 2014 22:32:22 +0800 Subject: [PATCH 070/130] =?UTF-8?q?=E5=90=8C=E6=AD=A5=E5=AE=98=E6=96=B9?= =?UTF-8?q?=E6=BA=90=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 5 +- {asserts => assets}/data.plist | 0 {asserts => assets}/image1.pdf | Bin {asserts => assets}/logo-simple.jpg | Bin {asserts => assets}/logo.graffle | 0 {asserts => assets}/logo.jpg | Bin {asserts => assets}/logo2.graffle/data.plist | 0 {asserts => assets}/logo2.graffle/image1.tiff | Bin {asserts => assets}/logo3.graffle/data.plist | 0 {asserts => assets}/logo3.graffle/image1.tiff | Bin {asserts => assets}/logo3.graffle/image2.tiff | Bin {asserts => assets}/logo3.graffle/image4.tiff | Bin {asserts => assets}/logo3.graffle/image5.tiff | Bin {asserts => assets}/logo3.png | Bin {asserts => assets}/logo4.png | Bin assets/page-extract-rule.bmml | 9 + .../webmagic-create-spider.bmml | 0 .../webmagic-create-spider.png | Bin .../webmagic-spider-manage.bmml | 0 .../webmagic-spider-manage.png | Bin {asserts => assets}/webmagic.psd | Bin pom.xml | 9 + .../webapp/WEB-INF/pages/create_spider.ftl | 14 ++ .../downloader/HttpClientDownloader.java | 2 +- .../LocalDuplicatedRemovedScheduler.java | 3 +- .../us/codecraft/webmagic/selector/Html.java | 1 + webmagic-core/src/test/resources/log4j.xml | 10 - .../ConfigurablePageProcessor.java | 49 ++++ .../webmagic/configurable/ExpressionType.java | 11 + .../webmagic/configurable/ExtractRule.java | 113 +++++++++ .../webmagic/configurable/Inject.java | 15 -- .../webmagic/configurable/PropertyLoader.java | 18 -- .../ConfigurableBlogPageProcessor.java | 51 ---- .../example/PatternProcessorDemo.java | 53 +++++ .../handler/CompositePageProcessor.java | 49 ++++ .../webmagic/handler/PatternHandler.java | 113 +++++++++ .../webmagic/handler/SubPageProcessor.java | 33 +++ .../webmagic/model/ModelPageProcessor.java | 9 +- .../webmagic/model/PageModelExtractor.java | 4 +- .../webmagic/pipeline/PatternPipeline.java | 44 ++++ .../processor/PatternPageProcessor.java | 78 +++++++ .../scheduler/FileCacheQueueScheduler.java | 14 +- .../ConfigurablePageProcessorTest.java | 39 ++++ .../model/ModelPageProcessorTest.java | 45 ++++ .../webmagic/model/samples/BaiduNews.java | 43 ++++ .../webmagic/model/samples/News163.java | 10 +- .../webmagic/model/samples/QQMeishi.java | 27 +++ .../processor/SinablogProcessorTest.java | 6 +- webmagic-scripts/README.md | 0 webmagic-scripts/deploy.sh | 0 webmagic-scripts/pom.xml | 4 + .../codecraft/webmagic/scripts/Language.java | 4 +- .../webmagic/scripts/ScriptConsole.java | 0 .../webmagic/scripts/ScriptEnginePool.java | 0 .../webmagic/scripts/ScriptProcessor.java | 46 ++-- .../scripts/ScriptProcessorBuilder.java | 0 .../src/main/resources/js/defines.js | 0 .../src/main/resources/js/github.js | 0 .../src/main/resources/js/oschina.js | 1 + webmagic-scripts/src/main/resources/log4j.xml | 0 .../src/main/resources/python/defines.py | 13 ++ .../src/main/resources/python/oschina.py | 4 + .../src/main/resources/ruby/defines.rb | 0 .../src/main/resources/ruby/github.rb | 0 .../src/main/resources/ruby/oschina.rb | 5 +- .../webmagic/scripts/ScriptProcessorTest.java | 8 + webmagic-scripts/src/test/resouces/log4j.xml | 0 zh_docs/user-manual-new.md | 221 ++++++++++++++++-- 68 files changed, 1024 insertions(+), 159 deletions(-) rename {asserts => assets}/data.plist (100%) rename {asserts => assets}/image1.pdf (100%) rename {asserts => assets}/logo-simple.jpg (100%) rename {asserts => assets}/logo.graffle (100%) rename {asserts => assets}/logo.jpg (100%) rename {asserts => assets}/logo2.graffle/data.plist (100%) rename {asserts => assets}/logo2.graffle/image1.tiff (100%) rename {asserts => assets}/logo3.graffle/data.plist (100%) rename {asserts => assets}/logo3.graffle/image1.tiff (100%) rename {asserts => assets}/logo3.graffle/image2.tiff (100%) rename {asserts => assets}/logo3.graffle/image4.tiff (100%) rename {asserts => assets}/logo3.graffle/image5.tiff (100%) rename {asserts => assets}/logo3.png (100%) rename {asserts => assets}/logo4.png (100%) create mode 100644 assets/page-extract-rule.bmml rename {asserts => assets}/webmagic-create-spider.bmml (100%) rename {asserts => assets}/webmagic-create-spider.png (100%) rename {asserts => assets}/webmagic-spider-manage.bmml (100%) rename {asserts => assets}/webmagic-spider-manage.png (100%) rename {asserts => assets}/webmagic.psd (100%) create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ConfigurablePageProcessor.java create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ExpressionType.java create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ExtractRule.java delete mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/Inject.java delete mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/PropertyLoader.java delete mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/example/ConfigurableBlogPageProcessor.java create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/example/PatternProcessorDemo.java create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/handler/CompositePageProcessor.java create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/handler/PatternHandler.java create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/handler/SubPageProcessor.java create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/PatternPipeline.java create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/processor/PatternPageProcessor.java create mode 100644 webmagic-extension/src/test/java/us/codecraft/webmagic/configurable/ConfigurablePageProcessorTest.java create mode 100644 webmagic-extension/src/test/java/us/codecraft/webmagic/model/ModelPageProcessorTest.java create mode 100644 webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/BaiduNews.java create mode 100644 webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/QQMeishi.java mode change 100644 => 100755 webmagic-scripts/README.md mode change 100644 => 100755 webmagic-scripts/deploy.sh mode change 100644 => 100755 webmagic-scripts/pom.xml mode change 100644 => 100755 webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/Language.java mode change 100644 => 100755 webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptConsole.java mode change 100644 => 100755 webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptEnginePool.java mode change 100644 => 100755 webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptProcessor.java mode change 100644 => 100755 webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptProcessorBuilder.java mode change 100644 => 100755 webmagic-scripts/src/main/resources/js/defines.js mode change 100644 => 100755 webmagic-scripts/src/main/resources/js/github.js mode change 100644 => 100755 webmagic-scripts/src/main/resources/js/oschina.js mode change 100644 => 100755 webmagic-scripts/src/main/resources/log4j.xml create mode 100755 webmagic-scripts/src/main/resources/python/defines.py create mode 100755 webmagic-scripts/src/main/resources/python/oschina.py mode change 100644 => 100755 webmagic-scripts/src/main/resources/ruby/defines.rb mode change 100644 => 100755 webmagic-scripts/src/main/resources/ruby/github.rb mode change 100644 => 100755 webmagic-scripts/src/test/java/us/codecraft/webmagic/scripts/ScriptProcessorTest.java mode change 100644 => 100755 webmagic-scripts/src/test/resouces/log4j.xml diff --git a/README.md b/README.md index 1f4bc13..2056fba 100644 --- a/README.md +++ b/README.md @@ -78,7 +78,6 @@ public class GithubRepoPageProcessor implements PageProcessor { Spider.create(new GithubRepoPageProcessor()).addUrl("https://github.com/code4craft").thread(5).run(); } } - ``` * `page.addTargetRequests(links)` @@ -164,6 +163,10 @@ To write webmagic, I refered to the projects below : [https://groups.google.com/forum/#!forum/webmagic-java](https://groups.google.com/forum/#!forum/webmagic-java) +[http://list.qq.com/cgi-bin/qf_invite?id=023a01f505246785f77c5a5a9aff4e57ab20fcdde871e988](http://list.qq.com/cgi-bin/qf_invite?id=023a01f505246785f77c5a5a9aff4e57ab20fcdde871e988) + +QQ Group: 330192938 + [![Bitdeli Badge](https://d2weczhvl823v0.cloudfront.net/code4craft/webmagic/trend.png)](https://bitdeli.com/free "Bitdeli Badge") diff --git a/asserts/data.plist b/assets/data.plist similarity index 100% rename from asserts/data.plist rename to assets/data.plist diff --git a/asserts/image1.pdf b/assets/image1.pdf similarity index 100% rename from asserts/image1.pdf rename to assets/image1.pdf diff --git a/asserts/logo-simple.jpg b/assets/logo-simple.jpg similarity index 100% rename from asserts/logo-simple.jpg rename to assets/logo-simple.jpg diff --git a/asserts/logo.graffle b/assets/logo.graffle similarity index 100% rename from asserts/logo.graffle rename to assets/logo.graffle diff --git a/asserts/logo.jpg b/assets/logo.jpg similarity index 100% rename from asserts/logo.jpg rename to assets/logo.jpg diff --git a/asserts/logo2.graffle/data.plist b/assets/logo2.graffle/data.plist similarity index 100% rename from asserts/logo2.graffle/data.plist rename to assets/logo2.graffle/data.plist diff --git a/asserts/logo2.graffle/image1.tiff b/assets/logo2.graffle/image1.tiff similarity index 100% rename from asserts/logo2.graffle/image1.tiff rename to assets/logo2.graffle/image1.tiff diff --git a/asserts/logo3.graffle/data.plist b/assets/logo3.graffle/data.plist similarity index 100% rename from asserts/logo3.graffle/data.plist rename to assets/logo3.graffle/data.plist diff --git a/asserts/logo3.graffle/image1.tiff b/assets/logo3.graffle/image1.tiff similarity index 100% rename from asserts/logo3.graffle/image1.tiff rename to assets/logo3.graffle/image1.tiff diff --git a/asserts/logo3.graffle/image2.tiff b/assets/logo3.graffle/image2.tiff similarity index 100% rename from asserts/logo3.graffle/image2.tiff rename to assets/logo3.graffle/image2.tiff diff --git a/asserts/logo3.graffle/image4.tiff b/assets/logo3.graffle/image4.tiff similarity index 100% rename from asserts/logo3.graffle/image4.tiff rename to assets/logo3.graffle/image4.tiff diff --git a/asserts/logo3.graffle/image5.tiff b/assets/logo3.graffle/image5.tiff similarity index 100% rename from asserts/logo3.graffle/image5.tiff rename to assets/logo3.graffle/image5.tiff diff --git a/asserts/logo3.png b/assets/logo3.png similarity index 100% rename from asserts/logo3.png rename to assets/logo3.png diff --git a/asserts/logo4.png b/assets/logo4.png similarity index 100% rename from asserts/logo4.png rename to assets/logo4.png diff --git a/assets/page-extract-rule.bmml b/assets/page-extract-rule.bmml new file mode 100644 index 0000000..fec8d3e --- /dev/null +++ b/assets/page-extract-rule.bmml @@ -0,0 +1,9 @@ + + + + + A%20Web%20Page%0Ahttp%3A// + + + + \ No newline at end of file diff --git a/asserts/webmagic-create-spider.bmml b/assets/webmagic-create-spider.bmml similarity index 100% rename from asserts/webmagic-create-spider.bmml rename to assets/webmagic-create-spider.bmml diff --git a/asserts/webmagic-create-spider.png b/assets/webmagic-create-spider.png similarity index 100% rename from asserts/webmagic-create-spider.png rename to assets/webmagic-create-spider.png diff --git a/asserts/webmagic-spider-manage.bmml b/assets/webmagic-spider-manage.bmml similarity index 100% rename from asserts/webmagic-spider-manage.bmml rename to assets/webmagic-spider-manage.bmml diff --git a/asserts/webmagic-spider-manage.png b/assets/webmagic-spider-manage.png similarity index 100% rename from asserts/webmagic-spider-manage.png rename to assets/webmagic-spider-manage.png diff --git a/asserts/webmagic.psd b/assets/webmagic.psd similarity index 100% rename from asserts/webmagic.psd rename to assets/webmagic.psd diff --git a/pom.xml b/pom.xml index 085e94e..b277b38 100644 --- a/pom.xml +++ b/pom.xml @@ -182,6 +182,15 @@ UTF-8
+ + org.apache.maven.plugins + maven-jar-plugin + + + log4j.xml + + + org.apache.maven.plugins maven-source-plugin diff --git a/webmagic-avalon/webmagic-admin/src/main/webapp/WEB-INF/pages/create_spider.ftl b/webmagic-avalon/webmagic-admin/src/main/webapp/WEB-INF/pages/create_spider.ftl index e69de29..4cd838c 100644 --- a/webmagic-avalon/webmagic-admin/src/main/webapp/WEB-INF/pages/create_spider.ftl +++ b/webmagic-avalon/webmagic-admin/src/main/webapp/WEB-INF/pages/create_spider.ftl @@ -0,0 +1,14 @@ + + + + + +
+ +
+ +
+ +
+ + \ No newline at end of file diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index bcf4a53..30c561b 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -74,7 +74,7 @@ public class HttpClientDownloader extends AbstractDownloader { } else { acceptStatCode = Sets.newHashSet(200); } - logger.info("downloading page " + request.getUrl()); + logger.info("downloading page {}" , request.getUrl()); RequestBuilder requestBuilder = RequestBuilder.get().setUri(request.getUrl()); if (headers != null) { for (Map.Entry headerEntry : headers.entrySet()) { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/LocalDuplicatedRemovedScheduler.java b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/LocalDuplicatedRemovedScheduler.java index c4b08f3..397199c 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/LocalDuplicatedRemovedScheduler.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/LocalDuplicatedRemovedScheduler.java @@ -23,8 +23,9 @@ public abstract class LocalDuplicatedRemovedScheduler implements Scheduler { @Override public void push(Request request, Task task) { - logger.debug("push to queue " + request.getUrl()); + logger.trace("get a candidate url {}", request.getUrl()); if (request.getExtra(Request.CYCLE_TRIED_TIMES) != null || urls.add(request.getUrl())) { + logger.debug("push to queue {}", request.getUrl()); pushWhenNoDuplicate(request, task); } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java index 614b111..34386b5 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java @@ -131,6 +131,7 @@ public class Html extends PlainText { } public Document getDocument() { + initDocument(); return document; } diff --git a/webmagic-core/src/test/resources/log4j.xml b/webmagic-core/src/test/resources/log4j.xml index 9084694..c2b5a2f 100644 --- a/webmagic-core/src/test/resources/log4j.xml +++ b/webmagic-core/src/test/resources/log4j.xml @@ -8,21 +8,11 @@ - - - - - - - - - - diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ConfigurablePageProcessor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ConfigurablePageProcessor.java new file mode 100644 index 0000000..36615d8 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ConfigurablePageProcessor.java @@ -0,0 +1,49 @@ +package us.codecraft.webmagic.configurable; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.processor.PageProcessor; + +import java.util.List; + +/** + * @author code4crafter@gmail.com
+ */ +public class ConfigurablePageProcessor implements PageProcessor { + + private Site site; + + private List extractRules; + + public ConfigurablePageProcessor(Site site, List extractRules) { + this.site = site; + this.extractRules = extractRules; + } + + @Override + public void process(Page page) { + for (ExtractRule extractRule : extractRules) { + if (extractRule.isMulti()) { + List results = page.getHtml().selectDocumentForList(extractRule.getSelector()); + if (extractRule.isNotNull() && results.size() == 0) { + page.setSkip(true); + } else { + page.getResultItems().put(extractRule.getFieldName(), results); + } + } else { + String result = page.getHtml().selectDocument(extractRule.getSelector()); + if (extractRule.isNotNull() && result == null) { + page.setSkip(true); + } else { + page.getResultItems().put(extractRule.getFieldName(), result); + } + } + } + } + + @Override + public Site getSite() { + return site; + } + +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ExpressionType.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ExpressionType.java new file mode 100644 index 0000000..bd84be3 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ExpressionType.java @@ -0,0 +1,11 @@ +package us.codecraft.webmagic.configurable; + +/** + * @author code4crafter@gmail.com + * @date 14-4-5 + */ +public enum ExpressionType { + + XPath, Regex, Css, JsonPath; + +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ExtractRule.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ExtractRule.java new file mode 100644 index 0000000..82337c4 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ExtractRule.java @@ -0,0 +1,113 @@ +package us.codecraft.webmagic.configurable; + +import us.codecraft.webmagic.selector.JsonPathSelector; +import us.codecraft.webmagic.selector.Selector; + +import static us.codecraft.webmagic.selector.Selectors.*; + +/** + * @author code4crafter@gmail.com + * @date 14-4-5 + */ +public class ExtractRule { + + private String fieldName; + + private ExpressionType expressionType; + + private String expressionValue; + + private String[] expressionParams; + + private boolean multi = false; + + private volatile Selector selector; + + private boolean notNull = false; + + public String getFieldName() { + return fieldName; + } + + public void setFieldName(String fieldName) { + this.fieldName = fieldName; + } + + public ExpressionType getExpressionType() { + return expressionType; + } + + public void setExpressionType(ExpressionType expressionType) { + this.expressionType = expressionType; + } + + public String getExpressionValue() { + return expressionValue; + } + + public void setExpressionValue(String expressionValue) { + this.expressionValue = expressionValue; + } + + public String[] getExpressionParams() { + return expressionParams; + } + + public void setExpressionParams(String[] expressionParams) { + this.expressionParams = expressionParams; + } + + public boolean isMulti() { + return multi; + } + + public void setMulti(boolean multi) { + this.multi = multi; + } + + public Selector getSelector() { + if (selector == null) { + synchronized (this) { + if (selector == null) { + selector = compileSelector(); + } + } + } + return selector; + } + + private Selector compileSelector() { + switch (expressionType) { + case Css: + if (expressionParams.length >= 1) { + return $(expressionValue, expressionParams[0]); + } else { + return $(expressionValue); + } + case XPath: + return xpath(expressionValue); + case Regex: + if (expressionParams.length >= 1) { + return regex(expressionValue, Integer.parseInt(expressionParams[0])); + } else { + return regex(expressionValue); + } + case JsonPath: + return new JsonPathSelector(expressionValue); + default: + return xpath(expressionValue); + } + } + + public void setSelector(Selector selector) { + this.selector = selector; + } + + public boolean isNotNull() { + return notNull; + } + + public void setNotNull(boolean notNull) { + this.notNull = notNull; + } +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/Inject.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/Inject.java deleted file mode 100644 index c6608ae..0000000 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/Inject.java +++ /dev/null @@ -1,15 +0,0 @@ -package us.codecraft.webmagic.configurable; - -import java.lang.annotation.ElementType; -import java.lang.annotation.Retention; -import java.lang.annotation.Target; - -/** - * @author yihua.huang@dianping.com - */ -@Retention(java.lang.annotation.RetentionPolicy.RUNTIME) -@Target({ElementType.FIELD}) -public @interface Inject { - - String value() default ""; -} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/PropertyLoader.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/PropertyLoader.java deleted file mode 100644 index bffbcf2..0000000 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/PropertyLoader.java +++ /dev/null @@ -1,18 +0,0 @@ -package us.codecraft.webmagic.configurable; - -import us.codecraft.webmagic.processor.PageProcessor; - -import java.util.Map; - -/** - * Inject property to object by {@link Inject} annotation. - * - * @author yihua.huang@dianping.com - */ -public class PropertyLoader { - - public T load(T object, Map properties) { - return object; - } - -} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/ConfigurableBlogPageProcessor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/ConfigurableBlogPageProcessor.java deleted file mode 100644 index 28d3ab0..0000000 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/ConfigurableBlogPageProcessor.java +++ /dev/null @@ -1,51 +0,0 @@ -package us.codecraft.webmagic.example; - -import java.util.List; -import us.codecraft.webmagic.Page; -import us.codecraft.webmagic.Site; -import us.codecraft.webmagic.Spider; -import us.codecraft.webmagic.configurable.Inject; -import us.codecraft.webmagic.processor.PageProcessor; - -/** - * @author code4crafter@gmail.com
- */ -public class ConfigurableBlogPageProcessor implements PageProcessor { - - private Site site = Site.me().setDomain("my.oschina.net"); - - @Inject("linkRegex") - private String linkRegex; - - @Inject("titleXpath") - private String titleXpath; - - @Inject("contentXpath") - private String contentXpath; - - @Inject("tagsXpath") - private String tagsXpath; - - @Override - public void process(Page page) { - List links = page.getHtml().links().regex(linkRegex).all(); - page.addTargetRequests(links); - page.putField("title", page.getHtml().xpath(titleXpath).toString()); - if (page.getResultItems().get("title") == null) { - //skip this page - page.setSkip(true); - } - page.putField("content", page.getHtml().smartContent().toString()); - page.putField("tags", page.getHtml().xpath(tagsXpath).all()); - } - - @Override - public Site getSite() { - return site; - - } - - public static void main(String[] args) { - Spider.create(new ConfigurableBlogPageProcessor()).addUrl("http://my.oschina.net/flashsword/blog").thread(2).run(); - } -} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/PatternProcessorDemo.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/PatternProcessorDemo.java new file mode 100644 index 0000000..51a9484 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/PatternProcessorDemo.java @@ -0,0 +1,53 @@ +package us.codecraft.webmagic.example; + +import org.apache.log4j.Logger; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.ResultItems; +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.Task; +import us.codecraft.webmagic.handler.PatternHandler; +import us.codecraft.webmagic.pipeline.PatternPipeline; +import us.codecraft.webmagic.processor.PatternPageProcessor; + +/** + * Created with IntelliJ IDEA. + * User: Sebastian MA + * Date: April 04, 2014 + * Time: 21:23 + */ +public class PatternProcessorDemo { + + private static Logger log = Logger.getLogger(PatternProcessorDemo.class); + + public static void main(String... args) { + + PatternPageProcessor processor + = new PatternPageProcessor("http://item.jd.com/981821.html", + PatternPageProcessor.TARGET_PATTERN_ALL + ); + + PatternPipeline pipeline = new PatternPipeline(); + + // define a handler which handles only "http://item.jd.com/.*" + PatternHandler handler = new PatternHandler("http://item.jd.com/.*") { + + @Override + public void onExtract(Page page) { + + log.info("Extracting from " + page.getUrl()); + page.putField("test", "hello world:)"); + } + + @Override + public void onHandle(ResultItems result, Task task) { + + log.info("Handling " + result.getRequest().getUrl()); + log.info("Retrieved test=" + result.get("test")); + } + }; + + handler.register(processor, pipeline); + + Spider.create(processor).thread(5).addPipeline(pipeline).runAsync(); + } +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/CompositePageProcessor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/CompositePageProcessor.java new file mode 100644 index 0000000..ecf4aa1 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/CompositePageProcessor.java @@ -0,0 +1,49 @@ +package us.codecraft.webmagic.handler; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.processor.PageProcessor; + +import java.util.ArrayList; +import java.util.List; + +/** + * @author code4crafter@gmail.com + * @date 14-4-5 + */ +public class CompositePageProcessor implements PageProcessor { + + private Site site; + + private List subPageProcessors; + + @Override + public void process(Page page) { + for (SubPageProcessor subPageProcessor : subPageProcessors) { + if (subPageProcessor.match(page)) { + SubPageProcessor.MatchOtherProcessor matchOtherProcessorProcessor = subPageProcessor.process(page); + if (matchOtherProcessorProcessor == null || matchOtherProcessorProcessor != SubPageProcessor.MatchOtherProcessor.YES) { + return; + } + } + } + } + + public CompositePageProcessor setSite(Site site) { + this.site = site; + return this; + } + + public CompositePageProcessor setSubPageProcessors(SubPageProcessor... subPageProcessors) { + this.subPageProcessors = new ArrayList(); + for (SubPageProcessor subPageProcessor : subPageProcessors) { + this.subPageProcessors.add(subPageProcessor); + } + return this; + } + + @Override + public Site getSite() { + return site; + } +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/PatternHandler.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/PatternHandler.java new file mode 100644 index 0000000..51e44e0 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/PatternHandler.java @@ -0,0 +1,113 @@ +package us.codecraft.webmagic.handler; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.ResultItems; +import us.codecraft.webmagic.Task; +import us.codecraft.webmagic.pipeline.PatternPipeline; +import us.codecraft.webmagic.processor.PatternPageProcessor; + +import java.util.UUID; + +/** + * Created with IntelliJ IDEA. + * User: Sebastian MA + * Date: April 03, 2014 + * Time: 10:00 + *

+ * A PatternHandler is in charge of both page extraction and data processing by implementing + * its two abstract methods. + */ +public abstract class PatternHandler { + + /** + * identity of the handler. + */ + protected String id; + + /** + * match pattern. only matched page should be handled. + */ + protected String pattern; + + /** + * @param pattern + * url pattern to handle + */ + protected PatternHandler(String pattern) { + + this.pattern = pattern; + this.id = UUID.randomUUID().toString(); + } + + /** + * determine if the page should be handled. + */ + public boolean match(String url) { + + return url.matches(pattern); + } + + /** + * registers to both the page processor and the pipeline so the handler could take charge of + * both end of procedure. + * + * @param processor + * the processor to handle + * @param pipeline + * the pipeline to handle + */ + public void register(PatternPageProcessor processor, PatternPipeline pipeline) { + + processor.addHandler(this); + pipeline.addHandler(this); + } + + public void unregister(PatternPageProcessor processor, PatternPipeline pipeline) { + + processor.removeHandler(this); + pipeline.removeHandler(this); + } + + public boolean process(Page page) { + + if(match(page.getUrl().toString())) { + page.putField(id, true); + onExtract(page); + return true; + } else { + return false; + } + } + + public boolean process(ResultItems resultItems, Task task) { + + if(resultItems.isSkip()) { + return false; + } + + if(match(resultItems.getRequest().getUrl()) && resultItems.get(id) != null) { + onHandle(resultItems, task); + return true; + } else { + return false; + } + } + + /** + * implements this method to extract from page. + * + * @param page + * the page to extract + */ + public abstract void onExtract(Page page); + + /** + * implements this method to handle the extraction result. + * + * @param result + * extraction result + * @param task + */ + public abstract void onHandle(ResultItems result, Task task); + +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/SubPageProcessor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/SubPageProcessor.java new file mode 100644 index 0000000..c880500 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/SubPageProcessor.java @@ -0,0 +1,33 @@ +package us.codecraft.webmagic.handler; + +import us.codecraft.webmagic.Page; + +/** + * @author code4crafter@gmail.com + * @date 14-4-5 + */ +public interface SubPageProcessor { + + /** + * Check whether the SubPageProcessor can process the page.

+ * Please DO NOT change page status in this method. + * + * @param page + * @return + */ + public boolean match(Page page); + + /** + * + * process the page, extract urls to fetch, extract the data and store + * + * @param page + * @return whether continue to match + */ + public MatchOtherProcessor process(Page page); + + public enum MatchOtherProcessor { + YES, NO; + } + +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ModelPageProcessor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ModelPageProcessor.java index 8a40dae..3a97e1d 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ModelPageProcessor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ModelPageProcessor.java @@ -25,8 +25,6 @@ class ModelPageProcessor implements PageProcessor { private Site site; - private Set targetUrlPatterns = new HashSet(); - public static ModelPageProcessor create(Site site, Class... clazzs) { ModelPageProcessor modelPageProcessor = new ModelPageProcessor(site); for (Class clazz : clazzs) { @@ -38,8 +36,6 @@ class ModelPageProcessor implements PageProcessor { public ModelPageProcessor addPageModel(Class clazz) { PageModelExtractor pageModelExtractor = PageModelExtractor.create(clazz); - targetUrlPatterns.addAll(pageModelExtractor.getTargetUrlPatterns()); - targetUrlPatterns.addAll(pageModelExtractor.getHelpUrlPatterns()); pageModelExtractorList.add(pageModelExtractor); return this; } @@ -55,11 +51,14 @@ class ModelPageProcessor implements PageProcessor { extractLinks(page, pageModelExtractor.getTargetUrlRegionSelector(), pageModelExtractor.getTargetUrlPatterns()); Object process = pageModelExtractor.process(page); if (process == null || (process instanceof List && ((List) process).size() == 0)) { - page.getResultItems().setSkip(true); + continue; } postProcessPageModel(pageModelExtractor.getClazz(), process); page.putField(pageModelExtractor.getClazz().getCanonicalName(), process); } + if (page.getResultItems().getAll().size() == 0) { + page.getResultItems().setSkip(true); + } } private void extractLinks(Page page, Selector urlRegionSelector, List urlPatterns) { diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java index 5e4da11..b7b7900 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java @@ -340,9 +340,7 @@ class PageModelExtractor { private Object convert(String value, ObjectFormatter objectFormatter) { try { Object format = objectFormatter.format(value); - if (logger.isDebugEnabled()) { - logger.debug("String " + value + " is converted to " + format); - } + logger.debug("String {} is converted to {}", value, format); return format; } catch (Exception e) { logger.error("convert " + value + " to " + objectFormatter.clazz() + " error!", e); diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/PatternPipeline.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/PatternPipeline.java new file mode 100644 index 0000000..582b162 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/PatternPipeline.java @@ -0,0 +1,44 @@ +package us.codecraft.webmagic.pipeline; + +import us.codecraft.webmagic.ResultItems; +import us.codecraft.webmagic.Task; +import us.codecraft.webmagic.handler.PatternHandler; + +import java.util.ArrayList; + +/** + * Created with IntelliJ IDEA. + * User: Sebastian MA + * Date: April 04, 2014 + * Time: 20:44 + */ +public class PatternPipeline implements Pipeline { + + protected ArrayList handlers = new ArrayList(); + + /** + * A handler works only if it is added to BOTH the page processor and the pipeline. + * Uses PatternHandler's register instead. + * + * @param handler the pattern handler + * + * @see PatternHandler#register + */ + public void addHandler(PatternHandler handler) { + + handlers.add(handler); + } + + public void removeHandler(PatternHandler handler) { + + handlers.remove(handler); + } + + @Override + public void process(ResultItems resultItems, Task task) { + + for(PatternHandler handler : handlers) { + handler.process(resultItems, task); + } + } +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/processor/PatternPageProcessor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/processor/PatternPageProcessor.java new file mode 100644 index 0000000..d7d909c --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/processor/PatternPageProcessor.java @@ -0,0 +1,78 @@ +package us.codecraft.webmagic.processor; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.handler.PatternHandler; +import us.codecraft.webmagic.utils.UrlUtils; + +import java.util.ArrayList; +import java.util.List; + +/** + * Created with IntelliJ IDEA. + * User: Sebastian MA + * Date: April 04, 2014 + * Time: 15:36 + *

+ * A PatternPageProcessor uses PatternHandler to setup extraction rules for specific url pattern. + * + * @see us.codecraft.webmagic.handler.PatternHandler + */ +public class PatternPageProcessor implements PageProcessor { + + public static final String TARGET_PATTERN_ALL = "http://*"; + + protected Site site; + + protected String targetPattern; + + protected ArrayList handlers = new ArrayList(); + + public PatternPageProcessor(String startUrl, String targetPattern) { + + this.targetPattern = targetPattern; + + this.site = Site.me().addStartUrl(startUrl).setDomain(UrlUtils.getDomain(startUrl)); + this.targetPattern = "(" + targetPattern.replace(".", "\\.").replace("*", + "[^\"'#]*") + ")"; + + site.setUserAgent("Chrome/5.0.354.0"); + } + + @Override + public void process(Page page) { + + + List requests = page.getHtml().links().regex(targetPattern).all(); + page.addTargetRequests(requests); + for(PatternHandler handler : handlers) { + if(handler.match(page.getUrl().toString())) { + handler.process(page); + } + } + } + + /** + * A handler works only if it is added to BOTH the page processor and the pipeline. + * Uses PatternHandler's register instead. + * + * @param handler the pattern handler + * + * @see PatternHandler#register + */ + public void addHandler(PatternHandler handler) { + + handlers.add(handler); + } + + public void removeHandler(PatternHandler handler) { + + handlers.remove(handler); + } + + @Override + public Site getSite() { + + return site; + } +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java index 38e8a79..79f3b8b 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java @@ -23,7 +23,7 @@ import java.util.concurrent.atomic.AtomicInteger; * @author code4crafter@gmail.com
* @since 0.2.0 */ -public class FileCacheQueueScheduler implements Scheduler { +public class FileCacheQueueScheduler extends LocalDuplicatedRemovedScheduler { private Logger logger = LoggerFactory.getLogger(getClass()); @@ -145,18 +145,12 @@ public class FileCacheQueueScheduler implements Scheduler { } @Override - public synchronized void push(Request request, Task task) { + protected void pushWhenNoDuplicate(Request request, Task task) { if (!inited.get()) { init(task); } - if (logger.isDebugEnabled()) { - logger.debug("push to queue " + request.getUrl()); - } - if (urls.add(request.getUrl())) { - queue.add(request); - fileUrlWriter.println(request.getUrl()); - } - + queue.add(request); + fileUrlWriter.println(request.getUrl()); } @Override diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/configurable/ConfigurablePageProcessorTest.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/configurable/ConfigurablePageProcessorTest.java new file mode 100644 index 0000000..a35fffa --- /dev/null +++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/configurable/ConfigurablePageProcessorTest.java @@ -0,0 +1,39 @@ +package us.codecraft.webmagic.configurable; + +import org.junit.Test; +import us.codecraft.webmagic.ResultItems; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.downloader.MockGithubDownloader; + +import java.util.ArrayList; +import java.util.List; + +import static org.assertj.core.api.Assertions.assertThat; + +/** + * @author code4crafter@gmail.com + * @date 14-4-5 + */ +public class ConfigurablePageProcessorTest { + + @Test + public void test() throws Exception { + List extractRules = new ArrayList(); + ExtractRule extractRule = new ExtractRule(); + extractRule.setExpressionType(ExpressionType.XPath); + extractRule.setExpressionValue("//title"); + extractRule.setFieldName("title"); + extractRules.add(extractRule); + extractRule = new ExtractRule(); + extractRule.setExpressionType(ExpressionType.XPath); + extractRule.setExpressionValue("//ul[@class='pagehead-actions']/li[1]//a[@class='social-count js-social-count']/text()"); + extractRule.setFieldName("star"); + extractRules.add(extractRule); + ResultItems resultItems = Spider.create(new ConfigurablePageProcessor(Site.me(), extractRules)) + .setDownloader(new MockGithubDownloader()).get("https://github.com/code4craft/webmagic"); + assertThat(resultItems.getAll()).containsEntry("title", "code4craft/webmagic · GitHub"); + assertThat(resultItems.getAll()).containsEntry("star", " 86 "); + + } +} diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/model/ModelPageProcessorTest.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/ModelPageProcessorTest.java new file mode 100644 index 0000000..74f3f6a --- /dev/null +++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/ModelPageProcessorTest.java @@ -0,0 +1,45 @@ +package us.codecraft.webmagic.model; + +import org.junit.Test; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.model.annotation.ExtractBy; +import us.codecraft.webmagic.model.annotation.TargetUrl; +import us.codecraft.webmagic.selector.PlainText; + +import static org.assertj.core.api.Assertions.assertThat; + +/** + * @author code4crafter@gmail.com + * @date 14-4-4 + */ +public class ModelPageProcessorTest { + + @TargetUrl("http://codecraft.us/foo") + public static class ModelFoo { + + @ExtractBy(value = "//div/@foo", notNull = true) + private String foo; + + } + + @TargetUrl("http://codecraft.us/bar") + public static class ModelBar { + + @ExtractBy(value = "//div/@bar", notNull = true) + private String bar; + + } + + @Test + public void testMultiModel_should_not_skip_when_match() throws Exception { + Page page = new Page(); + page.setRawText("
"); + page.setRequest(new Request("http://codecraft.us/foo")); + page.setUrl(PlainText.create("http://codecraft.us/foo")); + ModelPageProcessor modelPageProcessor = ModelPageProcessor.create(null, ModelFoo.class, ModelBar.class); + modelPageProcessor.process(page); + assertThat(page.getResultItems().isSkip()).isFalse(); + + } +} diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/BaiduNews.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/BaiduNews.java new file mode 100644 index 0000000..4795662 --- /dev/null +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/BaiduNews.java @@ -0,0 +1,43 @@ +package us.codecraft.webmagic.model.samples; + +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.model.OOSpider; +import us.codecraft.webmagic.model.annotation.ExtractBy; + +/** + * @author code4crafter@gmail.com + * @date 14-4-9 + */ +public class BaiduNews { + + @ExtractBy("//h3[@class='c-title']/a/text()") + private String name; + + @ExtractBy("//div[@class='c-summary']/text()") + private String description; + + @Override + public String toString() { + return "BaiduNews{" + + "name='" + name + '\'' + + ", description='" + description + '\'' + + '}'; + } + + public static void main(String[] args) { + OOSpider ooSpider = OOSpider.create(Site.me().setSleepTime(0), BaiduNews.class); + //single download + BaiduNews baike = ooSpider.get("http://news.baidu.com/ns?tn=news&cl=2&rn=20&ct=1&fr=bks0000&ie=utf-8&word=httpclient"); + System.out.println(baike); + + ooSpider.close(); + } + + public String getName() { + return name; + } + + public String getDescription() { + return description; + } +} \ No newline at end of file diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java index e9dfb26..45bee2f 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java @@ -3,7 +3,6 @@ package us.codecraft.webmagic.model.samples; import us.codecraft.webmagic.MultiPageModel; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.model.OOSpider; -import us.codecraft.webmagic.model.annotation.ComboExtract; import us.codecraft.webmagic.model.annotation.ExtractBy; import us.codecraft.webmagic.model.annotation.ExtractByUrl; import us.codecraft.webmagic.model.annotation.TargetUrl; @@ -26,9 +25,8 @@ public class News163 implements MultiPageModel { @ExtractByUrl(value = "http://news\\.163\\.com/\\d+/\\d+/\\d+/\\w+_(\\d+)\\.html", notNull = false) private String page; - @ComboExtract(value = {@ExtractBy("//div[@class=\"ep-pages\"]//a/@href"), - @ExtractBy(value = "http://news\\.163\\.com/\\d+/\\d+/\\d+/\\w+_(\\d+)\\.html", type = ExtractBy.Type.Regex)}, - multi = true, notNull = false) + @ExtractBy(value = "//div[@class=\"ep-pages\"]//a/regex('http://news\\.163\\.com/\\d+/\\d+/\\d+/\\w+_(\\d+)\\.html',1)" + , multi = true, notNull = false) private List otherPage; @ExtractBy("//h1[@id=\"h1title\"]/text()") @@ -74,8 +72,8 @@ public class News163 implements MultiPageModel { } public static void main(String[] args) { - OOSpider.create(Site.me().addStartUrl("http://news.163.com/13/0802/05/958I1E330001124J_2.html"), News163.class) - .scheduler(new RedisScheduler("localhost")).clearPipeline().pipeline(new MultiPagePipeline()).pipeline(new ConsolePipeline()).run(); + OOSpider.create(Site.me(), News163.class).addUrl("http://news.163.com/13/0802/05/958I1E330001124J_2.html") + .scheduler(new RedisScheduler("localhost")).addPipeline(new MultiPagePipeline()).addPipeline(new ConsolePipeline()).run(); } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/QQMeishi.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/QQMeishi.java new file mode 100644 index 0000000..f4f8591 --- /dev/null +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/QQMeishi.java @@ -0,0 +1,27 @@ +package us.codecraft.webmagic.model.samples; + +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.model.ConsolePageModelPipeline; +import us.codecraft.webmagic.model.OOSpider; +import us.codecraft.webmagic.model.annotation.ExtractBy; +import us.codecraft.webmagic.model.annotation.TargetUrl; + +/** + * @author code4crafter@gmail.com + * @date 14-4-11 + */ +@TargetUrl("http://meishi.qq.com/beijing/c/all[\\-p2]*") +@ExtractBy(value = "//ul[@id=\"promos_list2\"]/li",multi = true) +public class QQMeishi { + + @ExtractBy("//div[@class=info]/a[@class=title]/h4/text()") + private String shopName; + + @ExtractBy("//div[@class=info]/a[@class=title]/text()") + private String promo; + + public static void main(String[] args) { + OOSpider.create(Site.me(), new ConsolePageModelPipeline(), QQMeishi.class).addUrl("http://meishi.qq.com/beijing/c/all").thread(4).run(); + } + +} diff --git a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/SinablogProcessorTest.java b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/SinablogProcessorTest.java index 026f8d5..d7cd5d5 100644 --- a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/SinablogProcessorTest.java +++ b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/SinablogProcessorTest.java @@ -5,7 +5,7 @@ import org.junit.Test; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.pipeline.FilePipeline; import us.codecraft.webmagic.pipeline.JsonFilePipeline; -import us.codecraft.webmagic.samples.SinaBlogProcesser; +import us.codecraft.webmagic.samples.SinaBlogProcessor; import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler; import java.io.IOException; @@ -20,7 +20,7 @@ public class SinablogProcessorTest { @Ignore @Test public void test() throws IOException { - SinaBlogProcesser sinaBlogProcesser = new SinaBlogProcesser(); + SinaBlogProcessor sinaBlogProcessor = new SinaBlogProcessor(); //pipeline是抓取结束后的处理 //默认放到/data/webmagic/ftl/[domain]目录下 JsonFilePipeline pipeline = new JsonFilePipeline("/data/webmagic/"); @@ -29,7 +29,7 @@ public class SinablogProcessorTest { //ConsolePipeline输出结果到控制台 //FileCacheQueueSchedular保存url,支持断点续传,临时文件输出到/data/temp/webmagic/cache目录 //Spider.run()执行 - Spider.create(sinaBlogProcesser).pipeline(new FilePipeline()).pipeline(pipeline).scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")). + Spider.create(sinaBlogProcessor).pipeline(new FilePipeline()).pipeline(pipeline).scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")). run(); } } diff --git a/webmagic-scripts/README.md b/webmagic-scripts/README.md old mode 100644 new mode 100755 diff --git a/webmagic-scripts/deploy.sh b/webmagic-scripts/deploy.sh old mode 100644 new mode 100755 diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml old mode 100644 new mode 100755 index 5c21160..41c79ea --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -16,6 +16,10 @@ jruby 1.7.6 + org.python + jython + 2.5.3 + commons-cli commons-cli diff --git a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/Language.java b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/Language.java old mode 100644 new mode 100755 index c7ddcda..2f9d22d --- a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/Language.java +++ b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/Language.java @@ -7,7 +7,9 @@ public enum Language { JavaScript("javascript","js/defines.js",""), - JRuby("jruby","ruby/defines.rb",""); + JRuby("jruby","ruby/defines.rb",""), + + Jython("jython","python/defines.py",""); private String engineName; diff --git a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptConsole.java b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptConsole.java old mode 100644 new mode 100755 diff --git a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptEnginePool.java b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptEnginePool.java old mode 100644 new mode 100755 diff --git a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptProcessor.java b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptProcessor.java old mode 100644 new mode 100755 index 5801851..1822318 --- a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptProcessor.java +++ b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptProcessor.java @@ -1,6 +1,8 @@ package us.codecraft.webmagic.scripts; import org.apache.commons.io.IOUtils; +import org.jruby.RubyHash; +import org.python.core.PyDictionary; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.processor.PageProcessor; @@ -10,6 +12,8 @@ import javax.script.ScriptEngine; import javax.script.ScriptException; import java.io.IOException; import java.io.InputStream; +import java.util.Iterator; +import java.util.Map; /** * @author code4crafter@gmail.com @@ -50,20 +54,35 @@ public class ScriptProcessor implements PageProcessor { context.setAttribute("page", page, ScriptContext.ENGINE_SCOPE); context.setAttribute("config", site, ScriptContext.ENGINE_SCOPE); try { - engine.eval(defines + "\n" + script, context); -// switch (language) { -// case JavaScript: -// NativeObject o = (NativeObject) engine.get("result"); -// if (o != null) { -// for (Map.Entry objectObjectEntry : o.entrySet()) { -// page.getResultItems().put(objectObjectEntry.getKey().toString(), objectObjectEntry.getValue()); + switch (language) { + case JavaScript: + engine.eval(defines + "\n" + script, context); +// NativeObject o = (NativeObject) engine.get("result"); +// if (o != null) { +// for (Object o1 : o.getIds()) { +// String key = String.valueOf(o1); +// page.getResultItems().put(key, NativeObject.getProperty(o, key)); +// } // } -// } -// break; -// case JRuby: -// Object o1 = engine.get("result"); -// break; -// } + break; + case JRuby: + RubyHash oRuby = (RubyHash) engine.eval(defines + "\n" + script, context); + Iterator itruby = oRuby.entrySet().iterator(); + while (itruby.hasNext()) { + Map.Entry pairs = (Map.Entry) itruby.next(); + page.getResultItems().put(pairs.getKey().toString(), pairs.getValue()); + } + break; + case Jython: + engine.eval(defines + "\n" + script, context); + PyDictionary oJython = (PyDictionary) engine.get("result"); + Iterator it = oJython.entrySet().iterator(); + while (it.hasNext()) { + Map.Entry pairs = (Map.Entry) it.next(); + page.getResultItems().put(pairs.getKey().toString(), pairs.getValue()); + } + break; + } } catch (ScriptException e) { e.printStackTrace(); } @@ -72,6 +91,7 @@ public class ScriptProcessor implements PageProcessor { } } + @Override public Site getSite() { return site; diff --git a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptProcessorBuilder.java b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptProcessorBuilder.java old mode 100644 new mode 100755 diff --git a/webmagic-scripts/src/main/resources/js/defines.js b/webmagic-scripts/src/main/resources/js/defines.js old mode 100644 new mode 100755 diff --git a/webmagic-scripts/src/main/resources/js/github.js b/webmagic-scripts/src/main/resources/js/github.js old mode 100644 new mode 100755 diff --git a/webmagic-scripts/src/main/resources/js/oschina.js b/webmagic-scripts/src/main/resources/js/oschina.js old mode 100644 new mode 100755 index 305682e..02191c3 --- a/webmagic-scripts/src/main/resources/js/oschina.js +++ b/webmagic-scripts/src/main/resources/js/oschina.js @@ -9,3 +9,4 @@ var config = { title = $("div.BlogTitle h1"), content = $("div.BlogContent") urls("http://my\\.oschina\\.net/flashsword/blog/\\d+") +config; diff --git a/webmagic-scripts/src/main/resources/log4j.xml b/webmagic-scripts/src/main/resources/log4j.xml old mode 100644 new mode 100755 diff --git a/webmagic-scripts/src/main/resources/python/defines.py b/webmagic-scripts/src/main/resources/python/defines.py new file mode 100755 index 0000000..913a4b4 --- /dev/null +++ b/webmagic-scripts/src/main/resources/python/defines.py @@ -0,0 +1,13 @@ +def xpath(str): + return page.getHtml().xpath(str).toString() + +def css(str): + return page.getHtml().css(str).toString() + +def urls(str): + links=page.getHtml().links().regex(str).all() + page.addTargetRequests(links); + +def tomap(key,value): + return "hello world" + diff --git a/webmagic-scripts/src/main/resources/python/oschina.py b/webmagic-scripts/src/main/resources/python/oschina.py new file mode 100755 index 0000000..51a188b --- /dev/null +++ b/webmagic-scripts/src/main/resources/python/oschina.py @@ -0,0 +1,4 @@ +title=xpath("div[@class=BlogTitle]") +urls="http://my\\.oschina\\.net/flashsword/blog/\\d+" + +result={"title":title,"urls":urls} diff --git a/webmagic-scripts/src/main/resources/ruby/defines.rb b/webmagic-scripts/src/main/resources/ruby/defines.rb old mode 100644 new mode 100755 diff --git a/webmagic-scripts/src/main/resources/ruby/github.rb b/webmagic-scripts/src/main/resources/ruby/github.rb old mode 100644 new mode 100755 diff --git a/webmagic-scripts/src/main/resources/ruby/oschina.rb b/webmagic-scripts/src/main/resources/ruby/oschina.rb index cbced0b..dbea13b 100644 --- a/webmagic-scripts/src/main/resources/ruby/oschina.rb +++ b/webmagic-scripts/src/main/resources/ruby/oschina.rb @@ -1,3 +1,6 @@ +urls "http://my\\.oschina\\.net/flashsword/blog/\\d+" title = css "div.BlogTitle h1" content = css "div.BlogContent" -urls "http://my\\.oschina\\.net/flashsword/blog/\\d+" \ No newline at end of file + +return {"title"=>title,"content"=>content} + diff --git a/webmagic-scripts/src/test/java/us/codecraft/webmagic/scripts/ScriptProcessorTest.java b/webmagic-scripts/src/test/java/us/codecraft/webmagic/scripts/ScriptProcessorTest.java old mode 100644 new mode 100755 index ec3f674..23fe093 --- a/webmagic-scripts/src/test/java/us/codecraft/webmagic/scripts/ScriptProcessorTest.java +++ b/webmagic-scripts/src/test/java/us/codecraft/webmagic/scripts/ScriptProcessorTest.java @@ -22,4 +22,12 @@ public class ScriptProcessorTest { pageProcessor.getSite().setSleepTime(0); Spider.create(pageProcessor).addUrl("http://my.oschina.net/flashsword/blog").setSpawnUrl(false).run(); } + + + @Test + public void testPythonProcessor() { + ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom().language(Language.Jython).scriptFromClassPathFile("python/oschina.py").build(); + pageProcessor.getSite().setSleepTime(0); + Spider.create(pageProcessor).addUrl("http://my.oschina.net/flashsword/blog").setSpawnUrl(false).run(); + } } diff --git a/webmagic-scripts/src/test/resouces/log4j.xml b/webmagic-scripts/src/test/resouces/log4j.xml old mode 100644 new mode 100755 diff --git a/zh_docs/user-manual-new.md b/zh_docs/user-manual-new.md index 229c9a6..a8ae5c2 100644 --- a/zh_docs/user-manual-new.md +++ b/zh_docs/user-manual-new.md @@ -1,11 +1,11 @@ -WebMagic文档2.0版 +WebMagic in Action ======== WebMagic是一个简单灵活、便于二次开发的爬虫框架。除了可以便捷的实现一个爬虫,WebMagic还提供多线程功能,以及基本的分布式功能。 你可以直接使用WebMagic进行爬虫开发,也可以定制WebMagic以适应复杂项目的需要。 -## 1. 使用WebMagic +## 1. 在项目中使用WebMagic WebMagic主要包含两个jar包:`webmagic-core-{version}.jar`和`webmagic-extension-{version}.jar`。在项目中添加这两个包的依赖,即可使用WebMagic。 @@ -88,6 +88,8 @@ public class GithubRepoPageProcessor implements PageProcessor { ![runlog](http://static.oschina.net/uploads/space/2014/0403/103741_3Gf5_190591.png) +
+ ## 2.下载和编译源码 WebMagic是一个纯Java项目,如果你熟悉Maven,那么下载并编译源码是非常简单的。如果不熟悉Maven也没关系,这部分会介绍如何在Eclipse里导入这个项目。 @@ -158,21 +160,200 @@ Intellij Idea默认自带Maven支持,import项目时选择Maven项目即可。 ## 3. 基本的爬虫 -### 3.1 抽取内容(xpath, regex, css selector, jsonpath) +### 3.1 实现PageProcessor -### 3.2 发现链接 +在WebMagic里,实现一个基本的爬虫只需要编写一个类,实现`PageProcessor`接口即可。这个类基本上包含了抓取一个网站,你需要写的所有代码。 -### 3.3 处理多个页面 +以之前的`GithubRepoPageProcessor`为例,我将PageProcessor的定制分为三个部分,分别是爬虫的配置、页面元素的抽取和链接的发现。 -## 4. 使用注解 +```java +public class GithubRepoPageProcessor implements PageProcessor { -### 4.1 抽取内容(xpath, regex, css selector, jsonpath) + // 部分一:抓取网站的相关配置,包括编码、抓取间隔、重试次数等 + private Site site = Site.me().setRetryTimes(3).setSleepTime(1000); -### 4.2 发现链接 + @Override + // process是定制爬虫逻辑的核心接口,在这里编写抽取逻辑 + public void process(Page page) { + // 部分二:定义如何抽取页面信息,并保存下来 + page.putField("author", page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString()); + page.putField("name", page.getHtml().xpath("//h1[@class='entry-title public']/strong/a/text()").toString()); + if (page.getResultItems().get("name") == null) { + //skip this page + page.setSkip(true); + } + page.putField("readme", page.getHtml().xpath("//div[@id='readme']/tidyText()")); -### 4.3 处理多个页面 + // 部分三:从页面发现后续的url地址来抓取 + page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all()); + } -### 4.4 在POJO中实现复杂逻辑 + @Override + public Site getSite() { + return site; + } + + public static void main(String[] args) { + + Spider.create(new GithubRepoPageProcessor()) + //从"https://github.com/code4craft"开始抓 + .addUrl("https://github.com/code4craft") + //开启5个线程抓取 + .thread(5) + //启动爬虫 + .run(); + } +} +``` + +#### 3.1.1 爬虫的配置 + +第一部分关于爬虫的配置,包括编码、抓取间隔、超时时间、重试次数等,也包括一些模拟的参数,例如User Agent、cookie,以及代理的设置,我们会在第5章-“爬虫的配置”里进行介绍。在这里我们先简单设置一下:重试次数为3次,抓取间隔为一秒。 + +#### 3.1.2 页面元素的抽取 + +第二部分是爬虫的核心部分:对于下载到的Html页面,你如何从中抽取到你想要的信息?WebMagic里主要使用了三种抽取技术:XPath、正则表达式和CSS选择器。 + +1. XPath + + XPath本来是用于XML中获取元素的一种查询语言,但是用于Html也是比较方便的。例如: + + ```java + page.getHtml().xpath("//h1[@class='entry-title public']/strong/a/text()") + ``` + 这段代码使用了XPath,它的意思是“查找所有class属性为'entry-title public'的h1元素,并找到他的strong子节点的a子节点,并提取a节点的文本信息”。 +对应的Html是这样子的: + + ![xpath-html](http://static.oschina.net/uploads/space/2014/0404/104607_Aqq8_190591.png) + +2. CSS选择器 + + CSS选择器是与XPath类似的语言。如果大家做过前端开发,肯定知道$('h1.entry-title')这种写法的含义。客观的说,它比XPath写起来要简单一些,但是如果写复杂一点的抽取规则,就相对要麻烦一点。 + +3. 正则表达式 + + 正则表达式则是一种通用的文本抽取语言。 + + ```java + page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all()); + ``` + + 这段代码就用到了正则表达式,它表示匹配所有"https://github.com/code4craft/webmagic"这样的链接。 + +XPath、CSS选择器和正则表达式的具体用法会在第4章“抽取工具详解”中讲到。 + +#### 3.1.3 链接的发现 + +有了处理页面的逻辑,我们的爬虫就接近完工了! + +但是现在还有一个问题:一个站点的页面是很多的,一开始我们不可能全部列举出来,于是如何发现后续的链接,是一个爬虫不可缺少的一部分。 + +```java +page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all()); +``` + +这段代码的分为两部分,`page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all()`用于获取所有满足"(https://github\\.com/\\w+/\\w+)"这个正则表达式的链接,`page.addTargetRequests()`则将这些链接加入到待抓取的队列中去。 + +### 3.2 使用Selectable的链式API + +`Selectable`相关的链式API是WebMagic的一个核心功能。使用Selectable接口,你可以直接完成页面元素的链式抽取,也无需去关心抽取的细节。 + +在刚才的例子中可以看到,page.getHtml()返回的是一个`Html`对象,它实现了`Selectable`接口。这个接口包含一些重要的方法,我将它分为两类:抽取部分和获取结果部分。 + +#### 3.2.1 抽取部分API: + +| 方法 | 说明 | 示例 | +| ------------ | ------------- | ------------ | +| xpath(String xpath) | 使用XPath选择 | html.xpath("//div[@class='title']") | +| \$(String selector) | 使用Css选择器选择 | html.\$("div.title") | +| \$(String selector,String attr) | 使用Css选择器选择 | html.\$("div.title","text") | +| css(String selector) | 功能同$(),使用Css选择器选择 | html.css("div.title") | +| links() | 选择所有链接 | html.links() | +| regex(String regex) | 使用正则表达式抽取 | html.regex("\(.\*?)\") | +| regex(String regex,int group) | 使用正则表达式抽取,并指定捕获组 | html.regex("\(.\*?)\",1) | +| replace(String regex, String replacement) | 替换内容| html.replace("\","")| + +这部分抽取API返回的都是一个`Selectable`接口,意思是说,抽取是支持链式调用的。下面我用一个实例来讲解链式API的使用。 + +例如,我现在要抓取github上所有的Java项目,这些项目可以在[https://github.com/search?l=Java&p=1&q=stars%3A%3E1&s=stars&type=Repositories](https://github.com/search?l=Java&p=1&q=stars%3A%3E1&s=stars&type=Repositories)搜索结果中看到。 + +为了避免抓取范围太宽,我指定只从分页部分抓取链接。这个抓取规则是比较复杂的,我会要怎么写呢? + +![selectable-chain-ui](http://static.oschina.net/uploads/space/2014/0404/151454_2T01_190591.png) + +首先看到页面的html结构是这个样子的: + +![selectable-chain](http://static.oschina.net/uploads/space/2014/0404/151632_88Oq_190591.png) + +那么我可以先用CSS选择器提取出这个div,然后在取到所有的链接。为了保险起见,我再使用正则表达式限定一下提取出的URL的格式,那么最终的写法是这样子的: + +```java +List urls = page.getHtml().css("div.pagination").links().regex(".*/search/\?l=java.*").all(); +``` + +然后,我们可以把这些URL加到抓取列表中去: + +```java +List urls = page.getHtml().css("div.pagination").links().regex(".*/search/\?l=java.*").all(); +page.addTargetRequests(urls); +``` + +是不是比较简单?除了发现链接,Selectable的链式抽取还可以完成很多工作。我们会在第9章示例中再讲到。 + +#### 3.2.2 获取结果的API: + +当链式调用结束时,我们一般都想要拿到一个字符串类型的结果。这时候就需要用到获取结果的API了。我们知道,一条抽取规则,无论是XPath、CSS选择器或者正则表达式,总有可能抽取到多条元素。WebMagic对这些进行了统一,你可以通过不同的API获取到一个或者多个元素。 + +| 方法 | 说明 | 示例 | +| ------------ | ------------- | ------------ | +| get() | 返回一条String类型的结果 | String link= html.links().get()| +| toString() | 功能同get(),返回一条String类型的结果 | String link= html.links().toString()| +| all() | 返回所有抽取结果 | List links= html.links().all()| +| match() | 是否有匹配结果 | if (html.links().match()){ xxx; }| + +例如,我们知道页面只会有一条结果,那么可以使用selectable.get()或者selectable.toString()拿到这条结果。 + +这里selectable.toString()采用了toString()这个接口,是为了在输出以及和一些框架结合的时候,更加方便。因为一般情况下,我们都只需要选择一个元素! + +selectable.all()则会获取到所有元素。 + +好了,到现在为止,在回过头看看3.1中的GithubRepoPageProcessor,可能就觉得更加清晰了吧?指定main方法,已经可以看到抓取结果在控制台输出了。 + +### 3.3 保存结果 + +好了,爬虫编写完成,现在我们可能还有一个问题:我如果想把抓取的结果保存下来,要怎么做呢?WebMagic用于保存结果的组件叫做`Pipeline`。例如我们通过“控制台输出结果”这件事也是通过一个内置的Pipeline完成的,它叫做`ConsolePipeline`。那么,我现在想要把结果用Json的格式保存下来,怎么做呢?我只需要将Pipeline的实现换成"JsonFilePipeline"就可以了。 + +```java + public static void main(String[] args) { + + Spider.create(new GithubRepoPageProcessor()) + //从"https://github.com/code4craft"开始抓 + .addUrl("https://github.com/code4craft") + .addPipeline(new JsonFilePipeline("D:\webmagic\")) + //开启5个线程抓取 + .thread(5) + //启动爬虫 + .run(); + } +``` + +这样子下载下来的文件就会保存在D盘的webmagic目录中了。 + +通过定制Pipeline,我们还可以实现保存结果到文件、数据库等一系列功能。这个会在第7章“抽取结果的处理”中介绍。 + +至此为止,我们已经完成了一个基本爬虫的编写,也具有了一些定制功能。 + +
+ +## 4. 抽取工具详解 + +### 4.1 XPath + +### 4.2 CSS选择器 + +### 4.3 正则表达式 + +### 4.4 JsonPath ## 5. 配置爬虫 @@ -198,25 +379,25 @@ Intellij Idea默认自带Maven支持,import项目时选择Maven项目即可。 ### 6.4 定期抓取 -## 7. 管理URL +## 7. 抽取结果的处理 -### 7.1 手动添加URL +### 7.1 输出到控制台 -### 7.2 在URL中保存信息 +### 7.2 保存到文件 -### 7.3 几种URL管理方式 +### 7.3 JSON格式输出 -### 7.4 自己管理爬虫的URL +### 7.4 自定义持久化方式(mysql/mongodb…) -## 8. 抽取结果的处理 +## 8. 管理URL -### 8.1 输出到控制台 +### 8.1 手动添加URL -### 8.2 保存到文件 +### 8.2 在URL中保存信息 -### 8.3 JSON格式输出 +### 8.3 几种URL管理方式 -### 8.4 自定义持久化方式(mysql/mongodb…) +### 8.4 自己管理爬虫的URL ## 9. 实例 From 53184f0390e5c2e8b77d3642b8475f8e95788e63 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=84=A4=E6=80=92=E7=9A=84=E7=95=AA=E8=8C=84?= Date: Sat, 12 Apr 2014 23:00:37 +0800 Subject: [PATCH 071/130] test --- .../java/us/codecraft/webmagic/samples/HuabanProcessor.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webmagic-selenium/src/test/java/us/codecraft/webmagic/samples/HuabanProcessor.java b/webmagic-selenium/src/test/java/us/codecraft/webmagic/samples/HuabanProcessor.java index fcfb068..2854a76 100644 --- a/webmagic-selenium/src/test/java/us/codecraft/webmagic/samples/HuabanProcessor.java +++ b/webmagic-selenium/src/test/java/us/codecraft/webmagic/samples/HuabanProcessor.java @@ -30,7 +30,7 @@ public class HuabanProcessor implements PageProcessor { @Override public Site getSite() { - if (site == null) { + if (null == site) { site = Site.me().setDomain("huaban.com").setSleepTime(0); } return site; From 99e12aafaa6906c6e9800fd094e3b60b05f55011 Mon Sep 17 00:00:00 2001 From: Tian Date: Sun, 13 Apr 2014 10:14:39 +0800 Subject: [PATCH 072/130] update:PatternHandler --- .../example/PatternProcessorDemo.java | 9 ++-- .../webmagic/handler/PatternHandler.java | 53 ++++++------------- .../webmagic/handler/SubPageProcessor.java | 39 +++++++------- .../webmagic/pipeline/PatternPipeline.java | 3 +- .../processor/PatternPageProcessor.java | 6 +-- 5 files changed, 44 insertions(+), 66 deletions(-) diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/PatternProcessorDemo.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/PatternProcessorDemo.java index 51a9484..e2303a0 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/PatternProcessorDemo.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/PatternProcessorDemo.java @@ -6,6 +6,7 @@ import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.handler.PatternHandler; +import us.codecraft.webmagic.handler.SubPageProcessor; import us.codecraft.webmagic.pipeline.PatternPipeline; import us.codecraft.webmagic.processor.PatternPageProcessor; @@ -32,21 +33,23 @@ public class PatternProcessorDemo { PatternHandler handler = new PatternHandler("http://item.jd.com/.*") { @Override - public void onExtract(Page page) { + public SubPageProcessor.MatchOtherProcessor process(Page page) { log.info("Extracting from " + page.getUrl()); page.putField("test", "hello world:)"); + return MatchOtherProcessor.YES; } @Override - public void onHandle(ResultItems result, Task task) { + public void handle(ResultItems result, Task task) { log.info("Handling " + result.getRequest().getUrl()); log.info("Retrieved test=" + result.get("test")); } }; - handler.register(processor, pipeline); + processor.addHandler(handler); + pipeline.addHandler(handler); Spider.create(processor).thread(5).addPipeline(pipeline).runAsync(); } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/PatternHandler.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/PatternHandler.java index 51e44e0..4be03de 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/PatternHandler.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/PatternHandler.java @@ -3,8 +3,6 @@ package us.codecraft.webmagic.handler; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.Task; -import us.codecraft.webmagic.pipeline.PatternPipeline; -import us.codecraft.webmagic.processor.PatternPageProcessor; import java.util.UUID; @@ -17,7 +15,7 @@ import java.util.UUID; * A PatternHandler is in charge of both page extraction and data processing by implementing * its two abstract methods. */ -public abstract class PatternHandler { +public abstract class PatternHandler implements SubPageProcessor { /** * identity of the handler. @@ -47,46 +45,25 @@ public abstract class PatternHandler { return url.matches(pattern); } - /** - * registers to both the page processor and the pipeline so the handler could take charge of - * both end of procedure. - * - * @param processor - * the processor to handle - * @param pipeline - * the pipeline to handle - */ - public void register(PatternPageProcessor processor, PatternPipeline pipeline) { - - processor.addHandler(this); - pipeline.addHandler(this); - } - - public void unregister(PatternPageProcessor processor, PatternPipeline pipeline) { - - processor.removeHandler(this); - pipeline.removeHandler(this); - } - - public boolean process(Page page) { + public boolean processPage(Page page) { if(match(page.getUrl().toString())) { page.putField(id, true); - onExtract(page); + process(page); return true; } else { return false; } } - public boolean process(ResultItems resultItems, Task task) { + public boolean processResult(ResultItems resultItems, Task task) { if(resultItems.isSkip()) { return false; } if(match(resultItems.getRequest().getUrl()) && resultItems.get(id) != null) { - onHandle(resultItems, task); + handle(resultItems, task); return true; } else { return false; @@ -94,20 +71,20 @@ public abstract class PatternHandler { } /** - * implements this method to extract from page. - * - * @param page - * the page to extract - */ - public abstract void onExtract(Page page); - - /** - * implements this method to handle the extraction result. + * override this method to handle the extraction result. this method MUST use + * with PatternPipeline * * @param result * extraction result * @param task */ - public abstract void onHandle(ResultItems result, Task task); + public void handle(ResultItems result, Task task) { + } + + @Override + public boolean match(Page page) { + + return match(page.getUrl().toString()); + } } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/SubPageProcessor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/SubPageProcessor.java index c880500..3778a62 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/SubPageProcessor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/SubPageProcessor.java @@ -8,26 +8,27 @@ import us.codecraft.webmagic.Page; */ public interface SubPageProcessor { - /** - * Check whether the SubPageProcessor can process the page.

- * Please DO NOT change page status in this method. - * - * @param page - * @return - */ - public boolean match(Page page); + /** + * Check whether the SubPageProcessor can process the page.

+ * Please DO NOT change page status in this method. + * + * @param page + * + * @return + */ + public boolean match(Page page); - /** - * - * process the page, extract urls to fetch, extract the data and store - * - * @param page - * @return whether continue to match - */ - public MatchOtherProcessor process(Page page); + /** + * process the page, extract urls to fetch, extract the data and store + * + * @param page + * + * @return whether continue to match + */ + public MatchOtherProcessor process(Page page); - public enum MatchOtherProcessor { - YES, NO; - } + public enum MatchOtherProcessor { + YES, NO + } } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/PatternPipeline.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/PatternPipeline.java index 582b162..c614114 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/PatternPipeline.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/PatternPipeline.java @@ -22,7 +22,6 @@ public class PatternPipeline implements Pipeline { * * @param handler the pattern handler * - * @see PatternHandler#register */ public void addHandler(PatternHandler handler) { @@ -38,7 +37,7 @@ public class PatternPipeline implements Pipeline { public void process(ResultItems resultItems, Task task) { for(PatternHandler handler : handlers) { - handler.process(resultItems, task); + handler.processResult(resultItems, task); } } } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/processor/PatternPageProcessor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/processor/PatternPageProcessor.java index d7d909c..51dbabe 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/processor/PatternPageProcessor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/processor/PatternPageProcessor.java @@ -47,18 +47,16 @@ public class PatternPageProcessor implements PageProcessor { page.addTargetRequests(requests); for(PatternHandler handler : handlers) { if(handler.match(page.getUrl().toString())) { - handler.process(page); + handler.processPage(page); } } } /** - * A handler works only if it is added to BOTH the page processor and the pipeline. - * Uses PatternHandler's register instead. * * @param handler the pattern handler * - * @see PatternHandler#register + * */ public void addHandler(PatternHandler handler) { From 03c251237b307ef5e5c193f165248fd7221f665d Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sun, 13 Apr 2014 10:23:00 +0800 Subject: [PATCH 073/130] add Json parse support --- webmagic-core/pom.xml | 11 ++++ .../main/java/us/codecraft/webmagic/Page.java | 18 +++++- .../us/codecraft/webmagic/selector/Json.java | 64 +++++++++++++++++++ .../webmagic/selector/JsonPathSelector.java | 2 +- .../webmagic/selector/PlainText.java | 5 ++ .../webmagic/selector/Selectable.java | 9 +++ .../java/us/codecraft/webmagic/HtmlTest.java | 4 +- .../selector/JsonPathSelectorTest.java | 0 .../codecraft/webmagic/selector/JsonTest.java | 20 ++++++ webmagic-extension/pom.xml | 9 --- .../webmagic/samples/AngularJSProcessor.java | 21 ++++++ 11 files changed, 150 insertions(+), 13 deletions(-) create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/selector/Json.java rename {webmagic-extension => webmagic-core}/src/main/java/us/codecraft/webmagic/selector/JsonPathSelector.java (95%) rename {webmagic-extension => webmagic-core}/src/test/java/us/codecraft/webmagic/selector/JsonPathSelectorTest.java (100%) create mode 100644 webmagic-core/src/test/java/us/codecraft/webmagic/selector/JsonTest.java create mode 100644 webmagic-samples/src/main/java/us/codecraft/webmagic/samples/AngularJSProcessor.java diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index 4bea6e2..0795a99 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -65,6 +65,17 @@ commons-io
+ + com.jayway.jsonpath + json-path + 0.8.1 + + + + com.alibaba + fastjson + + \ No newline at end of file diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java index e2d923e..3cafe62 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java @@ -2,6 +2,7 @@ package us.codecraft.webmagic; import org.apache.commons.lang3.StringUtils; import us.codecraft.webmagic.selector.Html; +import us.codecraft.webmagic.selector.Json; import us.codecraft.webmagic.selector.Selectable; import us.codecraft.webmagic.utils.UrlUtils; @@ -31,6 +32,8 @@ public class Page { private Html html; + private Json json; + private String rawText; private Selectable url; @@ -72,10 +75,23 @@ public class Page { return html; } + /** + * get json content of page + * + * @return json + * @since 0.5.0 + */ + public Json getJson() { + if (json == null) { + json = new Json(rawText); + } + return json; + } + /** * @param html * @deprecated since 0.4.0 - * The html is parse just when first time of calling {@link #getHtml()}, so use {@link #setRawText(String)} instead. + * The html is parse just when first time of calling {@link #getHtml()}, so use {@link #setRawText(String)} instead. */ public void setHtml(Html html) { this.html = html; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Json.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Json.java new file mode 100644 index 0000000..ef45d00 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Json.java @@ -0,0 +1,64 @@ +package us.codecraft.webmagic.selector; + +import com.alibaba.fastjson.JSON; +import org.jsoup.parser.TokenQueue; + +import java.util.List; + +/** + * parse json + * @author code4crafter@gmail.com + * @since 0.5.0 + */ +public class Json extends PlainText { + + public Json(List strings) { + super(strings); + } + + public Json(String text) { + super(text); + } + + /** + * remove padding for JSONP + * @param padding + * @return + */ + public Json removePadding(String padding) { + String text = getText(); + TokenQueue tokenQueue = new TokenQueue(text); + tokenQueue.consumeWhitespace(); + tokenQueue.consume(padding); + tokenQueue.consumeWhitespace(); + String chompBalanced = tokenQueue.chompBalanced('(', ')'); + return new Json(chompBalanced); + } + + public T toObject(Class clazz) { + if (getText() == null) { + return null; + } + return JSON.parseObject(getText(), clazz); + } + + public List toList(Class clazz) { + if (getText() == null) { + return null; + } + return JSON.parseArray(getText(), clazz); + } + + public String getText() { + if (strings != null && strings.size() > 0) { + return strings.get(0); + } + return null; + } + + @Override + public Selectable jsonPath(String jsonPath) { + JsonPathSelector jsonPathSelector = new JsonPathSelector(jsonPath); + return selectList(jsonPathSelector,strings); + } +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/selector/JsonPathSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/JsonPathSelector.java similarity index 95% rename from webmagic-extension/src/main/java/us/codecraft/webmagic/selector/JsonPathSelector.java rename to webmagic-core/src/main/java/us/codecraft/webmagic/selector/JsonPathSelector.java index 781669f..725dac5 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/selector/JsonPathSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/JsonPathSelector.java @@ -22,7 +22,7 @@ public class JsonPathSelector implements Selector { public JsonPathSelector(String jsonPathStr) { this.jsonPathStr = jsonPathStr; - this.jsonPath = JsonPath.compile(jsonPathStr); + this.jsonPath = JsonPath.compile(this.jsonPathStr); } @Override diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java index 9d5c385..ca40fac 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java @@ -108,6 +108,11 @@ public class PlainText implements Selectable { return strings; } + @Override + public Selectable jsonPath(String jsonPath) { + throw new UnsupportedOperationException(); + } + @Override public String get() { if (CollectionUtils.isNotEmpty(all())) { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java index aa1bb62..cdab8bf 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java @@ -119,4 +119,13 @@ public interface Selectable { * @return multi string result */ public List all(); + + /** + * extract by JSON Path expression + * + * @param jsonPath + * @return + */ + public Selectable jsonPath(String jsonPath); + } diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/HtmlTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/HtmlTest.java index c900014..fa66c3a 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/HtmlTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/HtmlTest.java @@ -1,6 +1,5 @@ package us.codecraft.webmagic; -import org.junit.Assert; import org.junit.Test; import us.codecraft.webmagic.selector.Html; @@ -14,7 +13,8 @@ public class HtmlTest { @Test public void testRegexSelector() { Html selectable = new Html("aaaaaaab"); - Assert.assertEquals("abbabbab", (selectable.regex("(.*)").replace("aa(a)", "$1bb").toString())); +// Assert.assertEquals("abbabbab", (selectable.regex("(.*)").replace("aa(a)", "$1bb").toString())); + System.out.println(selectable.regex("(.*)").replace("aa(a)", "$1bb").toString()); } diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/selector/JsonPathSelectorTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/JsonPathSelectorTest.java similarity index 100% rename from webmagic-extension/src/test/java/us/codecraft/webmagic/selector/JsonPathSelectorTest.java rename to webmagic-core/src/test/java/us/codecraft/webmagic/selector/JsonPathSelectorTest.java diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/JsonTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/JsonTest.java new file mode 100644 index 0000000..89afbb6 --- /dev/null +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/JsonTest.java @@ -0,0 +1,20 @@ +package us.codecraft.webmagic.selector; + +import org.junit.Test; + +import static org.assertj.core.api.Assertions.assertThat; + +/** + * @author code4crafter@gmai.com + * @since 0.5.0 + */ +public class JsonTest { + + private String text = "callback({\"name\":\"json\"})"; + + @Test + public void testRemovePadding() throws Exception { + String name = new Json(text).removePadding("callback").jsonPath("$.name").get(); + assertThat(name).isEqualTo("json"); + } +} diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index cd8c12f..f5a4019 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -10,10 +10,6 @@ webmagic-extension - - com.alibaba - fastjson - redis.clients jedis @@ -28,11 +24,6 @@ junit junit - - com.jayway.jsonpath - json-path - 0.8.1 - \ No newline at end of file diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/AngularJSProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/AngularJSProcessor.java new file mode 100644 index 0000000..c861b03 --- /dev/null +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/AngularJSProcessor.java @@ -0,0 +1,21 @@ +package us.codecraft.webmagic.samples; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.processor.PageProcessor; + +/** + * @author yihua.huang@dianping.com + */ +public class AngularJSProcessor implements PageProcessor{ + + @Override + public void process(Page page) { + + } + + @Override + public Site getSite() { + return null; + } +} From 84b897f83bcd9524bb57f0a5082fbe48bb6133cd Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sun, 13 Apr 2014 12:20:57 +0800 Subject: [PATCH 074/130] update AngularJSProcessor --- .../webmagic/selector/JsonPathSelector.java | 2 -- .../codecraft/webmagic/selector/JsonTest.java | 5 ++++ .../webmagic/samples/AngularJSProcessor.java | 28 +++++++++++++++++-- 3 files changed, 30 insertions(+), 5 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/JsonPathSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/JsonPathSelector.java index 725dac5..f9083a8 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/JsonPathSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/JsonPathSelector.java @@ -1,7 +1,6 @@ package us.codecraft.webmagic.selector; import com.jayway.jsonpath.JsonPath; -import us.codecraft.webmagic.utils.Experimental; import java.util.ArrayList; import java.util.List; @@ -13,7 +12,6 @@ import java.util.List; * @author code4crafter@gmail.com
* @since 0.2.1 */ -@Experimental public class JsonPathSelector implements Selector { private String jsonPathStr; diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/JsonTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/JsonTest.java index 89afbb6..f77e30d 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/JsonTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/JsonTest.java @@ -1,6 +1,7 @@ package us.codecraft.webmagic.selector; import org.junit.Test; +import us.codecraft.webmagic.Page; import static org.assertj.core.api.Assertions.assertThat; @@ -16,5 +17,9 @@ public class JsonTest { public void testRemovePadding() throws Exception { String name = new Json(text).removePadding("callback").jsonPath("$.name").get(); assertThat(name).isEqualTo("json"); + Page page = null; + + page.getJson().jsonPath("$.name").get(); + page.getJson().removePadding("callback").jsonPath("$.name").get(); } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/AngularJSProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/AngularJSProcessor.java index c861b03..18719bd 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/AngularJSProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/AngularJSProcessor.java @@ -1,21 +1,43 @@ package us.codecraft.webmagic.samples; +import org.apache.commons.collections.CollectionUtils; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.processor.PageProcessor; +import us.codecraft.webmagic.selector.JsonPathSelector; + +import java.util.List; /** - * @author yihua.huang@dianping.com + * @author code4crafter@gmail.com + * @since 0.5.0 */ -public class AngularJSProcessor implements PageProcessor{ +public class AngularJSProcessor implements PageProcessor { + + private Site site = Site.me(); + + private static final String ARITICALE_URL = "http://angularjs\\.cn/api/article/\\w+"; + + private static final String LIST_URL = "http://angularjs\\.cn/api/article/latest.*"; @Override public void process(Page page) { + if (page.getUrl().regex(LIST_URL).match()) { + List ids = new JsonPathSelector("$.data._id").selectList(page.getRawText()); + if (CollectionUtils.isNotEmpty(ids)) { + for (String id : ids) { + page.addTargetRequest("http://angularjs\\.cn/api/article/" + id); + } + } + } else { + page.putField("title", new JsonPathSelector("$.title").select(page.getRawText())); + page.putField("content", new JsonPathSelector("$.content").select(page.getRawText())); + } } @Override public Site getSite() { - return null; + return site; } } From 32ba1b888941a5b1fe6e908c1b3e9c902bb80e9e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=84=A4=E6=80=92=E7=9A=84=E7=95=AA=E8=8C=84?= Date: Sun, 13 Apr 2014 12:41:15 +0800 Subject: [PATCH 075/130] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E5=87=A0=E5=A4=84?= =?UTF-8?q?=E6=B3=A8=E9=87=8A=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- webmagic-core/src/main/java/us/codecraft/webmagic/Site.java | 2 +- .../src/main/java/us/codecraft/webmagic/Spider.java | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java index e83e85f..48b43f0 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java @@ -218,7 +218,7 @@ public class Site { * * @deprecated * @see Spider#addRequest(Request...) - * @param startUrl + * @param startRequest * @return this */ public Site addStartRequest(Request startRequest) { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 6fe2880..8af1338 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -142,7 +142,7 @@ public class Spider implements Runnable, Task { * Set startUrls of Spider.
* Prior to startUrls of Site. * - * @param startUrls + * @param startRequests * @return this */ public Spider startRequest(List startRequests) { @@ -218,7 +218,7 @@ public class Spider implements Runnable, Task { /** * set pipelines for Spider * - * @param pipeline + * @param pipelines * @return this * @see Pipeline * @since 0.4.1 @@ -477,7 +477,7 @@ public class Spider implements Runnable, Task { /** * Add urls with information to crawl.
* - * @param urls + * @param requests * @return */ public Spider addRequest(Request... requests) { From b14f0ee47992e0456f05dd22363fb216e014a246 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sun, 13 Apr 2014 12:54:44 +0800 Subject: [PATCH 076/130] fix jsonpath in AngularJSProcessor --- .../webmagic/samples/AngularJSProcessor.java | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/AngularJSProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/AngularJSProcessor.java index 18719bd..ab560e4 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/AngularJSProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/AngularJSProcessor.java @@ -3,6 +3,7 @@ package us.codecraft.webmagic.samples; import org.apache.commons.collections.CollectionUtils; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.selector.JsonPathSelector; @@ -23,15 +24,15 @@ public class AngularJSProcessor implements PageProcessor { @Override public void process(Page page) { if (page.getUrl().regex(LIST_URL).match()) { - List ids = new JsonPathSelector("$.data._id").selectList(page.getRawText()); + List ids = new JsonPathSelector("$.data[*]._id").selectList(page.getRawText()); if (CollectionUtils.isNotEmpty(ids)) { for (String id : ids) { - page.addTargetRequest("http://angularjs\\.cn/api/article/" + id); + page.addTargetRequest("http://angularjs.cn/api/article/" + id); } } } else { - page.putField("title", new JsonPathSelector("$.title").select(page.getRawText())); - page.putField("content", new JsonPathSelector("$.content").select(page.getRawText())); + page.putField("title", new JsonPathSelector("$.data.title").select(page.getRawText())); + page.putField("content", new JsonPathSelector("$.data.content").select(page.getRawText())); } } @@ -40,4 +41,8 @@ public class AngularJSProcessor implements PageProcessor { public Site getSite() { return site; } + + public static void main(String[] args) { + Spider.create(new AngularJSProcessor()).addUrl("http://angularjs.cn/api/article/latest?p=1&s=20").run(); + } } From f7950ebcab37fb5e262d85f448ebbb056703c76f Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sun, 13 Apr 2014 13:00:31 +0800 Subject: [PATCH 077/130] fix tests --- .../test/java/us/codecraft/webmagic/selector/JsonTest.java | 5 ----- 1 file changed, 5 deletions(-) diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/JsonTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/JsonTest.java index f77e30d..89afbb6 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/JsonTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/JsonTest.java @@ -1,7 +1,6 @@ package us.codecraft.webmagic.selector; import org.junit.Test; -import us.codecraft.webmagic.Page; import static org.assertj.core.api.Assertions.assertThat; @@ -17,9 +16,5 @@ public class JsonTest { public void testRemovePadding() throws Exception { String name = new Json(text).removePadding("callback").jsonPath("$.name").get(); assertThat(name).isEqualTo("json"); - Page page = null; - - page.getJson().jsonPath("$.name").get(); - page.getJson().removePadding("callback").jsonPath("$.name").get(); } } From 3a79b1b64a3c1d4c46849df912874df983785278 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sun, 13 Apr 2014 23:02:34 +0800 Subject: [PATCH 078/130] [Bugfix]formatter property does not work when field is String#100 --- .../webmagic/model/PageModelExtractor.java | 26 +++++++++++-------- .../webmagic/model/samples/GithubRepo.java | 7 +++-- .../formatter/StringTemplateFormatter.java | 26 +++++++++++++++++++ 3 files changed, 44 insertions(+), 15 deletions(-) create mode 100644 webmagic-samples/src/main/java/us/codecraft/webmagic/samples/formatter/StringTemplateFormatter.java diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java index b7b7900..3f92b28 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java @@ -76,9 +76,21 @@ class PageModelExtractor { } private void checkFormat(Field field, FieldExtractor fieldExtractor) { + //check custom formatter + Formatter formatter = field.getAnnotation(Formatter.class); + if (formatter != null && !formatter.formatter().equals(ObjectFormatter.class)) { + if (formatter != null) { + if (!formatter.formatter().equals(ObjectFormatter.class)) { + ObjectFormatter objectFormatter = initFormatter(formatter.formatter()); + objectFormatter.initParam(formatter.value()); + fieldExtractor.setObjectFormatter(objectFormatter); + return; + } + } + } if (!fieldExtractor.isMulti() && !String.class.isAssignableFrom(field.getType())) { Class fieldClazz = BasicTypeFormatter.detectBasicClass(field.getType()); - ObjectFormatter objectFormatter = getObjectFormatter(field, fieldClazz); + ObjectFormatter objectFormatter = getObjectFormatter(field, fieldClazz, formatter); if (objectFormatter == null) { throw new IllegalStateException("Can't find formatter for field " + field.getName() + " of type " + fieldClazz); } else { @@ -88,10 +100,9 @@ class PageModelExtractor { if (!List.class.isAssignableFrom(field.getType())) { throw new IllegalStateException("Field " + field.getName() + " must be list"); } - Formatter formatter = field.getAnnotation(Formatter.class); if (formatter != null) { if (!formatter.subClazz().equals(Void.class)) { - ObjectFormatter objectFormatter = getObjectFormatter(field, formatter.subClazz()); + ObjectFormatter objectFormatter = getObjectFormatter(field, formatter.subClazz(), formatter); if (objectFormatter == null) { throw new IllegalStateException("Can't find formatter for field " + field.getName() + " of type " + formatter.subClazz()); } else { @@ -102,14 +113,7 @@ class PageModelExtractor { } } - private ObjectFormatter getObjectFormatter(Field field, Class fieldClazz) { - Formatter formatter = field.getAnnotation(Formatter.class); - if (formatter != null) { - if (!formatter.formatter().equals(ObjectFormatter.class)) { - ObjectFormatter objectFormatter = initFormatter(formatter.formatter()); - objectFormatter.initParam(formatter.value()); - } - } + private ObjectFormatter getObjectFormatter(Field field, Class fieldClazz, Formatter formatter) { return initFormatter(ObjectFormatters.get(fieldClazz)); } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/GithubRepo.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/GithubRepo.java index e8998ec..57de3f1 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/GithubRepo.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/GithubRepo.java @@ -3,11 +3,9 @@ package us.codecraft.webmagic.model.samples; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.model.HasKey; import us.codecraft.webmagic.model.OOSpider; -import us.codecraft.webmagic.model.annotation.ExtractBy; -import us.codecraft.webmagic.model.annotation.ExtractByUrl; -import us.codecraft.webmagic.model.annotation.HelpUrl; -import us.codecraft.webmagic.model.annotation.TargetUrl; +import us.codecraft.webmagic.model.annotation.*; import us.codecraft.webmagic.pipeline.JsonFilePageModelPipeline; +import us.codecraft.webmagic.samples.formatter.StringTemplateFormatter; import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler; import java.util.List; @@ -22,6 +20,7 @@ public class GithubRepo implements HasKey { @ExtractBy(value = "//h1[@class='entry-title public']/strong/a/text()", notNull = true) private String name; + @Formatter(value = "author%s",formatter = StringTemplateFormatter.class) @ExtractByUrl("https://github\\.com/(\\w+)/.*") private String author; diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/formatter/StringTemplateFormatter.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/formatter/StringTemplateFormatter.java new file mode 100644 index 0000000..7b38125 --- /dev/null +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/formatter/StringTemplateFormatter.java @@ -0,0 +1,26 @@ +package us.codecraft.webmagic.samples.formatter; + +import us.codecraft.webmagic.model.formatter.ObjectFormatter; + +/** + * @author yihua.huang@dianping.com + */ +public class StringTemplateFormatter implements ObjectFormatter { + + private String template; + + @Override + public String format(String raw) throws Exception { + return String.format(template, raw); + } + + @Override + public Class clazz() { + return String.class; + } + + @Override + public void initParam(String[] extra) { + template = extra[0]; + } +} From b249e49748bfb64c122d7a8f1c77c135f2065e87 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sun, 13 Apr 2014 23:04:09 +0800 Subject: [PATCH 079/130] [Bugfix]loop error when add TargetRequest #99 --- webmagic-core/src/main/java/us/codecraft/webmagic/Page.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java index 3cafe62..a74b608 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java @@ -127,7 +127,7 @@ public class Page { synchronized (targetRequests) { for (String s : requests) { if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) { - break; + continue; } s = UrlUtils.canonicalizeUrl(s, url.toString()); targetRequests.add(new Request(s).setPriority(priority)); From 4a035e729a52432bca196dd2d1e3d305888b3468 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sun, 13 Apr 2014 23:31:13 +0800 Subject: [PATCH 080/130] extension point for LocalDuplicatedRemovedScheduler #95 --- .../webmagic/scheduler/LocalDuplicatedRemovedScheduler.java | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/LocalDuplicatedRemovedScheduler.java b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/LocalDuplicatedRemovedScheduler.java index 397199c..449c3f6 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/LocalDuplicatedRemovedScheduler.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/LocalDuplicatedRemovedScheduler.java @@ -24,11 +24,15 @@ public abstract class LocalDuplicatedRemovedScheduler implements Scheduler { @Override public void push(Request request, Task task) { logger.trace("get a candidate url {}", request.getUrl()); - if (request.getExtra(Request.CYCLE_TRIED_TIMES) != null || urls.add(request.getUrl())) { + if (urls.add(request.getUrl()) || shouldReserved(request)) { logger.debug("push to queue {}", request.getUrl()); pushWhenNoDuplicate(request, task); } } + protected boolean shouldReserved(Request request) { + return request.getExtra(Request.CYCLE_TRIED_TIMES) != null; + } + protected abstract void pushWhenNoDuplicate(Request request, Task task); } From ec446277b139411112dc065281c5bb0417e06c32 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Tue, 15 Apr 2014 15:30:37 +0800 Subject: [PATCH 081/130] some refactor in httpclientdownloader --- .../downloader/HttpClientDownloader.java | 54 +++++++++++-------- .../webmagic/model/samples/GithubRepo.java | 7 +-- 2 files changed, 36 insertions(+), 25 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 30c561b..f0f53c6 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -7,6 +7,7 @@ import org.apache.http.annotation.ThreadSafe; import org.apache.http.client.config.CookieSpecs; import org.apache.http.client.config.RequestConfig; import org.apache.http.client.methods.CloseableHttpResponse; +import org.apache.http.client.methods.HttpUriRequest; import org.apache.http.client.methods.RequestBuilder; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.util.EntityUtils; @@ -75,26 +76,12 @@ public class HttpClientDownloader extends AbstractDownloader { acceptStatCode = Sets.newHashSet(200); } logger.info("downloading page {}" , request.getUrl()); - RequestBuilder requestBuilder = RequestBuilder.get().setUri(request.getUrl()); - if (headers != null) { - for (Map.Entry headerEntry : headers.entrySet()) { - requestBuilder.addHeader(headerEntry.getKey(), headerEntry.getValue()); - } - } - RequestConfig.Builder requestConfigBuilder = RequestConfig.custom() - .setConnectionRequestTimeout(site.getTimeOut()) - .setSocketTimeout(site.getTimeOut()) - .setConnectTimeout(site.getTimeOut()) - .setCookieSpec(CookieSpecs.BEST_MATCH); - if (site != null && site.getHttpProxy() != null) { - requestConfigBuilder.setProxy(site.getHttpProxy()); - } - requestBuilder.setConfig(requestConfigBuilder.build()); CloseableHttpResponse httpResponse = null; try { - httpResponse = getHttpClient(site).execute(requestBuilder.build()); + HttpUriRequest httpUriRequest = getHttpUriRequest(request, site, headers); + httpResponse = getHttpClient(site).execute(httpUriRequest); int statusCode = httpResponse.getStatusLine().getStatusCode(); - if (acceptStatCode.contains(statusCode)) { + if (statusAccept(acceptStatCode, statusCode)) { //charset if (charset == null) { String value = httpResponse.getEntity().getContentType().getValue(); @@ -123,6 +110,34 @@ public class HttpClientDownloader extends AbstractDownloader { } } + @Override + public void setThread(int thread) { + httpClientGenerator.setPoolSize(thread); + } + + protected boolean statusAccept(Set acceptStatCode, int statusCode) { + return acceptStatCode.contains(statusCode); + } + + protected HttpUriRequest getHttpUriRequest(Request request, Site site, Map headers) { + RequestBuilder requestBuilder = RequestBuilder.get().setUri(request.getUrl()); + if (headers != null) { + for (Map.Entry headerEntry : headers.entrySet()) { + requestBuilder.addHeader(headerEntry.getKey(), headerEntry.getValue()); + } + } + RequestConfig.Builder requestConfigBuilder = RequestConfig.custom() + .setConnectionRequestTimeout(site.getTimeOut()) + .setSocketTimeout(site.getTimeOut()) + .setConnectTimeout(site.getTimeOut()) + .setCookieSpec(CookieSpecs.BEST_MATCH); + if (site != null && site.getHttpProxy() != null) { + requestConfigBuilder.setProxy(site.getHttpProxy()); + } + requestBuilder.setConfig(requestConfigBuilder.build()); + return requestBuilder.build(); + } + protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException { String content = IOUtils.toString(httpResponse.getEntity().getContent(), charset); Page page = new Page(); @@ -132,9 +147,4 @@ public class HttpClientDownloader extends AbstractDownloader { page.setStatusCode(httpResponse.getStatusLine().getStatusCode()); return page; } - - @Override - public void setThread(int thread) { - httpClientGenerator.setPoolSize(thread); - } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/GithubRepo.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/GithubRepo.java index 57de3f1..e8998ec 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/GithubRepo.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/GithubRepo.java @@ -3,9 +3,11 @@ package us.codecraft.webmagic.model.samples; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.model.HasKey; import us.codecraft.webmagic.model.OOSpider; -import us.codecraft.webmagic.model.annotation.*; +import us.codecraft.webmagic.model.annotation.ExtractBy; +import us.codecraft.webmagic.model.annotation.ExtractByUrl; +import us.codecraft.webmagic.model.annotation.HelpUrl; +import us.codecraft.webmagic.model.annotation.TargetUrl; import us.codecraft.webmagic.pipeline.JsonFilePageModelPipeline; -import us.codecraft.webmagic.samples.formatter.StringTemplateFormatter; import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler; import java.util.List; @@ -20,7 +22,6 @@ public class GithubRepo implements HasKey { @ExtractBy(value = "//h1[@class='entry-title public']/strong/a/text()", notNull = true) private String name; - @Formatter(value = "author%s",formatter = StringTemplateFormatter.class) @ExtractByUrl("https://github\\.com/(\\w+)/.*") private String author; From c8014a9ae6df3ec6c81da35a43d891c172fc733c Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Tue, 15 Apr 2014 15:34:37 +0800 Subject: [PATCH 082/130] update readme --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 2056fba..62276eb 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -![logo](https://raw.github.com/code4craft/webmagic/master/asserts/logo.jpg) +![logo](https://raw.github.com/code4craft/webmagic/master/assets/logo.jpg) [Readme in Chinese](https://github.com/code4craft/webmagic/tree/master/zh_docs) From 163773af6bff2c3fb0e21da59db2f873f40f7a3d Mon Sep 17 00:00:00 2001 From: Bo LIANG Date: Wed, 16 Apr 2014 16:05:08 +0800 Subject: [PATCH 083/130] combine two try-catch block into one, make it cleaner. --- .../src/main/java/us/codecraft/webmagic/Spider.java | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 8af1338..4a7fbee 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -489,17 +489,17 @@ public class Spider implements Runnable, Task { } private void waitNewUrl() { + newUrlLock.lock(); try { - newUrlLock.lock(); //double check if (threadAlive.get() == 0 && exitWhenComplete) { return; } - try { - newUrlCondition.await(); - } catch (InterruptedException e) { - } - } finally { + newUrlCondition.await(); + } catch (InterruptedException e) { + logger.warn("waitNewUrl - interrupted, error {}", e); + } + finally { newUrlLock.unlock(); } } From 1fbfc92de2f2310d697747c7af7e5d5276a73439 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Wed, 16 Apr 2014 18:13:44 +0800 Subject: [PATCH 084/130] Inherit support of Field annotation in Model #103 --- .../webmagic/model/PageModelExtractor.java | 4 ++- .../codecraft/webmagic/utils/ClassUtils.java | 26 +++++++++++++++ .../us/codecraft/webmagic/model/BaseRepo.java | 12 +++++++ .../codecraft/webmagic/model/GithubRepo.java | 32 +++++++++++++++++++ .../webmagic/model/GithubRepoTest.java | 1 - 5 files changed, 73 insertions(+), 2 deletions(-) create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/utils/ClassUtils.java create mode 100644 webmagic-extension/src/test/java/us/codecraft/webmagic/model/BaseRepo.java create mode 100644 webmagic-extension/src/test/java/us/codecraft/webmagic/model/GithubRepo.java diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java index 3f92b28..8330edf 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java @@ -9,6 +9,7 @@ import us.codecraft.webmagic.model.formatter.BasicTypeFormatter; import us.codecraft.webmagic.model.formatter.ObjectFormatter; import us.codecraft.webmagic.model.formatter.ObjectFormatters; import us.codecraft.webmagic.selector.*; +import us.codecraft.webmagic.utils.ClassUtils; import us.codecraft.webmagic.utils.ExtractorUtils; import java.lang.annotation.Annotation; @@ -52,8 +53,9 @@ class PageModelExtractor { private void init(Class clazz) { this.clazz = clazz; initClassExtractors(); + clazz.getDeclaredFields() fieldExtractors = new ArrayList(); - for (Field field : clazz.getDeclaredFields()) { + for (Field field : ClassUtils.getFieldsIncludeSuperClass(clazz)) { field.setAccessible(true); FieldExtractor fieldExtractor = getAnnotationExtractBy(clazz, field); FieldExtractor fieldExtractorTmp = getAnnotationExtractCombo(clazz, field); diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/ClassUtils.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/ClassUtils.java new file mode 100644 index 0000000..ed22a4e --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/ClassUtils.java @@ -0,0 +1,26 @@ +package us.codecraft.webmagic.utils; + +import java.lang.reflect.Field; +import java.util.LinkedHashSet; +import java.util.Set; + +/** + * @author code4crafter@gmail.com + * @since 0.5.0 + */ +public abstract class ClassUtils { + + public static Set getFieldsIncludeSuperClass(Class clazz) { + Set fields = new LinkedHashSet(); + Class current = clazz; + while (current != null) { + Field[] currentFields = current.getDeclaredFields(); + for (Field currentField : currentFields) { + fields.add(currentField); + } + current = current.getSuperclass(); + } + return fields; + } + +} diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/model/BaseRepo.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/BaseRepo.java new file mode 100644 index 0000000..2d9cf94 --- /dev/null +++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/BaseRepo.java @@ -0,0 +1,12 @@ +package us.codecraft.webmagic.model; + +import us.codecraft.webmagic.model.annotation.ExtractBy; + +/** + * @author code4crafter@gmail.com + */ +public class BaseRepo { + + @ExtractBy("//ul[@class='pagehead-actions']/li[1]//a[@class='social-count js-social-count']/text()") + protected int star; +} diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/model/GithubRepo.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/GithubRepo.java new file mode 100644 index 0000000..d825a1f --- /dev/null +++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/GithubRepo.java @@ -0,0 +1,32 @@ +package us.codecraft.webmagic.model; + +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.model.annotation.ExtractBy; +import us.codecraft.webmagic.model.annotation.HelpUrl; +import us.codecraft.webmagic.model.annotation.TargetUrl; + +/** + * @author code4crafter@gmail.com
+ * @since 0.3.2 + */ +@TargetUrl("https://github.com/\\w+/\\w+") +@HelpUrl({"https://github.com/\\w+\\?tab=repositories", "https://github.com/\\w+", "https://github.com/explore/*"}) +public class GithubRepo extends BaseRepo{ + + @ExtractBy("//ul[@class='pagehead-actions']/li[2]//a[@class='social-count']/text()") + private int fork; + + public static void main(String[] args) { + OOSpider.create(Site.me().setSleepTime(100) + , new ConsolePageModelPipeline(), GithubRepo.class) + .addUrl("https://github.com/code4craft").thread(10).run(); + } + + public int getStar() { + return star; + } + + public int getFork() { + return fork; + } +} diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/model/GithubRepoTest.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/GithubRepoTest.java index 85b6858..d9501a2 100644 --- a/webmagic-extension/src/test/java/us/codecraft/webmagic/model/GithubRepoTest.java +++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/GithubRepoTest.java @@ -5,7 +5,6 @@ import org.junit.Test; import us.codecraft.webmagic.downloader.MockGithubDownloader; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Task; -import us.codecraft.webmagic.example.GithubRepo; import us.codecraft.webmagic.pipeline.PageModelPipeline; /** From aae1ab2cd609c5f12d79ca7654dc9dbd13a77ebe Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Wed, 16 Apr 2014 18:14:13 +0800 Subject: [PATCH 085/130] fix compile error --- .../java/us/codecraft/webmagic/model/PageModelExtractor.java | 1 - 1 file changed, 1 deletion(-) diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java index 8330edf..9816c71 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java @@ -53,7 +53,6 @@ class PageModelExtractor { private void init(Class clazz) { this.clazz = clazz; initClassExtractors(); - clazz.getDeclaredFields() fieldExtractors = new ArrayList(); for (Field field : ClassUtils.getFieldsIncludeSuperClass(clazz)) { field.setAccessible(true); From 42bbe40a37ab64a845255c722737a1faad2e2ab7 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Wed, 16 Apr 2014 19:45:17 +0800 Subject: [PATCH 086/130] [Bugfix]Urls will be lost when call setScheduler() #104 --- .../src/main/java/us/codecraft/webmagic/Spider.java | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 4a7fbee..c9b1dd7 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -185,7 +185,12 @@ public class Spider implements Runnable, Task { */ public Spider setScheduler(Scheduler scheduler) { checkIfRunning(); + Scheduler oldScheduler = this.scheduler; this.scheduler = scheduler; + Request request; + while ((request = oldScheduler.poll(this)) != null) { + this.scheduler.push(request, this); + } return this; } From f39aa435cfc3758921eba038743adadd65566fab Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Wed, 16 Apr 2014 19:46:32 +0800 Subject: [PATCH 087/130] add null check #104 --- .../src/main/java/us/codecraft/webmagic/Spider.java | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index c9b1dd7..6d436bd 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -187,9 +187,11 @@ public class Spider implements Runnable, Task { checkIfRunning(); Scheduler oldScheduler = this.scheduler; this.scheduler = scheduler; - Request request; - while ((request = oldScheduler.poll(this)) != null) { - this.scheduler.push(request, this); + if (oldScheduler != null) { + Request request; + while ((request = oldScheduler.poll(this)) != null) { + this.scheduler.push(request, this); + } } return this; } From a5db6cf2929d2ca9091e15c4f7c3fbdb40fe3ead Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Thu, 17 Apr 2014 00:35:09 +0800 Subject: [PATCH 088/130] some monitor and JMX support #98 --- .../java/us/codecraft/webmagic/Spider.java | 35 +++++ .../downloader/AbstractDownloader.java | 6 + .../downloader/HttpClientDownloader.java | 5 +- .../monitor/MonitorableScheduler.java | 18 +++ .../webmagic/monitor/SpiderListener.java | 14 ++ .../webmagic/monitor/SpiderMonitor.java | 122 ++++++++++++++++++ .../webmagic/monitor/SpiderMonitorMBean.java | 14 ++ .../webmagic/monitor/SpiderStatus.java | 52 ++++++++ .../webmagic/monitor/SpiderStatusMBean.java | 22 ++++ .../LocalDuplicatedRemovedScheduler.java | 8 +- .../webmagic/scheduler/PriorityScheduler.java | 5 + .../webmagic/scheduler/QueueScheduler.java | 5 + .../scheduler/FileCacheQueueScheduler.java | 5 + .../webmagic/scheduler/RedisScheduler.java | 41 +++++- 14 files changed, 345 insertions(+), 7 deletions(-) create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/monitor/MonitorableScheduler.java create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/monitor/SpiderListener.java create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitorMBean.java create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/monitor/SpiderStatus.java create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/monitor/SpiderStatusMBean.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 6d436bd..c53afcb 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -6,6 +6,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import us.codecraft.webmagic.downloader.Downloader; import us.codecraft.webmagic.downloader.HttpClientDownloader; +import us.codecraft.webmagic.monitor.SpiderListener; import us.codecraft.webmagic.pipeline.CollectorPipeline; import us.codecraft.webmagic.pipeline.ConsolePipeline; import us.codecraft.webmagic.pipeline.Pipeline; @@ -101,6 +102,8 @@ public class Spider implements Runnable, Task { private final AtomicInteger threadAlive = new AtomicInteger(0); + private List spiderListeners; + private final AtomicLong pageCount = new AtomicLong(0); /** @@ -312,7 +315,9 @@ public class Spider implements Runnable, Task { public void run() { try { processRequest(requestFinal); + onSuccess(requestFinal); } catch (Exception e) { + onError(requestFinal); logger.error("download " + requestFinal + " error", e); } finally { threadAlive.decrementAndGet(); @@ -330,6 +335,22 @@ public class Spider implements Runnable, Task { } } + protected void onError(Request request) { + if (CollectionUtils.isNotEmpty(spiderListeners)){ + for (SpiderListener spiderListener : spiderListeners) { + spiderListener.onError(request); + } + } + } + + protected void onSuccess(Request request) { + if (CollectionUtils.isNotEmpty(spiderListeners)){ + for (SpiderListener spiderListener : spiderListeners) { + spiderListener.onSuccess(request); + } + } + } + private void checkRunningStat() { while (true) { int statNow = stat.get(); @@ -378,6 +399,7 @@ public class Spider implements Runnable, Task { protected void processRequest(Request request) { Page page = downloader.download(request, this); if (page == null) { + onError(request); sleep(site.getSleepTime()); return; } @@ -659,4 +681,17 @@ public class Spider implements Runnable, Task { public Site getSite() { return site; } + + public List getSpiderListeners() { + return spiderListeners; + } + + public Spider setSpiderListeners(List spiderListeners) { + this.spiderListeners = spiderListeners; + return this; + } + + public Scheduler getScheduler() { + return scheduler; + } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java index 2336856..5940c2f 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java @@ -34,6 +34,12 @@ public abstract class AbstractDownloader implements Downloader { return (Html) page.getHtml(); } + protected void onSuccess(Request request) { + } + + protected void onError(Request request) { + } + protected Page addToCycleRetry(Request request, Site site) { Page page = new Page(); Object cycleTriedTimesObject = request.getExtra(Request.CYCLE_TRIED_TIMES); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index f0f53c6..13e220f 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -87,7 +87,9 @@ public class HttpClientDownloader extends AbstractDownloader { String value = httpResponse.getEntity().getContentType().getValue(); charset = UrlUtils.getCharset(value); } - return handleResponse(request, charset, httpResponse, task); + Page page = handleResponse(request, charset, httpResponse, task); + onSuccess(request); + return page; } else { logger.warn("code error " + statusCode + "\t" + request.getUrl()); return null; @@ -97,6 +99,7 @@ public class HttpClientDownloader extends AbstractDownloader { if (site.getCycleRetryTimes() > 0) { return addToCycleRetry(request, site); } + onError(request); return null; } finally { try { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/monitor/MonitorableScheduler.java b/webmagic-core/src/main/java/us/codecraft/webmagic/monitor/MonitorableScheduler.java new file mode 100644 index 0000000..11889ac --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/monitor/MonitorableScheduler.java @@ -0,0 +1,18 @@ +package us.codecraft.webmagic.monitor; + +import us.codecraft.webmagic.Task; +import us.codecraft.webmagic.scheduler.Scheduler; + +/** + * The scheduler whose requests can be counted for monitor. + * + * @author code4crafter@gmail.com + * @since 0.5.0 + */ +public interface MonitorableScheduler extends Scheduler { + + public int getLeftRequestsCount(Task task); + + public int getTotalRequestsCount(Task task); + +} \ No newline at end of file diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/monitor/SpiderListener.java b/webmagic-core/src/main/java/us/codecraft/webmagic/monitor/SpiderListener.java new file mode 100644 index 0000000..7a6c687 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/monitor/SpiderListener.java @@ -0,0 +1,14 @@ +package us.codecraft.webmagic.monitor; + +import us.codecraft.webmagic.Request; + +/** + * @author code4crafer@gmail.com + * @since 0.5.0 + */ +public interface SpiderListener { + + public void onSuccess(Request request); + + public void onError(Request request); +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java new file mode 100644 index 0000000..ccf498d --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java @@ -0,0 +1,122 @@ +package us.codecraft.webmagic.monitor; + +import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.processor.example.GithubRepoPageProcessor; +import us.codecraft.webmagic.processor.example.OschinaBlogPageProcessor; + +import javax.management.*; +import javax.management.remote.JMXConnectorServer; +import javax.management.remote.JMXConnectorServerFactory; +import javax.management.remote.JMXServiceURL; +import java.io.IOException; +import java.rmi.registry.LocateRegistry; +import java.rmi.registry.Registry; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.concurrent.atomic.AtomicInteger; + +/** + * @author code4crafer@gmail.com + * @since 0.5.0 + */ +public class SpiderMonitor implements SpiderMonitorMBean { + + private List spiderStatuses = new ArrayList(); + + @Override + public List getSpiders() { + return spiderStatuses; + } + + @Override + public SpiderStatus getSpider() { + return spiderStatuses.get(0); + } + + public void register(Spider spider) { + MonitorSpiderListener monitorSpiderListener = new MonitorSpiderListener(); + if (spider.getSpiderListeners() == null) { + List spiderListeners = new ArrayList(); + spiderListeners.add(monitorSpiderListener); + spider.setSpiderListeners(spiderListeners); + } else { + spider.getSpiderListeners().add(monitorSpiderListener); + } + spiderStatuses.add(new SpiderStatus(spider, monitorSpiderListener)); + + } + + public class MonitorSpiderListener implements SpiderListener { + + private final AtomicInteger successCount = new AtomicInteger(0); + + private final AtomicInteger errorCount = new AtomicInteger(0); + + private List errorUrls = Collections.synchronizedList(new ArrayList()); + + @Override + public void onSuccess(Request request) { + successCount.incrementAndGet(); + } + + @Override + public void onError(Request request) { + errorUrls.add(request.getUrl()); + errorCount.incrementAndGet(); + } + + public AtomicInteger getSuccessCount() { + return successCount; + } + + public AtomicInteger getErrorCount() { + return errorCount; + } + + public List getErrorUrls() { + return errorUrls; + } + } + + + public static void main(String[] args) throws MalformedObjectNameException, + NullPointerException, InstanceAlreadyExistsException, + MBeanRegistrationException, NotCompliantMBeanException, IOException { + + int rmiPort = 1099; + SpiderMonitor spiderMonitor = new SpiderMonitor(); + String jmxServerName = "TestJMXServer"; + + Spider oschinaSpider = Spider.create(new OschinaBlogPageProcessor()).addUrl("http://my.oschina.net/flashsword/blog").thread(2); + + spiderMonitor.register(oschinaSpider); + + Spider githubSpider = Spider.create(new GithubRepoPageProcessor()).addUrl("https://github.com/code4craft"); + + spiderMonitor.register(githubSpider); + + // jdkfolder/bin/rmiregistry.exe 9999 + Registry registry = LocateRegistry.createRegistry(rmiPort); + + MBeanServer mbs = MBeanServerFactory.createMBeanServer(jmxServerName); + //MBeanServer mbs = ManagementFactory.getPlatformMBeanServer(); + + ObjectName objName = new ObjectName(jmxServerName + ":name=" + "HelloWorld"); + mbs.registerMBean(spiderMonitor, objName); + + JMXServiceURL url = new JMXServiceURL("service:jmx:rmi:///jndi/rmi://localhost:" + rmiPort + "/" + jmxServerName); + System.out.println("JMXServiceURL: " + url.toString()); + JMXConnectorServer jmxConnServer = JMXConnectorServerFactory.newJMXConnectorServer(url, null, mbs); + jmxConnServer.start(); + + for (SpiderStatus spiderStatuse : spiderMonitor.spiderStatuses) { + objName = new ObjectName(jmxServerName + ":name=" + spiderStatuse.getName()); + mbs.registerMBean(spiderStatuse, objName); + } + + + } + +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitorMBean.java b/webmagic-core/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitorMBean.java new file mode 100644 index 0000000..8b77b33 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitorMBean.java @@ -0,0 +1,14 @@ +package us.codecraft.webmagic.monitor; + +import java.util.List; + +/** + * @author code4crafer@gmail.com + */ +public interface SpiderMonitorMBean { + + public List getSpiders(); + + public SpiderStatus getSpider(); + +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/monitor/SpiderStatus.java b/webmagic-core/src/main/java/us/codecraft/webmagic/monitor/SpiderStatus.java new file mode 100644 index 0000000..84d8603 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/monitor/SpiderStatus.java @@ -0,0 +1,52 @@ +package us.codecraft.webmagic.monitor; + +import us.codecraft.webmagic.Spider; + +import java.util.List; + +/** + * @author code4crafer@gmail.com + * @since 0.5.0 + */ +public class SpiderStatus implements SpiderStatusMBean{ + + private final Spider spider; + + private final SpiderMonitor.MonitorSpiderListener monitorSpiderListener; + + public SpiderStatus(Spider spider, SpiderMonitor.MonitorSpiderListener monitorSpiderListener) { + this.spider = spider; + this.monitorSpiderListener = monitorSpiderListener; + } + + public String getName() { + return spider.getUUID(); + } + + public int getLeftPages() { + if (spider.getScheduler() instanceof MonitorableScheduler) { + return ((MonitorableScheduler) spider.getScheduler()).getLeftRequestsCount(spider); + } + return -1; + } + + public int getTotalPages() { + if (spider.getScheduler() instanceof MonitorableScheduler) { + return ((MonitorableScheduler) spider.getScheduler()).getTotalRequestsCount(spider); + } + return -1; + } + + public List getErrorPages() { + return monitorSpiderListener.getErrorUrls(); + } + + public void start() { + spider.start(); + } + + public void stop() { + spider.stop(); + } + +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/monitor/SpiderStatusMBean.java b/webmagic-core/src/main/java/us/codecraft/webmagic/monitor/SpiderStatusMBean.java new file mode 100644 index 0000000..cd884a5 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/monitor/SpiderStatusMBean.java @@ -0,0 +1,22 @@ +package us.codecraft.webmagic.monitor; + +import java.util.List; + +/** + * @author code4crafer@gmail.com + * @since 0.5.0 + */ +public interface SpiderStatusMBean { + + public String getName(); + + public int getLeftPages(); + + public int getTotalPages(); + public List getErrorPages(); + + public void start(); + + public void stop(); + +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/LocalDuplicatedRemovedScheduler.java b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/LocalDuplicatedRemovedScheduler.java index 449c3f6..2807e0f 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/LocalDuplicatedRemovedScheduler.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/LocalDuplicatedRemovedScheduler.java @@ -5,6 +5,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Task; +import us.codecraft.webmagic.monitor.MonitorableScheduler; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; @@ -15,7 +16,7 @@ import java.util.concurrent.ConcurrentHashMap; * @author code4crafter@gmail.com * @since 0.5.0 */ -public abstract class LocalDuplicatedRemovedScheduler implements Scheduler { +public abstract class LocalDuplicatedRemovedScheduler implements MonitorableScheduler { protected Logger logger = LoggerFactory.getLogger(getClass()); @@ -34,5 +35,10 @@ public abstract class LocalDuplicatedRemovedScheduler implements Scheduler { return request.getExtra(Request.CYCLE_TRIED_TIMES) != null; } + @Override + public int getTotalRequestsCount(Task task) { + return urls.size(); + } + protected abstract void pushWhenNoDuplicate(Request request, Task task); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/PriorityScheduler.java b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/PriorityScheduler.java index 04917ad..a57a6fb 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/PriorityScheduler.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/PriorityScheduler.java @@ -60,4 +60,9 @@ public class PriorityScheduler extends LocalDuplicatedRemovedScheduler { } return priorityQueueMinus.poll(); } + + @Override + public int getLeftRequestsCount(Task task) { + return noPriorityQueue.size(); + } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java index ab288df..e2a6e75 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java @@ -29,4 +29,9 @@ public class QueueScheduler extends LocalDuplicatedRemovedScheduler { public synchronized Request poll(Task task) { return queue.poll(); } + + @Override + public int getLeftRequestsCount(Task task) { + return queue.size(); + } } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java index 79f3b8b..9d7668d 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java @@ -161,4 +161,9 @@ public class FileCacheQueueScheduler extends LocalDuplicatedRemovedScheduler { fileCursorWriter.println(cursor.incrementAndGet()); return queue.poll(); } + + @Override + public int getLeftRequestsCount(Task task) { + return queue.size(); + } } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java index cd90625..16f9147 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java @@ -7,6 +7,7 @@ import redis.clients.jedis.JedisPool; import redis.clients.jedis.JedisPoolConfig; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Task; +import us.codecraft.webmagic.monitor.MonitorableScheduler; /** * Use Redis as url scheduler for distributed crawlers.
@@ -14,7 +15,7 @@ import us.codecraft.webmagic.Task; * @author code4crafter@gmail.com
* @since 0.2.0 */ -public class RedisScheduler implements Scheduler { +public class RedisScheduler implements MonitorableScheduler { private JedisPool pool; @@ -39,10 +40,10 @@ public class RedisScheduler implements Scheduler { // if cycleRetriedTimes is set, allow duplicated. Object cycleRetriedTimes = request.getExtra(Request.CYCLE_TRIED_TIMES); // use set to remove duplicate url - if (cycleRetriedTimes != null || !jedis.sismember(SET_PREFIX + task.getUUID(), request.getUrl())) { + if (cycleRetriedTimes != null || !jedis.sismember(getSetKey(task), request.getUrl())) { // use list to store queue - jedis.rpush(QUEUE_PREFIX + task.getUUID(), request.getUrl()); - jedis.sadd(SET_PREFIX + task.getUUID(), request.getUrl()); + jedis.rpush(getQueueKey(task), request.getUrl()); + jedis.sadd(getSetKey(task), request.getUrl()); if (request.getExtras() != null) { String field = DigestUtils.shaHex(request.getUrl()); String value = JSON.toJSONString(request); @@ -58,7 +59,7 @@ public class RedisScheduler implements Scheduler { public synchronized Request poll(Task task) { Jedis jedis = pool.getResource(); try { - String url = jedis.lpop(QUEUE_PREFIX + task.getUUID()); + String url = jedis.lpop(getQueueKey(task)); if (url == null) { return null; } @@ -75,4 +76,34 @@ public class RedisScheduler implements Scheduler { pool.returnResource(jedis); } } + + protected String getSetKey(Task task) { + return SET_PREFIX + task.getUUID(); + } + + protected String getQueueKey(Task task) { + return QUEUE_PREFIX + task.getUUID(); + } + + @Override + public int getLeftRequestsCount(Task task) { + Jedis jedis = pool.getResource(); + try { + Long size = jedis.llen(getQueueKey(task)); + return size.intValue(); + } finally { + pool.returnResource(jedis); + } + } + + @Override + public int getTotalRequestsCount(Task task) { + Jedis jedis = pool.getResource(); + try { + Long size = jedis.scard(getQueueKey(task)); + return size.intValue(); + } finally { + pool.returnResource(jedis); + } + } } From 27b37e8164c53f7079fadc0793d09553dd66bd3c Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Thu, 17 Apr 2014 08:12:37 +0800 Subject: [PATCH 089/130] extension point and sample for JMX support #98 --- .../webmagic/monitor/SpiderMonitor.java | 81 +++++++++++-------- .../webmagic/monitor/SpiderMonitorMBean.java | 14 ---- .../webmagic/monitor/SpiderStatus.java | 36 +++++++-- .../webmagic/monitor/SpiderStatusMBean.java | 14 +++- .../webmagic/monitor/CustomSpiderStatus.java | 19 +++++ .../monitor/CustomSpiderStatusMBean.java | 10 +++ .../webmagic/monitor/SpiderMonitorTest.java | 32 ++++++++ 7 files changed, 149 insertions(+), 57 deletions(-) delete mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitorMBean.java create mode 100644 webmagic-core/src/test/java/us/codecraft/webmagic/monitor/CustomSpiderStatus.java create mode 100644 webmagic-core/src/test/java/us/codecraft/webmagic/monitor/CustomSpiderStatusMBean.java create mode 100644 webmagic-core/src/test/java/us/codecraft/webmagic/monitor/SpiderMonitorTest.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java index ccf498d..2c11d71 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java @@ -21,31 +21,39 @@ import java.util.concurrent.atomic.AtomicInteger; * @author code4crafer@gmail.com * @since 0.5.0 */ -public class SpiderMonitor implements SpiderMonitorMBean { +public class SpiderMonitor { - private List spiderStatuses = new ArrayList(); + private List spiderStatuses = new ArrayList(); - @Override - public List getSpiders() { + public List getSpiders() { return spiderStatuses; } - @Override - public SpiderStatus getSpider() { + public SpiderStatusMBean getSpider() { return spiderStatuses.get(0); } - public void register(Spider spider) { - MonitorSpiderListener monitorSpiderListener = new MonitorSpiderListener(); - if (spider.getSpiderListeners() == null) { - List spiderListeners = new ArrayList(); - spiderListeners.add(monitorSpiderListener); - spider.setSpiderListeners(spiderListeners); - } else { - spider.getSpiderListeners().add(monitorSpiderListener); + public SpiderMonitor register(Spider... spiders) { + for (Spider spider : spiders) { + MonitorSpiderListener monitorSpiderListener = new MonitorSpiderListener(); + if (spider.getSpiderListeners() == null) { + List spiderListeners = new ArrayList(); + spiderListeners.add(monitorSpiderListener); + spider.setSpiderListeners(spiderListeners); + } else { + spider.getSpiderListeners().add(monitorSpiderListener); + } + spiderStatuses.add(getSpiderStatusMBean(spider, monitorSpiderListener)); } - spiderStatuses.add(new SpiderStatus(spider, monitorSpiderListener)); + return this; + } + protected SpiderStatusMBean getSpiderStatusMBean(Spider spider, MonitorSpiderListener monitorSpiderListener) { + return new SpiderStatus(spider, monitorSpiderListener); + } + + public static SpiderMonitor create(){ + return new SpiderMonitor(); } public class MonitorSpiderListener implements SpiderListener { @@ -81,42 +89,45 @@ public class SpiderMonitor implements SpiderMonitorMBean { } - public static void main(String[] args) throws MalformedObjectNameException, - NullPointerException, InstanceAlreadyExistsException, - MBeanRegistrationException, NotCompliantMBeanException, IOException { - - int rmiPort = 1099; - SpiderMonitor spiderMonitor = new SpiderMonitor(); - String jmxServerName = "TestJMXServer"; - - Spider oschinaSpider = Spider.create(new OschinaBlogPageProcessor()).addUrl("http://my.oschina.net/flashsword/blog").thread(2); - - spiderMonitor.register(oschinaSpider); - - Spider githubSpider = Spider.create(new GithubRepoPageProcessor()).addUrl("https://github.com/code4craft"); - - spiderMonitor.register(githubSpider); + public void jmxStart() throws IOException, JMException { + jmxStart(14721); + } + public void jmxStart(int rmiPort) throws IOException, JMException { + String jmxServerName = "WebMagic"; // jdkfolder/bin/rmiregistry.exe 9999 Registry registry = LocateRegistry.createRegistry(rmiPort); MBeanServer mbs = MBeanServerFactory.createMBeanServer(jmxServerName); //MBeanServer mbs = ManagementFactory.getPlatformMBeanServer(); - ObjectName objName = new ObjectName(jmxServerName + ":name=" + "HelloWorld"); - mbs.registerMBean(spiderMonitor, objName); + ObjectName objName; JMXServiceURL url = new JMXServiceURL("service:jmx:rmi:///jndi/rmi://localhost:" + rmiPort + "/" + jmxServerName); System.out.println("JMXServiceURL: " + url.toString()); JMXConnectorServer jmxConnServer = JMXConnectorServerFactory.newJMXConnectorServer(url, null, mbs); jmxConnServer.start(); - for (SpiderStatus spiderStatuse : spiderMonitor.spiderStatuses) { - objName = new ObjectName(jmxServerName + ":name=" + spiderStatuse.getName()); - mbs.registerMBean(spiderStatuse, objName); + for (SpiderStatusMBean spiderStatus : spiderStatuses) { + objName = new ObjectName(jmxServerName + ":name=" + spiderStatus.getName()); + mbs.registerMBean(spiderStatus, objName); } + } + public static void main(String[] args) throws JMException, + NullPointerException, + IOException { + + Spider oschinaSpider = Spider.create(new OschinaBlogPageProcessor()) + .addUrl("http://my.oschina.net/flashsword/blog").thread(2); + Spider githubSpider = Spider.create(new GithubRepoPageProcessor()) + .addUrl("https://github.com/code4craft"); + + SpiderMonitor spiderMonitor = new SpiderMonitor(); + spiderMonitor.register(oschinaSpider, githubSpider); + spiderMonitor.jmxStart(); + } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitorMBean.java b/webmagic-core/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitorMBean.java deleted file mode 100644 index 8b77b33..0000000 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitorMBean.java +++ /dev/null @@ -1,14 +0,0 @@ -package us.codecraft.webmagic.monitor; - -import java.util.List; - -/** - * @author code4crafer@gmail.com - */ -public interface SpiderMonitorMBean { - - public List getSpiders(); - - public SpiderStatus getSpider(); - -} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/monitor/SpiderStatus.java b/webmagic-core/src/main/java/us/codecraft/webmagic/monitor/SpiderStatus.java index 84d8603..6b49cbb 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/monitor/SpiderStatus.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/monitor/SpiderStatus.java @@ -1,5 +1,7 @@ package us.codecraft.webmagic.monitor; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import us.codecraft.webmagic.Spider; import java.util.List; @@ -8,11 +10,13 @@ import java.util.List; * @author code4crafer@gmail.com * @since 0.5.0 */ -public class SpiderStatus implements SpiderStatusMBean{ +public class SpiderStatus implements SpiderStatusMBean { - private final Spider spider; + protected final Spider spider; - private final SpiderMonitor.MonitorSpiderListener monitorSpiderListener; + protected Logger logger = LoggerFactory.getLogger(getClass()); + + protected final SpiderMonitor.MonitorSpiderListener monitorSpiderListener; public SpiderStatus(Spider spider, SpiderMonitor.MonitorSpiderListener monitorSpiderListener) { this.spider = spider; @@ -23,24 +27,46 @@ public class SpiderStatus implements SpiderStatusMBean{ return spider.getUUID(); } - public int getLeftPages() { + public int getLeftPageCount() { if (spider.getScheduler() instanceof MonitorableScheduler) { return ((MonitorableScheduler) spider.getScheduler()).getLeftRequestsCount(spider); } + logger.warn("Get leftPageCount fail, try to use a Scheduler implement MonitorableScheduler for monitor count!"); return -1; } - public int getTotalPages() { + public int getTotalPageCount() { if (spider.getScheduler() instanceof MonitorableScheduler) { return ((MonitorableScheduler) spider.getScheduler()).getTotalRequestsCount(spider); } + logger.warn("Get totalPageCount fail, try to use a Scheduler implement MonitorableScheduler for monitor count!"); return -1; } + @Override + public int getSuccessPageCount() { + return monitorSpiderListener.getSuccessCount().get(); + } + + @Override + public int getErrorPageCount() { + return monitorSpiderListener.getErrorCount().get(); + } + public List getErrorPages() { return monitorSpiderListener.getErrorUrls(); } + @Override + public String getStatus() { + return spider.getStatus().name(); + } + + @Override + public int getThread() { + return spider.getThreadAlive(); + } + public void start() { spider.start(); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/monitor/SpiderStatusMBean.java b/webmagic-core/src/main/java/us/codecraft/webmagic/monitor/SpiderStatusMBean.java index cd884a5..156b653 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/monitor/SpiderStatusMBean.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/monitor/SpiderStatusMBean.java @@ -10,13 +10,21 @@ public interface SpiderStatusMBean { public String getName(); - public int getLeftPages(); + public String getStatus(); + + public int getThread(); + + public int getTotalPageCount(); + + public int getLeftPageCount(); + + public int getSuccessPageCount(); + + public int getErrorPageCount(); - public int getTotalPages(); public List getErrorPages(); public void start(); public void stop(); - } diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/monitor/CustomSpiderStatus.java b/webmagic-core/src/test/java/us/codecraft/webmagic/monitor/CustomSpiderStatus.java new file mode 100644 index 0000000..a0b7f77 --- /dev/null +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/monitor/CustomSpiderStatus.java @@ -0,0 +1,19 @@ +package us.codecraft.webmagic.monitor; + +import us.codecraft.webmagic.Spider; + +/** + * @author code4crafer@gmail.com + */ +public class CustomSpiderStatus extends SpiderStatus implements CustomSpiderStatusMBean { + + public CustomSpiderStatus(Spider spider, SpiderMonitor.MonitorSpiderListener monitorSpiderListener) { + super(spider, monitorSpiderListener); + } + + + @Override + public String getSchedulerName() { + return spider.getScheduler().getClass().getName(); + } +} diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/monitor/CustomSpiderStatusMBean.java b/webmagic-core/src/test/java/us/codecraft/webmagic/monitor/CustomSpiderStatusMBean.java new file mode 100644 index 0000000..104b27d --- /dev/null +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/monitor/CustomSpiderStatusMBean.java @@ -0,0 +1,10 @@ +package us.codecraft.webmagic.monitor; + +/** + * @author code4crafer@gmail.com + */ +public interface CustomSpiderStatusMBean extends SpiderStatusMBean { + + public String getSchedulerName(); + +} diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/monitor/SpiderMonitorTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/monitor/SpiderMonitorTest.java new file mode 100644 index 0000000..450f3fe --- /dev/null +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/monitor/SpiderMonitorTest.java @@ -0,0 +1,32 @@ +package us.codecraft.webmagic.monitor; + +import org.junit.Test; +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.processor.example.GithubRepoPageProcessor; +import us.codecraft.webmagic.processor.example.OschinaBlogPageProcessor; + +/** + * @author code4crafer@gmail.com + * @since 0.5.0 + */ +public class SpiderMonitorTest { + + @Test + public void testInherit() throws Exception { + SpiderMonitor spiderMonitor = new SpiderMonitor(){ + @Override + protected SpiderStatusMBean getSpiderStatusMBean(Spider spider, MonitorSpiderListener monitorSpiderListener) { + return new CustomSpiderStatus(spider, monitorSpiderListener); + } + }; + + Spider oschinaSpider = Spider.create(new OschinaBlogPageProcessor()) + .addUrl("http://my.oschina.net/flashsword/blog").thread(2); + Spider githubSpider = Spider.create(new GithubRepoPageProcessor()) + .addUrl("https://github.com/code4craft"); + + spiderMonitor.register(oschinaSpider, githubSpider); + spiderMonitor.jmxStart(); + + } +} From 023c2ac84e2c42bbee6035bbd4bd4f7e009cf820 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Thu, 17 Apr 2014 16:44:32 +0800 Subject: [PATCH 090/130] spider config draft --- .../src/main/resouces/spider-config-draft.xml | 29 +++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 webmagic-extension/src/main/resouces/spider-config-draft.xml diff --git a/webmagic-extension/src/main/resouces/spider-config-draft.xml b/webmagic-extension/src/main/resouces/spider-config-draft.xml new file mode 100644 index 0000000..85aee4d --- /dev/null +++ b/webmagic-extension/src/main/resouces/spider-config-draft.xml @@ -0,0 +1,29 @@ + + + + utf-8 + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file From 08fa3b01c14225a52c78f806055c4a6abf37ce85 Mon Sep 17 00:00:00 2001 From: Bo LIANG Date: Thu, 17 Apr 2014 17:53:12 +0800 Subject: [PATCH 091/130] when download error, throw an exception instead of calling onError and returning peacefully. #105 --- .../src/main/java/us/codecraft/webmagic/Spider.java | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index c53afcb..a03dee1 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -1,9 +1,11 @@ package us.codecraft.webmagic; import com.google.common.collect.Lists; + import org.apache.commons.collections.CollectionUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; + import us.codecraft.webmagic.downloader.Downloader; import us.codecraft.webmagic.downloader.HttpClientDownloader; import us.codecraft.webmagic.monitor.SpiderListener; @@ -318,7 +320,7 @@ public class Spider implements Runnable, Task { onSuccess(requestFinal); } catch (Exception e) { onError(requestFinal); - logger.error("download " + requestFinal + " error", e); + logger.error("process request " + requestFinal + " error", e); } finally { threadAlive.decrementAndGet(); pageCount.incrementAndGet(); @@ -399,9 +401,8 @@ public class Spider implements Runnable, Task { protected void processRequest(Request request) { Page page = downloader.download(request, this); if (page == null) { - onError(request); sleep(site.getSleepTime()); - return; + throw new IllegalStateException("download error"); } // for cycle retry if (page.isNeedCycleRetry()) { From b06aa489fba007ba1e634f792045f0b52caf3c80 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Fri, 18 Apr 2014 17:48:26 +0800 Subject: [PATCH 092/130] [BugFix]Only one url from sourceRegion can be extracted #107 --- .../us/codecraft/webmagic/selector/PlainText.java | 10 ++++++++++ .../codecraft/webmagic/selector/Selectable.java | 15 +++++++++++++++ .../webmagic/model/ModelPageProcessor.java | 4 +--- 3 files changed, 26 insertions(+), 3 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java index ca40fac..efa38d8 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java @@ -122,6 +122,16 @@ public class PlainText implements Selectable { } } + @Override + public Selectable select(Selector selector) { + return select(selector, strings); + } + + @Override + public Selectable selectList(Selector selector) { + return selectList(selector, strings); + } + @Override public String toString() { return get(); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java index cdab8bf..2cc4ed9 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java @@ -128,4 +128,19 @@ public interface Selectable { */ public Selectable jsonPath(String jsonPath); + /** + * extract by custom selector + * + * @param selector + * @return + */ + public Selectable select(Selector selector); + + /** + * extract by custom selector + * + * @param selector + * @return + */ + public Selectable selectList(Selector selector); } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ModelPageProcessor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ModelPageProcessor.java index 3a97e1d..6bfe88d 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ModelPageProcessor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ModelPageProcessor.java @@ -7,9 +7,7 @@ import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.selector.Selector; import java.util.ArrayList; -import java.util.HashSet; import java.util.List; -import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -66,7 +64,7 @@ class ModelPageProcessor implements PageProcessor { if (urlRegionSelector == null) { links = page.getHtml().links().all(); } else { - links = urlRegionSelector.selectList(page.getHtml().toString()); + links = page.getHtml().selectList(urlRegionSelector).links().all(); } for (String link : links) { for (Pattern targetUrlPattern : urlPatterns) { From 8ba2da146cc21e460efcf92c1c62af5da750122d Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Thu, 24 Apr 2014 10:51:37 +0800 Subject: [PATCH 093/130] request method #108 and more cookie #109 config --- .../java/us/codecraft/webmagic/Request.java | 17 ++++++ .../main/java/us/codecraft/webmagic/Site.java | 60 ++++++++++++++++--- .../webmagic/constant/HttpConstant.java | 35 +++++++++++ .../downloader/HttpClientDownloader.java | 23 ++++++- .../downloader/HttpClientGenerator.java | 13 ++-- .../LocalDuplicatedRemovedScheduler.java | 6 +- 6 files changed, 138 insertions(+), 16 deletions(-) create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/constant/HttpConstant.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java index 142a20c..aeca08f 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java @@ -21,6 +21,8 @@ public class Request implements Serializable { private String url; + private String method; + /** * Store additional information in extras. */ @@ -106,10 +108,25 @@ public class Request implements Serializable { this.url = url; } + /** + * The http method of the request. Get for default. + * @return httpMethod + * @see us.codecraft.webmagic.constant.HttpConstant.Method + * @since 0.5.0 + */ + public String getMethod() { + return method; + } + + public void setMethod(String method) { + this.method = method; + } + @Override public String toString() { return "Request{" + "url='" + url + '\'' + + ", method='" + method + '\'' + ", extras=" + extras + ", priority=" + priority + '}'; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java index 48b43f0..3a5dd33 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java @@ -1,5 +1,7 @@ package us.codecraft.webmagic; +import com.google.common.collect.HashBasedTable; +import com.google.common.collect.Table; import org.apache.http.HttpHost; import us.codecraft.webmagic.utils.UrlUtils; @@ -18,7 +20,9 @@ public class Site { private String userAgent; - private Map cookies = new LinkedHashMap(); + private Map defaultCookies = new LinkedHashMap(); + + private Table cookies = HashBasedTable.create(); private String charset; @@ -45,6 +49,10 @@ public class Site { private boolean useGzip = true; + /** + * @see us.codecraft.webmagic.constant.HttpConstant.Header + * @deprecated + */ public static interface HeaderConst { public static final String REFERER = "Referer"; @@ -72,7 +80,20 @@ public class Site { * @return this */ public Site addCookie(String name, String value) { - cookies.put(name, value); + defaultCookies.put(name, value); + return this; + } + + /** + * Add a cookie with specific domain. + * + * @param domain + * @param name + * @param value + * @return + */ + public Site addCookie(String domain, String name, String value) { + cookies.put(domain, name, value); return this; } @@ -93,6 +114,25 @@ public class Site { * @return get cookies */ public Map getCookies() { + return defaultCookies; + } + + /** + * get cookies of all domains + * + * @return get cookies + */ + public Map> getAllCookies() { + return cookies.columnMap(); + } + + /** + * get cookies + * + * @return get cookies + */ + public Table getaCookies() { + cookies.columnMap(); return cookies; } @@ -203,10 +243,10 @@ public class Site { * Add a url to start url.
* Because urls are more a Spider's property than Site, move it to {@link Spider#addUrl(String...)}} * - * @deprecated - * @see Spider#addUrl(String...) * @param startUrl * @return this + * @see Spider#addUrl(String...) + * @deprecated */ public Site addStartUrl(String startUrl) { return addStartRequest(new Request(startUrl)); @@ -216,10 +256,10 @@ public class Site { * Add a url to start url.
* Because urls are more a Spider's property than Site, move it to {@link Spider#addRequest(Request...)}} * - * @deprecated - * @see Spider#addRequest(Request...) * @param startRequest * @return this + * @see Spider#addRequest(Request...) + * @deprecated */ public Site addStartRequest(Request startRequest) { this.startRequests.add(startRequest); @@ -312,6 +352,7 @@ public class Site { /** * set up httpProxy for this site + * * @param httpProxy * @return */ @@ -364,7 +405,8 @@ public class Site { if (acceptStatCode != null ? !acceptStatCode.equals(site.acceptStatCode) : site.acceptStatCode != null) return false; if (charset != null ? !charset.equals(site.charset) : site.charset != null) return false; - if (cookies != null ? !cookies.equals(site.cookies) : site.cookies != null) return false; + if (defaultCookies != null ? !defaultCookies.equals(site.defaultCookies) : site.defaultCookies != null) + return false; if (domain != null ? !domain.equals(site.domain) : site.domain != null) return false; if (headers != null ? !headers.equals(site.headers) : site.headers != null) return false; if (startRequests != null ? !startRequests.equals(site.startRequests) : site.startRequests != null) @@ -378,7 +420,7 @@ public class Site { public int hashCode() { int result = domain != null ? domain.hashCode() : 0; result = 31 * result + (userAgent != null ? userAgent.hashCode() : 0); - result = 31 * result + (cookies != null ? cookies.hashCode() : 0); + result = 31 * result + (defaultCookies != null ? defaultCookies.hashCode() : 0); result = 31 * result + (charset != null ? charset.hashCode() : 0); result = 31 * result + (startRequests != null ? startRequests.hashCode() : 0); result = 31 * result + sleepTime; @@ -395,7 +437,7 @@ public class Site { return "Site{" + "domain='" + domain + '\'' + ", userAgent='" + userAgent + '\'' + - ", cookies=" + cookies + + ", cookies=" + defaultCookies + ", charset='" + charset + '\'' + ", startRequests=" + startRequests + ", sleepTime=" + sleepTime + diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/constant/HttpConstant.java b/webmagic-core/src/main/java/us/codecraft/webmagic/constant/HttpConstant.java new file mode 100644 index 0000000..52f7ecb --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/constant/HttpConstant.java @@ -0,0 +1,35 @@ +package us.codecraft.webmagic.constant; + +/** + * Some constants of Http protocal. + * @author code4crafer@gmail.com + * @since 0.5.0 + */ +public abstract class HttpConstant { + + public static abstract class Method { + + public static final String GET = "GET"; + + public static final String HEAD = "HEAD"; + + public static final String POST = "POST"; + + public static final String PUT = "PUT"; + + public static final String DELETE = "DELETE"; + + public static final String TRACE = "TRACE"; + + public static final String CONNECT = "CONNECT"; + + } + + public static abstract class Header { + + public static final String REFERER = "Referer"; + + public static final String USER_AGENT = "User-Agent"; + } + +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 13e220f..4fecf32 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -17,6 +17,7 @@ import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Task; +import us.codecraft.webmagic.constant.HttpConstant; import us.codecraft.webmagic.selector.PlainText; import us.codecraft.webmagic.utils.UrlUtils; @@ -75,7 +76,7 @@ public class HttpClientDownloader extends AbstractDownloader { } else { acceptStatCode = Sets.newHashSet(200); } - logger.info("downloading page {}" , request.getUrl()); + logger.info("downloading page {}", request.getUrl()); CloseableHttpResponse httpResponse = null; try { HttpUriRequest httpUriRequest = getHttpUriRequest(request, site, headers); @@ -123,7 +124,7 @@ public class HttpClientDownloader extends AbstractDownloader { } protected HttpUriRequest getHttpUriRequest(Request request, Site site, Map headers) { - RequestBuilder requestBuilder = RequestBuilder.get().setUri(request.getUrl()); + RequestBuilder requestBuilder = selectRequestMethod(request.getMethod()).setUri(request.getUrl()); if (headers != null) { for (Map.Entry headerEntry : headers.entrySet()) { requestBuilder.addHeader(headerEntry.getKey(), headerEntry.getValue()); @@ -141,6 +142,24 @@ public class HttpClientDownloader extends AbstractDownloader { return requestBuilder.build(); } + protected RequestBuilder selectRequestMethod(String method) { + if (method == null || method.equalsIgnoreCase(HttpConstant.Method.GET)) { + //default get + return RequestBuilder.get(); + } else if (method.equalsIgnoreCase(HttpConstant.Method.POST)) { + return RequestBuilder.post(); + } else if (method.equalsIgnoreCase(HttpConstant.Method.HEAD)) { + return RequestBuilder.head(); + } else if (method.equalsIgnoreCase(HttpConstant.Method.PUT)) { + return RequestBuilder.put(); + } else if (method.equalsIgnoreCase(HttpConstant.Method.DELETE)) { + return RequestBuilder.delete(); + } else if (method.equalsIgnoreCase(HttpConstant.Method.TRACE)) { + return RequestBuilder.trace(); + } + throw new IllegalArgumentException("Illegal HTTP Method " + method); + } + protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException { String content = IOUtils.toString(httpResponse.getEntity().getContent(), charset); Page page = new Page(); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java index edb3a49..136d9c5 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java @@ -36,7 +36,7 @@ public class HttpClientGenerator { connectionManager.setDefaultMaxPerRoute(100); } - public HttpClientGenerator setPoolSize(int poolSize){ + public HttpClientGenerator setPoolSize(int poolSize) { connectionManager.setMaxTotal(poolSize); return this; } @@ -76,10 +76,15 @@ public class HttpClientGenerator { private void generateCookie(HttpClientBuilder httpClientBuilder, Site site) { CookieStore cookieStore = new BasicCookieStore(); - if (site.getCookies() != null) { - for (Map.Entry cookieEntry : site.getCookies().entrySet()) { + for (Map.Entry cookieEntry : site.getCookies().entrySet()) { + BasicClientCookie cookie = new BasicClientCookie(cookieEntry.getKey(), cookieEntry.getValue()); + cookie.setDomain(site.getDomain()); + cookieStore.addCookie(cookie); + } + for (Map.Entry> domainEntry : site.getAllCookies().entrySet()) { + for (Map.Entry cookieEntry : domainEntry.getValue().entrySet()) { BasicClientCookie cookie = new BasicClientCookie(cookieEntry.getKey(), cookieEntry.getValue()); - cookie.setDomain(site.getDomain()); + cookie.setDomain(domainEntry.getKey()); cookieStore.addCookie(cookie); } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/LocalDuplicatedRemovedScheduler.java b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/LocalDuplicatedRemovedScheduler.java index 2807e0f..015aa47 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/LocalDuplicatedRemovedScheduler.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/LocalDuplicatedRemovedScheduler.java @@ -25,12 +25,16 @@ public abstract class LocalDuplicatedRemovedScheduler implements MonitorableSche @Override public void push(Request request, Task task) { logger.trace("get a candidate url {}", request.getUrl()); - if (urls.add(request.getUrl()) || shouldReserved(request)) { + if (isDuplicate(request) || shouldReserved(request)) { logger.debug("push to queue {}", request.getUrl()); pushWhenNoDuplicate(request, task); } } + protected boolean isDuplicate(Request request) { + return urls.add(request.getUrl()); + } + protected boolean shouldReserved(Request request) { return request.getExtra(Request.CYCLE_TRIED_TIMES) != null; } From e1aaf1dd11a7c2d2b6c1a869f1df27e28483fee7 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Thu, 24 Apr 2014 11:05:49 +0800 Subject: [PATCH 094/130] fix mistake of guava Table #109 --- webmagic-core/src/main/java/us/codecraft/webmagic/Site.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java index 3a5dd33..4c84fb3 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java @@ -123,7 +123,7 @@ public class Site { * @return get cookies */ public Map> getAllCookies() { - return cookies.columnMap(); + return cookies.rowMap(); } /** From f49bb877c858617fa74cfd2c464adbfe616e45de Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Thu, 24 Apr 2014 11:38:13 +0800 Subject: [PATCH 095/130] clean some code #109 --- .../src/main/java/us/codecraft/webmagic/Site.java | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java index 4c84fb3..25afde9 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java @@ -126,16 +126,6 @@ public class Site { return cookies.rowMap(); } - /** - * get cookies - * - * @return get cookies - */ - public Table getaCookies() { - cookies.columnMap(); - return cookies; - } - /** * get user agent * From 95d3802e772f6a17c14446c8c9801001b1f536b8 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Thu, 24 Apr 2014 11:48:58 +0800 Subject: [PATCH 096/130] add formdata support for post request #108 --- .../webmagic/downloader/HttpClientDownloader.java | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 4fecf32..0e170f4 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -3,6 +3,7 @@ package us.codecraft.webmagic.downloader; import com.google.common.collect.Sets; import org.apache.commons.io.IOUtils; import org.apache.http.HttpResponse; +import org.apache.http.NameValuePair; import org.apache.http.annotation.ThreadSafe; import org.apache.http.client.config.CookieSpecs; import org.apache.http.client.config.RequestConfig; @@ -124,7 +125,7 @@ public class HttpClientDownloader extends AbstractDownloader { } protected HttpUriRequest getHttpUriRequest(Request request, Site site, Map headers) { - RequestBuilder requestBuilder = selectRequestMethod(request.getMethod()).setUri(request.getUrl()); + RequestBuilder requestBuilder = selectRequestMethod(request).setUri(request.getUrl()); if (headers != null) { for (Map.Entry headerEntry : headers.entrySet()) { requestBuilder.addHeader(headerEntry.getKey(), headerEntry.getValue()); @@ -142,12 +143,18 @@ public class HttpClientDownloader extends AbstractDownloader { return requestBuilder.build(); } - protected RequestBuilder selectRequestMethod(String method) { + protected RequestBuilder selectRequestMethod(Request request) { + String method = request.getMethod(); if (method == null || method.equalsIgnoreCase(HttpConstant.Method.GET)) { //default get return RequestBuilder.get(); } else if (method.equalsIgnoreCase(HttpConstant.Method.POST)) { - return RequestBuilder.post(); + RequestBuilder requestBuilder = RequestBuilder.post(); + NameValuePair[] nameValuePair = (NameValuePair[]) request.getExtra("nameValuePair"); + if (nameValuePair.length > 0) { + requestBuilder.addParameters(nameValuePair); + } + return requestBuilder; } else if (method.equalsIgnoreCase(HttpConstant.Method.HEAD)) { return RequestBuilder.head(); } else if (method.equalsIgnoreCase(HttpConstant.Method.PUT)) { From ced79630d324e199abb9c3becf93aab3defa307e Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Fri, 25 Apr 2014 11:10:35 +0800 Subject: [PATCH 097/130] specify jndi and jmx #98 --- .../webmagic/monitor/SpiderMonitor.java | 44 ++++++++++++------- 1 file changed, 29 insertions(+), 15 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java index 2c11d71..d16aa5f 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java @@ -5,11 +5,14 @@ import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.example.GithubRepoPageProcessor; import us.codecraft.webmagic.processor.example.OschinaBlogPageProcessor; -import javax.management.*; +import javax.management.JMException; +import javax.management.MBeanServer; +import javax.management.ObjectName; import javax.management.remote.JMXConnectorServer; import javax.management.remote.JMXConnectorServerFactory; import javax.management.remote.JMXServiceURL; import java.io.IOException; +import java.lang.management.ManagementFactory; import java.rmi.registry.LocateRegistry; import java.rmi.registry.Registry; import java.util.ArrayList; @@ -23,6 +26,7 @@ import java.util.concurrent.atomic.AtomicInteger; */ public class SpiderMonitor { + public static final int RMI_PORT = 14721; private List spiderStatuses = new ArrayList(); public List getSpiders() { @@ -52,7 +56,7 @@ public class SpiderMonitor { return new SpiderStatus(spider, monitorSpiderListener); } - public static SpiderMonitor create(){ + public static SpiderMonitor create() { return new SpiderMonitor(); } @@ -89,31 +93,41 @@ public class SpiderMonitor { } - public void jmxStart() throws IOException, JMException { - jmxStart(14721); + public SpiderMonitor jndiStart(int port) throws IOException, JMException { + Registry registry = LocateRegistry.createRegistry(port); + return this; } - public void jmxStart(int rmiPort) throws IOException, JMException { - String jmxServerName = "WebMagic"; - // jdkfolder/bin/rmiregistry.exe 9999 - Registry registry = LocateRegistry.createRegistry(rmiPort); + public SpiderMonitor jndiStart() throws IOException, JMException { + return jndiStart(RMI_PORT); + } - MBeanServer mbs = MBeanServerFactory.createMBeanServer(jmxServerName); - //MBeanServer mbs = ManagementFactory.getPlatformMBeanServer(); + public SpiderMonitor jmxStart() throws IOException, JMException { + return jmxStart("localhost", RMI_PORT); + } + + public SpiderMonitor jmxStart(String jndiServer, int rmiPort) throws IOException, JMException { + String jmxServerName = "WebMagic"; + + // start JNDI + MBeanServer localServer = ManagementFactory.getPlatformMBeanServer(); ObjectName objName; - JMXServiceURL url = new JMXServiceURL("service:jmx:rmi:///jndi/rmi://localhost:" + rmiPort + "/" + jmxServerName); + JMXServiceURL url = new JMXServiceURL("service:jmx:rmi:///jndi/rmi://" + jndiServer + ":" + rmiPort + "/" + jmxServerName); System.out.println("JMXServiceURL: " + url.toString()); - JMXConnectorServer jmxConnServer = JMXConnectorServerFactory.newJMXConnectorServer(url, null, mbs); + System.out.println("Please replace localhost of your ip if you want to connect it in remote server."); + JMXConnectorServer jmxConnServer = JMXConnectorServerFactory.newJMXConnectorServer(url, null, localServer); jmxConnServer.start(); + for (SpiderStatusMBean spiderStatus : spiderStatuses) { objName = new ObjectName(jmxServerName + ":name=" + spiderStatus.getName()); - mbs.registerMBean(spiderStatus, objName); + localServer.registerMBean(spiderStatus, objName); } - } + return this; + } public static void main(String[] args) throws JMException, NullPointerException, @@ -126,7 +140,7 @@ public class SpiderMonitor { SpiderMonitor spiderMonitor = new SpiderMonitor(); spiderMonitor.register(oschinaSpider, githubSpider); - spiderMonitor.jmxStart(); + spiderMonitor.jndiStart().jmxStart(); } From 30af23d003ba744cf5d15a947636b095644bdbb0 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Fri, 25 Apr 2014 11:25:52 +0800 Subject: [PATCH 098/130] split monitor to server and client mode #98 --- .../webmagic/monitor/SpiderMonitor.java | 91 ++++++++++++++++--- 1 file changed, 76 insertions(+), 15 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java index d16aa5f..d4243c1 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java @@ -26,7 +26,20 @@ import java.util.concurrent.atomic.AtomicInteger; */ public class SpiderMonitor { - public static final int RMI_PORT = 14721; + private enum Type { + Server, Client, Local; + } + + private static final int DEFAULT_SERVER_PORT = 14721; + + private static final String DEFAULT_SERVER_HOST = "localhost"; + + private int serverPort; + + private String serverHost; + + private Type type = Type.Local; + private List spiderStatuses = new ArrayList(); public List getSpiders() { @@ -37,6 +50,11 @@ public class SpiderMonitor { return spiderStatuses.get(0); } + /** + * Register spider for monitor. + * @param spiders + * @return + */ public SpiderMonitor register(Spider... spiders) { for (Spider spider : spiders) { MonitorSpiderListener monitorSpiderListener = new MonitorSpiderListener(); @@ -93,17 +111,59 @@ public class SpiderMonitor { } - public SpiderMonitor jndiStart(int port) throws IOException, JMException { + /** + * Start monitor as server mode. + * @param port + * @return + * @throws IOException + * @throws JMException + */ + public SpiderMonitor server(int port) throws IOException, JMException { Registry registry = LocateRegistry.createRegistry(port); + serverPort = port; + serverHost = "localhost"; + type = Type.Server; return this; } - public SpiderMonitor jndiStart() throws IOException, JMException { - return jndiStart(RMI_PORT); + /** + * Start monitor as server mode. + * @return + * @throws IOException + * @throws JMException + */ + public SpiderMonitor server() throws IOException, JMException { + return server(DEFAULT_SERVER_PORT); + } + + + /** + * Start monitor as client mode. + * @param serverHost + * @param serverPort + * @return + * @throws IOException + * @throws JMException + */ + public SpiderMonitor client(String serverHost, int serverPort) throws IOException, JMException { + type = Type.Client; + this.serverHost = serverHost; + this.serverPort = serverPort; + return this; + } + + /** + * Start monitor as client mode. + * @return + * @throws IOException + * @throws JMException + */ + public SpiderMonitor client() throws IOException, JMException { + return client(DEFAULT_SERVER_HOST, DEFAULT_SERVER_PORT); } public SpiderMonitor jmxStart() throws IOException, JMException { - return jmxStart("localhost", RMI_PORT); + return jmxStart("localhost", DEFAULT_SERVER_PORT); } public SpiderMonitor jmxStart(String jndiServer, int rmiPort) throws IOException, JMException { @@ -114,12 +174,13 @@ public class SpiderMonitor { ObjectName objName; - JMXServiceURL url = new JMXServiceURL("service:jmx:rmi:///jndi/rmi://" + jndiServer + ":" + rmiPort + "/" + jmxServerName); - System.out.println("JMXServiceURL: " + url.toString()); - System.out.println("Please replace localhost of your ip if you want to connect it in remote server."); - JMXConnectorServer jmxConnServer = JMXConnectorServerFactory.newJMXConnectorServer(url, null, localServer); - jmxConnServer.start(); - + if (type != Type.Local) { + JMXServiceURL url = new JMXServiceURL("service:jmx:rmi:///jndi/rmi://" + jndiServer + ":" + rmiPort + "/" + jmxServerName); + System.out.println("JMXServiceURL: " + url.toString()); + System.out.println("Please replace localhost of your ip if you want to connect it in remote server."); + JMXConnectorServer jmxConnServer = JMXConnectorServerFactory.newJMXConnectorServer(url, null, localServer); + jmxConnServer.start(); + } for (SpiderStatusMBean spiderStatus : spiderStatuses) { objName = new ObjectName(jmxServerName + ":name=" + spiderStatus.getName()); @@ -129,9 +190,7 @@ public class SpiderMonitor { return this; } - public static void main(String[] args) throws JMException, - NullPointerException, - IOException { + public static void main(String[] args) throws Exception { Spider oschinaSpider = Spider.create(new OschinaBlogPageProcessor()) .addUrl("http://my.oschina.net/flashsword/blog").thread(2); @@ -140,7 +199,9 @@ public class SpiderMonitor { SpiderMonitor spiderMonitor = new SpiderMonitor(); spiderMonitor.register(oschinaSpider, githubSpider); - spiderMonitor.jndiStart().jmxStart(); + spiderMonitor.jmxStart(); + oschinaSpider.start(); + githubSpider.start(); } From ad6a273b12cadbc263fd6f5e86299030e0e0225f Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Fri, 25 Apr 2014 11:28:35 +0800 Subject: [PATCH 099/130] update test url --- .../codecraft/webmagic/downloader/HttpClientDownloaderTest.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java index e6fe5ae..ab84665 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java @@ -30,7 +30,7 @@ public class HttpClientDownloaderTest { @Test public void testDownloader() { HttpClientDownloader httpClientDownloader = new HttpClientDownloader(); - Html html = httpClientDownloader.download("http://www.oschina.net"); + Html html = httpClientDownloader.download("https://github.com"); assertTrue(!html.getText().isEmpty()); } From d61f65cef84cad0ada34e8744c1d1d5a8e311cf0 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Fri, 25 Apr 2014 11:31:43 +0800 Subject: [PATCH 100/130] update mbean to mxbean #98 --- .../us/codecraft/webmagic/monitor/SpiderMonitor.java | 11 ++++++----- .../us/codecraft/webmagic/monitor/SpiderStatus.java | 2 +- ...SpiderStatusMBean.java => SpiderStatusMXBean.java} | 2 +- .../webmagic/monitor/CustomSpiderStatus.java | 2 +- ...StatusMBean.java => CustomSpiderStatusMXBean.java} | 2 +- .../codecraft/webmagic/monitor/SpiderMonitorTest.java | 2 +- 6 files changed, 11 insertions(+), 10 deletions(-) rename webmagic-core/src/main/java/us/codecraft/webmagic/monitor/{SpiderStatusMBean.java => SpiderStatusMXBean.java} (92%) rename webmagic-core/src/test/java/us/codecraft/webmagic/monitor/{CustomSpiderStatusMBean.java => CustomSpiderStatusMXBean.java} (63%) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java index d4243c1..4a02db1 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java @@ -40,13 +40,13 @@ public class SpiderMonitor { private Type type = Type.Local; - private List spiderStatuses = new ArrayList(); + private List spiderStatuses = new ArrayList(); - public List getSpiders() { + public List getSpiders() { return spiderStatuses; } - public SpiderStatusMBean getSpider() { + public SpiderStatusMXBean getSpider() { return spiderStatuses.get(0); } @@ -70,7 +70,7 @@ public class SpiderMonitor { return this; } - protected SpiderStatusMBean getSpiderStatusMBean(Spider spider, MonitorSpiderListener monitorSpiderListener) { + protected SpiderStatusMXBean getSpiderStatusMBean(Spider spider, MonitorSpiderListener monitorSpiderListener) { return new SpiderStatus(spider, monitorSpiderListener); } @@ -182,7 +182,7 @@ public class SpiderMonitor { jmxConnServer.start(); } - for (SpiderStatusMBean spiderStatus : spiderStatuses) { + for (SpiderStatusMXBean spiderStatus : spiderStatuses) { objName = new ObjectName(jmxServerName + ":name=" + spiderStatus.getName()); localServer.registerMBean(spiderStatus, objName); } @@ -199,6 +199,7 @@ public class SpiderMonitor { SpiderMonitor spiderMonitor = new SpiderMonitor(); spiderMonitor.register(oschinaSpider, githubSpider); + // spiderMonitor.jmxStart(); oschinaSpider.start(); githubSpider.start(); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/monitor/SpiderStatus.java b/webmagic-core/src/main/java/us/codecraft/webmagic/monitor/SpiderStatus.java index 6b49cbb..889555c 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/monitor/SpiderStatus.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/monitor/SpiderStatus.java @@ -10,7 +10,7 @@ import java.util.List; * @author code4crafer@gmail.com * @since 0.5.0 */ -public class SpiderStatus implements SpiderStatusMBean { +public class SpiderStatus implements SpiderStatusMXBean { protected final Spider spider; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/monitor/SpiderStatusMBean.java b/webmagic-core/src/main/java/us/codecraft/webmagic/monitor/SpiderStatusMXBean.java similarity index 92% rename from webmagic-core/src/main/java/us/codecraft/webmagic/monitor/SpiderStatusMBean.java rename to webmagic-core/src/main/java/us/codecraft/webmagic/monitor/SpiderStatusMXBean.java index 156b653..cc0f040 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/monitor/SpiderStatusMBean.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/monitor/SpiderStatusMXBean.java @@ -6,7 +6,7 @@ import java.util.List; * @author code4crafer@gmail.com * @since 0.5.0 */ -public interface SpiderStatusMBean { +public interface SpiderStatusMXBean { public String getName(); diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/monitor/CustomSpiderStatus.java b/webmagic-core/src/test/java/us/codecraft/webmagic/monitor/CustomSpiderStatus.java index a0b7f77..75679da 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/monitor/CustomSpiderStatus.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/monitor/CustomSpiderStatus.java @@ -5,7 +5,7 @@ import us.codecraft.webmagic.Spider; /** * @author code4crafer@gmail.com */ -public class CustomSpiderStatus extends SpiderStatus implements CustomSpiderStatusMBean { +public class CustomSpiderStatus extends SpiderStatus implements CustomSpiderStatusMXBean { public CustomSpiderStatus(Spider spider, SpiderMonitor.MonitorSpiderListener monitorSpiderListener) { super(spider, monitorSpiderListener); diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/monitor/CustomSpiderStatusMBean.java b/webmagic-core/src/test/java/us/codecraft/webmagic/monitor/CustomSpiderStatusMXBean.java similarity index 63% rename from webmagic-core/src/test/java/us/codecraft/webmagic/monitor/CustomSpiderStatusMBean.java rename to webmagic-core/src/test/java/us/codecraft/webmagic/monitor/CustomSpiderStatusMXBean.java index 104b27d..5dd8ace 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/monitor/CustomSpiderStatusMBean.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/monitor/CustomSpiderStatusMXBean.java @@ -3,7 +3,7 @@ package us.codecraft.webmagic.monitor; /** * @author code4crafer@gmail.com */ -public interface CustomSpiderStatusMBean extends SpiderStatusMBean { +public interface CustomSpiderStatusMXBean extends SpiderStatusMXBean { public String getSchedulerName(); diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/monitor/SpiderMonitorTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/monitor/SpiderMonitorTest.java index 450f3fe..d1065f9 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/monitor/SpiderMonitorTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/monitor/SpiderMonitorTest.java @@ -15,7 +15,7 @@ public class SpiderMonitorTest { public void testInherit() throws Exception { SpiderMonitor spiderMonitor = new SpiderMonitor(){ @Override - protected SpiderStatusMBean getSpiderStatusMBean(Spider spider, MonitorSpiderListener monitorSpiderListener) { + protected SpiderStatusMXBean getSpiderStatusMBean(Spider spider, MonitorSpiderListener monitorSpiderListener) { return new CustomSpiderStatus(spider, monitorSpiderListener); } }; From 11ba5beb42b4dca1cf37a3f73234c417c27dfd97 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Fri, 25 Apr 2014 13:17:13 +0800 Subject: [PATCH 101/130] [refactor]move monitor to webmagic-extension #98 --- .../java/us/codecraft/webmagic/Request.java | 2 +- .../main/java/us/codecraft/webmagic/Site.java | 2 +- .../java/us/codecraft/webmagic/Spider.java | 1 - .../{monitor => }/SpiderListener.java | 6 ++-- .../downloader/HttpClientDownloader.java | 2 +- .../LocalDuplicatedRemovedScheduler.java | 1 - .../MonitorableScheduler.java | 3 +- .../{constant => utils}/HttpConstant.java | 2 +- .../webmagic/example/MonitorExample.java | 31 ++++++++++++++++ .../webmagic/monitor/SpiderMonitor.java | 25 +++++++++++-- .../webmagic/monitor/SpiderStatus.java | 1 + .../webmagic/monitor/SpiderStatusMXBean.java | 0 .../webmagic/scheduler/RedisScheduler.java | 1 - .../us/codecraft/webmagic/utils/IPUtils.java | 36 +++++++++++++++++++ .../webmagic/monitor/CustomSpiderStatus.java | 0 .../monitor/CustomSpiderStatusMXBean.java | 0 .../webmagic/monitor/SpiderMonitorTest.java | 0 .../codecraft/webmagic/utils/IPUtilsTest.java | 14 ++++++++ 18 files changed, 112 insertions(+), 15 deletions(-) rename webmagic-core/src/main/java/us/codecraft/webmagic/{monitor => }/SpiderListener.java (63%) rename webmagic-core/src/main/java/us/codecraft/webmagic/{monitor => scheduler}/MonitorableScheduler.java (77%) rename webmagic-core/src/main/java/us/codecraft/webmagic/{constant => utils}/HttpConstant.java (94%) create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/example/MonitorExample.java rename {webmagic-core => webmagic-extension}/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java (88%) rename {webmagic-core => webmagic-extension}/src/main/java/us/codecraft/webmagic/monitor/SpiderStatus.java (97%) rename {webmagic-core => webmagic-extension}/src/main/java/us/codecraft/webmagic/monitor/SpiderStatusMXBean.java (100%) create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/utils/IPUtils.java rename {webmagic-core => webmagic-extension}/src/test/java/us/codecraft/webmagic/monitor/CustomSpiderStatus.java (100%) rename {webmagic-core => webmagic-extension}/src/test/java/us/codecraft/webmagic/monitor/CustomSpiderStatusMXBean.java (100%) rename {webmagic-core => webmagic-extension}/src/test/java/us/codecraft/webmagic/monitor/SpiderMonitorTest.java (100%) create mode 100644 webmagic-extension/src/test/java/us/codecraft/webmagic/utils/IPUtilsTest.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java index aeca08f..1f8a194 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java @@ -111,7 +111,7 @@ public class Request implements Serializable { /** * The http method of the request. Get for default. * @return httpMethod - * @see us.codecraft.webmagic.constant.HttpConstant.Method + * @see us.codecraft.webmagic.utils.HttpConstant.Method * @since 0.5.0 */ public String getMethod() { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java index 25afde9..a7c7bf8 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java @@ -50,7 +50,7 @@ public class Site { private boolean useGzip = true; /** - * @see us.codecraft.webmagic.constant.HttpConstant.Header + * @see us.codecraft.webmagic.utils.HttpConstant.Header * @deprecated */ public static interface HeaderConst { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index a03dee1..68b2e11 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -8,7 +8,6 @@ import org.slf4j.LoggerFactory; import us.codecraft.webmagic.downloader.Downloader; import us.codecraft.webmagic.downloader.HttpClientDownloader; -import us.codecraft.webmagic.monitor.SpiderListener; import us.codecraft.webmagic.pipeline.CollectorPipeline; import us.codecraft.webmagic.pipeline.ConsolePipeline; import us.codecraft.webmagic.pipeline.Pipeline; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/monitor/SpiderListener.java b/webmagic-core/src/main/java/us/codecraft/webmagic/SpiderListener.java similarity index 63% rename from webmagic-core/src/main/java/us/codecraft/webmagic/monitor/SpiderListener.java rename to webmagic-core/src/main/java/us/codecraft/webmagic/SpiderListener.java index 7a6c687..0678180 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/monitor/SpiderListener.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/SpiderListener.java @@ -1,8 +1,8 @@ -package us.codecraft.webmagic.monitor; - -import us.codecraft.webmagic.Request; +package us.codecraft.webmagic; /** + * Listener of Spider on page processing. Used for monitor and such on. + * * @author code4crafer@gmail.com * @since 0.5.0 */ diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 0e170f4..eeae70e 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -18,7 +18,7 @@ import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Task; -import us.codecraft.webmagic.constant.HttpConstant; +import us.codecraft.webmagic.utils.HttpConstant; import us.codecraft.webmagic.selector.PlainText; import us.codecraft.webmagic.utils.UrlUtils; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/LocalDuplicatedRemovedScheduler.java b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/LocalDuplicatedRemovedScheduler.java index 015aa47..1ec128b 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/LocalDuplicatedRemovedScheduler.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/LocalDuplicatedRemovedScheduler.java @@ -5,7 +5,6 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Task; -import us.codecraft.webmagic.monitor.MonitorableScheduler; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/monitor/MonitorableScheduler.java b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/MonitorableScheduler.java similarity index 77% rename from webmagic-core/src/main/java/us/codecraft/webmagic/monitor/MonitorableScheduler.java rename to webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/MonitorableScheduler.java index 11889ac..ca76dfa 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/monitor/MonitorableScheduler.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/MonitorableScheduler.java @@ -1,7 +1,6 @@ -package us.codecraft.webmagic.monitor; +package us.codecraft.webmagic.scheduler; import us.codecraft.webmagic.Task; -import us.codecraft.webmagic.scheduler.Scheduler; /** * The scheduler whose requests can be counted for monitor. diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/constant/HttpConstant.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/HttpConstant.java similarity index 94% rename from webmagic-core/src/main/java/us/codecraft/webmagic/constant/HttpConstant.java rename to webmagic-core/src/main/java/us/codecraft/webmagic/utils/HttpConstant.java index 52f7ecb..2a76ecc 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/constant/HttpConstant.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/HttpConstant.java @@ -1,4 +1,4 @@ -package us.codecraft.webmagic.constant; +package us.codecraft.webmagic.utils; /** * Some constants of Http protocal. diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/MonitorExample.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/MonitorExample.java new file mode 100644 index 0000000..0ff145e --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/MonitorExample.java @@ -0,0 +1,31 @@ +package us.codecraft.webmagic.example; + +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.monitor.SpiderMonitor; +import us.codecraft.webmagic.processor.example.GithubRepoPageProcessor; +import us.codecraft.webmagic.processor.example.OschinaBlogPageProcessor; + +/** + * @author code4crafer@gmail.com + */ +public class MonitorExample { + + public static void main(String[] args) throws Exception { + + Spider oschinaSpider = Spider.create(new OschinaBlogPageProcessor()) + .addUrl("http://my.oschina.net/flashsword/blog").thread(2); + Spider githubSpider = Spider.create(new GithubRepoPageProcessor()) + .addUrl("https://github.com/code4craft"); + + SpiderMonitor spiderMonitor = new SpiderMonitor(); + spiderMonitor.register(oschinaSpider, githubSpider); + //If you want to connect it from remote, use spiderMonitor.server().jmxStart(); + //ONLY ONE server can start for a machine. + //Others will be registered + spiderMonitor.server().server(); + spiderMonitor.jmxStart(); + oschinaSpider.start(); + githubSpider.start(); + + } +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java similarity index 88% rename from webmagic-core/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java rename to webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java index 4a02db1..ba9baea 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java @@ -1,9 +1,13 @@ package us.codecraft.webmagic.monitor; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.SpiderListener; import us.codecraft.webmagic.processor.example.GithubRepoPageProcessor; import us.codecraft.webmagic.processor.example.OschinaBlogPageProcessor; +import us.codecraft.webmagic.utils.IPUtils; import javax.management.JMException; import javax.management.MBeanServer; @@ -15,6 +19,7 @@ import java.io.IOException; import java.lang.management.ManagementFactory; import java.rmi.registry.LocateRegistry; import java.rmi.registry.Registry; +import java.rmi.server.ExportException; import java.util.ArrayList; import java.util.Collections; import java.util.List; @@ -30,6 +35,8 @@ public class SpiderMonitor { Server, Client, Local; } + private Logger logger = LoggerFactory.getLogger(getClass()); + private static final int DEFAULT_SERVER_PORT = 14721; private static final String DEFAULT_SERVER_HOST = "localhost"; @@ -52,6 +59,7 @@ public class SpiderMonitor { /** * Register spider for monitor. + * * @param spiders * @return */ @@ -113,13 +121,18 @@ public class SpiderMonitor { /** * Start monitor as server mode. + * * @param port * @return * @throws IOException * @throws JMException */ public SpiderMonitor server(int port) throws IOException, JMException { - Registry registry = LocateRegistry.createRegistry(port); + try { + Registry registry = LocateRegistry.createRegistry(port); + } catch (ExportException e) { + logger.warn("Start server fail, maybe the address is in using.", e); + } serverPort = port; serverHost = "localhost"; type = Type.Server; @@ -128,6 +141,7 @@ public class SpiderMonitor { /** * Start monitor as server mode. + * * @return * @throws IOException * @throws JMException @@ -139,6 +153,7 @@ public class SpiderMonitor { /** * Start monitor as client mode. + * * @param serverHost * @param serverPort * @return @@ -154,6 +169,7 @@ public class SpiderMonitor { /** * Start monitor as client mode. + * * @return * @throws IOException * @throws JMException @@ -167,7 +183,7 @@ public class SpiderMonitor { } public SpiderMonitor jmxStart(String jndiServer, int rmiPort) throws IOException, JMException { - String jmxServerName = "WebMagic"; + String jmxServerName = "WebMagic-"+ IPUtils.getFirstNoLoopbackIPAddresses(); // start JNDI MBeanServer localServer = ManagementFactory.getPlatformMBeanServer(); @@ -199,7 +215,10 @@ public class SpiderMonitor { SpiderMonitor spiderMonitor = new SpiderMonitor(); spiderMonitor.register(oschinaSpider, githubSpider); - // + //If you want to connect it from remote, use spiderMonitor.server().jmxStart(); + //ONLY ONE server can start for a machine. + //Others will be registered + spiderMonitor.server().server(); spiderMonitor.jmxStart(); oschinaSpider.start(); githubSpider.start(); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/monitor/SpiderStatus.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderStatus.java similarity index 97% rename from webmagic-core/src/main/java/us/codecraft/webmagic/monitor/SpiderStatus.java rename to webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderStatus.java index 889555c..af08526 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/monitor/SpiderStatus.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderStatus.java @@ -3,6 +3,7 @@ package us.codecraft.webmagic.monitor; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.scheduler.MonitorableScheduler; import java.util.List; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/monitor/SpiderStatusMXBean.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderStatusMXBean.java similarity index 100% rename from webmagic-core/src/main/java/us/codecraft/webmagic/monitor/SpiderStatusMXBean.java rename to webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderStatusMXBean.java diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java index 16f9147..cd3a0b6 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java @@ -7,7 +7,6 @@ import redis.clients.jedis.JedisPool; import redis.clients.jedis.JedisPoolConfig; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Task; -import us.codecraft.webmagic.monitor.MonitorableScheduler; /** * Use Redis as url scheduler for distributed crawlers.
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/IPUtils.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/IPUtils.java new file mode 100644 index 0000000..3d41696 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/IPUtils.java @@ -0,0 +1,36 @@ +package us.codecraft.webmagic.utils; + +import java.net.Inet6Address; +import java.net.InetAddress; +import java.net.NetworkInterface; +import java.net.SocketException; +import java.util.Enumeration; + +/** + * @author code4crafer@gmail.com + * @since 0.5.0 + */ +public abstract class IPUtils { + + public static String getFirstNoLoopbackIPAddresses() throws SocketException { + + Enumeration networkInterfaces = NetworkInterface.getNetworkInterfaces(); + + InetAddress localAddress = null; + while (networkInterfaces.hasMoreElements()) { + NetworkInterface networkInterface = networkInterfaces.nextElement(); + Enumeration inetAddresses = networkInterface.getInetAddresses(); + while (inetAddresses.hasMoreElements()) { + InetAddress address = inetAddresses.nextElement(); + if (!address.isLoopbackAddress() && !Inet6Address.class.isInstance(address)) { + return address.getHostAddress(); + } else if (!address.isLoopbackAddress()) { + localAddress = address; + } + } + } + + return localAddress.getHostAddress(); + } + +} diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/monitor/CustomSpiderStatus.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/monitor/CustomSpiderStatus.java similarity index 100% rename from webmagic-core/src/test/java/us/codecraft/webmagic/monitor/CustomSpiderStatus.java rename to webmagic-extension/src/test/java/us/codecraft/webmagic/monitor/CustomSpiderStatus.java diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/monitor/CustomSpiderStatusMXBean.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/monitor/CustomSpiderStatusMXBean.java similarity index 100% rename from webmagic-core/src/test/java/us/codecraft/webmagic/monitor/CustomSpiderStatusMXBean.java rename to webmagic-extension/src/test/java/us/codecraft/webmagic/monitor/CustomSpiderStatusMXBean.java diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/monitor/SpiderMonitorTest.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/monitor/SpiderMonitorTest.java similarity index 100% rename from webmagic-core/src/test/java/us/codecraft/webmagic/monitor/SpiderMonitorTest.java rename to webmagic-extension/src/test/java/us/codecraft/webmagic/monitor/SpiderMonitorTest.java diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/utils/IPUtilsTest.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/utils/IPUtilsTest.java new file mode 100644 index 0000000..9d78fb9 --- /dev/null +++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/utils/IPUtilsTest.java @@ -0,0 +1,14 @@ +package us.codecraft.webmagic.utils; + +import org.junit.Test; + +/** + * @author code4crafer@gmail.com + */ +public class IPUtilsTest { + + @Test + public void testGetFirstNoLoopbackIPAddresses() throws Exception { + System.out.println(IPUtils.getFirstNoLoopbackIPAddresses()); + } +} From acb63d55d71a3e704a0445020104ae1b3c11af79 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Fri, 25 Apr 2014 13:26:08 +0800 Subject: [PATCH 102/130] some check and example #98 --- .../webmagic/example/MonitorExample.java | 5 ++-- .../webmagic/monitor/SpiderMonitor.java | 25 ++++++++++++++++--- 2 files changed, 25 insertions(+), 5 deletions(-) diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/MonitorExample.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/MonitorExample.java index 0ff145e..d22a16e 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/MonitorExample.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/MonitorExample.java @@ -7,6 +7,7 @@ import us.codecraft.webmagic.processor.example.OschinaBlogPageProcessor; /** * @author code4crafer@gmail.com + * @since 0.5.0 */ public class MonitorExample { @@ -21,8 +22,8 @@ public class MonitorExample { spiderMonitor.register(oschinaSpider, githubSpider); //If you want to connect it from remote, use spiderMonitor.server().jmxStart(); //ONLY ONE server can start for a machine. - //Others will be registered - spiderMonitor.server().server(); + //Others will be registered without start a server. + //You can also register a server by spiderMonitor.client(host,port).jmxStart(). spiderMonitor.jmxStart(); oschinaSpider.start(); githubSpider.start(); diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java index ba9baea..193ff94 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java @@ -23,6 +23,7 @@ import java.rmi.server.ExportException; import java.util.ArrayList; import java.util.Collections; import java.util.List; +import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicInteger; /** @@ -35,6 +36,10 @@ public class SpiderMonitor { Server, Client, Local; } + private static AtomicInteger serialNumber = new AtomicInteger(); + + private AtomicBoolean started = new AtomicBoolean(false); + private Logger logger = LoggerFactory.getLogger(getClass()); private static final int DEFAULT_SERVER_PORT = 14721; @@ -150,6 +155,17 @@ public class SpiderMonitor { return server(DEFAULT_SERVER_PORT); } + /** + * Local mode: the monitor will be bound to the JVM instance.

+ * Use jconsole to check your application. + * + * @return + */ + public SpiderMonitor local() { + this.type = Type.Local; + return this; + } + /** * Start monitor as client mode. @@ -183,7 +199,11 @@ public class SpiderMonitor { } public SpiderMonitor jmxStart(String jndiServer, int rmiPort) throws IOException, JMException { - String jmxServerName = "WebMagic-"+ IPUtils.getFirstNoLoopbackIPAddresses(); + if (!started.compareAndSet(false, true)) { + logger.error("Monitor has already started!"); + return this; + } + String jmxServerName = "WebMagic-" + IPUtils.getFirstNoLoopbackIPAddresses() + "-" + serialNumber.incrementAndGet(); // start JNDI MBeanServer localServer = ManagementFactory.getPlatformMBeanServer(); @@ -218,8 +238,7 @@ public class SpiderMonitor { //If you want to connect it from remote, use spiderMonitor.server().jmxStart(); //ONLY ONE server can start for a machine. //Others will be registered - spiderMonitor.server().server(); - spiderMonitor.jmxStart(); + spiderMonitor.server().jmxStart(); oschinaSpider.start(); githubSpider.start(); From f973889cdac05a92b61916c8693efc5ad3390652 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Fri, 25 Apr 2014 15:48:05 +0800 Subject: [PATCH 103/130] refactor subpageprossor etc. #94 --- .../example/PatternProcessorDemo.java | 56 ------------ .../example/PatternProcessorExample.java | 66 ++++++++++++++ .../handler/CompositePageProcessor.java | 17 +++- .../webmagic/handler/CompositePipeline.java | 42 +++++++++ .../webmagic/handler/PatternHandler.java | 90 ------------------- .../webmagic/handler/PatternProcessor.java | 13 +++ .../handler/PatternRequestMatcher.java | 37 ++++++++ .../webmagic/handler/RequestMatcher.java | 24 +++++ .../webmagic/handler/SubPageProcessor.java | 18 +--- .../webmagic/handler/SubPipeline.java | 21 +++++ .../webmagic/pipeline/PatternPipeline.java | 43 --------- .../processor/PatternPageProcessor.java | 76 ---------------- 12 files changed, 218 insertions(+), 285 deletions(-) delete mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/example/PatternProcessorDemo.java create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/example/PatternProcessorExample.java create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/handler/CompositePipeline.java delete mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/handler/PatternHandler.java create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/handler/PatternProcessor.java create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/handler/PatternRequestMatcher.java create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/handler/RequestMatcher.java create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/handler/SubPipeline.java delete mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/PatternPipeline.java delete mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/processor/PatternPageProcessor.java diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/PatternProcessorDemo.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/PatternProcessorDemo.java deleted file mode 100644 index e2303a0..0000000 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/PatternProcessorDemo.java +++ /dev/null @@ -1,56 +0,0 @@ -package us.codecraft.webmagic.example; - -import org.apache.log4j.Logger; -import us.codecraft.webmagic.Page; -import us.codecraft.webmagic.ResultItems; -import us.codecraft.webmagic.Spider; -import us.codecraft.webmagic.Task; -import us.codecraft.webmagic.handler.PatternHandler; -import us.codecraft.webmagic.handler.SubPageProcessor; -import us.codecraft.webmagic.pipeline.PatternPipeline; -import us.codecraft.webmagic.processor.PatternPageProcessor; - -/** - * Created with IntelliJ IDEA. - * User: Sebastian MA - * Date: April 04, 2014 - * Time: 21:23 - */ -public class PatternProcessorDemo { - - private static Logger log = Logger.getLogger(PatternProcessorDemo.class); - - public static void main(String... args) { - - PatternPageProcessor processor - = new PatternPageProcessor("http://item.jd.com/981821.html", - PatternPageProcessor.TARGET_PATTERN_ALL - ); - - PatternPipeline pipeline = new PatternPipeline(); - - // define a handler which handles only "http://item.jd.com/.*" - PatternHandler handler = new PatternHandler("http://item.jd.com/.*") { - - @Override - public SubPageProcessor.MatchOtherProcessor process(Page page) { - - log.info("Extracting from " + page.getUrl()); - page.putField("test", "hello world:)"); - return MatchOtherProcessor.YES; - } - - @Override - public void handle(ResultItems result, Task task) { - - log.info("Handling " + result.getRequest().getUrl()); - log.info("Retrieved test=" + result.get("test")); - } - }; - - processor.addHandler(handler); - pipeline.addHandler(handler); - - Spider.create(processor).thread(5).addPipeline(pipeline).runAsync(); - } -} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/PatternProcessorExample.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/PatternProcessorExample.java new file mode 100644 index 0000000..84b3164 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/PatternProcessorExample.java @@ -0,0 +1,66 @@ +package us.codecraft.webmagic.example; + +import org.apache.log4j.Logger; +import us.codecraft.webmagic.*; +import us.codecraft.webmagic.handler.CompositePageProcessor; +import us.codecraft.webmagic.handler.CompositePipeline; +import us.codecraft.webmagic.handler.PatternProcessor; +import us.codecraft.webmagic.handler.RequestMatcher; + +/** + * Created with IntelliJ IDEA. + * User: Sebastian MA + * Date: April 04, 2014 + * Time: 21:23 + */ +public class PatternProcessorExample { + + private static Logger log = Logger.getLogger(PatternProcessorExample.class); + + public static void main(String... args) { + + // define a patternProcessor which handles only "http://item.jd.com/.*" + PatternProcessor githubRepoProcessor = new PatternProcessor("https://github\\.com/[\\w\\-]+/[\\w\\-]+") { + + @Override + public RequestMatcher.MatchOther processPage(Page page) { + page.putField("reponame", page.getHtml().xpath("//h1[@class='entry-title public']/strong/a/text()").toString()); + return RequestMatcher.MatchOther.YES; + } + + @Override + public RequestMatcher.MatchOther processResult(ResultItems resultItems, Task task) { + log.info("Extracting from repo" + resultItems.getRequest()); + System.out.println(resultItems.get("reponame")); + return RequestMatcher.MatchOther.YES; + } + }; + + PatternProcessor githubUserProcessor = new PatternProcessor("https://github\\.com/[\\w\\-]+") { + + @Override + public RequestMatcher.MatchOther processPage(Page page) { + log.info("Extracting from " + page.getUrl()); + page.addTargetRequests(page.getHtml().links().regex("https://github\\.com/[\\w\\-]+/[\\w\\-]+").all()); + page.addTargetRequests(page.getHtml().links().regex("https://github\\.com/[\\w\\-]+").all()); + page.putField("username", page.getHtml().xpath("//span[@class='vcard-fullname']/text()").toString()); + return RequestMatcher.MatchOther.YES; + } + + @Override + public RequestMatcher.MatchOther processResult(ResultItems resultItems, Task task) { + System.out.println(resultItems.get("username")); + return RequestMatcher.MatchOther.YES; + } + }; + + CompositePageProcessor pageProcessor = new CompositePageProcessor(Site.me().setDomain("github.com")); + CompositePipeline pipeline = new CompositePipeline(); + + pageProcessor.setSubPageProcessors(githubRepoProcessor, githubUserProcessor); + pipeline.setSubPipeline(githubRepoProcessor, githubUserProcessor); + + Spider.create(pageProcessor).addUrl("https://github.com/code4craft").thread(5).addPipeline(pipeline).runAsync(); + } + +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/CompositePageProcessor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/CompositePageProcessor.java index ecf4aa1..2073445 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/CompositePageProcessor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/CompositePageProcessor.java @@ -15,14 +15,18 @@ public class CompositePageProcessor implements PageProcessor { private Site site; - private List subPageProcessors; + private List subPageProcessors = new ArrayList(); + + public CompositePageProcessor(Site site) { + this.site = site; + } @Override public void process(Page page) { for (SubPageProcessor subPageProcessor : subPageProcessors) { - if (subPageProcessor.match(page)) { - SubPageProcessor.MatchOtherProcessor matchOtherProcessorProcessor = subPageProcessor.process(page); - if (matchOtherProcessorProcessor == null || matchOtherProcessorProcessor != SubPageProcessor.MatchOtherProcessor.YES) { + if (subPageProcessor.match(page.getRequest())) { + SubPageProcessor.MatchOther matchOtherProcessorProcessor = subPageProcessor.processPage(page); + if (matchOtherProcessorProcessor == null || matchOtherProcessorProcessor != SubPageProcessor.MatchOther.YES) { return; } } @@ -34,6 +38,11 @@ public class CompositePageProcessor implements PageProcessor { return this; } + public CompositePageProcessor addSubPageProcessor(SubPageProcessor subPageProcessor) { + this.subPageProcessors.add(subPageProcessor); + return this; + } + public CompositePageProcessor setSubPageProcessors(SubPageProcessor... subPageProcessors) { this.subPageProcessors = new ArrayList(); for (SubPageProcessor subPageProcessor : subPageProcessors) { diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/CompositePipeline.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/CompositePipeline.java new file mode 100644 index 0000000..3f09eee --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/CompositePipeline.java @@ -0,0 +1,42 @@ +package us.codecraft.webmagic.handler; + +import us.codecraft.webmagic.ResultItems; +import us.codecraft.webmagic.Task; +import us.codecraft.webmagic.pipeline.Pipeline; + +import java.util.ArrayList; +import java.util.List; + +/** + * @author code4crafer@gmail.com + */ +public class CompositePipeline implements Pipeline { + + private List subPipelines = new ArrayList(); + + @Override + public void process(ResultItems resultItems, Task task) { + for (SubPipeline subPipeline : subPipelines) { + if (subPipeline.match(resultItems.getRequest())) { + RequestMatcher.MatchOther matchOtherProcessorProcessor = subPipeline.processResult(resultItems, task); + if (matchOtherProcessorProcessor == null || matchOtherProcessorProcessor != RequestMatcher.MatchOther.YES) { + return; + } + } + } + } + + public CompositePipeline addSubPipeline(SubPipeline subPipeline) { + this.subPipelines.add(subPipeline); + return this; + } + + public CompositePipeline setSubPipeline(SubPipeline... subPipelines) { + this.subPipelines = new ArrayList(); + for (SubPipeline subPipeline : subPipelines) { + this.subPipelines.add(subPipeline); + } + return this; + } + +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/PatternHandler.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/PatternHandler.java deleted file mode 100644 index 4be03de..0000000 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/PatternHandler.java +++ /dev/null @@ -1,90 +0,0 @@ -package us.codecraft.webmagic.handler; - -import us.codecraft.webmagic.Page; -import us.codecraft.webmagic.ResultItems; -import us.codecraft.webmagic.Task; - -import java.util.UUID; - -/** - * Created with IntelliJ IDEA. - * User: Sebastian MA - * Date: April 03, 2014 - * Time: 10:00 - *

- * A PatternHandler is in charge of both page extraction and data processing by implementing - * its two abstract methods. - */ -public abstract class PatternHandler implements SubPageProcessor { - - /** - * identity of the handler. - */ - protected String id; - - /** - * match pattern. only matched page should be handled. - */ - protected String pattern; - - /** - * @param pattern - * url pattern to handle - */ - protected PatternHandler(String pattern) { - - this.pattern = pattern; - this.id = UUID.randomUUID().toString(); - } - - /** - * determine if the page should be handled. - */ - public boolean match(String url) { - - return url.matches(pattern); - } - - public boolean processPage(Page page) { - - if(match(page.getUrl().toString())) { - page.putField(id, true); - process(page); - return true; - } else { - return false; - } - } - - public boolean processResult(ResultItems resultItems, Task task) { - - if(resultItems.isSkip()) { - return false; - } - - if(match(resultItems.getRequest().getUrl()) && resultItems.get(id) != null) { - handle(resultItems, task); - return true; - } else { - return false; - } - } - - /** - * override this method to handle the extraction result. this method MUST use - * with PatternPipeline - * - * @param result - * extraction result - * @param task - */ - public void handle(ResultItems result, Task task) { - - } - - @Override - public boolean match(Page page) { - - return match(page.getUrl().toString()); - } -} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/PatternProcessor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/PatternProcessor.java new file mode 100644 index 0000000..f9ef286 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/PatternProcessor.java @@ -0,0 +1,13 @@ +package us.codecraft.webmagic.handler; + +/** + * @author code4crafer@gmail.com + */ +public abstract class PatternProcessor extends PatternRequestMatcher implements SubPipeline, SubPageProcessor { + /** + * @param pattern url pattern to handle + */ + public PatternProcessor(String pattern) { + super(pattern); + } +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/PatternRequestMatcher.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/PatternRequestMatcher.java new file mode 100644 index 0000000..5c0f31a --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/PatternRequestMatcher.java @@ -0,0 +1,37 @@ +package us.codecraft.webmagic.handler; + +import us.codecraft.webmagic.Request; + +import java.util.regex.Pattern; + +/** + * Created with IntelliJ IDEA. + * User: Sebastian MA + * Date: April 03, 2014 + * Time: 10:00 + *

+ * A PatternHandler is in charge of both page extraction and data processing by implementing + * its two abstract methods. + */ +public abstract class PatternRequestMatcher implements RequestMatcher { + + /** + * match pattern. only matched page should be handled. + */ + protected String pattern; + + private Pattern patternCompiled; + + /** + * @param pattern url pattern to handle + */ + public PatternRequestMatcher(String pattern) { + this.pattern = pattern; + this.patternCompiled = Pattern.compile(pattern); + } + + @Override + public boolean match(Request request) { + return patternCompiled.matcher(request.getUrl()).find(); + } +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/RequestMatcher.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/RequestMatcher.java new file mode 100644 index 0000000..31b9a78 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/RequestMatcher.java @@ -0,0 +1,24 @@ +package us.codecraft.webmagic.handler; + +import us.codecraft.webmagic.Request; + +/** + * @author code4crafer@gmail.com + * @since 0.5.0 + */ +public interface RequestMatcher { + + /** + * Check whether to process the page.

+ * Please DO NOT change page status in this method. + * + * @param page + * + * @return + */ + public boolean match(Request page); + + public enum MatchOther { + YES, NO + } +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/SubPageProcessor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/SubPageProcessor.java index 3778a62..1b6e283 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/SubPageProcessor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/SubPageProcessor.java @@ -6,17 +6,7 @@ import us.codecraft.webmagic.Page; * @author code4crafter@gmail.com * @date 14-4-5 */ -public interface SubPageProcessor { - - /** - * Check whether the SubPageProcessor can process the page.

- * Please DO NOT change page status in this method. - * - * @param page - * - * @return - */ - public boolean match(Page page); +public interface SubPageProcessor extends RequestMatcher { /** * process the page, extract urls to fetch, extract the data and store @@ -25,10 +15,6 @@ public interface SubPageProcessor { * * @return whether continue to match */ - public MatchOtherProcessor process(Page page); - - public enum MatchOtherProcessor { - YES, NO - } + public MatchOther processPage(Page page); } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/SubPipeline.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/SubPipeline.java new file mode 100644 index 0000000..4045608 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/SubPipeline.java @@ -0,0 +1,21 @@ +package us.codecraft.webmagic.handler; + +import us.codecraft.webmagic.ResultItems; +import us.codecraft.webmagic.Task; + +/** + * @author code4crafer@gmail.com + * @since 0.5.0 + */ +public interface SubPipeline extends RequestMatcher { + + /** + * process the page, extract urls to fetch, extract the data and store + * + * @param page + * @param task + * @return whether continue to match + */ + public MatchOther processResult(ResultItems resultItems, Task task); + +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/PatternPipeline.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/PatternPipeline.java deleted file mode 100644 index c614114..0000000 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/PatternPipeline.java +++ /dev/null @@ -1,43 +0,0 @@ -package us.codecraft.webmagic.pipeline; - -import us.codecraft.webmagic.ResultItems; -import us.codecraft.webmagic.Task; -import us.codecraft.webmagic.handler.PatternHandler; - -import java.util.ArrayList; - -/** - * Created with IntelliJ IDEA. - * User: Sebastian MA - * Date: April 04, 2014 - * Time: 20:44 - */ -public class PatternPipeline implements Pipeline { - - protected ArrayList handlers = new ArrayList(); - - /** - * A handler works only if it is added to BOTH the page processor and the pipeline. - * Uses PatternHandler's register instead. - * - * @param handler the pattern handler - * - */ - public void addHandler(PatternHandler handler) { - - handlers.add(handler); - } - - public void removeHandler(PatternHandler handler) { - - handlers.remove(handler); - } - - @Override - public void process(ResultItems resultItems, Task task) { - - for(PatternHandler handler : handlers) { - handler.processResult(resultItems, task); - } - } -} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/processor/PatternPageProcessor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/processor/PatternPageProcessor.java deleted file mode 100644 index 51dbabe..0000000 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/processor/PatternPageProcessor.java +++ /dev/null @@ -1,76 +0,0 @@ -package us.codecraft.webmagic.processor; - -import us.codecraft.webmagic.Page; -import us.codecraft.webmagic.Site; -import us.codecraft.webmagic.handler.PatternHandler; -import us.codecraft.webmagic.utils.UrlUtils; - -import java.util.ArrayList; -import java.util.List; - -/** - * Created with IntelliJ IDEA. - * User: Sebastian MA - * Date: April 04, 2014 - * Time: 15:36 - *

- * A PatternPageProcessor uses PatternHandler to setup extraction rules for specific url pattern. - * - * @see us.codecraft.webmagic.handler.PatternHandler - */ -public class PatternPageProcessor implements PageProcessor { - - public static final String TARGET_PATTERN_ALL = "http://*"; - - protected Site site; - - protected String targetPattern; - - protected ArrayList handlers = new ArrayList(); - - public PatternPageProcessor(String startUrl, String targetPattern) { - - this.targetPattern = targetPattern; - - this.site = Site.me().addStartUrl(startUrl).setDomain(UrlUtils.getDomain(startUrl)); - this.targetPattern = "(" + targetPattern.replace(".", "\\.").replace("*", - "[^\"'#]*") + ")"; - - site.setUserAgent("Chrome/5.0.354.0"); - } - - @Override - public void process(Page page) { - - - List requests = page.getHtml().links().regex(targetPattern).all(); - page.addTargetRequests(requests); - for(PatternHandler handler : handlers) { - if(handler.match(page.getUrl().toString())) { - handler.processPage(page); - } - } - } - - /** - * - * @param handler the pattern handler - * - * - */ - public void addHandler(PatternHandler handler) { - - handlers.add(handler); - } - - public void removeHandler(PatternHandler handler) { - - handlers.remove(handler); - } - - @Override - public Site getSite() { - - return site; - } -} From 4738ae2d14a0d2419026cf5b2e9990cf3f3e06f1 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Fri, 25 Apr 2014 16:04:41 +0800 Subject: [PATCH 104/130] change url find to match #94 --- .../codecraft/webmagic/example/PatternProcessorExample.java | 6 +++--- .../codecraft/webmagic/handler/PatternRequestMatcher.java | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/PatternProcessorExample.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/PatternProcessorExample.java index 84b3164..f6b2e9b 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/PatternProcessorExample.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/PatternProcessorExample.java @@ -31,7 +31,7 @@ public class PatternProcessorExample { @Override public RequestMatcher.MatchOther processResult(ResultItems resultItems, Task task) { log.info("Extracting from repo" + resultItems.getRequest()); - System.out.println(resultItems.get("reponame")); + System.out.println("Repo name: "+resultItems.get("reponame")); return RequestMatcher.MatchOther.YES; } }; @@ -49,12 +49,12 @@ public class PatternProcessorExample { @Override public RequestMatcher.MatchOther processResult(ResultItems resultItems, Task task) { - System.out.println(resultItems.get("username")); + System.out.println("User name: "+resultItems.get("username")); return RequestMatcher.MatchOther.YES; } }; - CompositePageProcessor pageProcessor = new CompositePageProcessor(Site.me().setDomain("github.com")); + CompositePageProcessor pageProcessor = new CompositePageProcessor(Site.me().setDomain("github.com").setRetryTimes(0).setSleepTime(0)); CompositePipeline pipeline = new CompositePipeline(); pageProcessor.setSubPageProcessors(githubRepoProcessor, githubUserProcessor); diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/PatternRequestMatcher.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/PatternRequestMatcher.java index 5c0f31a..9201a4c 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/PatternRequestMatcher.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/PatternRequestMatcher.java @@ -32,6 +32,6 @@ public abstract class PatternRequestMatcher implements RequestMatcher { @Override public boolean match(Request request) { - return patternCompiled.matcher(request.getUrl()).find(); + return patternCompiled.matcher(request.getUrl()).matches(); } } From 0336f4cdb4c71ed81bce53d7fb413503cdb312d5 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Fri, 25 Apr 2014 16:06:29 +0800 Subject: [PATCH 105/130] remove IllegalStateException when download error for less error log --- webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 68b2e11..7f075b1 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -401,7 +401,7 @@ public class Spider implements Runnable, Task { Page page = downloader.download(request, this); if (page == null) { sleep(site.getSleepTime()); - throw new IllegalStateException("download error"); + onError(request); } // for cycle retry if (page.isNeedCycleRetry()) { From 179baa7a227b97f6465c5140224df718bfe64123 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Fri, 25 Apr 2014 16:07:41 +0800 Subject: [PATCH 106/130] return when page is null --- webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java | 1 + .../us/codecraft/webmagic/example/PatternProcessorExample.java | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 7f075b1..6560a1b 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -402,6 +402,7 @@ public class Spider implements Runnable, Task { if (page == null) { sleep(site.getSleepTime()); onError(request); + return; } // for cycle retry if (page.isNeedCycleRetry()) { diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/PatternProcessorExample.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/PatternProcessorExample.java index f6b2e9b..8ecb08f 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/PatternProcessorExample.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/PatternProcessorExample.java @@ -54,7 +54,7 @@ public class PatternProcessorExample { } }; - CompositePageProcessor pageProcessor = new CompositePageProcessor(Site.me().setDomain("github.com").setRetryTimes(0).setSleepTime(0)); + CompositePageProcessor pageProcessor = new CompositePageProcessor(Site.me().setDomain("github.com").setRetryTimes(3)); CompositePipeline pipeline = new CompositePipeline(); pageProcessor.setSubPageProcessors(githubRepoProcessor, githubUserProcessor); From c6661899fd4b06f1590613c5c42503c46e9b909c Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Fri, 25 Apr 2014 17:33:48 +0800 Subject: [PATCH 107/130] new thread pool #110 --- .../java/us/codecraft/webmagic/Spider.java | 27 +++---- .../example/GithubRepoPageProcessor.java | 3 +- .../webmagic/selector/thread/ThreadPool.java | 73 +++++++++++++++++++ .../codecraft/webmagic/utils/ThreadUtils.java | 1 - .../webmagic/monitor/SpiderMonitor.java | 2 +- 5 files changed, 86 insertions(+), 20 deletions(-) create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/selector/thread/ThreadPool.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 6560a1b..0d7d5be 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -1,11 +1,9 @@ package us.codecraft.webmagic; import com.google.common.collect.Lists; - import org.apache.commons.collections.CollectionUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; - import us.codecraft.webmagic.downloader.Downloader; import us.codecraft.webmagic.downloader.HttpClientDownloader; import us.codecraft.webmagic.pipeline.CollectorPipeline; @@ -15,7 +13,7 @@ import us.codecraft.webmagic.pipeline.ResultItemsCollectorPipeline; import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.scheduler.QueueScheduler; import us.codecraft.webmagic.scheduler.Scheduler; -import us.codecraft.webmagic.utils.ThreadUtils; +import us.codecraft.webmagic.selector.thread.ThreadPool; import us.codecraft.webmagic.utils.UrlUtils; import java.io.Closeable; @@ -79,7 +77,7 @@ public class Spider implements Runnable, Task { protected Logger logger = LoggerFactory.getLogger(getClass()); - protected ExecutorService executorService; + protected ThreadPool threadPool; protected int threadNum = 1; @@ -101,8 +99,6 @@ public class Spider implements Runnable, Task { private Condition newUrlCondition = newUrlLock.newCondition(); - private final AtomicInteger threadAlive = new AtomicInteger(0); - private List spiderListeners; private final AtomicLong pageCount = new AtomicLong(0); @@ -283,8 +279,8 @@ public class Spider implements Runnable, Task { pipelines.add(new ConsolePipeline()); } downloader.setThread(threadNum); - if (executorService == null || executorService.isShutdown()) { - executorService = ThreadUtils.newFixedThreadPool(threadNum); + if (threadPool == null || threadPool.isShutdown()) { + threadPool = new ThreadPool(threadNum); } if (startRequests != null) { for (Request request : startRequests) { @@ -292,7 +288,6 @@ public class Spider implements Runnable, Task { } startRequests.clear(); } - threadAlive.set(0); } @Override @@ -303,15 +298,14 @@ public class Spider implements Runnable, Task { while (!Thread.currentThread().isInterrupted() && stat.get() == STAT_RUNNING) { Request request = scheduler.poll(this); if (request == null) { - if (threadAlive.get() == 0 && exitWhenComplete) { + if (threadPool.getThreadAlive() == 0 && exitWhenComplete) { break; } // wait until new url added waitNewUrl(); } else { final Request requestFinal = request; - threadAlive.incrementAndGet(); - executorService.execute(new Runnable() { + threadPool.execute(new Runnable() { @Override public void run() { try { @@ -321,7 +315,6 @@ public class Spider implements Runnable, Task { onError(requestFinal); logger.error("process request " + requestFinal + " error", e); } finally { - threadAlive.decrementAndGet(); pageCount.incrementAndGet(); signalNewUrl(); } @@ -370,7 +363,7 @@ public class Spider implements Runnable, Task { for (Pipeline pipeline : pipelines) { destroyEach(pipeline); } - executorService.shutdown(); + threadPool.shutdown(); } private void destroyEach(Object object) { @@ -522,7 +515,7 @@ public class Spider implements Runnable, Task { newUrlLock.lock(); try { //double check - if (threadAlive.get() == 0 && exitWhenComplete) { + if (threadPool.getThreadAlive() == 0 && exitWhenComplete) { return; } newUrlCondition.await(); @@ -644,7 +637,7 @@ public class Spider implements Runnable, Task { * @since 0.4.1 */ public int getThreadAlive() { - return threadAlive.get(); + return threadPool.getThreadAlive(); } /** @@ -674,7 +667,7 @@ public class Spider implements Runnable, Task { } public Spider setExecutorService(ExecutorService executorService) { - this.executorService = executorService; + this.threadPool.setExecutorService(executorService); return this; } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/GithubRepoPageProcessor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/GithubRepoPageProcessor.java index c512265..f4ae058 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/GithubRepoPageProcessor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/GithubRepoPageProcessor.java @@ -11,11 +11,12 @@ import us.codecraft.webmagic.processor.PageProcessor; */ public class GithubRepoPageProcessor implements PageProcessor { - private Site site = Site.me().setRetryTimes(3).setSleepTime(1000); + private Site site = Site.me().setRetryTimes(3).setSleepTime(0); @Override public void process(Page page) { page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all()); + page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+)").all()); page.putField("author", page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString()); page.putField("name", page.getHtml().xpath("//h1[@class='entry-title public']/strong/a/text()").toString()); if (page.getResultItems().get("name")==null){ diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/thread/ThreadPool.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/thread/ThreadPool.java new file mode 100644 index 0000000..0548919 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/thread/ThreadPool.java @@ -0,0 +1,73 @@ +package us.codecraft.webmagic.selector.thread; + +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.locks.Condition; +import java.util.concurrent.locks.ReentrantLock; + +/** + * @author code4crafer@gmail.com + * @since 0.5.0 + */ +public class ThreadPool { + + private int threadNum; + + private int threadAlive; + + private ReentrantLock reentrantLock = new ReentrantLock(); + + private Condition condition = reentrantLock.newCondition(); + + public ThreadPool(int threadNum) { + this.threadNum = threadNum; + this.executorService = Executors.newFixedThreadPool(threadNum); + } + + public ThreadPool(int threadNum, ExecutorService executorService) { + this.threadNum = threadNum; + this.executorService = executorService; + } + + public void setExecutorService(ExecutorService executorService) { + this.executorService = executorService; + } + + public int getThreadAlive() { + return threadAlive; + } + + public int getThreadNum() { + return threadNum; + } + + private ExecutorService executorService; + + public void execute(Runnable runnable) { + try { + reentrantLock.lock(); + while (threadAlive >= threadNum) { + try { + condition.await(); + } catch (InterruptedException e) { + } + } + threadAlive++; + executorService.execute(runnable); + } finally { + condition.notify(); + threadAlive--; + reentrantLock.unlock(); + } + } + + public boolean isShutdown() { + return executorService.isShutdown(); + } + + public void shutdown() { + executorService.shutdown(); + } + + +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/ThreadUtils.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/ThreadUtils.java index cdfe6d0..d8d2122 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/ThreadUtils.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/ThreadUtils.java @@ -19,7 +19,6 @@ public class ThreadUtils { } if (threadSize == 1) { return MoreExecutors.sameThreadExecutor(); - } return new ThreadPoolExecutor(threadSize - 1, threadSize - 1, 0L, TimeUnit.MILLISECONDS, new SynchronousQueue(), new ThreadPoolExecutor.CallerRunsPolicy()); diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java index 193ff94..0783b7e 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java @@ -240,7 +240,7 @@ public class SpiderMonitor { //Others will be registered spiderMonitor.server().jmxStart(); oschinaSpider.start(); - githubSpider.start(); + githubSpider.thread(10).start(); } From cdc423f2bf0e853729aaae41edf2565bad982f62 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Fri, 25 Apr 2014 17:41:41 +0800 Subject: [PATCH 108/130] log --- .../java/us/codecraft/webmagic/selector/thread/ThreadPool.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/thread/ThreadPool.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/thread/ThreadPool.java index 0548919..3e90282 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/thread/ThreadPool.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/thread/ThreadPool.java @@ -53,10 +53,11 @@ public class ThreadPool { } } threadAlive++; + System.out.println(threadAlive); executorService.execute(runnable); } finally { - condition.notify(); threadAlive--; + condition.signal(); reentrantLock.unlock(); } } From 018061d2cdf26db11c3bb6804f39a84cf26a47b6 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Fri, 25 Apr 2014 18:01:02 +0800 Subject: [PATCH 109/130] fix error in thread pool --- .../webmagic/selector/thread/ThreadPool.java | 46 +++++++++++++------ 1 file changed, 32 insertions(+), 14 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/thread/ThreadPool.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/thread/ThreadPool.java index 3e90282..00df89a 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/thread/ThreadPool.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/thread/ThreadPool.java @@ -2,6 +2,7 @@ package us.codecraft.webmagic.selector.thread; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; +import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.locks.Condition; import java.util.concurrent.locks.ReentrantLock; @@ -13,7 +14,7 @@ public class ThreadPool { private int threadNum; - private int threadAlive; + private AtomicInteger threadAlive = new AtomicInteger(); private ReentrantLock reentrantLock = new ReentrantLock(); @@ -34,7 +35,7 @@ public class ThreadPool { } public int getThreadAlive() { - return threadAlive; + return threadAlive.get(); } public int getThreadNum() { @@ -43,22 +44,39 @@ public class ThreadPool { private ExecutorService executorService; - public void execute(Runnable runnable) { + public void execute(final Runnable runnable) { try { - reentrantLock.lock(); - while (threadAlive >= threadNum) { - try { - condition.await(); - } catch (InterruptedException e) { + + if (threadAlive.get() >= threadNum) { + reentrantLock.lock(); + while (threadAlive.get() >= threadNum) { + try { + condition.await(); + } catch (InterruptedException e) { + } } } - threadAlive++; - System.out.println(threadAlive); - executorService.execute(runnable); + threadAlive.incrementAndGet(); + executorService.execute(new Runnable() { + @Override + public void run() { + try { + runnable.run(); + } finally { + try { + reentrantLock.lock(); + threadAlive.decrementAndGet(); + condition.signal(); + } finally { + reentrantLock.unlock(); + } + } + } + }); } finally { - threadAlive--; - condition.signal(); - reentrantLock.unlock(); + if (reentrantLock.isLocked()) { + reentrantLock.unlock(); + } } } From 375e64e84521366dabbf1cc5bc08828dd25b8a81 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Fri, 25 Apr 2014 18:10:14 +0800 Subject: [PATCH 110/130] more monitor status --- .../src/main/java/us/codecraft/webmagic/Spider.java | 12 ++++++++---- .../us/codecraft/webmagic/monitor/SpiderStatus.java | 12 ++++++++++++ .../webmagic/monitor/SpiderStatusMXBean.java | 5 +++++ 3 files changed, 25 insertions(+), 4 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 0d7d5be..5016724 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -18,10 +18,7 @@ import us.codecraft.webmagic.utils.UrlUtils; import java.io.Closeable; import java.io.IOException; -import java.util.ArrayList; -import java.util.Collection; -import java.util.List; -import java.util.UUID; +import java.util.*; import java.util.concurrent.ExecutorService; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicLong; @@ -103,6 +100,8 @@ public class Spider implements Runnable, Task { private final AtomicLong pageCount = new AtomicLong(0); + private Date startTime; + /** * create a spider with pageProcessor. * @@ -288,6 +287,7 @@ public class Spider implements Runnable, Task { } startRequests.clear(); } + startTime = new Date(); } @Override @@ -685,6 +685,10 @@ public class Spider implements Runnable, Task { return this; } + public Date getStartTime() { + return startTime; + } + public Scheduler getScheduler() { return scheduler; } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderStatus.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderStatus.java index af08526..a87c040 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderStatus.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderStatus.java @@ -5,6 +5,7 @@ import org.slf4j.LoggerFactory; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.scheduler.MonitorableScheduler; +import java.util.Date; import java.util.List; /** @@ -76,4 +77,15 @@ public class SpiderStatus implements SpiderStatusMXBean { spider.stop(); } + @Override + public Date getStartTime() { + return spider.getStartTime(); + } + + @Override + public int getPagePerSecond() { + int runSeconds = (int) (System.currentTimeMillis() - getStartTime().getTime()) / 1000; + return getSuccessPageCount() / runSeconds; + } + } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderStatusMXBean.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderStatusMXBean.java index cc0f040..e49ff8f 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderStatusMXBean.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderStatusMXBean.java @@ -1,5 +1,6 @@ package us.codecraft.webmagic.monitor; +import java.util.Date; import java.util.List; /** @@ -27,4 +28,8 @@ public interface SpiderStatusMXBean { public void start(); public void stop(); + + public Date getStartTime(); + + public int getPagePerSecond(); } From 05eb7831b68fbe4971d2aa3765d172ef3b5bd04f Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Fri, 25 Apr 2014 18:27:40 +0800 Subject: [PATCH 111/130] refactor and comments #110 --- .../java/us/codecraft/webmagic/Spider.java | 40 +++++++++++++++---- ...readPool.java => CountableThreadPool.java} | 12 ++++-- .../codecraft/webmagic/utils/ThreadUtils.java | 6 +++ 3 files changed, 47 insertions(+), 11 deletions(-) rename webmagic-core/src/main/java/us/codecraft/webmagic/selector/thread/{ThreadPool.java => CountableThreadPool.java} (84%) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 5016724..0c5b147 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -13,7 +13,7 @@ import us.codecraft.webmagic.pipeline.ResultItemsCollectorPipeline; import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.scheduler.QueueScheduler; import us.codecraft.webmagic.scheduler.Scheduler; -import us.codecraft.webmagic.selector.thread.ThreadPool; +import us.codecraft.webmagic.selector.thread.CountableThreadPool; import us.codecraft.webmagic.utils.UrlUtils; import java.io.Closeable; @@ -74,7 +74,9 @@ public class Spider implements Runnable, Task { protected Logger logger = LoggerFactory.getLogger(getClass()); - protected ThreadPool threadPool; + protected CountableThreadPool threadPool; + + protected ExecutorService executorService; protected int threadNum = 1; @@ -279,7 +281,11 @@ public class Spider implements Runnable, Task { } downloader.setThread(threadNum); if (threadPool == null || threadPool.isShutdown()) { - threadPool = new ThreadPool(threadNum); + if (executorService != null && !executorService.isShutdown()) { + threadPool = new CountableThreadPool(threadNum, executorService); + } else { + threadPool = new CountableThreadPool(threadNum); + } } if (startRequests != null) { for (Request request : startRequests) { @@ -330,7 +336,7 @@ public class Spider implements Runnable, Task { } protected void onError(Request request) { - if (CollectionUtils.isNotEmpty(spiderListeners)){ + if (CollectionUtils.isNotEmpty(spiderListeners)) { for (SpiderListener spiderListener : spiderListeners) { spiderListener.onError(request); } @@ -338,7 +344,7 @@ public class Spider implements Runnable, Task { } protected void onSuccess(Request request) { - if (CollectionUtils.isNotEmpty(spiderListeners)){ + if (CollectionUtils.isNotEmpty(spiderListeners)) { for (SpiderListener spiderListener : spiderListeners) { spiderListener.onSuccess(request); } @@ -521,8 +527,7 @@ public class Spider implements Runnable, Task { newUrlCondition.await(); } catch (InterruptedException e) { logger.warn("waitNewUrl - interrupted, error {}", e); - } - finally { + } finally { newUrlLock.unlock(); } } @@ -563,6 +568,21 @@ public class Spider implements Runnable, Task { return this; } + /** + * start with more than one threads + * + * @param threadNum + * @return this + */ + public Spider thread(ExecutorService executorService, int threadNum) { + checkIfRunning(); + this.threadNum = threadNum; + if (threadNum <= 0) { + throw new IllegalArgumentException("threadNum should be more than one!"); + } + return this; + } + public boolean isExitWhenComplete() { return exitWhenComplete; } @@ -637,6 +657,9 @@ public class Spider implements Runnable, Task { * @since 0.4.1 */ public int getThreadAlive() { + if (threadPool == null) { + return 0; + } return threadPool.getThreadAlive(); } @@ -667,7 +690,8 @@ public class Spider implements Runnable, Task { } public Spider setExecutorService(ExecutorService executorService) { - this.threadPool.setExecutorService(executorService); + checkIfRunning(); + this.executorService = executorService; return this; } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/thread/ThreadPool.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/thread/CountableThreadPool.java similarity index 84% rename from webmagic-core/src/main/java/us/codecraft/webmagic/selector/thread/ThreadPool.java rename to webmagic-core/src/main/java/us/codecraft/webmagic/selector/thread/CountableThreadPool.java index 00df89a..0121cf2 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/thread/ThreadPool.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/thread/CountableThreadPool.java @@ -7,10 +7,16 @@ import java.util.concurrent.locks.Condition; import java.util.concurrent.locks.ReentrantLock; /** + * Thread pool for workers.

+ * Use {@link java.util.concurrent.ExecutorService} as inner implement.

+ * New feature:

+ * 1. Block when thread pool is full to avoid poll many urls but not process.

+ * 2. Count of thread alive for monitor. + * * @author code4crafer@gmail.com * @since 0.5.0 */ -public class ThreadPool { +public class CountableThreadPool { private int threadNum; @@ -20,12 +26,12 @@ public class ThreadPool { private Condition condition = reentrantLock.newCondition(); - public ThreadPool(int threadNum) { + public CountableThreadPool(int threadNum) { this.threadNum = threadNum; this.executorService = Executors.newFixedThreadPool(threadNum); } - public ThreadPool(int threadNum, ExecutorService executorService) { + public CountableThreadPool(int threadNum, ExecutorService executorService) { this.threadNum = threadNum; this.executorService = executorService; } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/ThreadUtils.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/ThreadUtils.java index d8d2122..5c4d346 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/ThreadUtils.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/ThreadUtils.java @@ -13,6 +13,12 @@ import java.util.concurrent.TimeUnit; */ public class ThreadUtils { + /** + * @Deprecated + * @param threadSize + * @return + * @see us.codecraft.webmagic.selector.thread.CountableThreadPool + */ public static ExecutorService newFixedThreadPool(int threadSize) { if (threadSize <= 0) { throw new IllegalArgumentException("ThreadSize must be greater than 0!"); From b1258f4f160a84d080fc281568bfba01d3279044 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Fri, 25 Apr 2014 18:30:49 +0800 Subject: [PATCH 112/130] remove some test --- .../java/us/codecraft/webmagic/scripts/ScriptProcessorTest.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/webmagic-scripts/src/test/java/us/codecraft/webmagic/scripts/ScriptProcessorTest.java b/webmagic-scripts/src/test/java/us/codecraft/webmagic/scripts/ScriptProcessorTest.java index 23fe093..ffeb9c9 100755 --- a/webmagic-scripts/src/test/java/us/codecraft/webmagic/scripts/ScriptProcessorTest.java +++ b/webmagic-scripts/src/test/java/us/codecraft/webmagic/scripts/ScriptProcessorTest.java @@ -1,5 +1,6 @@ package us.codecraft.webmagic.scripts; +import org.junit.Ignore; import org.junit.Test; import us.codecraft.webmagic.Spider; @@ -7,6 +8,7 @@ import us.codecraft.webmagic.Spider; * @author code4crafter@gmail.com * @since 0.4.1 */ +@Ignore public class ScriptProcessorTest { @Test From 3a666fcebf40516a116ca6b290e255648af4e186 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Fri, 25 Apr 2014 18:36:07 +0800 Subject: [PATCH 113/130] add sample of 36kr #110 --- .../webmagic/model/samples/Kr36NewsModel.java | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/Kr36NewsModel.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/Kr36NewsModel.java index 936f132..a9e3f3a 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/Kr36NewsModel.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/Kr36NewsModel.java @@ -1,14 +1,19 @@ package us.codecraft.webmagic.model.samples; import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.model.OOSpider; +import us.codecraft.webmagic.monitor.SpiderMonitor; import us.codecraft.webmagic.pipeline.PageModelPipeline; import us.codecraft.webmagic.model.annotation.ExtractBy; import us.codecraft.webmagic.model.annotation.ExtractByUrl; import us.codecraft.webmagic.model.annotation.HelpUrl; import us.codecraft.webmagic.model.annotation.TargetUrl; +import javax.management.JMException; +import java.io.IOException; + /** * @author code4crafter@gmail.com
*/ @@ -25,14 +30,17 @@ public class Kr36NewsModel { @ExtractByUrl private String url; - public static void main(String[] args) { + public static void main(String[] args) throws IOException, JMException { //Just for benchmark - OOSpider.create(Site.me().addStartUrl("http://www.36kr.com/").setSleepTime(0), new PageModelPipeline() { + Spider thread = OOSpider.create(Site.me().addStartUrl("http://www.36kr.com/").setSleepTime(0), new PageModelPipeline() { @Override public void process(Object o, Task task) { } - },Kr36NewsModel.class).thread(20).run(); + }, Kr36NewsModel.class).thread(20); + thread.run(); + SpiderMonitor spiderMonitor = SpiderMonitor.create(); + spiderMonitor.register(thread).jmxStart(); } public String getTitle() { From 17e95f2a7feb713fcacd61f9ade7b140d9587f87 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Fri, 25 Apr 2014 18:39:01 +0800 Subject: [PATCH 114/130] comments --- .../selector/thread/CountableThreadPool.java | 2 +- .../ConfigurablePageProcessor.java | 2 ++ .../webmagic/monitor/SpiderMonitor.java | 22 ++----------------- 3 files changed, 5 insertions(+), 21 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/thread/CountableThreadPool.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/thread/CountableThreadPool.java index 0121cf2..b20ff15 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/thread/CountableThreadPool.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/thread/CountableThreadPool.java @@ -10,7 +10,7 @@ import java.util.concurrent.locks.ReentrantLock; * Thread pool for workers.

* Use {@link java.util.concurrent.ExecutorService} as inner implement.

* New feature:

- * 1. Block when thread pool is full to avoid poll many urls but not process.

+ * 1. Block when thread pool is full to avoid poll many urls without process.

* 2. Count of thread alive for monitor. * * @author code4crafer@gmail.com diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ConfigurablePageProcessor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ConfigurablePageProcessor.java index 36615d8..902dfdd 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ConfigurablePageProcessor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ConfigurablePageProcessor.java @@ -3,12 +3,14 @@ package us.codecraft.webmagic.configurable; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.processor.PageProcessor; +import us.codecraft.webmagic.utils.Experimental; import java.util.List; /** * @author code4crafter@gmail.com
*/ +@Experimental public class ConfigurablePageProcessor implements PageProcessor { private Site site; diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java index 0783b7e..ea9b374 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java @@ -5,8 +5,7 @@ import org.slf4j.LoggerFactory; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.SpiderListener; -import us.codecraft.webmagic.processor.example.GithubRepoPageProcessor; -import us.codecraft.webmagic.processor.example.OschinaBlogPageProcessor; +import us.codecraft.webmagic.utils.Experimental; import us.codecraft.webmagic.utils.IPUtils; import javax.management.JMException; @@ -30,6 +29,7 @@ import java.util.concurrent.atomic.AtomicInteger; * @author code4crafer@gmail.com * @since 0.5.0 */ +@Experimental public class SpiderMonitor { private enum Type { @@ -226,22 +226,4 @@ public class SpiderMonitor { return this; } - public static void main(String[] args) throws Exception { - - Spider oschinaSpider = Spider.create(new OschinaBlogPageProcessor()) - .addUrl("http://my.oschina.net/flashsword/blog").thread(2); - Spider githubSpider = Spider.create(new GithubRepoPageProcessor()) - .addUrl("https://github.com/code4craft"); - - SpiderMonitor spiderMonitor = new SpiderMonitor(); - spiderMonitor.register(oschinaSpider, githubSpider); - //If you want to connect it from remote, use spiderMonitor.server().jmxStart(); - //ONLY ONE server can start for a machine. - //Others will be registered - spiderMonitor.server().jmxStart(); - oschinaSpider.start(); - githubSpider.thread(10).start(); - - } - } From c7afdb516e0cda23a03017efc427a388eed997d4 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Fri, 25 Apr 2014 18:44:33 +0800 Subject: [PATCH 115/130] remove thread utils #110 --- .../codecraft/webmagic/utils/ThreadUtils.java | 32 ------------------- 1 file changed, 32 deletions(-) delete mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/utils/ThreadUtils.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/ThreadUtils.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/ThreadUtils.java deleted file mode 100644 index 5c4d346..0000000 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/ThreadUtils.java +++ /dev/null @@ -1,32 +0,0 @@ -package us.codecraft.webmagic.utils; - -import com.google.common.util.concurrent.MoreExecutors; - -import java.util.concurrent.ExecutorService; -import java.util.concurrent.SynchronousQueue; -import java.util.concurrent.ThreadPoolExecutor; -import java.util.concurrent.TimeUnit; - -/** - * @author code4crafer@gmail.com - * @since 0.1.0 - */ -public class ThreadUtils { - - /** - * @Deprecated - * @param threadSize - * @return - * @see us.codecraft.webmagic.selector.thread.CountableThreadPool - */ - public static ExecutorService newFixedThreadPool(int threadSize) { - if (threadSize <= 0) { - throw new IllegalArgumentException("ThreadSize must be greater than 0!"); - } - if (threadSize == 1) { - return MoreExecutors.sameThreadExecutor(); - } - return new ThreadPoolExecutor(threadSize - 1, threadSize - 1, 0L, TimeUnit.MILLISECONDS, - new SynchronousQueue(), new ThreadPoolExecutor.CallerRunsPolicy()); - } -} From 964e637264d8eba7f738f32466d45ad4505e8cfc Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Fri, 25 Apr 2014 18:46:31 +0800 Subject: [PATCH 116/130] fix ut #110 --- .../src/main/java/us/codecraft/webmagic/worker/Worker.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/webmagic-avalon/webmagic-worker/src/main/java/us/codecraft/webmagic/worker/Worker.java b/webmagic-avalon/webmagic-worker/src/main/java/us/codecraft/webmagic/worker/Worker.java index a65c94b..312500b 100644 --- a/webmagic-avalon/webmagic-worker/src/main/java/us/codecraft/webmagic/worker/Worker.java +++ b/webmagic-avalon/webmagic-worker/src/main/java/us/codecraft/webmagic/worker/Worker.java @@ -1,11 +1,11 @@ package us.codecraft.webmagic.worker; import us.codecraft.webmagic.Spider; -import us.codecraft.webmagic.utils.ThreadUtils; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; /** * Container of Spiders. @@ -33,7 +33,7 @@ public class Worker { } protected ExecutorService initExecutorService() { - return ThreadUtils.newFixedThreadPool(poolSize); + return Executors.newFixedThreadPool(poolSize); } public void addSpider(Spider spider) { From 5ecd909ef2282705cb8f6fb3c2accea2f283354d Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Fri, 25 Apr 2014 19:37:55 +0800 Subject: [PATCH 117/130] add timeout for wait/notify #111 --- .../java/us/codecraft/webmagic/Spider.java | 14 +++++- .../selector/thread/CountableThreadPool.java | 43 +++++++++---------- .../us/codecraft/webmagic/SpiderTest.java | 2 +- 3 files changed, 35 insertions(+), 24 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 0c5b147..81cf179 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -20,6 +20,7 @@ import java.io.Closeable; import java.io.IOException; import java.util.*; import java.util.concurrent.ExecutorService; +import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicLong; import java.util.concurrent.locks.Condition; @@ -104,6 +105,8 @@ public class Spider implements Runnable, Task { private Date startTime; + private int emptySleepTime = 30000; + /** * create a spider with pageProcessor. * @@ -524,7 +527,7 @@ public class Spider implements Runnable, Task { if (threadPool.getThreadAlive() == 0 && exitWhenComplete) { return; } - newUrlCondition.await(); + newUrlCondition.await(emptySleepTime, TimeUnit.MILLISECONDS); } catch (InterruptedException e) { logger.warn("waitNewUrl - interrupted, error {}", e); } finally { @@ -716,4 +719,13 @@ public class Spider implements Runnable, Task { public Scheduler getScheduler() { return scheduler; } + + /** + * Set wait time when no url is polled.

+ * + * @param emptySleepTime In MILLISECONDS. + */ + public void setEmptySleepTime(int emptySleepTime) { + this.emptySleepTime = emptySleepTime; + } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/thread/CountableThreadPool.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/thread/CountableThreadPool.java index b20ff15..ac41668 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/thread/CountableThreadPool.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/thread/CountableThreadPool.java @@ -51,9 +51,10 @@ public class CountableThreadPool { private ExecutorService executorService; public void execute(final Runnable runnable) { - try { - if (threadAlive.get() >= threadNum) { + + if (threadAlive.get() >= threadNum) { + try { reentrantLock.lock(); while (threadAlive.get() >= threadNum) { try { @@ -61,29 +62,27 @@ public class CountableThreadPool { } catch (InterruptedException e) { } } - } - threadAlive.incrementAndGet(); - executorService.execute(new Runnable() { - @Override - public void run() { - try { - runnable.run(); - } finally { - try { - reentrantLock.lock(); - threadAlive.decrementAndGet(); - condition.signal(); - } finally { - reentrantLock.unlock(); - } - } - } - }); - } finally { - if (reentrantLock.isLocked()) { + } finally { reentrantLock.unlock(); } } + threadAlive.incrementAndGet(); + executorService.execute(new Runnable() { + @Override + public void run() { + try { + runnable.run(); + } finally { + try { + reentrantLock.lock(); + threadAlive.decrementAndGet(); + condition.signal(); + } finally { + reentrantLock.unlock(); + } + } + } + }); } public boolean isShutdown() { diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/SpiderTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/SpiderTest.java index 9d950ae..ba29387 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/SpiderTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/SpiderTest.java @@ -37,7 +37,7 @@ public class SpiderTest { @Test public void testWaitAndNotify() throws InterruptedException { for (int i = 0; i < 10000; i++) { - System.out.println("round" + i); + System.out.println("round " + i); testRound(); } } From 2770811a10ee57838f3ca5ae10f36a7e6ab44bca Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 26 Apr 2014 11:24:22 +0800 Subject: [PATCH 118/130] update monitor example --- .../processor/example/OschinaBlogPageProcessor.java | 2 +- .../java/us/codecraft/webmagic/example/MonitorExample.java | 7 ++++--- .../us/codecraft/webmagic/model/samples/Kr36NewsModel.java | 4 ++-- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/OschinaBlogPageProcessor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/OschinaBlogPageProcessor.java index aac0ac1..053c155 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/OschinaBlogPageProcessor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/OschinaBlogPageProcessor.java @@ -34,6 +34,6 @@ public class OschinaBlogPageProcessor implements PageProcessor { } public static void main(String[] args) { - Spider.create(new OschinaBlogPageProcessor()).addUrl("http://my.oschina.net/flashsword/blog").thread(2).run(); + Spider.create(new OschinaBlogPageProcessor()).addUrl("http://my.oschina.net/flashsword/blog").run(); } } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/MonitorExample.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/MonitorExample.java index d22a16e..0763fdd 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/MonitorExample.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/MonitorExample.java @@ -14,17 +14,18 @@ public class MonitorExample { public static void main(String[] args) throws Exception { Spider oschinaSpider = Spider.create(new OschinaBlogPageProcessor()) - .addUrl("http://my.oschina.net/flashsword/blog").thread(2); + .addUrl("http://my.oschina.net/flashsword/blog"); Spider githubSpider = Spider.create(new GithubRepoPageProcessor()) .addUrl("https://github.com/code4craft"); SpiderMonitor spiderMonitor = new SpiderMonitor(); - spiderMonitor.register(oschinaSpider, githubSpider); + spiderMonitor.register(oschinaSpider); + spiderMonitor.register(githubSpider); //If you want to connect it from remote, use spiderMonitor.server().jmxStart(); //ONLY ONE server can start for a machine. //Others will be registered without start a server. //You can also register a server by spiderMonitor.client(host,port).jmxStart(). - spiderMonitor.jmxStart(); + spiderMonitor.server().jmxStart(); oschinaSpider.start(); githubSpider.start(); diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/Kr36NewsModel.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/Kr36NewsModel.java index a9e3f3a..3dcc5f9 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/Kr36NewsModel.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/Kr36NewsModel.java @@ -38,9 +38,9 @@ public class Kr36NewsModel { } }, Kr36NewsModel.class).thread(20); - thread.run(); + thread.start(); SpiderMonitor spiderMonitor = SpiderMonitor.create(); - spiderMonitor.register(thread).jmxStart(); + spiderMonitor.server().register(thread).jmxStart(); } public String getTitle() { From 04fde8203b175a259ee102b2cfa94bd4aa36a298 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 26 Apr 2014 11:44:14 +0800 Subject: [PATCH 119/130] add control for monitor --- .../java/us/codecraft/webmagic/monitor/SpiderMonitor.java | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java index ea9b374..265efc7 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java @@ -52,6 +52,8 @@ public class SpiderMonitor { private Type type = Type.Local; + private JMXConnectorServer jmxConnServer; + private List spiderStatuses = new ArrayList(); public List getSpiders() { @@ -214,7 +216,7 @@ public class SpiderMonitor { JMXServiceURL url = new JMXServiceURL("service:jmx:rmi:///jndi/rmi://" + jndiServer + ":" + rmiPort + "/" + jmxServerName); System.out.println("JMXServiceURL: " + url.toString()); System.out.println("Please replace localhost of your ip if you want to connect it in remote server."); - JMXConnectorServer jmxConnServer = JMXConnectorServerFactory.newJMXConnectorServer(url, null, localServer); + jmxConnServer = JMXConnectorServerFactory.newJMXConnectorServer(url, null, localServer); jmxConnServer.start(); } @@ -222,6 +224,8 @@ public class SpiderMonitor { objName = new ObjectName(jmxServerName + ":name=" + spiderStatus.getName()); localServer.registerMBean(spiderStatus, objName); } + objName = new ObjectName(jmxServerName + ":name=WebMagicMonitor"); + localServer.registerMBean(jmxConnServer, objName); return this; } From ab4d36806e211f71cee2982801ee1f2a0fe7254b Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 26 Apr 2014 11:45:21 +0800 Subject: [PATCH 120/130] clean code --- .../java/us/codecraft/webmagic/monitor/SpiderMonitor.java | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java index 265efc7..b9a9d20 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java @@ -52,8 +52,6 @@ public class SpiderMonitor { private Type type = Type.Local; - private JMXConnectorServer jmxConnServer; - private List spiderStatuses = new ArrayList(); public List getSpiders() { @@ -216,16 +214,16 @@ public class SpiderMonitor { JMXServiceURL url = new JMXServiceURL("service:jmx:rmi:///jndi/rmi://" + jndiServer + ":" + rmiPort + "/" + jmxServerName); System.out.println("JMXServiceURL: " + url.toString()); System.out.println("Please replace localhost of your ip if you want to connect it in remote server."); - jmxConnServer = JMXConnectorServerFactory.newJMXConnectorServer(url, null, localServer); + JMXConnectorServer jmxConnServer = JMXConnectorServerFactory.newJMXConnectorServer(url, null, localServer); jmxConnServer.start(); + objName = new ObjectName(jmxServerName + ":name=WebMagicMonitor"); + localServer.registerMBean(jmxConnServer, objName); } for (SpiderStatusMXBean spiderStatus : spiderStatuses) { objName = new ObjectName(jmxServerName + ":name=" + spiderStatus.getName()); localServer.registerMBean(spiderStatus, objName); } - objName = new ObjectName(jmxServerName + ":name=WebMagicMonitor"); - localServer.registerMBean(jmxConnServer, objName); return this; } From 86a45a6643dee003043614928dc093f598c767cf Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 26 Apr 2014 18:14:25 +0800 Subject: [PATCH 121/130] change SpiderMonitor to singleton #98 --- .../webmagic/example/MonitorExample.java | 2 +- .../webmagic/monitor/SpiderMonitor.java | 56 +++++++++++-------- .../webmagic/model/samples/Kr36NewsModel.java | 2 +- 3 files changed, 35 insertions(+), 25 deletions(-) diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/MonitorExample.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/MonitorExample.java index 0763fdd..60f62b0 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/MonitorExample.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/MonitorExample.java @@ -18,7 +18,7 @@ public class MonitorExample { Spider githubSpider = Spider.create(new GithubRepoPageProcessor()) .addUrl("https://github.com/code4craft"); - SpiderMonitor spiderMonitor = new SpiderMonitor(); + SpiderMonitor spiderMonitor = SpiderMonitor.instance(); spiderMonitor.register(oschinaSpider); spiderMonitor.register(githubSpider); //If you want to connect it from remote, use spiderMonitor.server().jmxStart(); diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java index b9a9d20..10ca508 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java @@ -8,9 +8,7 @@ import us.codecraft.webmagic.SpiderListener; import us.codecraft.webmagic.utils.Experimental; import us.codecraft.webmagic.utils.IPUtils; -import javax.management.JMException; -import javax.management.MBeanServer; -import javax.management.ObjectName; +import javax.management.*; import javax.management.remote.JMXConnectorServer; import javax.management.remote.JMXConnectorServerFactory; import javax.management.remote.JMXServiceURL; @@ -36,7 +34,7 @@ public class SpiderMonitor { Server, Client, Local; } - private static AtomicInteger serialNumber = new AtomicInteger(); + private static SpiderMonitor INSTANCE = new SpiderMonitor(); private AtomicBoolean started = new AtomicBoolean(false); @@ -48,6 +46,10 @@ public class SpiderMonitor { private int serverPort; + private MBeanServer mbeanServer; + + private String jmxServerName; + private String serverHost; private Type type = Type.Local; @@ -62,13 +64,16 @@ public class SpiderMonitor { return spiderStatuses.get(0); } + private SpiderMonitor() { + } + /** * Register spider for monitor. * * @param spiders * @return */ - public SpiderMonitor register(Spider... spiders) { + public synchronized SpiderMonitor register(Spider... spiders) throws JMException { for (Spider spider : spiders) { MonitorSpiderListener monitorSpiderListener = new MonitorSpiderListener(); if (spider.getSpiderListeners() == null) { @@ -78,7 +83,11 @@ public class SpiderMonitor { } else { spider.getSpiderListeners().add(monitorSpiderListener); } - spiderStatuses.add(getSpiderStatusMBean(spider, monitorSpiderListener)); + SpiderStatusMXBean spiderStatusMBean = getSpiderStatusMBean(spider, monitorSpiderListener); + if (started.get()) { + registerMBean(spiderStatusMBean); + } + spiderStatuses.add(spiderStatusMBean); } return this; } @@ -87,8 +96,8 @@ public class SpiderMonitor { return new SpiderStatus(spider, monitorSpiderListener); } - public static SpiderMonitor create() { - return new SpiderMonitor(); + public static SpiderMonitor instance() { + return INSTANCE; } public class MonitorSpiderListener implements SpiderListener { @@ -132,7 +141,7 @@ public class SpiderMonitor { * @throws IOException * @throws JMException */ - public SpiderMonitor server(int port) throws IOException, JMException { + public synchronized SpiderMonitor server(int port) throws IOException, JMException { try { Registry registry = LocateRegistry.createRegistry(port); } catch (ExportException e) { @@ -161,7 +170,7 @@ public class SpiderMonitor { * * @return */ - public SpiderMonitor local() { + public synchronized SpiderMonitor local() { this.type = Type.Local; return this; } @@ -176,7 +185,7 @@ public class SpiderMonitor { * @throws IOException * @throws JMException */ - public SpiderMonitor client(String serverHost, int serverPort) throws IOException, JMException { + public synchronized SpiderMonitor client(String serverHost, int serverPort) throws IOException, JMException { type = Type.Client; this.serverHost = serverHost; this.serverPort = serverPort; @@ -194,38 +203,39 @@ public class SpiderMonitor { return client(DEFAULT_SERVER_HOST, DEFAULT_SERVER_PORT); } - public SpiderMonitor jmxStart() throws IOException, JMException { - return jmxStart("localhost", DEFAULT_SERVER_PORT); - } - - public SpiderMonitor jmxStart(String jndiServer, int rmiPort) throws IOException, JMException { + public synchronized SpiderMonitor jmxStart() throws IOException, JMException { if (!started.compareAndSet(false, true)) { logger.error("Monitor has already started!"); return this; } - String jmxServerName = "WebMagic-" + IPUtils.getFirstNoLoopbackIPAddresses() + "-" + serialNumber.incrementAndGet(); + jmxServerName = "WebMagic-" + IPUtils.getFirstNoLoopbackIPAddresses(); // start JNDI - MBeanServer localServer = ManagementFactory.getPlatformMBeanServer(); + mbeanServer = ManagementFactory.getPlatformMBeanServer(); ObjectName objName; if (type != Type.Local) { - JMXServiceURL url = new JMXServiceURL("service:jmx:rmi:///jndi/rmi://" + jndiServer + ":" + rmiPort + "/" + jmxServerName); + JMXServiceURL url = new JMXServiceURL("service:jmx:rmi:///jndi/rmi://" + serverHost + ":" + serverPort + "/" + jmxServerName); System.out.println("JMXServiceURL: " + url.toString()); System.out.println("Please replace localhost of your ip if you want to connect it in remote server."); - JMXConnectorServer jmxConnServer = JMXConnectorServerFactory.newJMXConnectorServer(url, null, localServer); + JMXConnectorServer jmxConnServer = JMXConnectorServerFactory.newJMXConnectorServer(url, null, mbeanServer); jmxConnServer.start(); objName = new ObjectName(jmxServerName + ":name=WebMagicMonitor"); - localServer.registerMBean(jmxConnServer, objName); + mbeanServer.registerMBean(jmxConnServer, objName); } for (SpiderStatusMXBean spiderStatus : spiderStatuses) { - objName = new ObjectName(jmxServerName + ":name=" + spiderStatus.getName()); - localServer.registerMBean(spiderStatus, objName); + registerMBean(spiderStatus); } return this; } + protected void registerMBean(SpiderStatusMXBean spiderStatus) throws MalformedObjectNameException, InstanceAlreadyExistsException, MBeanRegistrationException, NotCompliantMBeanException { + ObjectName objName; + objName = new ObjectName(jmxServerName + ":name=" + spiderStatus.getName()); + mbeanServer.registerMBean(spiderStatus, objName); + } + } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/Kr36NewsModel.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/Kr36NewsModel.java index 3dcc5f9..6208f68 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/Kr36NewsModel.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/Kr36NewsModel.java @@ -39,7 +39,7 @@ public class Kr36NewsModel { } }, Kr36NewsModel.class).thread(20); thread.start(); - SpiderMonitor spiderMonitor = SpiderMonitor.create(); + SpiderMonitor spiderMonitor = SpiderMonitor.instance(); spiderMonitor.server().register(thread).jmxStart(); } From 94a67165e140f64b1f3fd6485422bed117249122 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 26 Apr 2014 20:17:52 +0800 Subject: [PATCH 122/130] remove jmx server for simplify #98 --- .../webmagic/example/MonitorExample.java | 12 +- .../webmagic/monitor/SpiderMonitor.java | 141 +----------------- .../webmagic/monitor/SpiderMonitorTest.java | 1 - .../webmagic/model/samples/Kr36NewsModel.java | 2 +- 4 files changed, 8 insertions(+), 148 deletions(-) diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/MonitorExample.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/MonitorExample.java index 60f62b0..55fa0c8 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/MonitorExample.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/MonitorExample.java @@ -18,16 +18,8 @@ public class MonitorExample { Spider githubSpider = Spider.create(new GithubRepoPageProcessor()) .addUrl("https://github.com/code4craft"); - SpiderMonitor spiderMonitor = SpiderMonitor.instance(); - spiderMonitor.register(oschinaSpider); - spiderMonitor.register(githubSpider); - //If you want to connect it from remote, use spiderMonitor.server().jmxStart(); - //ONLY ONE server can start for a machine. - //Others will be registered without start a server. - //You can also register a server by spiderMonitor.client(host,port).jmxStart(). - spiderMonitor.server().jmxStart(); - oschinaSpider.start(); - githubSpider.start(); + SpiderMonitor.instance().register(oschinaSpider); + SpiderMonitor.instance().register(githubSpider); } } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java index 10ca508..a870c1d 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java @@ -6,17 +6,9 @@ import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.SpiderListener; import us.codecraft.webmagic.utils.Experimental; -import us.codecraft.webmagic.utils.IPUtils; import javax.management.*; -import javax.management.remote.JMXConnectorServer; -import javax.management.remote.JMXConnectorServerFactory; -import javax.management.remote.JMXServiceURL; -import java.io.IOException; import java.lang.management.ManagementFactory; -import java.rmi.registry.LocateRegistry; -import java.rmi.registry.Registry; -import java.rmi.server.ExportException; import java.util.ArrayList; import java.util.Collections; import java.util.List; @@ -30,41 +22,21 @@ import java.util.concurrent.atomic.AtomicInteger; @Experimental public class SpiderMonitor { - private enum Type { - Server, Client, Local; - } - private static SpiderMonitor INSTANCE = new SpiderMonitor(); private AtomicBoolean started = new AtomicBoolean(false); private Logger logger = LoggerFactory.getLogger(getClass()); - private static final int DEFAULT_SERVER_PORT = 14721; - - private static final String DEFAULT_SERVER_HOST = "localhost"; - - private int serverPort; - private MBeanServer mbeanServer; private String jmxServerName; - private String serverHost; - - private Type type = Type.Local; - private List spiderStatuses = new ArrayList(); - public List getSpiders() { - return spiderStatuses; - } - - public SpiderStatusMXBean getSpider() { - return spiderStatuses.get(0); - } - - private SpiderMonitor() { + protected SpiderMonitor() { + jmxServerName = "WebMagic"; + mbeanServer = ManagementFactory.getPlatformMBeanServer(); } /** @@ -84,9 +56,7 @@ public class SpiderMonitor { spider.getSpiderListeners().add(monitorSpiderListener); } SpiderStatusMXBean spiderStatusMBean = getSpiderStatusMBean(spider, monitorSpiderListener); - if (started.get()) { - registerMBean(spiderStatusMBean); - } + registerMBean(spiderStatusMBean); spiderStatuses.add(spiderStatusMBean); } return this; @@ -132,109 +102,8 @@ public class SpiderMonitor { } } - - /** - * Start monitor as server mode. - * - * @param port - * @return - * @throws IOException - * @throws JMException - */ - public synchronized SpiderMonitor server(int port) throws IOException, JMException { - try { - Registry registry = LocateRegistry.createRegistry(port); - } catch (ExportException e) { - logger.warn("Start server fail, maybe the address is in using.", e); - } - serverPort = port; - serverHost = "localhost"; - type = Type.Server; - return this; - } - - /** - * Start monitor as server mode. - * - * @return - * @throws IOException - * @throws JMException - */ - public SpiderMonitor server() throws IOException, JMException { - return server(DEFAULT_SERVER_PORT); - } - - /** - * Local mode: the monitor will be bound to the JVM instance.

- * Use jconsole to check your application. - * - * @return - */ - public synchronized SpiderMonitor local() { - this.type = Type.Local; - return this; - } - - - /** - * Start monitor as client mode. - * - * @param serverHost - * @param serverPort - * @return - * @throws IOException - * @throws JMException - */ - public synchronized SpiderMonitor client(String serverHost, int serverPort) throws IOException, JMException { - type = Type.Client; - this.serverHost = serverHost; - this.serverPort = serverPort; - return this; - } - - /** - * Start monitor as client mode. - * - * @return - * @throws IOException - * @throws JMException - */ - public SpiderMonitor client() throws IOException, JMException { - return client(DEFAULT_SERVER_HOST, DEFAULT_SERVER_PORT); - } - - public synchronized SpiderMonitor jmxStart() throws IOException, JMException { - if (!started.compareAndSet(false, true)) { - logger.error("Monitor has already started!"); - return this; - } - jmxServerName = "WebMagic-" + IPUtils.getFirstNoLoopbackIPAddresses(); - - // start JNDI - mbeanServer = ManagementFactory.getPlatformMBeanServer(); - - ObjectName objName; - - if (type != Type.Local) { - JMXServiceURL url = new JMXServiceURL("service:jmx:rmi:///jndi/rmi://" + serverHost + ":" + serverPort + "/" + jmxServerName); - System.out.println("JMXServiceURL: " + url.toString()); - System.out.println("Please replace localhost of your ip if you want to connect it in remote server."); - JMXConnectorServer jmxConnServer = JMXConnectorServerFactory.newJMXConnectorServer(url, null, mbeanServer); - jmxConnServer.start(); - objName = new ObjectName(jmxServerName + ":name=WebMagicMonitor"); - mbeanServer.registerMBean(jmxConnServer, objName); - } - - for (SpiderStatusMXBean spiderStatus : spiderStatuses) { - registerMBean(spiderStatus); - } - - return this; - } - protected void registerMBean(SpiderStatusMXBean spiderStatus) throws MalformedObjectNameException, InstanceAlreadyExistsException, MBeanRegistrationException, NotCompliantMBeanException { - ObjectName objName; - objName = new ObjectName(jmxServerName + ":name=" + spiderStatus.getName()); + ObjectName objName = new ObjectName(jmxServerName + ":name=" + spiderStatus.getName()); mbeanServer.registerMBean(spiderStatus, objName); } diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/monitor/SpiderMonitorTest.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/monitor/SpiderMonitorTest.java index d1065f9..3baa0d6 100644 --- a/webmagic-extension/src/test/java/us/codecraft/webmagic/monitor/SpiderMonitorTest.java +++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/monitor/SpiderMonitorTest.java @@ -26,7 +26,6 @@ public class SpiderMonitorTest { .addUrl("https://github.com/code4craft"); spiderMonitor.register(oschinaSpider, githubSpider); - spiderMonitor.jmxStart(); } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/Kr36NewsModel.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/Kr36NewsModel.java index 6208f68..a1ef3fd 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/Kr36NewsModel.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/Kr36NewsModel.java @@ -40,7 +40,7 @@ public class Kr36NewsModel { }, Kr36NewsModel.class).thread(20); thread.start(); SpiderMonitor spiderMonitor = SpiderMonitor.instance(); - spiderMonitor.server().register(thread).jmxStart(); + spiderMonitor.register(thread); } public String getTitle() { From b0fb1c3e10c6c66c2a21ae96033cd458a7329a10 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sun, 27 Apr 2014 08:22:33 +0800 Subject: [PATCH 123/130] remove copy-dependcies plugin for m2e error --- .gitignore | 3 +- pom.xml | 40 +++++++++---------- .../webmagic/example/MonitorExample.java | 3 +- webmagic-samples/pom.xml | 19 +++++++++ webmagic-scripts/pom.xml | 19 --------- 5 files changed, 43 insertions(+), 41 deletions(-) diff --git a/.gitignore b/.gitignore index c0dc326..d7d63fe 100644 --- a/.gitignore +++ b/.gitignore @@ -2,4 +2,5 @@ target *.iml out/ .idea - +.classpath +.project diff --git a/pom.xml b/pom.xml index b277b38..e867743 100644 --- a/pom.xml +++ b/pom.xml @@ -154,26 +154,26 @@ UTF-8
- - org.apache.maven.plugins - maven-dependency-plugin - 2.8 - - - copy-dependencies - package - - copy-dependencies - - - ${project.build.directory}/lib - false - false - true - - - - + + + + + + + + + + + + + + + + + + + + org.apache.maven.plugins maven-resources-plugin diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/MonitorExample.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/MonitorExample.java index 55fa0c8..734f042 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/MonitorExample.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/MonitorExample.java @@ -20,6 +20,7 @@ public class MonitorExample { SpiderMonitor.instance().register(oschinaSpider); SpiderMonitor.instance().register(githubSpider); - + oschinaSpider.start(); + githubSpider.start(); } } diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index 3868dda..c4487b0 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -34,6 +34,25 @@ true + + org.apache.maven.plugins + maven-dependency-plugin + + + copy-dependencies + package + + copy-dependencies + + + ${project.build.directory}/lib + false + false + true + + + + org.apache.maven.plugins maven-jar-plugin diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index 41c79ea..4838638 100755 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -44,25 +44,6 @@ - - org.apache.maven.plugins - maven-dependency-plugin - - - copy-dependencies - package - - copy-dependencies - - - ${project.build.directory}/lib - false - false - true - - - - maven-compiler-plugin From 110412297925415549cca50cfc0be78c30360171 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sun, 27 Apr 2014 09:30:01 +0800 Subject: [PATCH 124/130] more abstraction in scheduler --- .../scheduler/DuplicatedRemoveScheduler.java | 45 ++++++++++++++++++ .../LocalDuplicatedRemoveScheduler.java | 34 ++++++++++++++ .../LocalDuplicatedRemovedScheduler.java | 47 ------------------- .../webmagic/scheduler/PriorityScheduler.java | 2 +- .../webmagic/scheduler/QueueScheduler.java | 2 +- .../scheduler/FileCacheQueueScheduler.java | 2 +- .../webmagic/scheduler/RedisScheduler.java | 44 ++++++++++++----- 7 files changed, 113 insertions(+), 63 deletions(-) create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/DuplicatedRemoveScheduler.java create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/LocalDuplicatedRemoveScheduler.java delete mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/LocalDuplicatedRemovedScheduler.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/DuplicatedRemoveScheduler.java b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/DuplicatedRemoveScheduler.java new file mode 100644 index 0000000..7b319b6 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/DuplicatedRemoveScheduler.java @@ -0,0 +1,45 @@ +package us.codecraft.webmagic.scheduler; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.Task; + +/** + * Remove duplicate urls and only push urls which are not duplicate.

+ * + * @author code4crafer@gmail.com + * @since 0.5.0 + */ +public abstract class DuplicatedRemoveScheduler implements Scheduler { + + protected Logger logger = LoggerFactory.getLogger(getClass()); + + @Override + public void push(Request request, Task task) { + logger.trace("get a candidate url {}", request.getUrl()); + if (isDuplicate(request, task) || shouldReserved(request)) { + logger.debug("push to queue {}", request.getUrl()); + pushWhenNoDuplicate(request, task); + } + } + + /** + * Reset duplicate check. + */ + public abstract void resetDuplicateCheck(Task task); + + /** + * @param request + * @return + */ + protected abstract boolean isDuplicate(Request request, Task task); + + protected boolean shouldReserved(Request request) { + return request.getExtra(Request.CYCLE_TRIED_TIMES) != null; + } + + protected void pushWhenNoDuplicate(Request request, Task task) { + + } +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/LocalDuplicatedRemoveScheduler.java b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/LocalDuplicatedRemoveScheduler.java new file mode 100644 index 0000000..c127c98 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/LocalDuplicatedRemoveScheduler.java @@ -0,0 +1,34 @@ +package us.codecraft.webmagic.scheduler; + +import com.google.common.collect.Sets; +import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.Task; + +import java.util.Set; +import java.util.concurrent.ConcurrentHashMap; + +/** + * Base Scheduler with duplicated urls removed by hash set.

+ * + * @author code4crafter@gmail.com + * @since 0.5.0 + */ +public abstract class LocalDuplicatedRemoveScheduler extends DuplicatedRemoveScheduler implements MonitorableScheduler { + + private Set urls = Sets.newSetFromMap(new ConcurrentHashMap()); + + @Override + public void resetDuplicateCheck(Task task) { + urls.clear(); + } + + @Override + protected boolean isDuplicate(Request request, Task task) { + return urls.add(request.getUrl()); + } + + @Override + public int getTotalRequestsCount(Task task) { + return urls.size(); + } +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/LocalDuplicatedRemovedScheduler.java b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/LocalDuplicatedRemovedScheduler.java deleted file mode 100644 index 1ec128b..0000000 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/LocalDuplicatedRemovedScheduler.java +++ /dev/null @@ -1,47 +0,0 @@ -package us.codecraft.webmagic.scheduler; - -import com.google.common.collect.Sets; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import us.codecraft.webmagic.Request; -import us.codecraft.webmagic.Task; - -import java.util.Set; -import java.util.concurrent.ConcurrentHashMap; - -/** - * Base Scheduler with duplicated urls removed locally. - * - * @author code4crafter@gmail.com - * @since 0.5.0 - */ -public abstract class LocalDuplicatedRemovedScheduler implements MonitorableScheduler { - - protected Logger logger = LoggerFactory.getLogger(getClass()); - - private Set urls = Sets.newSetFromMap(new ConcurrentHashMap()); - - @Override - public void push(Request request, Task task) { - logger.trace("get a candidate url {}", request.getUrl()); - if (isDuplicate(request) || shouldReserved(request)) { - logger.debug("push to queue {}", request.getUrl()); - pushWhenNoDuplicate(request, task); - } - } - - protected boolean isDuplicate(Request request) { - return urls.add(request.getUrl()); - } - - protected boolean shouldReserved(Request request) { - return request.getExtra(Request.CYCLE_TRIED_TIMES) != null; - } - - @Override - public int getTotalRequestsCount(Task task) { - return urls.size(); - } - - protected abstract void pushWhenNoDuplicate(Request request, Task task); -} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/PriorityScheduler.java b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/PriorityScheduler.java index a57a6fb..38c9b6c 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/PriorityScheduler.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/PriorityScheduler.java @@ -17,7 +17,7 @@ import java.util.concurrent.PriorityBlockingQueue; * @since 0.2.1 */ @ThreadSafe -public class PriorityScheduler extends LocalDuplicatedRemovedScheduler { +public class PriorityScheduler extends LocalDuplicatedRemoveScheduler { public static final int INITIAL_CAPACITY = 5; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java index e2a6e75..511d8a0 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java @@ -16,7 +16,7 @@ import java.util.concurrent.LinkedBlockingQueue; * @since 0.1.0 */ @ThreadSafe -public class QueueScheduler extends LocalDuplicatedRemovedScheduler { +public class QueueScheduler extends LocalDuplicatedRemoveScheduler { private BlockingQueue queue = new LinkedBlockingQueue(); diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java index 9d7668d..4215ab8 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java @@ -23,7 +23,7 @@ import java.util.concurrent.atomic.AtomicInteger; * @author code4crafter@gmail.com
* @since 0.2.0 */ -public class FileCacheQueueScheduler extends LocalDuplicatedRemovedScheduler { +public class FileCacheQueueScheduler extends LocalDuplicatedRemoveScheduler { private Logger logger = LoggerFactory.getLogger(getClass()); diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java index cd3a0b6..dc2ee2e 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java @@ -14,7 +14,7 @@ import us.codecraft.webmagic.Task; * @author code4crafter@gmail.com
* @since 0.2.0 */ -public class RedisScheduler implements MonitorableScheduler { +public class RedisScheduler extends DuplicatedRemoveScheduler implements MonitorableScheduler { private JedisPool pool; @@ -33,21 +33,39 @@ public class RedisScheduler implements MonitorableScheduler { } @Override - public synchronized void push(Request request, Task task) { + public void resetDuplicateCheck(Task task) { Jedis jedis = pool.getResource(); try { - // if cycleRetriedTimes is set, allow duplicated. - Object cycleRetriedTimes = request.getExtra(Request.CYCLE_TRIED_TIMES); - // use set to remove duplicate url - if (cycleRetriedTimes != null || !jedis.sismember(getSetKey(task), request.getUrl())) { - // use list to store queue - jedis.rpush(getQueueKey(task), request.getUrl()); + jedis.del(getSetKey(task)); + } finally { + pool.returnResource(jedis); + } + } + + @Override + protected boolean isDuplicate(Request request, Task task) { + Jedis jedis = pool.getResource(); + try { + boolean isDuplicate = !jedis.sismember(getSetKey(task), request.getUrl()); + if (!isDuplicate) { jedis.sadd(getSetKey(task), request.getUrl()); - if (request.getExtras() != null) { - String field = DigestUtils.shaHex(request.getUrl()); - String value = JSON.toJSONString(request); - jedis.hset((ITEM_PREFIX + task.getUUID()), field, value); - } + } + return isDuplicate; + } finally { + pool.returnResource(jedis); + } + + } + + @Override + protected void pushWhenNoDuplicate(Request request, Task task) { + Jedis jedis = pool.getResource(); + try { + jedis.rpush(getQueueKey(task), request.getUrl()); + if (request.getExtras() != null) { + String field = DigestUtils.shaHex(request.getUrl()); + String value = JSON.toJSONString(request); + jedis.hset((ITEM_PREFIX + task.getUUID()), field, value); } } finally { pool.returnResource(jedis); From dc3c1757729d64a1da91bf095adb3bbd2f7cdf0d Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sun, 27 Apr 2014 10:50:35 +0800 Subject: [PATCH 125/130] docs --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 62276eb..df25293 100644 --- a/README.md +++ b/README.md @@ -110,6 +110,8 @@ public class GithubRepo { ### Docs and samples: +Documents: [http://webmagic.io/docs/](http://webmagic.io/docs/) + The architecture of webmagic (refered to [Scrapy](http://scrapy.org/)) ![image](http://code4craft.github.io/images/posts/webmagic.png) From 7ff83bb11a90e64fdc754c02001cd548b2fc9e63 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sun, 27 Apr 2014 12:52:12 +0800 Subject: [PATCH 126/130] [maven-release-plugin] prepare release WebMagic-0.5.0 --- pom.xml | 4 ++-- webmagic-avalon/forger/pom.xml | 4 ++-- webmagic-avalon/pom.xml | 7 +++---- webmagic-avalon/webmagic-admin/pom.xml | 2 +- webmagic-avalon/webmagic-avalon-common/pom.xml | 4 ++-- webmagic-avalon/webmagic-worker/pom.xml | 2 +- webmagic-core/pom.xml | 2 +- webmagic-extension/pom.xml | 2 +- webmagic-samples/pom.xml | 2 +- webmagic-saxon/pom.xml | 2 +- webmagic-scripts/pom.xml | 2 +- webmagic-selenium/pom.xml | 2 +- 12 files changed, 17 insertions(+), 18 deletions(-) diff --git a/pom.xml b/pom.xml index e867743..5481c43 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ 7 us.codecraft - 0.5.0-SNAPSHOT + 0.5.0 4.0.0 pom @@ -38,7 +38,7 @@ scm:git:git@github.com:code4craft/webmagic.git scm:git:git@github.com:code4craft/webmagic.git git@github.com:code4craft/webmagic.git - HEAD + WebMagic-0.5.0 diff --git a/webmagic-avalon/forger/pom.xml b/webmagic-avalon/forger/pom.xml index 9738c10..f7115d4 100644 --- a/webmagic-avalon/forger/pom.xml +++ b/webmagic-avalon/forger/pom.xml @@ -7,7 +7,7 @@ us.codecraft forger - 0.1.0-SNAPSHOT + 0.1.0 4.0.0 jar @@ -30,7 +30,7 @@ scm:git:git@github.com:code4craft/forger.git scm:git:git@github.com:code4craft/forger.git git@github.com:code4craft/forger.git - HEAD + WebMagic-0.5.0 diff --git a/webmagic-avalon/pom.xml b/webmagic-avalon/pom.xml index e74af11..962d6be 100644 --- a/webmagic-avalon/pom.xml +++ b/webmagic-avalon/pom.xml @@ -1,10 +1,9 @@ - + webmagic-parent us.codecraft - 0.5.0-SNAPSHOT + 0.5.0 4.0.0 @@ -43,7 +42,7 @@ us.codecraft forger - 0.1.0-SNAPSHOT + 0.1.0 diff --git a/webmagic-avalon/webmagic-admin/pom.xml b/webmagic-avalon/webmagic-admin/pom.xml index b3a05b8..020ca8a 100644 --- a/webmagic-avalon/webmagic-admin/pom.xml +++ b/webmagic-avalon/webmagic-admin/pom.xml @@ -3,7 +3,7 @@ webmagic-avalon us.codecraft - 0.5.0-SNAPSHOT + 0.5.0 4.0.0 diff --git a/webmagic-avalon/webmagic-avalon-common/pom.xml b/webmagic-avalon/webmagic-avalon-common/pom.xml index ed0bc23..32eb8b4 100644 --- a/webmagic-avalon/webmagic-avalon-common/pom.xml +++ b/webmagic-avalon/webmagic-avalon-common/pom.xml @@ -3,7 +3,7 @@ webmagic-avalon us.codecraft - 0.5.0-SNAPSHOT + 0.5.0 4.0.0 @@ -26,7 +26,7 @@ us.codecraft forger - 0.1.0-SNAPSHOT + 0.1.0 diff --git a/webmagic-avalon/webmagic-worker/pom.xml b/webmagic-avalon/webmagic-worker/pom.xml index 84e1d73..f085c82 100644 --- a/webmagic-avalon/webmagic-worker/pom.xml +++ b/webmagic-avalon/webmagic-worker/pom.xml @@ -3,7 +3,7 @@ webmagic-avalon us.codecraft - 0.5.0-SNAPSHOT + 0.5.0 4.0.0 diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index 0795a99..eb47d59 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.5.0-SNAPSHOT + 0.5.0 4.0.0 diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index f5a4019..c6629a4 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.5.0-SNAPSHOT + 0.5.0 4.0.0 diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index c4487b0..4769c21 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.5.0-SNAPSHOT + 0.5.0 4.0.0 diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml index f63c21f..a0aa2cb 100644 --- a/webmagic-saxon/pom.xml +++ b/webmagic-saxon/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.5.0-SNAPSHOT + 0.5.0 4.0.0 diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index 4838638..943165e 100755 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.5.0-SNAPSHOT + 0.5.0 4.0.0 diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml index 01577ce..56d55d3 100644 --- a/webmagic-selenium/pom.xml +++ b/webmagic-selenium/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.5.0-SNAPSHOT + 0.5.0 4.0.0 From c25b32f1ca6bf35be24d2a794c1481d127ccad37 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sun, 27 Apr 2014 12:52:27 +0800 Subject: [PATCH 127/130] [maven-release-plugin] prepare for next development iteration --- pom.xml | 4 ++-- webmagic-avalon/forger/pom.xml | 4 ++-- webmagic-avalon/pom.xml | 4 ++-- webmagic-avalon/webmagic-admin/pom.xml | 2 +- webmagic-avalon/webmagic-avalon-common/pom.xml | 4 ++-- webmagic-avalon/webmagic-worker/pom.xml | 2 +- webmagic-core/pom.xml | 2 +- webmagic-extension/pom.xml | 2 +- webmagic-samples/pom.xml | 2 +- webmagic-saxon/pom.xml | 2 +- webmagic-scripts/pom.xml | 2 +- webmagic-selenium/pom.xml | 2 +- 12 files changed, 16 insertions(+), 16 deletions(-) diff --git a/pom.xml b/pom.xml index 5481c43..2fea5b4 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ 7 us.codecraft - 0.5.0 + 0.5.1-SNAPSHOT 4.0.0 pom @@ -38,7 +38,7 @@ scm:git:git@github.com:code4craft/webmagic.git scm:git:git@github.com:code4craft/webmagic.git git@github.com:code4craft/webmagic.git - WebMagic-0.5.0 + HEAD diff --git a/webmagic-avalon/forger/pom.xml b/webmagic-avalon/forger/pom.xml index f7115d4..44b42f9 100644 --- a/webmagic-avalon/forger/pom.xml +++ b/webmagic-avalon/forger/pom.xml @@ -7,7 +7,7 @@ us.codecraft forger - 0.1.0 + 0.1.1-SNAPSHOT 4.0.0 jar @@ -30,7 +30,7 @@ scm:git:git@github.com:code4craft/forger.git scm:git:git@github.com:code4craft/forger.git git@github.com:code4craft/forger.git - WebMagic-0.5.0 + HEAD diff --git a/webmagic-avalon/pom.xml b/webmagic-avalon/pom.xml index 962d6be..8232103 100644 --- a/webmagic-avalon/pom.xml +++ b/webmagic-avalon/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.5.0 + 0.5.1-SNAPSHOT 4.0.0 @@ -42,7 +42,7 @@ us.codecraft forger - 0.1.0 + 0.1.1-SNAPSHOT diff --git a/webmagic-avalon/webmagic-admin/pom.xml b/webmagic-avalon/webmagic-admin/pom.xml index 020ca8a..ed364c1 100644 --- a/webmagic-avalon/webmagic-admin/pom.xml +++ b/webmagic-avalon/webmagic-admin/pom.xml @@ -3,7 +3,7 @@ webmagic-avalon us.codecraft - 0.5.0 + 0.5.1-SNAPSHOT 4.0.0 diff --git a/webmagic-avalon/webmagic-avalon-common/pom.xml b/webmagic-avalon/webmagic-avalon-common/pom.xml index 32eb8b4..3b06899 100644 --- a/webmagic-avalon/webmagic-avalon-common/pom.xml +++ b/webmagic-avalon/webmagic-avalon-common/pom.xml @@ -3,7 +3,7 @@ webmagic-avalon us.codecraft - 0.5.0 + 0.5.1-SNAPSHOT 4.0.0 @@ -26,7 +26,7 @@ us.codecraft forger - 0.1.0 + 0.1.1-SNAPSHOT diff --git a/webmagic-avalon/webmagic-worker/pom.xml b/webmagic-avalon/webmagic-worker/pom.xml index f085c82..ebc5174 100644 --- a/webmagic-avalon/webmagic-worker/pom.xml +++ b/webmagic-avalon/webmagic-worker/pom.xml @@ -3,7 +3,7 @@ webmagic-avalon us.codecraft - 0.5.0 + 0.5.1-SNAPSHOT 4.0.0 diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index eb47d59..407f266 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.5.0 + 0.5.1-SNAPSHOT 4.0.0 diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index c6629a4..3c97d4b 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.5.0 + 0.5.1-SNAPSHOT 4.0.0 diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index 4769c21..ee71ce8 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.5.0 + 0.5.1-SNAPSHOT 4.0.0 diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml index a0aa2cb..8ebc8af 100644 --- a/webmagic-saxon/pom.xml +++ b/webmagic-saxon/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.5.0 + 0.5.1-SNAPSHOT 4.0.0 diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index 943165e..0272c57 100755 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.5.0 + 0.5.1-SNAPSHOT 4.0.0 diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml index 56d55d3..36a3fc0 100644 --- a/webmagic-selenium/pom.xml +++ b/webmagic-selenium/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.5.0 + 0.5.1-SNAPSHOT 4.0.0 From 028f5e8755aa7de767d52baf69f8c1c1bc64619e Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sun, 27 Apr 2014 14:52:44 +0800 Subject: [PATCH 128/130] readme --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index df25293..7f3c394 100644 --- a/README.md +++ b/README.md @@ -128,6 +128,7 @@ Lisenced under [Apache 2.0 lisence](http://opensource.org/licenses/Apache-2.0) Thanks these people for commiting source code, reporting bugs or suggesting for new feature: +* [ccliangbo](https://github.com/ccliangbo) * [yuany](https://github.com/yuany) * [yxssfxwzy](https://github.com/yxssfxwzy) * [linkerlin](https://github.com/linkerlin) @@ -141,8 +142,9 @@ Thanks these people for commiting source code, reporting bugs or suggesting for * [ywooer](https://github.com/ywooer) * [yyw258520](https://github.com/yyw258520) * [perfecking](https://github.com/perfecking) -* [ccliangbo](https://github.com/ccliangbo) * [lidongyang](http://my.oschina.net/lidongyang) +* [seveniu](https://github.com/seveniu) +* [sebastian1118](https://github.com/sebastian1118) ### Thanks: From c892eadb5610121e9756935fe78bf8badb863f41 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sun, 27 Apr 2014 14:54:39 +0800 Subject: [PATCH 129/130] contributor --- README.md | 2 +- zh_docs/README.md | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 7f3c394..cebaecd 100644 --- a/README.md +++ b/README.md @@ -169,7 +169,7 @@ To write webmagic, I refered to the projects below : [http://list.qq.com/cgi-bin/qf_invite?id=023a01f505246785f77c5a5a9aff4e57ab20fcdde871e988](http://list.qq.com/cgi-bin/qf_invite?id=023a01f505246785f77c5a5a9aff4e57ab20fcdde871e988) -QQ Group: 330192938 +QQ Group: 373225642 [![Bitdeli Badge](https://d2weczhvl823v0.cloudfront.net/code4craft/webmagic/trend.png)](https://bitdeli.com/free "Bitdeli Badge") diff --git a/zh_docs/README.md b/zh_docs/README.md index 8d40752..b336367 100644 --- a/zh_docs/README.md +++ b/zh_docs/README.md @@ -175,6 +175,7 @@ webmagic遵循[Apache 2.0协议](http://opensource.org/licenses/Apache-2.0) 以下是为WebMagic提交过代码或者issue的朋友: +* [ccliangbo](https://github.com/ccliangbo) * [yuany](https://github.com/yuany) * [yxssfxwzy](https://github.com/yxssfxwzy) * [linkerlin](https://github.com/linkerlin) @@ -188,8 +189,9 @@ webmagic遵循[Apache 2.0协议](http://opensource.org/licenses/Apache-2.0) * [ywooer](https://github.com/ywooer) * [yyw258520](https://github.com/yyw258520) * [perfecking](https://github.com/perfecking) -* [ccliangbo](https://github.com/ccliangbo) * [lidongyang](http://my.oschina.net/lidongyang) +* [seveniu](https://github.com/seveniu) +* [sebastian1118](https://github.com/sebastian1118) ### 邮件组: @@ -201,4 +203,4 @@ QQ: ### QQ群: -330192938 +373225642 From 42a2676e8c8a17039b796ae5b696d7b1bf56465b Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sun, 27 Apr 2014 14:56:21 +0800 Subject: [PATCH 130/130] update version --- pom.xml | 2 +- webmagic-avalon/pom.xml | 2 +- webmagic-avalon/webmagic-admin/pom.xml | 2 +- webmagic-avalon/webmagic-avalon-common/pom.xml | 2 +- webmagic-avalon/webmagic-worker/pom.xml | 2 +- webmagic-core/pom.xml | 2 +- webmagic-extension/pom.xml | 2 +- webmagic-samples/pom.xml | 2 +- webmagic-saxon/pom.xml | 2 +- webmagic-scripts/pom.xml | 2 +- webmagic-selenium/pom.xml | 2 +- 11 files changed, 11 insertions(+), 11 deletions(-) diff --git a/pom.xml b/pom.xml index 2fea5b4..9bfc505 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ 7 us.codecraft - 0.5.1-SNAPSHOT + 0.5.0 4.0.0 pom diff --git a/webmagic-avalon/pom.xml b/webmagic-avalon/pom.xml index 8232103..39a5ac5 100644 --- a/webmagic-avalon/pom.xml +++ b/webmagic-avalon/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.5.1-SNAPSHOT + 0.5.0 4.0.0 diff --git a/webmagic-avalon/webmagic-admin/pom.xml b/webmagic-avalon/webmagic-admin/pom.xml index ed364c1..020ca8a 100644 --- a/webmagic-avalon/webmagic-admin/pom.xml +++ b/webmagic-avalon/webmagic-admin/pom.xml @@ -3,7 +3,7 @@ webmagic-avalon us.codecraft - 0.5.1-SNAPSHOT + 0.5.0 4.0.0 diff --git a/webmagic-avalon/webmagic-avalon-common/pom.xml b/webmagic-avalon/webmagic-avalon-common/pom.xml index 3b06899..0125b19 100644 --- a/webmagic-avalon/webmagic-avalon-common/pom.xml +++ b/webmagic-avalon/webmagic-avalon-common/pom.xml @@ -3,7 +3,7 @@ webmagic-avalon us.codecraft - 0.5.1-SNAPSHOT + 0.5.0 4.0.0 diff --git a/webmagic-avalon/webmagic-worker/pom.xml b/webmagic-avalon/webmagic-worker/pom.xml index ebc5174..f085c82 100644 --- a/webmagic-avalon/webmagic-worker/pom.xml +++ b/webmagic-avalon/webmagic-worker/pom.xml @@ -3,7 +3,7 @@ webmagic-avalon us.codecraft - 0.5.1-SNAPSHOT + 0.5.0 4.0.0 diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index 407f266..eb47d59 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.5.1-SNAPSHOT + 0.5.0 4.0.0 diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index 3c97d4b..c6629a4 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.5.1-SNAPSHOT + 0.5.0 4.0.0 diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index ee71ce8..4769c21 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.5.1-SNAPSHOT + 0.5.0 4.0.0 diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml index 8ebc8af..a0aa2cb 100644 --- a/webmagic-saxon/pom.xml +++ b/webmagic-saxon/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.5.1-SNAPSHOT + 0.5.0 4.0.0 diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index 0272c57..943165e 100755 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.5.1-SNAPSHOT + 0.5.0 4.0.0 diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml index 36a3fc0..56d55d3 100644 --- a/webmagic-selenium/pom.xml +++ b/webmagic-selenium/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.5.1-SNAPSHOT + 0.5.0 4.0.0