From eae37c868b51711eb9963b03e0a516916ca74b66 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Tue, 10 Jun 2014 17:38:54 +0800 Subject: [PATCH] new sample --- .../java/us/codecraft/webmagic/Spider.java | 2 +- .../AlexanderMcqueenGoodsProcessor.java | 65 +++++++++++++++++++ 2 files changed, 66 insertions(+), 1 deletion(-) create mode 100644 webmagic-samples/src/main/java/us/codecraft/webmagic/samples/AlexanderMcqueenGoodsProcessor.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 07aad87..2e00743 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -325,7 +325,7 @@ public class Spider implements Runnable, Task { onError(requestFinal); logger.error("process request " + requestFinal + " error", e); } finally { - if (site.getHttpProxyPool().isEnable()) { + if (site.getHttpProxyPool()!=null && site.getHttpProxyPool().isEnable()) { site.returnHttpProxyToPool((HttpHost) requestFinal.getExtra(Request.PROXY), (Integer) requestFinal .getExtra(Request.STATUS_CODE)); } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/AlexanderMcqueenGoodsProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/AlexanderMcqueenGoodsProcessor.java new file mode 100644 index 0000000..af9c01e --- /dev/null +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/AlexanderMcqueenGoodsProcessor.java @@ -0,0 +1,65 @@ +package us.codecraft.webmagic.samples; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.processor.PageProcessor; +import us.codecraft.webmagic.scheduler.PriorityScheduler; + +/** + * @author code4crafer@gmail.com + */ +public class AlexanderMcqueenGoodsProcessor implements PageProcessor { + + private Site site = Site.me().setRetryTimes(3).setSleepTime(0); + + + public static final String URL_LIST = "http://www\\.alexandermcqueen\\.cn/.*"; + + public static final String URL_POST = "http://www\\.alexandermcqueen\\.cn/cn/\\w+/.*\\.html"; + + @Override + public void process(Page page) { + if (page.getUrl().regex(URL_POST).match()) { + page.putField("goodsName", page.getHtml().xpath("//div[@id='description']/h1/tidyText()")); + if (page.getResultItems().get("goodsName") == null) { + page.setSkip(true); + } + page.putField("currency", page.getHtml().xpath("//div[@id='description']//div[@class='itemBoxPrice']/span//span[@class='currency']/tidyText()")); + page.putField("goodsPrice", page.getHtml().xpath("//div[@id='description']//div[@class='itemBoxPrice']/span//span[@class='priceValue']/tidyText()")); + page.putField("description", page.getHtml() + .xpath("//div[@id='tabbedDescription']//div[@class='tabbedDescription']//ul[@id='tabs']//li[@id='tab_description']/div[@id='description_pane']/tidyText()")); + page.putField("material", page.getHtml() + .xpath("//div[@id='tabbedDescription']" + + "//div[@class='tabbedDescription']" + + "//ul[@id='tabs']" + + "//li[@id='tab_description']" + + "//div[@class='productProperty']" + + "//div[@class='productPropertyRow']/span[2]/tidyText()")); + page.putField("goodsCode", page.getHtml() + .xpath("//div[@id='tabbedDescription']" + + "//div[@class='tabbedDescription']" + + "//ul[@id='tabs']" + + "//li[@id='tab_description']" + + "//div[@class='productProperty']" + + "//div[@class='productPropertyRow']//span[@id='modelFabricColorContainer']/tidyText()")); + page.putField("goodsSize", page.getHtml() + .xpath("//div[@id='sizesContainer']//div[@id='sizes']//ul[@class='SizeW']")); + page.putField("goodsColors", page.getHtml() + .xpath("//div[@id='colors']/ul/html()")); + } else { + page.addTargetRequests(page.getHtml().links().regex(URL_POST).all(), 1000); + page.addTargetRequests(page.getHtml().links().regex(URL_LIST).all(), 1); + } + } + + @Override + public Site getSite() { + return site; + } + + public static void main(String[] args) { + Spider.create(new AlexanderMcqueenGoodsProcessor()).setScheduler(new PriorityScheduler()) + .addUrl("http://www.alexandermcqueen.cn/sitemap.asp?tskay=E2F1A848").thread(5).run(); + } +} \ No newline at end of file