From 2b34dc9d3fe23861d6cfc117274c3710e64e7230 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Thu, 18 Jul 2013 17:22:26 +0800 Subject: [PATCH] add retry --- .gitignore | 1 + webmagic-core/pom.xml | 2 +- .../main/java/us/codecraft/webmagic/Site.java | 19 +++++++ .../downloader/HttpClientDownloader.java | 28 ++++++++-- webmagic-samples/pom.xml | 32 +++++++++++ .../samples/DianpingIndexProcessor.java | 53 +++++++++++++++++++ .../webmagic/samples/DianpingProcessor.java | 30 ++++++----- .../webmagic/samples/GlobalProcessor.java | 13 +++-- .../webmagic/samples/GuoxueProcessor.java | 20 +++++++ .../processor/DiaoyuwengProcessorTest.java | 2 - 10 files changed, 175 insertions(+), 25 deletions(-) create mode 100644 webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingIndexProcessor.java create mode 100644 webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GuoxueProcessor.java diff --git a/.gitignore b/.gitignore index 0af075f..cd33b61 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ target/* *.iml +out/ diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index c0ef6a1..df482f7 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -12,7 +12,7 @@ org.apache.httpcomponents httpclient - 4.2.1 + 4.2.4 diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java index 676584a..2c6118c 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java @@ -24,6 +24,8 @@ public class Site { private int sleepTime = 3000; + private int retryTimes = 0; + private static final Set DEFAULT_STATUS_CODE_SET = new HashSet(); private Set acceptStatCode = DEFAULT_STATUS_CODE_SET; @@ -183,6 +185,23 @@ public class Site { return sleepTime; } + /** + * 获取重新下载的次数,默认为0 + * @return 重新下载的次数 + */ + public int getRetryTimes() { + return retryTimes; + } + + /** + * 设置获取重新下载的次数,默认为0 + * @return this + */ + public Site setRetryTimes(int retryTimes) { + this.retryTimes = retryTimes; + return this; + } + @Override public boolean equals(Object o) { if (this == o) return true; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index d2c2d62..e4ae0ff 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -16,11 +16,13 @@ import us.codecraft.webmagic.selector.Html; import us.codecraft.webmagic.selector.PlainText; import us.codecraft.webmagic.utils.UrlUtils; +import java.io.IOException; + /** * @author code4crafter@gmail.com
- * Date: 13-4-21 - * Time: 下午12:15 + * Date: 13-4-21 + * Time: 下午12:15 */ public class HttpClientDownloader implements Downloader { @@ -34,11 +36,27 @@ public class HttpClientDownloader implements Downloader { String charset = site.getCharset(); try { HttpGet httpGet = new HttpGet(request.getUrl()); - HttpResponse httpResponse = httpClient.execute(httpGet); + HttpResponse httpResponse = null; + int tried = 0; + boolean retry; + do { + try { + httpResponse = httpClient.execute(httpGet); + retry = false; + } catch (IOException e) { + tried++; + if (tried > site.getRetryTimes()) { + logger.warn("download page " + request.getUrl() + " error", e); + return null; + } + logger.info("download page " + request.getUrl() + " error, retry the "+tried+" time!"); + retry = true; + } + } while (retry); int statusCode = httpResponse.getStatusLine().getStatusCode(); if (site.getAcceptStatCode().contains(statusCode)) { //charset - if (charset == null){ + if (charset == null) { String value = httpResponse.getEntity().getContentType().getValue(); charset = new PlainText(value).regex("charset=([^\\s]+)").toString(); } @@ -52,7 +70,7 @@ public class HttpClientDownloader implements Downloader { page.setRequest(request); return page; } else { - logger.warn("code error " + statusCode); + logger.warn("code error " + statusCode + "\t" + request.getUrl()); } } catch (Exception e) { logger.warn("download page " + request.getUrl() + " error", e); diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index 4e345a2..f1f6806 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -39,6 +39,25 @@ 1.6 + + org.apache.maven.plugins + maven-dependency-plugin + + + copy-dependencies + package + + copy-dependencies + + + ${project.build.directory}/lib + false + false + true + + + + org.apache.maven.plugins maven-resources-plugin @@ -70,6 +89,19 @@ + + org.apache.maven.plugins + maven-jar-plugin + + + + true + ./lib/ + us.codecraft.webmagic.samples.DianpingIndexProcessor + + + + org.apache.maven.plugins maven-release-plugin diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingIndexProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingIndexProcessor.java new file mode 100644 index 0000000..1f5da51 --- /dev/null +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingIndexProcessor.java @@ -0,0 +1,53 @@ +package us.codecraft.webmagic.samples; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.processor.PageProcessor; + +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * @author code4crafter@gmail.com
+ * Date: 13-4-21 Time: 下午8:08 + */ +public class DianpingIndexProcessor implements PageProcessor { + @Override + public void process(Page page) { + if (page.getUrl().toString().equals("http://www.dianping.com/citylist")) { + page.addTargetRequests(page.getHtml().links().regex("http://www\\.dianping\\.com/\\w+$").toStrings()); + return; + } + Pattern p = Pattern.compile("http://www\\.dianping\\.com/\\w+"); + Matcher matcher = p.matcher(page.getUrl().toString()); + if (matcher.matches()) { + page.addTargetRequests(page.getHtml().xpath("//li[@class='term-list-item']//a/@href").regex("http://www\\.dianping\\.com/search/.*").toStrings()); + } else { + p = Pattern.compile("http://www\\.dianping\\.com/search/.*"); + matcher = p.matcher(page.getUrl().toString()); + if (matcher.matches()) { + String result = page.getHtml().regex("您要查看的内容不存在").toString(); + if (result != null) { + System.err.println("No!Url not exist!" + page.getUrl()); + } + } + } + } + + @Override + public Site getSite() { + return Site.me().setDomain("www.dianping.com").addStartUrl("http://www.dianping.com/citylist") + .setSleepTime(0).setUserAgent("I'm a performance tester created by yihua.huang"); + } + + public static void main(String[] args) { + int sleepTime = 0; + if (args.length > 0) { + sleepTime = Integer.parseInt(args[0]); + } + DianpingIndexProcessor dianpingProcessor = new DianpingIndexProcessor(); + dianpingProcessor.getSite().setSleepTime(sleepTime); + Spider.create(dianpingProcessor).thread(10).run(); + } +} diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingProcessor.java index 33ac3d7..056da0a 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingProcessor.java @@ -1,7 +1,7 @@ package us.codecraft.webmagic.samples; -import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; @@ -9,30 +9,36 @@ import java.util.List; /** * @author code4crafter@gmail.com
- * Date: 13-4-21 - * Time: 下午8:08 + * Date: 13-4-21 + * Time: 下午8:08 */ public class DianpingProcessor implements PageProcessor { + + private Site site; + @Override public void process(Page page) { - List requests = page.getHtml().links().regex(".*shop.*").toStrings(); + List requests = page.getHtml().links().regex("http://info-search-web121361\\.alpha\\.dp:8080/search/.*").toStrings(); page.addTargetRequests(requests); - requests = page.getHtml().regex(".*search/category/.*").toStrings(); - page.addTargetRequests(requests); - if (page.getUrl().toString().contains("shop")) { - page.putField("title", page.getHtml().xpath("//h1[@class='shop-title']")); - page.putField("content", page.getHtml().smartContent()); - } } @Override public Site getSite() { - return Site.me().setDomain("www.dianping.com").addStartUrl("http://www.dianping.com/"). - setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); + if (site == null) { + site = Site.me().setDomain("info-search-web361.alpha.dp:8080").addStartUrl("http://info11-search-web361.alpha.dp:8080/search/category/1/0"). + setSleepTime(100). + setUserAgent("I'm a performance tester created by yihua.huang"); + } + return site; } public static void main(String[] args) { + int sleepTime = 0; + if (args.length > 0) { + sleepTime = Integer.parseInt(args[0]); + } DianpingProcessor dianpingProcessor = new DianpingProcessor(); + dianpingProcessor.getSite().setSleepTime(sleepTime).setRetryTimes(10); Spider.create(dianpingProcessor).run(); } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GlobalProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GlobalProcessor.java index 07f0101..383422f 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GlobalProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GlobalProcessor.java @@ -22,7 +22,7 @@ public class GlobalProcessor implements PageProcessor { @Override public void process(Page page) { - final List requests = page.getHtml().links().regex(".*book\\.douban\\.com.*").toStrings(); + final List requests = page.getHtml().links().toStrings(); page.addTargetRequests(requests); } @@ -30,16 +30,19 @@ public class GlobalProcessor implements PageProcessor { @Override public Site getSite() { if (site==null){ - site = Site.me().setDomain("douban.com").addStartUrl("http://book.douban.com/").setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); + site = Site.me().setDomain("www.2345.com") + .addStartUrl("http://www.2345.com/").addStartUrl("http://hao.360.cn/") + .addStartUrl("http://www.baidu.com/s?wd=%E7%BD%91%E7%AB%99%E5%AF%BC%E8%88%AA&rsv_spt=1&issp=1&rsv_bp=0&ie=utf-8&tn=80039098_oem_dg&rsv_n=2&rsv_sug3=6&rsv_sug4=698&rsv_sug=0&rsv_sug1=3") + .setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); } return site; } public static void main(String[] args) { Spider.create(new GlobalProcessor()).thread(10) - .scheduler(new FileCacheQueueScheduler("/data/webmagic/github")) - .downloader(new FileDownloader("/data/webmagic/douban", new HttpClientDownloader())) - .pipeline(new FilePipeline("/data/webmagic/douban")) + .scheduler(new FileCacheQueueScheduler("/data/webmagic/test")) + .downloader(new FileDownloader("/data/webmagic/test", new HttpClientDownloader())) + .pipeline(new FilePipeline("/data/webmagic/test")) .run(); } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GuoxueProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GuoxueProcessor.java new file mode 100644 index 0000000..54d995e --- /dev/null +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GuoxueProcessor.java @@ -0,0 +1,20 @@ +package us.codecraft.webmagic.samples; + +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.pipeline.FilePipeline; +import us.codecraft.webmagic.processor.SimplePageProcessor; +import us.codecraft.webmagic.schedular.FileCacheQueueScheduler; + +/** + * @author yihua.huang@dianping.com
+ * @date: 13-7-14
+ * Time: 上午8:33
+ */ +public class GuoxueProcessor { + + public static void main(String[] args) { + SimplePageProcessor simplePageProcessor = new SimplePageProcessor("http://www.guoxue123.cn/", "http://www.guoxue123.cn/*"); + simplePageProcessor.getSite().setCharset("GBK").setSleepTime(500); + Spider.create(simplePageProcessor).pipeline(new FilePipeline("/data/webmagic/")).scheduler(new FileCacheQueueScheduler("/data/webmagic/")).run(); + } +} diff --git a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiaoyuwengProcessorTest.java b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiaoyuwengProcessorTest.java index 33bcf9c..5680d12 100644 --- a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiaoyuwengProcessorTest.java +++ b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiaoyuwengProcessorTest.java @@ -1,6 +1,5 @@ package us.codecraft.webmagic.processor; -import org.junit.Ignore; import org.junit.Test; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.pipeline.FilePipeline; @@ -17,7 +16,6 @@ import java.io.IOException; */ public class DiaoyuwengProcessorTest { - @Ignore @Test public void test() throws IOException { DiaoyuwengProcessor diaoyuwengProcessor = new DiaoyuwengProcessor();