diff --git a/.gitignore b/.gitignore
index 0af075f..cd33b61 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,3 @@
target/*
*.iml
+out/
diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml
index c0ef6a1..df482f7 100644
--- a/webmagic-core/pom.xml
+++ b/webmagic-core/pom.xml
@@ -12,7 +12,7 @@
org.apache.httpcomponents
httpclient
- 4.2.1
+ 4.2.4
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java
index 676584a..2c6118c 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java
@@ -24,6 +24,8 @@ public class Site {
private int sleepTime = 3000;
+ private int retryTimes = 0;
+
private static final Set DEFAULT_STATUS_CODE_SET = new HashSet();
private Set acceptStatCode = DEFAULT_STATUS_CODE_SET;
@@ -183,6 +185,23 @@ public class Site {
return sleepTime;
}
+ /**
+ * 获取重新下载的次数,默认为0
+ * @return 重新下载的次数
+ */
+ public int getRetryTimes() {
+ return retryTimes;
+ }
+
+ /**
+ * 设置获取重新下载的次数,默认为0
+ * @return this
+ */
+ public Site setRetryTimes(int retryTimes) {
+ this.retryTimes = retryTimes;
+ return this;
+ }
+
@Override
public boolean equals(Object o) {
if (this == o) return true;
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
index d2c2d62..e4ae0ff 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
@@ -16,11 +16,13 @@ import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.PlainText;
import us.codecraft.webmagic.utils.UrlUtils;
+import java.io.IOException;
+
/**
* @author code4crafter@gmail.com
- * Date: 13-4-21
- * Time: 下午12:15
+ * Date: 13-4-21
+ * Time: 下午12:15
*/
public class HttpClientDownloader implements Downloader {
@@ -34,11 +36,27 @@ public class HttpClientDownloader implements Downloader {
String charset = site.getCharset();
try {
HttpGet httpGet = new HttpGet(request.getUrl());
- HttpResponse httpResponse = httpClient.execute(httpGet);
+ HttpResponse httpResponse = null;
+ int tried = 0;
+ boolean retry;
+ do {
+ try {
+ httpResponse = httpClient.execute(httpGet);
+ retry = false;
+ } catch (IOException e) {
+ tried++;
+ if (tried > site.getRetryTimes()) {
+ logger.warn("download page " + request.getUrl() + " error", e);
+ return null;
+ }
+ logger.info("download page " + request.getUrl() + " error, retry the "+tried+" time!");
+ retry = true;
+ }
+ } while (retry);
int statusCode = httpResponse.getStatusLine().getStatusCode();
if (site.getAcceptStatCode().contains(statusCode)) {
//charset
- if (charset == null){
+ if (charset == null) {
String value = httpResponse.getEntity().getContentType().getValue();
charset = new PlainText(value).regex("charset=([^\\s]+)").toString();
}
@@ -52,7 +70,7 @@ public class HttpClientDownloader implements Downloader {
page.setRequest(request);
return page;
} else {
- logger.warn("code error " + statusCode);
+ logger.warn("code error " + statusCode + "\t" + request.getUrl());
}
} catch (Exception e) {
logger.warn("download page " + request.getUrl() + " error", e);
diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml
index 4e345a2..f1f6806 100644
--- a/webmagic-samples/pom.xml
+++ b/webmagic-samples/pom.xml
@@ -39,6 +39,25 @@
1.6
+
+ org.apache.maven.plugins
+ maven-dependency-plugin
+
+
+ copy-dependencies
+ package
+
+ copy-dependencies
+
+
+ ${project.build.directory}/lib
+ false
+ false
+ true
+
+
+
+
org.apache.maven.plugins
maven-resources-plugin
@@ -70,6 +89,19 @@
+
+ org.apache.maven.plugins
+ maven-jar-plugin
+
+
+
+ true
+ ./lib/
+ us.codecraft.webmagic.samples.DianpingIndexProcessor
+
+
+
+
org.apache.maven.plugins
maven-release-plugin
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingIndexProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingIndexProcessor.java
new file mode 100644
index 0000000..1f5da51
--- /dev/null
+++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingIndexProcessor.java
@@ -0,0 +1,53 @@
+package us.codecraft.webmagic.samples;
+
+import us.codecraft.webmagic.Page;
+import us.codecraft.webmagic.Site;
+import us.codecraft.webmagic.Spider;
+import us.codecraft.webmagic.processor.PageProcessor;
+
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * @author code4crafter@gmail.com
+ * Date: 13-4-21 Time: 下午8:08
+ */
+public class DianpingIndexProcessor implements PageProcessor {
+ @Override
+ public void process(Page page) {
+ if (page.getUrl().toString().equals("http://www.dianping.com/citylist")) {
+ page.addTargetRequests(page.getHtml().links().regex("http://www\\.dianping\\.com/\\w+$").toStrings());
+ return;
+ }
+ Pattern p = Pattern.compile("http://www\\.dianping\\.com/\\w+");
+ Matcher matcher = p.matcher(page.getUrl().toString());
+ if (matcher.matches()) {
+ page.addTargetRequests(page.getHtml().xpath("//li[@class='term-list-item']//a/@href").regex("http://www\\.dianping\\.com/search/.*").toStrings());
+ } else {
+ p = Pattern.compile("http://www\\.dianping\\.com/search/.*");
+ matcher = p.matcher(page.getUrl().toString());
+ if (matcher.matches()) {
+ String result = page.getHtml().regex("您要查看的内容不存在").toString();
+ if (result != null) {
+ System.err.println("No!Url not exist!" + page.getUrl());
+ }
+ }
+ }
+ }
+
+ @Override
+ public Site getSite() {
+ return Site.me().setDomain("www.dianping.com").addStartUrl("http://www.dianping.com/citylist")
+ .setSleepTime(0).setUserAgent("I'm a performance tester created by yihua.huang");
+ }
+
+ public static void main(String[] args) {
+ int sleepTime = 0;
+ if (args.length > 0) {
+ sleepTime = Integer.parseInt(args[0]);
+ }
+ DianpingIndexProcessor dianpingProcessor = new DianpingIndexProcessor();
+ dianpingProcessor.getSite().setSleepTime(sleepTime);
+ Spider.create(dianpingProcessor).thread(10).run();
+ }
+}
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingProcessor.java
index 33ac3d7..056da0a 100644
--- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingProcessor.java
+++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingProcessor.java
@@ -1,7 +1,7 @@
package us.codecraft.webmagic.samples;
-import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Page;
+import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
@@ -9,30 +9,36 @@ import java.util.List;
/**
* @author code4crafter@gmail.com
- * Date: 13-4-21
- * Time: 下午8:08
+ * Date: 13-4-21
+ * Time: 下午8:08
*/
public class DianpingProcessor implements PageProcessor {
+
+ private Site site;
+
@Override
public void process(Page page) {
- List requests = page.getHtml().links().regex(".*shop.*").toStrings();
+ List requests = page.getHtml().links().regex("http://info-search-web121361\\.alpha\\.dp:8080/search/.*").toStrings();
page.addTargetRequests(requests);
- requests = page.getHtml().regex(".*search/category/.*").toStrings();
- page.addTargetRequests(requests);
- if (page.getUrl().toString().contains("shop")) {
- page.putField("title", page.getHtml().xpath("//h1[@class='shop-title']"));
- page.putField("content", page.getHtml().smartContent());
- }
}
@Override
public Site getSite() {
- return Site.me().setDomain("www.dianping.com").addStartUrl("http://www.dianping.com/").
- setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
+ if (site == null) {
+ site = Site.me().setDomain("info-search-web361.alpha.dp:8080").addStartUrl("http://info11-search-web361.alpha.dp:8080/search/category/1/0").
+ setSleepTime(100).
+ setUserAgent("I'm a performance tester created by yihua.huang");
+ }
+ return site;
}
public static void main(String[] args) {
+ int sleepTime = 0;
+ if (args.length > 0) {
+ sleepTime = Integer.parseInt(args[0]);
+ }
DianpingProcessor dianpingProcessor = new DianpingProcessor();
+ dianpingProcessor.getSite().setSleepTime(sleepTime).setRetryTimes(10);
Spider.create(dianpingProcessor).run();
}
}
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GlobalProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GlobalProcessor.java
index 07f0101..383422f 100644
--- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GlobalProcessor.java
+++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GlobalProcessor.java
@@ -22,7 +22,7 @@ public class GlobalProcessor implements PageProcessor {
@Override
public void process(Page page) {
- final List requests = page.getHtml().links().regex(".*book\\.douban\\.com.*").toStrings();
+ final List requests = page.getHtml().links().toStrings();
page.addTargetRequests(requests);
}
@@ -30,16 +30,19 @@ public class GlobalProcessor implements PageProcessor {
@Override
public Site getSite() {
if (site==null){
- site = Site.me().setDomain("douban.com").addStartUrl("http://book.douban.com/").setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
+ site = Site.me().setDomain("www.2345.com")
+ .addStartUrl("http://www.2345.com/").addStartUrl("http://hao.360.cn/")
+ .addStartUrl("http://www.baidu.com/s?wd=%E7%BD%91%E7%AB%99%E5%AF%BC%E8%88%AA&rsv_spt=1&issp=1&rsv_bp=0&ie=utf-8&tn=80039098_oem_dg&rsv_n=2&rsv_sug3=6&rsv_sug4=698&rsv_sug=0&rsv_sug1=3")
+ .setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
}
return site;
}
public static void main(String[] args) {
Spider.create(new GlobalProcessor()).thread(10)
- .scheduler(new FileCacheQueueScheduler("/data/webmagic/github"))
- .downloader(new FileDownloader("/data/webmagic/douban", new HttpClientDownloader()))
- .pipeline(new FilePipeline("/data/webmagic/douban"))
+ .scheduler(new FileCacheQueueScheduler("/data/webmagic/test"))
+ .downloader(new FileDownloader("/data/webmagic/test", new HttpClientDownloader()))
+ .pipeline(new FilePipeline("/data/webmagic/test"))
.run();
}
}
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GuoxueProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GuoxueProcessor.java
new file mode 100644
index 0000000..54d995e
--- /dev/null
+++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GuoxueProcessor.java
@@ -0,0 +1,20 @@
+package us.codecraft.webmagic.samples;
+
+import us.codecraft.webmagic.Spider;
+import us.codecraft.webmagic.pipeline.FilePipeline;
+import us.codecraft.webmagic.processor.SimplePageProcessor;
+import us.codecraft.webmagic.schedular.FileCacheQueueScheduler;
+
+/**
+ * @author yihua.huang@dianping.com
+ * @date: 13-7-14
+ * Time: 上午8:33
+ */
+public class GuoxueProcessor {
+
+ public static void main(String[] args) {
+ SimplePageProcessor simplePageProcessor = new SimplePageProcessor("http://www.guoxue123.cn/", "http://www.guoxue123.cn/*");
+ simplePageProcessor.getSite().setCharset("GBK").setSleepTime(500);
+ Spider.create(simplePageProcessor).pipeline(new FilePipeline("/data/webmagic/")).scheduler(new FileCacheQueueScheduler("/data/webmagic/")).run();
+ }
+}
diff --git a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiaoyuwengProcessorTest.java b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiaoyuwengProcessorTest.java
index 33bcf9c..5680d12 100644
--- a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiaoyuwengProcessorTest.java
+++ b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiaoyuwengProcessorTest.java
@@ -1,6 +1,5 @@
package us.codecraft.webmagic.processor;
-import org.junit.Ignore;
import org.junit.Test;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.FilePipeline;
@@ -17,7 +16,6 @@ import java.io.IOException;
*/
public class DiaoyuwengProcessorTest {
- @Ignore
@Test
public void test() throws IOException {
DiaoyuwengProcessor diaoyuwengProcessor = new DiaoyuwengProcessor();