add retry
parent
5c79550fd9
commit
2b34dc9d3f
|
@ -1,2 +1,3 @@
|
||||||
target/*
|
target/*
|
||||||
*.iml
|
*.iml
|
||||||
|
out/
|
||||||
|
|
|
@ -12,7 +12,7 @@
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.httpcomponents</groupId>
|
<groupId>org.apache.httpcomponents</groupId>
|
||||||
<artifactId>httpclient</artifactId>
|
<artifactId>httpclient</artifactId>
|
||||||
<version>4.2.1</version>
|
<version>4.2.4</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
|
|
|
@ -24,6 +24,8 @@ public class Site {
|
||||||
|
|
||||||
private int sleepTime = 3000;
|
private int sleepTime = 3000;
|
||||||
|
|
||||||
|
private int retryTimes = 0;
|
||||||
|
|
||||||
private static final Set<Integer> DEFAULT_STATUS_CODE_SET = new HashSet<Integer>();
|
private static final Set<Integer> DEFAULT_STATUS_CODE_SET = new HashSet<Integer>();
|
||||||
|
|
||||||
private Set<Integer> acceptStatCode = DEFAULT_STATUS_CODE_SET;
|
private Set<Integer> acceptStatCode = DEFAULT_STATUS_CODE_SET;
|
||||||
|
@ -183,6 +185,23 @@ public class Site {
|
||||||
return sleepTime;
|
return sleepTime;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 获取重新下载的次数,默认为0
|
||||||
|
* @return 重新下载的次数
|
||||||
|
*/
|
||||||
|
public int getRetryTimes() {
|
||||||
|
return retryTimes;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 设置获取重新下载的次数,默认为0
|
||||||
|
* @return this
|
||||||
|
*/
|
||||||
|
public Site setRetryTimes(int retryTimes) {
|
||||||
|
this.retryTimes = retryTimes;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean equals(Object o) {
|
public boolean equals(Object o) {
|
||||||
if (this == o) return true;
|
if (this == o) return true;
|
||||||
|
|
|
@ -16,6 +16,8 @@ import us.codecraft.webmagic.selector.Html;
|
||||||
import us.codecraft.webmagic.selector.PlainText;
|
import us.codecraft.webmagic.selector.PlainText;
|
||||||
import us.codecraft.webmagic.utils.UrlUtils;
|
import us.codecraft.webmagic.utils.UrlUtils;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
|
@ -34,11 +36,27 @@ public class HttpClientDownloader implements Downloader {
|
||||||
String charset = site.getCharset();
|
String charset = site.getCharset();
|
||||||
try {
|
try {
|
||||||
HttpGet httpGet = new HttpGet(request.getUrl());
|
HttpGet httpGet = new HttpGet(request.getUrl());
|
||||||
HttpResponse httpResponse = httpClient.execute(httpGet);
|
HttpResponse httpResponse = null;
|
||||||
|
int tried = 0;
|
||||||
|
boolean retry;
|
||||||
|
do {
|
||||||
|
try {
|
||||||
|
httpResponse = httpClient.execute(httpGet);
|
||||||
|
retry = false;
|
||||||
|
} catch (IOException e) {
|
||||||
|
tried++;
|
||||||
|
if (tried > site.getRetryTimes()) {
|
||||||
|
logger.warn("download page " + request.getUrl() + " error", e);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
logger.info("download page " + request.getUrl() + " error, retry the "+tried+" time!");
|
||||||
|
retry = true;
|
||||||
|
}
|
||||||
|
} while (retry);
|
||||||
int statusCode = httpResponse.getStatusLine().getStatusCode();
|
int statusCode = httpResponse.getStatusLine().getStatusCode();
|
||||||
if (site.getAcceptStatCode().contains(statusCode)) {
|
if (site.getAcceptStatCode().contains(statusCode)) {
|
||||||
//charset
|
//charset
|
||||||
if (charset == null){
|
if (charset == null) {
|
||||||
String value = httpResponse.getEntity().getContentType().getValue();
|
String value = httpResponse.getEntity().getContentType().getValue();
|
||||||
charset = new PlainText(value).regex("charset=([^\\s]+)").toString();
|
charset = new PlainText(value).regex("charset=([^\\s]+)").toString();
|
||||||
}
|
}
|
||||||
|
@ -52,7 +70,7 @@ public class HttpClientDownloader implements Downloader {
|
||||||
page.setRequest(request);
|
page.setRequest(request);
|
||||||
return page;
|
return page;
|
||||||
} else {
|
} else {
|
||||||
logger.warn("code error " + statusCode);
|
logger.warn("code error " + statusCode + "\t" + request.getUrl());
|
||||||
}
|
}
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
logger.warn("download page " + request.getUrl() + " error", e);
|
logger.warn("download page " + request.getUrl() + " error", e);
|
||||||
|
|
|
@ -39,6 +39,25 @@
|
||||||
<target>1.6</target>
|
<target>1.6</target>
|
||||||
</configuration>
|
</configuration>
|
||||||
</plugin>
|
</plugin>
|
||||||
|
<plugin>
|
||||||
|
<groupId>org.apache.maven.plugins</groupId>
|
||||||
|
<artifactId>maven-dependency-plugin</artifactId>
|
||||||
|
<executions>
|
||||||
|
<execution>
|
||||||
|
<id>copy-dependencies</id>
|
||||||
|
<phase>package</phase>
|
||||||
|
<goals>
|
||||||
|
<goal>copy-dependencies</goal>
|
||||||
|
</goals>
|
||||||
|
<configuration>
|
||||||
|
<outputDirectory>${project.build.directory}/lib</outputDirectory>
|
||||||
|
<overWriteReleases>false</overWriteReleases>
|
||||||
|
<overWriteSnapshots>false</overWriteSnapshots>
|
||||||
|
<overWriteIfNewer>true</overWriteIfNewer>
|
||||||
|
</configuration>
|
||||||
|
</execution>
|
||||||
|
</executions>
|
||||||
|
</plugin>
|
||||||
<plugin>
|
<plugin>
|
||||||
<groupId>org.apache.maven.plugins</groupId>
|
<groupId>org.apache.maven.plugins</groupId>
|
||||||
<artifactId>maven-resources-plugin</artifactId>
|
<artifactId>maven-resources-plugin</artifactId>
|
||||||
|
@ -70,6 +89,19 @@
|
||||||
</execution>
|
</execution>
|
||||||
</executions>
|
</executions>
|
||||||
</plugin>
|
</plugin>
|
||||||
|
<plugin>
|
||||||
|
<groupId>org.apache.maven.plugins</groupId>
|
||||||
|
<artifactId>maven-jar-plugin</artifactId>
|
||||||
|
<configuration>
|
||||||
|
<archive>
|
||||||
|
<manifest>
|
||||||
|
<addClasspath>true</addClasspath>
|
||||||
|
<classpathPrefix>./lib/</classpathPrefix>
|
||||||
|
<mainClass>us.codecraft.webmagic.samples.DianpingIndexProcessor</mainClass>
|
||||||
|
</manifest>
|
||||||
|
</archive>
|
||||||
|
</configuration>
|
||||||
|
</plugin>
|
||||||
<plugin>
|
<plugin>
|
||||||
<groupId>org.apache.maven.plugins</groupId>
|
<groupId>org.apache.maven.plugins</groupId>
|
||||||
<artifactId>maven-release-plugin</artifactId>
|
<artifactId>maven-release-plugin</artifactId>
|
||||||
|
|
|
@ -0,0 +1,53 @@
|
||||||
|
package us.codecraft.webmagic.samples;
|
||||||
|
|
||||||
|
import us.codecraft.webmagic.Page;
|
||||||
|
import us.codecraft.webmagic.Site;
|
||||||
|
import us.codecraft.webmagic.Spider;
|
||||||
|
import us.codecraft.webmagic.processor.PageProcessor;
|
||||||
|
|
||||||
|
import java.util.regex.Matcher;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author code4crafter@gmail.com <br>
|
||||||
|
* Date: 13-4-21 Time: 下午8:08
|
||||||
|
*/
|
||||||
|
public class DianpingIndexProcessor implements PageProcessor {
|
||||||
|
@Override
|
||||||
|
public void process(Page page) {
|
||||||
|
if (page.getUrl().toString().equals("http://www.dianping.com/citylist")) {
|
||||||
|
page.addTargetRequests(page.getHtml().links().regex("http://www\\.dianping\\.com/\\w+$").toStrings());
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
Pattern p = Pattern.compile("http://www\\.dianping\\.com/\\w+");
|
||||||
|
Matcher matcher = p.matcher(page.getUrl().toString());
|
||||||
|
if (matcher.matches()) {
|
||||||
|
page.addTargetRequests(page.getHtml().xpath("//li[@class='term-list-item']//a/@href").regex("http://www\\.dianping\\.com/search/.*").toStrings());
|
||||||
|
} else {
|
||||||
|
p = Pattern.compile("http://www\\.dianping\\.com/search/.*");
|
||||||
|
matcher = p.matcher(page.getUrl().toString());
|
||||||
|
if (matcher.matches()) {
|
||||||
|
String result = page.getHtml().regex("您要查看的内容不存在").toString();
|
||||||
|
if (result != null) {
|
||||||
|
System.err.println("No!Url not exist!" + page.getUrl());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Site getSite() {
|
||||||
|
return Site.me().setDomain("www.dianping.com").addStartUrl("http://www.dianping.com/citylist")
|
||||||
|
.setSleepTime(0).setUserAgent("I'm a performance tester created by yihua.huang");
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void main(String[] args) {
|
||||||
|
int sleepTime = 0;
|
||||||
|
if (args.length > 0) {
|
||||||
|
sleepTime = Integer.parseInt(args[0]);
|
||||||
|
}
|
||||||
|
DianpingIndexProcessor dianpingProcessor = new DianpingIndexProcessor();
|
||||||
|
dianpingProcessor.getSite().setSleepTime(sleepTime);
|
||||||
|
Spider.create(dianpingProcessor).thread(10).run();
|
||||||
|
}
|
||||||
|
}
|
|
@ -1,7 +1,7 @@
|
||||||
package us.codecraft.webmagic.samples;
|
package us.codecraft.webmagic.samples;
|
||||||
|
|
||||||
import us.codecraft.webmagic.Site;
|
|
||||||
import us.codecraft.webmagic.Page;
|
import us.codecraft.webmagic.Page;
|
||||||
|
import us.codecraft.webmagic.Site;
|
||||||
import us.codecraft.webmagic.Spider;
|
import us.codecraft.webmagic.Spider;
|
||||||
import us.codecraft.webmagic.processor.PageProcessor;
|
import us.codecraft.webmagic.processor.PageProcessor;
|
||||||
|
|
||||||
|
@ -13,26 +13,32 @@ import java.util.List;
|
||||||
* Time: 下午8:08
|
* Time: 下午8:08
|
||||||
*/
|
*/
|
||||||
public class DianpingProcessor implements PageProcessor {
|
public class DianpingProcessor implements PageProcessor {
|
||||||
|
|
||||||
|
private Site site;
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void process(Page page) {
|
public void process(Page page) {
|
||||||
List<String> requests = page.getHtml().links().regex(".*shop.*").toStrings();
|
List<String> requests = page.getHtml().links().regex("http://info-search-web121361\\.alpha\\.dp:8080/search/.*").toStrings();
|
||||||
page.addTargetRequests(requests);
|
page.addTargetRequests(requests);
|
||||||
requests = page.getHtml().regex(".*search/category/.*").toStrings();
|
|
||||||
page.addTargetRequests(requests);
|
|
||||||
if (page.getUrl().toString().contains("shop")) {
|
|
||||||
page.putField("title", page.getHtml().xpath("//h1[@class='shop-title']"));
|
|
||||||
page.putField("content", page.getHtml().smartContent());
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Site getSite() {
|
public Site getSite() {
|
||||||
return Site.me().setDomain("www.dianping.com").addStartUrl("http://www.dianping.com/").
|
if (site == null) {
|
||||||
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
|
site = Site.me().setDomain("info-search-web361.alpha.dp:8080").addStartUrl("http://info11-search-web361.alpha.dp:8080/search/category/1/0").
|
||||||
|
setSleepTime(100).
|
||||||
|
setUserAgent("I'm a performance tester created by yihua.huang");
|
||||||
|
}
|
||||||
|
return site;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static void main(String[] args) {
|
public static void main(String[] args) {
|
||||||
|
int sleepTime = 0;
|
||||||
|
if (args.length > 0) {
|
||||||
|
sleepTime = Integer.parseInt(args[0]);
|
||||||
|
}
|
||||||
DianpingProcessor dianpingProcessor = new DianpingProcessor();
|
DianpingProcessor dianpingProcessor = new DianpingProcessor();
|
||||||
|
dianpingProcessor.getSite().setSleepTime(sleepTime).setRetryTimes(10);
|
||||||
Spider.create(dianpingProcessor).run();
|
Spider.create(dianpingProcessor).run();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -22,7 +22,7 @@ public class GlobalProcessor implements PageProcessor {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void process(Page page) {
|
public void process(Page page) {
|
||||||
final List<String> requests = page.getHtml().links().regex(".*book\\.douban\\.com.*").toStrings();
|
final List<String> requests = page.getHtml().links().toStrings();
|
||||||
page.addTargetRequests(requests);
|
page.addTargetRequests(requests);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -30,16 +30,19 @@ public class GlobalProcessor implements PageProcessor {
|
||||||
@Override
|
@Override
|
||||||
public Site getSite() {
|
public Site getSite() {
|
||||||
if (site==null){
|
if (site==null){
|
||||||
site = Site.me().setDomain("douban.com").addStartUrl("http://book.douban.com/").setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
|
site = Site.me().setDomain("www.2345.com")
|
||||||
|
.addStartUrl("http://www.2345.com/").addStartUrl("http://hao.360.cn/")
|
||||||
|
.addStartUrl("http://www.baidu.com/s?wd=%E7%BD%91%E7%AB%99%E5%AF%BC%E8%88%AA&rsv_spt=1&issp=1&rsv_bp=0&ie=utf-8&tn=80039098_oem_dg&rsv_n=2&rsv_sug3=6&rsv_sug4=698&rsv_sug=0&rsv_sug1=3")
|
||||||
|
.setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
|
||||||
}
|
}
|
||||||
return site;
|
return site;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static void main(String[] args) {
|
public static void main(String[] args) {
|
||||||
Spider.create(new GlobalProcessor()).thread(10)
|
Spider.create(new GlobalProcessor()).thread(10)
|
||||||
.scheduler(new FileCacheQueueScheduler("/data/webmagic/github"))
|
.scheduler(new FileCacheQueueScheduler("/data/webmagic/test"))
|
||||||
.downloader(new FileDownloader("/data/webmagic/douban", new HttpClientDownloader()))
|
.downloader(new FileDownloader("/data/webmagic/test", new HttpClientDownloader()))
|
||||||
.pipeline(new FilePipeline("/data/webmagic/douban"))
|
.pipeline(new FilePipeline("/data/webmagic/test"))
|
||||||
.run();
|
.run();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,20 @@
|
||||||
|
package us.codecraft.webmagic.samples;
|
||||||
|
|
||||||
|
import us.codecraft.webmagic.Spider;
|
||||||
|
import us.codecraft.webmagic.pipeline.FilePipeline;
|
||||||
|
import us.codecraft.webmagic.processor.SimplePageProcessor;
|
||||||
|
import us.codecraft.webmagic.schedular.FileCacheQueueScheduler;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author yihua.huang@dianping.com <br>
|
||||||
|
* @date: 13-7-14 <br>
|
||||||
|
* Time: 上午8:33 <br>
|
||||||
|
*/
|
||||||
|
public class GuoxueProcessor {
|
||||||
|
|
||||||
|
public static void main(String[] args) {
|
||||||
|
SimplePageProcessor simplePageProcessor = new SimplePageProcessor("http://www.guoxue123.cn/", "http://www.guoxue123.cn/*");
|
||||||
|
simplePageProcessor.getSite().setCharset("GBK").setSleepTime(500);
|
||||||
|
Spider.create(simplePageProcessor).pipeline(new FilePipeline("/data/webmagic/")).scheduler(new FileCacheQueueScheduler("/data/webmagic/")).run();
|
||||||
|
}
|
||||||
|
}
|
|
@ -1,6 +1,5 @@
|
||||||
package us.codecraft.webmagic.processor;
|
package us.codecraft.webmagic.processor;
|
||||||
|
|
||||||
import org.junit.Ignore;
|
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
import us.codecraft.webmagic.Spider;
|
import us.codecraft.webmagic.Spider;
|
||||||
import us.codecraft.webmagic.pipeline.FilePipeline;
|
import us.codecraft.webmagic.pipeline.FilePipeline;
|
||||||
|
@ -17,7 +16,6 @@ import java.io.IOException;
|
||||||
*/
|
*/
|
||||||
public class DiaoyuwengProcessorTest {
|
public class DiaoyuwengProcessorTest {
|
||||||
|
|
||||||
@Ignore
|
|
||||||
@Test
|
@Test
|
||||||
public void test() throws IOException {
|
public void test() throws IOException {
|
||||||
DiaoyuwengProcessor diaoyuwengProcessor = new DiaoyuwengProcessor();
|
DiaoyuwengProcessor diaoyuwengProcessor = new DiaoyuwengProcessor();
|
||||||
|
|
Loading…
Reference in New Issue