add retry
parent
5c79550fd9
commit
2b34dc9d3f
|
@ -1,2 +1,3 @@
|
|||
target/*
|
||||
*.iml
|
||||
out/
|
||||
|
|
|
@ -12,7 +12,7 @@
|
|||
<dependency>
|
||||
<groupId>org.apache.httpcomponents</groupId>
|
||||
<artifactId>httpclient</artifactId>
|
||||
<version>4.2.1</version>
|
||||
<version>4.2.4</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
|
|
|
@ -24,6 +24,8 @@ public class Site {
|
|||
|
||||
private int sleepTime = 3000;
|
||||
|
||||
private int retryTimes = 0;
|
||||
|
||||
private static final Set<Integer> DEFAULT_STATUS_CODE_SET = new HashSet<Integer>();
|
||||
|
||||
private Set<Integer> acceptStatCode = DEFAULT_STATUS_CODE_SET;
|
||||
|
@ -183,6 +185,23 @@ public class Site {
|
|||
return sleepTime;
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取重新下载的次数,默认为0
|
||||
* @return 重新下载的次数
|
||||
*/
|
||||
public int getRetryTimes() {
|
||||
return retryTimes;
|
||||
}
|
||||
|
||||
/**
|
||||
* 设置获取重新下载的次数,默认为0
|
||||
* @return this
|
||||
*/
|
||||
public Site setRetryTimes(int retryTimes) {
|
||||
this.retryTimes = retryTimes;
|
||||
return this;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
if (this == o) return true;
|
||||
|
|
|
@ -16,11 +16,13 @@ import us.codecraft.webmagic.selector.Html;
|
|||
import us.codecraft.webmagic.selector.PlainText;
|
||||
import us.codecraft.webmagic.utils.UrlUtils;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-4-21
|
||||
* Time: 下午12:15
|
||||
* Date: 13-4-21
|
||||
* Time: 下午12:15
|
||||
*/
|
||||
public class HttpClientDownloader implements Downloader {
|
||||
|
||||
|
@ -34,11 +36,27 @@ public class HttpClientDownloader implements Downloader {
|
|||
String charset = site.getCharset();
|
||||
try {
|
||||
HttpGet httpGet = new HttpGet(request.getUrl());
|
||||
HttpResponse httpResponse = httpClient.execute(httpGet);
|
||||
HttpResponse httpResponse = null;
|
||||
int tried = 0;
|
||||
boolean retry;
|
||||
do {
|
||||
try {
|
||||
httpResponse = httpClient.execute(httpGet);
|
||||
retry = false;
|
||||
} catch (IOException e) {
|
||||
tried++;
|
||||
if (tried > site.getRetryTimes()) {
|
||||
logger.warn("download page " + request.getUrl() + " error", e);
|
||||
return null;
|
||||
}
|
||||
logger.info("download page " + request.getUrl() + " error, retry the "+tried+" time!");
|
||||
retry = true;
|
||||
}
|
||||
} while (retry);
|
||||
int statusCode = httpResponse.getStatusLine().getStatusCode();
|
||||
if (site.getAcceptStatCode().contains(statusCode)) {
|
||||
//charset
|
||||
if (charset == null){
|
||||
if (charset == null) {
|
||||
String value = httpResponse.getEntity().getContentType().getValue();
|
||||
charset = new PlainText(value).regex("charset=([^\\s]+)").toString();
|
||||
}
|
||||
|
@ -52,7 +70,7 @@ public class HttpClientDownloader implements Downloader {
|
|||
page.setRequest(request);
|
||||
return page;
|
||||
} else {
|
||||
logger.warn("code error " + statusCode);
|
||||
logger.warn("code error " + statusCode + "\t" + request.getUrl());
|
||||
}
|
||||
} catch (Exception e) {
|
||||
logger.warn("download page " + request.getUrl() + " error", e);
|
||||
|
|
|
@ -39,6 +39,25 @@
|
|||
<target>1.6</target>
|
||||
</configuration>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-dependency-plugin</artifactId>
|
||||
<executions>
|
||||
<execution>
|
||||
<id>copy-dependencies</id>
|
||||
<phase>package</phase>
|
||||
<goals>
|
||||
<goal>copy-dependencies</goal>
|
||||
</goals>
|
||||
<configuration>
|
||||
<outputDirectory>${project.build.directory}/lib</outputDirectory>
|
||||
<overWriteReleases>false</overWriteReleases>
|
||||
<overWriteSnapshots>false</overWriteSnapshots>
|
||||
<overWriteIfNewer>true</overWriteIfNewer>
|
||||
</configuration>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-resources-plugin</artifactId>
|
||||
|
@ -70,6 +89,19 @@
|
|||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-jar-plugin</artifactId>
|
||||
<configuration>
|
||||
<archive>
|
||||
<manifest>
|
||||
<addClasspath>true</addClasspath>
|
||||
<classpathPrefix>./lib/</classpathPrefix>
|
||||
<mainClass>us.codecraft.webmagic.samples.DianpingIndexProcessor</mainClass>
|
||||
</manifest>
|
||||
</archive>
|
||||
</configuration>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-release-plugin</artifactId>
|
||||
|
|
|
@ -0,0 +1,53 @@
|
|||
package us.codecraft.webmagic.samples;
|
||||
|
||||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.Site;
|
||||
import us.codecraft.webmagic.Spider;
|
||||
import us.codecraft.webmagic.processor.PageProcessor;
|
||||
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-4-21 Time: 下午8:08
|
||||
*/
|
||||
public class DianpingIndexProcessor implements PageProcessor {
|
||||
@Override
|
||||
public void process(Page page) {
|
||||
if (page.getUrl().toString().equals("http://www.dianping.com/citylist")) {
|
||||
page.addTargetRequests(page.getHtml().links().regex("http://www\\.dianping\\.com/\\w+$").toStrings());
|
||||
return;
|
||||
}
|
||||
Pattern p = Pattern.compile("http://www\\.dianping\\.com/\\w+");
|
||||
Matcher matcher = p.matcher(page.getUrl().toString());
|
||||
if (matcher.matches()) {
|
||||
page.addTargetRequests(page.getHtml().xpath("//li[@class='term-list-item']//a/@href").regex("http://www\\.dianping\\.com/search/.*").toStrings());
|
||||
} else {
|
||||
p = Pattern.compile("http://www\\.dianping\\.com/search/.*");
|
||||
matcher = p.matcher(page.getUrl().toString());
|
||||
if (matcher.matches()) {
|
||||
String result = page.getHtml().regex("您要查看的内容不存在").toString();
|
||||
if (result != null) {
|
||||
System.err.println("No!Url not exist!" + page.getUrl());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Site getSite() {
|
||||
return Site.me().setDomain("www.dianping.com").addStartUrl("http://www.dianping.com/citylist")
|
||||
.setSleepTime(0).setUserAgent("I'm a performance tester created by yihua.huang");
|
||||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
int sleepTime = 0;
|
||||
if (args.length > 0) {
|
||||
sleepTime = Integer.parseInt(args[0]);
|
||||
}
|
||||
DianpingIndexProcessor dianpingProcessor = new DianpingIndexProcessor();
|
||||
dianpingProcessor.getSite().setSleepTime(sleepTime);
|
||||
Spider.create(dianpingProcessor).thread(10).run();
|
||||
}
|
||||
}
|
|
@ -1,7 +1,7 @@
|
|||
package us.codecraft.webmagic.samples;
|
||||
|
||||
import us.codecraft.webmagic.Site;
|
||||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.Site;
|
||||
import us.codecraft.webmagic.Spider;
|
||||
import us.codecraft.webmagic.processor.PageProcessor;
|
||||
|
||||
|
@ -9,30 +9,36 @@ import java.util.List;
|
|||
|
||||
/**
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-4-21
|
||||
* Time: 下午8:08
|
||||
* Date: 13-4-21
|
||||
* Time: 下午8:08
|
||||
*/
|
||||
public class DianpingProcessor implements PageProcessor {
|
||||
|
||||
private Site site;
|
||||
|
||||
@Override
|
||||
public void process(Page page) {
|
||||
List<String> requests = page.getHtml().links().regex(".*shop.*").toStrings();
|
||||
List<String> requests = page.getHtml().links().regex("http://info-search-web121361\\.alpha\\.dp:8080/search/.*").toStrings();
|
||||
page.addTargetRequests(requests);
|
||||
requests = page.getHtml().regex(".*search/category/.*").toStrings();
|
||||
page.addTargetRequests(requests);
|
||||
if (page.getUrl().toString().contains("shop")) {
|
||||
page.putField("title", page.getHtml().xpath("//h1[@class='shop-title']"));
|
||||
page.putField("content", page.getHtml().smartContent());
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Site getSite() {
|
||||
return Site.me().setDomain("www.dianping.com").addStartUrl("http://www.dianping.com/").
|
||||
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
|
||||
if (site == null) {
|
||||
site = Site.me().setDomain("info-search-web361.alpha.dp:8080").addStartUrl("http://info11-search-web361.alpha.dp:8080/search/category/1/0").
|
||||
setSleepTime(100).
|
||||
setUserAgent("I'm a performance tester created by yihua.huang");
|
||||
}
|
||||
return site;
|
||||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
int sleepTime = 0;
|
||||
if (args.length > 0) {
|
||||
sleepTime = Integer.parseInt(args[0]);
|
||||
}
|
||||
DianpingProcessor dianpingProcessor = new DianpingProcessor();
|
||||
dianpingProcessor.getSite().setSleepTime(sleepTime).setRetryTimes(10);
|
||||
Spider.create(dianpingProcessor).run();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -22,7 +22,7 @@ public class GlobalProcessor implements PageProcessor {
|
|||
|
||||
@Override
|
||||
public void process(Page page) {
|
||||
final List<String> requests = page.getHtml().links().regex(".*book\\.douban\\.com.*").toStrings();
|
||||
final List<String> requests = page.getHtml().links().toStrings();
|
||||
page.addTargetRequests(requests);
|
||||
|
||||
}
|
||||
|
@ -30,16 +30,19 @@ public class GlobalProcessor implements PageProcessor {
|
|||
@Override
|
||||
public Site getSite() {
|
||||
if (site==null){
|
||||
site = Site.me().setDomain("douban.com").addStartUrl("http://book.douban.com/").setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
|
||||
site = Site.me().setDomain("www.2345.com")
|
||||
.addStartUrl("http://www.2345.com/").addStartUrl("http://hao.360.cn/")
|
||||
.addStartUrl("http://www.baidu.com/s?wd=%E7%BD%91%E7%AB%99%E5%AF%BC%E8%88%AA&rsv_spt=1&issp=1&rsv_bp=0&ie=utf-8&tn=80039098_oem_dg&rsv_n=2&rsv_sug3=6&rsv_sug4=698&rsv_sug=0&rsv_sug1=3")
|
||||
.setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
|
||||
}
|
||||
return site;
|
||||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
Spider.create(new GlobalProcessor()).thread(10)
|
||||
.scheduler(new FileCacheQueueScheduler("/data/webmagic/github"))
|
||||
.downloader(new FileDownloader("/data/webmagic/douban", new HttpClientDownloader()))
|
||||
.pipeline(new FilePipeline("/data/webmagic/douban"))
|
||||
.scheduler(new FileCacheQueueScheduler("/data/webmagic/test"))
|
||||
.downloader(new FileDownloader("/data/webmagic/test", new HttpClientDownloader()))
|
||||
.pipeline(new FilePipeline("/data/webmagic/test"))
|
||||
.run();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,20 @@
|
|||
package us.codecraft.webmagic.samples;
|
||||
|
||||
import us.codecraft.webmagic.Spider;
|
||||
import us.codecraft.webmagic.pipeline.FilePipeline;
|
||||
import us.codecraft.webmagic.processor.SimplePageProcessor;
|
||||
import us.codecraft.webmagic.schedular.FileCacheQueueScheduler;
|
||||
|
||||
/**
|
||||
* @author yihua.huang@dianping.com <br>
|
||||
* @date: 13-7-14 <br>
|
||||
* Time: 上午8:33 <br>
|
||||
*/
|
||||
public class GuoxueProcessor {
|
||||
|
||||
public static void main(String[] args) {
|
||||
SimplePageProcessor simplePageProcessor = new SimplePageProcessor("http://www.guoxue123.cn/", "http://www.guoxue123.cn/*");
|
||||
simplePageProcessor.getSite().setCharset("GBK").setSleepTime(500);
|
||||
Spider.create(simplePageProcessor).pipeline(new FilePipeline("/data/webmagic/")).scheduler(new FileCacheQueueScheduler("/data/webmagic/")).run();
|
||||
}
|
||||
}
|
|
@ -1,6 +1,5 @@
|
|||
package us.codecraft.webmagic.processor;
|
||||
|
||||
import org.junit.Ignore;
|
||||
import org.junit.Test;
|
||||
import us.codecraft.webmagic.Spider;
|
||||
import us.codecraft.webmagic.pipeline.FilePipeline;
|
||||
|
@ -17,7 +16,6 @@ import java.io.IOException;
|
|||
*/
|
||||
public class DiaoyuwengProcessorTest {
|
||||
|
||||
@Ignore
|
||||
@Test
|
||||
public void test() throws IOException {
|
||||
DiaoyuwengProcessor diaoyuwengProcessor = new DiaoyuwengProcessor();
|
||||
|
|
Loading…
Reference in New Issue