From 067f3ea0cbe17570a8ffb09575ecf2a740cf1851 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Wed, 14 Aug 2013 13:30:09 +0800 Subject: [PATCH] add some null pointer check for httpclientdownloader --- .../downloader/HttpClientDownloader.java | 30 ++++++++++++++----- .../webmagic/downloader/HttpClientPool.java | 24 ++++++++++----- 2 files changed, 39 insertions(+), 15 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index dd805c6..33207c8 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -17,6 +17,8 @@ import us.codecraft.webmagic.selector.PlainText; import us.codecraft.webmagic.utils.UrlUtils; import java.io.IOException; +import java.util.HashSet; +import java.util.Set; /** @@ -34,10 +36,23 @@ public class HttpClientDownloader implements Downloader { @Override public Page download(Request request, Task task) { - Site site = task.getSite(); + Site site = null; + if (task != null) { + site = task.getSite(); + } + int retryTimes = 0; + Set acceptStatCode; + String charset = null; + if (site != null) { + retryTimes = site.getRetryTimes(); + acceptStatCode = site.getAcceptStatCode(); + charset = site.getCharset(); + } else { + acceptStatCode = new HashSet(); + acceptStatCode.add(200); + } logger.info("downloading page " + request.getUrl()); HttpClient httpClient = HttpClientPool.getInstance(poolSize).getClient(site); - String charset = site.getCharset(); try { HttpGet httpGet = new HttpGet(request.getUrl()); HttpResponse httpResponse = null; @@ -49,7 +64,8 @@ public class HttpClientDownloader implements Downloader { retry = false; } catch (IOException e) { tried++; - if (tried > site.getRetryTimes()) { + + if (tried > retryTimes) { logger.warn("download page " + request.getUrl() + " error", e); return null; } @@ -58,7 +74,7 @@ public class HttpClientDownloader implements Downloader { } } while (retry); int statusCode = httpResponse.getStatusLine().getStatusCode(); - if (site.getAcceptStatCode().contains(statusCode)) { + if (acceptStatCode.contains(statusCode)) { //charset if (charset == null) { String value = httpResponse.getEntity().getContentType().getValue(); @@ -66,7 +82,7 @@ public class HttpClientDownloader implements Downloader { } // handleGzip(httpResponse); - return handleResponse(request, charset, httpResponse,task); + return handleResponse(request, charset, httpResponse, task); } else { logger.warn("code error " + statusCode + "\t" + request.getUrl()); } @@ -76,7 +92,7 @@ public class HttpClientDownloader implements Downloader { return null; } - protected Page handleResponse(Request request, String charset, HttpResponse httpResponse,Task task) throws IOException { + protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException { String content = IOUtils.toString(httpResponse.getEntity().getContent(), charset); Page page = new Page(); @@ -88,7 +104,7 @@ public class HttpClientDownloader implements Downloader { @Override public void setThread(int thread) { - poolSize=thread; + poolSize = thread; } private void handleGzip(HttpResponse httpResponse) { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java index a412f74..c6e2652 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java @@ -50,24 +50,30 @@ public class HttpClientPool { private HttpClient generateClient(Site site) { HttpParams params = new BasicHttpParams(); - params.setParameter(CoreProtocolPNames.USER_AGENT, site.getUserAgent()); + if (site != null && site.getUserAgent() != null) { + params.setParameter(CoreProtocolPNames.USER_AGENT, site.getUserAgent()); + } params.setIntParameter(CoreConnectionPNames.SO_TIMEOUT, 1000); params.setIntParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, 2000); HttpProtocolParamBean paramsBean = new HttpProtocolParamBean(params); paramsBean.setVersion(HttpVersion.HTTP_1_1); - paramsBean.setContentCharset(site.getCharset()); + if (site != null && site.getCharset() != null) { + paramsBean.setContentCharset(site.getCharset()); + } paramsBean.setUseExpectContinue(false); SchemeRegistry schemeRegistry = new SchemeRegistry(); schemeRegistry.register(new Scheme("http", 80, PlainSocketFactory.getSocketFactory())); - schemeRegistry.register(new Scheme("https", 443, SSLSocketFactory.getSocketFactory())); + schemeRegistry.register(new Scheme("https", 443, SSLSocketFactory.getSocketFactory())); PoolingClientConnectionManager connectionManager = new PoolingClientConnectionManager(schemeRegistry); connectionManager.setMaxTotal(poolSize); connectionManager.setDefaultMaxPerRoute(100); DefaultHttpClient httpClient = new DefaultHttpClient(connectionManager, params); - generateCookie(httpClient, site); + if (site != null) { + generateCookie(httpClient, site); + } httpClient.getParams().setIntParameter("http.socket.timeout", 60000); httpClient.getParams().setParameter(ClientPNames.COOKIE_POLICY, CookiePolicy.BEST_MATCH); return httpClient; @@ -75,10 +81,12 @@ public class HttpClientPool { private void generateCookie(DefaultHttpClient httpClient, Site site) { CookieStore cookieStore = new BasicCookieStore(); - for (Map.Entry cookieEntry : site.getCookies().entrySet()) { - BasicClientCookie cookie = new BasicClientCookie(cookieEntry.getKey(), cookieEntry.getValue()); - cookie.setDomain(site.getDomain()); - cookieStore.addCookie(cookie); + if (site.getCookies() != null) { + for (Map.Entry cookieEntry : site.getCookies().entrySet()) { + BasicClientCookie cookie = new BasicClientCookie(cookieEntry.getKey(), cookieEntry.getValue()); + cookie.setDomain(site.getDomain()); + cookieStore.addCookie(cookie); + } } httpClient.setCookieStore(cookieStore); }