add response headers to Page #508
parent
ba000b364c
commit
f23e138c72
|
@ -8,6 +8,7 @@ import us.codecraft.webmagic.utils.UrlUtils;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Object storing extracted result and urls to fetch.<br>
|
* Object storing extracted result and urls to fetch.<br>
|
||||||
|
@ -38,6 +39,8 @@ public class Page {
|
||||||
|
|
||||||
private Selectable url;
|
private Selectable url;
|
||||||
|
|
||||||
|
private Map<String,List<String>> headers;
|
||||||
|
|
||||||
private int statusCode;
|
private int statusCode;
|
||||||
|
|
||||||
private boolean needCycleRetry;
|
private boolean needCycleRetry;
|
||||||
|
@ -210,6 +213,14 @@ public class Page {
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public Map<String, List<String>> getHeaders() {
|
||||||
|
return headers;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setHeaders(Map<String, List<String>> headers) {
|
||||||
|
this.headers = headers;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String toString() {
|
public String toString() {
|
||||||
return "Page{" +
|
return "Page{" +
|
||||||
|
@ -217,7 +228,9 @@ public class Page {
|
||||||
", resultItems=" + resultItems +
|
", resultItems=" + resultItems +
|
||||||
", rawText='" + rawText + '\'' +
|
", rawText='" + rawText + '\'' +
|
||||||
", url=" + url +
|
", url=" + url +
|
||||||
|
", headers=" + headers +
|
||||||
", statusCode=" + statusCode +
|
", statusCode=" + statusCode +
|
||||||
|
", needCycleRetry=" + needCycleRetry +
|
||||||
", targetRequests=" + targetRequests +
|
", targetRequests=" + targetRequests +
|
||||||
'}';
|
'}';
|
||||||
}
|
}
|
||||||
|
|
|
@ -18,7 +18,6 @@ public class Request implements Serializable {
|
||||||
private static final long serialVersionUID = 2062192774891352043L;
|
private static final long serialVersionUID = 2062192774891352043L;
|
||||||
|
|
||||||
public static final String CYCLE_TRIED_TIMES = "_cycle_tried_times";
|
public static final String CYCLE_TRIED_TIMES = "_cycle_tried_times";
|
||||||
public static final String PROXY = "proxy";
|
|
||||||
|
|
||||||
private String url;
|
private String url;
|
||||||
|
|
||||||
|
|
|
@ -39,16 +39,6 @@ public class Site {
|
||||||
|
|
||||||
private boolean useGzip = true;
|
private boolean useGzip = true;
|
||||||
|
|
||||||
/**
|
|
||||||
* @see us.codecraft.webmagic.utils.HttpConstant.Header
|
|
||||||
* @deprecated
|
|
||||||
*/
|
|
||||||
public static interface HeaderConst {
|
|
||||||
|
|
||||||
public static final String REFERER = "Referer";
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
static {
|
static {
|
||||||
DEFAULT_STATUS_CODE_SET.add(200);
|
DEFAULT_STATUS_CODE_SET.add(200);
|
||||||
}
|
}
|
||||||
|
|
|
@ -23,6 +23,7 @@ import us.codecraft.webmagic.proxy.Proxy;
|
||||||
import us.codecraft.webmagic.proxy.ProxyProvider;
|
import us.codecraft.webmagic.proxy.ProxyProvider;
|
||||||
import us.codecraft.webmagic.selector.PlainText;
|
import us.codecraft.webmagic.selector.PlainText;
|
||||||
import us.codecraft.webmagic.utils.CharsetUtils;
|
import us.codecraft.webmagic.utils.CharsetUtils;
|
||||||
|
import us.codecraft.webmagic.utils.HttpClientUtils;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.nio.charset.Charset;
|
import java.nio.charset.Charset;
|
||||||
|
@ -49,6 +50,8 @@ public class HttpClientDownloader extends AbstractDownloader {
|
||||||
|
|
||||||
private ProxyProvider proxyProvider;
|
private ProxyProvider proxyProvider;
|
||||||
|
|
||||||
|
private boolean responseHeader = true;
|
||||||
|
|
||||||
public void setHttpUriRequestConverter(HttpUriRequestConverter httpUriRequestConverter) {
|
public void setHttpUriRequestConverter(HttpUriRequestConverter httpUriRequestConverter) {
|
||||||
this.httpUriRequestConverter = httpUriRequestConverter;
|
this.httpUriRequestConverter = httpUriRequestConverter;
|
||||||
}
|
}
|
||||||
|
@ -88,13 +91,12 @@ public class HttpClientDownloader extends AbstractDownloader {
|
||||||
HttpContext httpContext = new BasicHttpContext();
|
HttpContext httpContext = new BasicHttpContext();
|
||||||
if (proxyProvider != null) {
|
if (proxyProvider != null) {
|
||||||
proxy = proxyProvider.getProxy(task);
|
proxy = proxyProvider.getProxy(task);
|
||||||
request.putExtra(Request.PROXY, proxy);
|
|
||||||
AuthState authState = new AuthState();
|
AuthState authState = new AuthState();
|
||||||
authState.update(new BasicScheme(), new UsernamePasswordCredentials(proxy.getUsername(), proxy.getPassword()));
|
authState.update(new BasicScheme(), new UsernamePasswordCredentials(proxy.getUsername(), proxy.getPassword()));
|
||||||
httpContext.setAttribute(HttpClientContext.PROXY_AUTH_STATE, authState);
|
httpContext.setAttribute(HttpClientContext.PROXY_AUTH_STATE, authState);
|
||||||
}
|
}
|
||||||
HttpUriRequest httpUriRequest = httpUriRequestConverter.convert(request, site, proxy);
|
|
||||||
CloseableHttpClient httpClient = getHttpClient(site);
|
CloseableHttpClient httpClient = getHttpClient(site);
|
||||||
|
HttpUriRequest httpUriRequest = httpUriRequestConverter.convert(request, site, proxy);
|
||||||
try {
|
try {
|
||||||
httpResponse = httpClient.execute(httpUriRequest, httpContext);
|
httpResponse = httpClient.execute(httpUriRequest, httpContext);
|
||||||
statusCode = httpResponse.getStatusLine().getStatusCode();
|
statusCode = httpResponse.getStatusLine().getStatusCode();
|
||||||
|
@ -133,10 +135,13 @@ public class HttpClientDownloader extends AbstractDownloader {
|
||||||
page.setUrl(new PlainText(request.getUrl()));
|
page.setUrl(new PlainText(request.getUrl()));
|
||||||
page.setRequest(request);
|
page.setRequest(request);
|
||||||
page.setStatusCode(httpResponse.getStatusLine().getStatusCode());
|
page.setStatusCode(httpResponse.getStatusLine().getStatusCode());
|
||||||
|
if (responseHeader) {
|
||||||
|
page.setHeaders(HttpClientUtils.convertHeaders(httpResponse.getAllHeaders()));
|
||||||
|
}
|
||||||
return page;
|
return page;
|
||||||
}
|
}
|
||||||
|
|
||||||
protected String getContent(String charset, HttpResponse httpResponse) throws IOException {
|
private String getContent(String charset, HttpResponse httpResponse) throws IOException {
|
||||||
if (charset == null) {
|
if (charset == null) {
|
||||||
byte[] contentBytes = IOUtils.toByteArray(httpResponse.getEntity().getContent());
|
byte[] contentBytes = IOUtils.toByteArray(httpResponse.getEntity().getContent());
|
||||||
String htmlCharset = getHtmlCharset(httpResponse, contentBytes);
|
String htmlCharset = getHtmlCharset(httpResponse, contentBytes);
|
||||||
|
@ -151,7 +156,7 @@ public class HttpClientDownloader extends AbstractDownloader {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
protected String getHtmlCharset(HttpResponse httpResponse, byte[] contentBytes) throws IOException {
|
private String getHtmlCharset(HttpResponse httpResponse, byte[] contentBytes) throws IOException {
|
||||||
return CharsetUtils.detectCharset(httpResponse.getEntity().getContentType().getValue(), contentBytes);
|
return CharsetUtils.detectCharset(httpResponse.getEntity().getContentType().getValue(), contentBytes);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -39,7 +39,7 @@ public class HttpUriRequestConverter {
|
||||||
requestConfigBuilder.setConnectionRequestTimeout(site.getTimeOut())
|
requestConfigBuilder.setConnectionRequestTimeout(site.getTimeOut())
|
||||||
.setSocketTimeout(site.getTimeOut())
|
.setSocketTimeout(site.getTimeOut())
|
||||||
.setConnectTimeout(site.getTimeOut())
|
.setConnectTimeout(site.getTimeOut())
|
||||||
.setCookieSpec(CookieSpecs.BEST_MATCH);
|
.setCookieSpec(CookieSpecs.STANDARD);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (proxy != null) {
|
if (proxy != null) {
|
||||||
|
|
|
@ -7,7 +7,7 @@ import org.apache.http.HttpResponse;
|
||||||
* Date: 17/3/20
|
* Date: 17/3/20
|
||||||
* Time: 下午10:52
|
* Time: 下午10:52
|
||||||
*/
|
*/
|
||||||
public interface BannedChecker {
|
public interface ResponseChecker {
|
||||||
|
|
||||||
boolean isBanned(HttpResponse httpResponse);
|
boolean isBanned(HttpResponse httpResponse);
|
||||||
}
|
}
|
|
@ -0,0 +1,28 @@
|
||||||
|
package us.codecraft.webmagic.utils;
|
||||||
|
|
||||||
|
import org.apache.http.Header;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author code4crafter@gmail.com
|
||||||
|
* Date: 17/3/27
|
||||||
|
*/
|
||||||
|
public abstract class HttpClientUtils {
|
||||||
|
|
||||||
|
public static Map<String,List<String>> convertHeaders(Header[] headers){
|
||||||
|
Map<String,List<String>> results = new HashMap<String, List<String>>();
|
||||||
|
for (Header header : headers) {
|
||||||
|
List<String> list = results.get(header.getName());
|
||||||
|
if (list == null) {
|
||||||
|
list = new ArrayList<String>();
|
||||||
|
results.put(header.getName(), list);
|
||||||
|
}
|
||||||
|
list.add(header.getValue());
|
||||||
|
}
|
||||||
|
return results;
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue