add response headers to Page #508
parent
ba000b364c
commit
f23e138c72
|
@ -8,6 +8,7 @@ import us.codecraft.webmagic.utils.UrlUtils;
|
|||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Object storing extracted result and urls to fetch.<br>
|
||||
|
@ -38,6 +39,8 @@ public class Page {
|
|||
|
||||
private Selectable url;
|
||||
|
||||
private Map<String,List<String>> headers;
|
||||
|
||||
private int statusCode;
|
||||
|
||||
private boolean needCycleRetry;
|
||||
|
@ -210,6 +213,14 @@ public class Page {
|
|||
return this;
|
||||
}
|
||||
|
||||
public Map<String, List<String>> getHeaders() {
|
||||
return headers;
|
||||
}
|
||||
|
||||
public void setHeaders(Map<String, List<String>> headers) {
|
||||
this.headers = headers;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "Page{" +
|
||||
|
@ -217,7 +228,9 @@ public class Page {
|
|||
", resultItems=" + resultItems +
|
||||
", rawText='" + rawText + '\'' +
|
||||
", url=" + url +
|
||||
", headers=" + headers +
|
||||
", statusCode=" + statusCode +
|
||||
", needCycleRetry=" + needCycleRetry +
|
||||
", targetRequests=" + targetRequests +
|
||||
'}';
|
||||
}
|
||||
|
|
|
@ -18,7 +18,6 @@ public class Request implements Serializable {
|
|||
private static final long serialVersionUID = 2062192774891352043L;
|
||||
|
||||
public static final String CYCLE_TRIED_TIMES = "_cycle_tried_times";
|
||||
public static final String PROXY = "proxy";
|
||||
|
||||
private String url;
|
||||
|
||||
|
|
|
@ -39,16 +39,6 @@ public class Site {
|
|||
|
||||
private boolean useGzip = true;
|
||||
|
||||
/**
|
||||
* @see us.codecraft.webmagic.utils.HttpConstant.Header
|
||||
* @deprecated
|
||||
*/
|
||||
public static interface HeaderConst {
|
||||
|
||||
public static final String REFERER = "Referer";
|
||||
}
|
||||
|
||||
|
||||
static {
|
||||
DEFAULT_STATUS_CODE_SET.add(200);
|
||||
}
|
||||
|
|
|
@ -23,6 +23,7 @@ import us.codecraft.webmagic.proxy.Proxy;
|
|||
import us.codecraft.webmagic.proxy.ProxyProvider;
|
||||
import us.codecraft.webmagic.selector.PlainText;
|
||||
import us.codecraft.webmagic.utils.CharsetUtils;
|
||||
import us.codecraft.webmagic.utils.HttpClientUtils;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.charset.Charset;
|
||||
|
@ -49,6 +50,8 @@ public class HttpClientDownloader extends AbstractDownloader {
|
|||
|
||||
private ProxyProvider proxyProvider;
|
||||
|
||||
private boolean responseHeader = true;
|
||||
|
||||
public void setHttpUriRequestConverter(HttpUriRequestConverter httpUriRequestConverter) {
|
||||
this.httpUriRequestConverter = httpUriRequestConverter;
|
||||
}
|
||||
|
@ -88,13 +91,12 @@ public class HttpClientDownloader extends AbstractDownloader {
|
|||
HttpContext httpContext = new BasicHttpContext();
|
||||
if (proxyProvider != null) {
|
||||
proxy = proxyProvider.getProxy(task);
|
||||
request.putExtra(Request.PROXY, proxy);
|
||||
AuthState authState = new AuthState();
|
||||
authState.update(new BasicScheme(), new UsernamePasswordCredentials(proxy.getUsername(), proxy.getPassword()));
|
||||
httpContext.setAttribute(HttpClientContext.PROXY_AUTH_STATE, authState);
|
||||
}
|
||||
HttpUriRequest httpUriRequest = httpUriRequestConverter.convert(request, site, proxy);
|
||||
CloseableHttpClient httpClient = getHttpClient(site);
|
||||
HttpUriRequest httpUriRequest = httpUriRequestConverter.convert(request, site, proxy);
|
||||
try {
|
||||
httpResponse = httpClient.execute(httpUriRequest, httpContext);
|
||||
statusCode = httpResponse.getStatusLine().getStatusCode();
|
||||
|
@ -133,10 +135,13 @@ public class HttpClientDownloader extends AbstractDownloader {
|
|||
page.setUrl(new PlainText(request.getUrl()));
|
||||
page.setRequest(request);
|
||||
page.setStatusCode(httpResponse.getStatusLine().getStatusCode());
|
||||
if (responseHeader) {
|
||||
page.setHeaders(HttpClientUtils.convertHeaders(httpResponse.getAllHeaders()));
|
||||
}
|
||||
return page;
|
||||
}
|
||||
|
||||
protected String getContent(String charset, HttpResponse httpResponse) throws IOException {
|
||||
private String getContent(String charset, HttpResponse httpResponse) throws IOException {
|
||||
if (charset == null) {
|
||||
byte[] contentBytes = IOUtils.toByteArray(httpResponse.getEntity().getContent());
|
||||
String htmlCharset = getHtmlCharset(httpResponse, contentBytes);
|
||||
|
@ -151,7 +156,7 @@ public class HttpClientDownloader extends AbstractDownloader {
|
|||
}
|
||||
}
|
||||
|
||||
protected String getHtmlCharset(HttpResponse httpResponse, byte[] contentBytes) throws IOException {
|
||||
private String getHtmlCharset(HttpResponse httpResponse, byte[] contentBytes) throws IOException {
|
||||
return CharsetUtils.detectCharset(httpResponse.getEntity().getContentType().getValue(), contentBytes);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -39,7 +39,7 @@ public class HttpUriRequestConverter {
|
|||
requestConfigBuilder.setConnectionRequestTimeout(site.getTimeOut())
|
||||
.setSocketTimeout(site.getTimeOut())
|
||||
.setConnectTimeout(site.getTimeOut())
|
||||
.setCookieSpec(CookieSpecs.BEST_MATCH);
|
||||
.setCookieSpec(CookieSpecs.STANDARD);
|
||||
}
|
||||
|
||||
if (proxy != null) {
|
||||
|
|
|
@ -7,7 +7,7 @@ import org.apache.http.HttpResponse;
|
|||
* Date: 17/3/20
|
||||
* Time: 下午10:52
|
||||
*/
|
||||
public interface BannedChecker {
|
||||
public interface ResponseChecker {
|
||||
|
||||
boolean isBanned(HttpResponse httpResponse);
|
||||
}
|
|
@ -0,0 +1,28 @@
|
|||
package us.codecraft.webmagic.utils;
|
||||
|
||||
import org.apache.http.Header;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com
|
||||
* Date: 17/3/27
|
||||
*/
|
||||
public abstract class HttpClientUtils {
|
||||
|
||||
public static Map<String,List<String>> convertHeaders(Header[] headers){
|
||||
Map<String,List<String>> results = new HashMap<String, List<String>>();
|
||||
for (Header header : headers) {
|
||||
List<String> list = results.get(header.getName());
|
||||
if (list == null) {
|
||||
list = new ArrayList<String>();
|
||||
results.put(header.getName(), list);
|
||||
}
|
||||
list.add(header.getValue());
|
||||
}
|
||||
return results;
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue