add cookie support & add docs
parent
8cef8774cb
commit
0ae7adf324
|
@ -10,6 +10,7 @@ import java.util.Map;
|
||||||
import java.util.concurrent.ConcurrentHashMap;
|
import java.util.concurrent.ConcurrentHashMap;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
* Page保存了抓取的结果,并可定义下一次抓取的链接内容。
|
||||||
* Author: code4crafter@gmail.com
|
* Author: code4crafter@gmail.com
|
||||||
* Date: 13-4-21
|
* Date: 13-4-21
|
||||||
* Time: 上午11:22
|
* Time: 上午11:22
|
||||||
|
@ -65,7 +66,7 @@ public class Page {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public void addTargetRequests(String requestString) {
|
public void addTargetRequest(String requestString) {
|
||||||
if (StringUtils.isBlank(requestString) || requestString.equals("#")) {
|
if (StringUtils.isBlank(requestString) || requestString.equals("#")) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
@ -75,6 +76,12 @@ public class Page {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void addTargetRequest(Request request) {
|
||||||
|
synchronized (targetRequests) {
|
||||||
|
targetRequests.add(request);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public Selectable getUrl() {
|
public Selectable getUrl() {
|
||||||
return url;
|
return url;
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,7 +1,22 @@
|
||||||
package us.codecraft.webmagic;
|
package us.codecraft.webmagic;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Request对象是
|
* Request对象封装了待抓取的url信息。<br/>
|
||||||
|
* 在PageProcessor中,Request对象可以通过{@link us.codecraft.webmagic.Page#getRequest()} 获取。<br/>
|
||||||
|
* Request对象包含一个extra属性,可以写入一些必须的上下文,这个特性在某些场合会有用。<br/>
|
||||||
|
* <pre>
|
||||||
|
* Example:
|
||||||
|
* 抓取<a href="${link}">${linktext}</a>时,希望提取链接link,并保存linktext的信息。
|
||||||
|
* 在上一个页面:
|
||||||
|
* public void process(Page page){
|
||||||
|
* Request request = new Request(link,linktext);
|
||||||
|
* page.addTargetRequest(request)
|
||||||
|
* }
|
||||||
|
* 在下一个页面:
|
||||||
|
* public void process(Page page){
|
||||||
|
* String linktext = (String)page.getRequest().getExtra()[0];
|
||||||
|
* }
|
||||||
|
* </pre>
|
||||||
* Author: code4crafter@gmail.com
|
* Author: code4crafter@gmail.com
|
||||||
* Date: 13-4-21
|
* Date: 13-4-21
|
||||||
* Time: 上午11:37
|
* Time: 上午11:37
|
||||||
|
@ -12,15 +27,28 @@ public class Request {
|
||||||
|
|
||||||
private Object[] extra;
|
private Object[] extra;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 构建一个request对象
|
||||||
|
* @param url 必须参数,待抓取的url
|
||||||
|
* @param extra 额外参数,可以保存一些需要的上下文信息
|
||||||
|
*/
|
||||||
public Request(String url, Object... extra) {
|
public Request(String url, Object... extra) {
|
||||||
this.url = url;
|
this.url = url;
|
||||||
this.extra = extra;
|
this.extra = extra;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 获取预存的对象
|
||||||
|
* @return object[] 预存的对象数组
|
||||||
|
*/
|
||||||
public Object[] getExtra() {
|
public Object[] getExtra() {
|
||||||
return extra;
|
return extra;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 获取待抓取的url
|
||||||
|
* @return url 待抓取的url
|
||||||
|
*/
|
||||||
public String getUrl() {
|
public String getUrl() {
|
||||||
return url;
|
return url;
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,9 +1,9 @@
|
||||||
package us.codecraft.webmagic;
|
package us.codecraft.webmagic;
|
||||||
|
|
||||||
import java.util.HashSet;
|
import java.util.*;
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
* Site定义一个待抓取的站点的各种信息。
|
||||||
* Author: code4crafter@gmail.com
|
* Author: code4crafter@gmail.com
|
||||||
* Date: 13-4-21
|
* Date: 13-4-21
|
||||||
* Time: 下午12:13
|
* Time: 下午12:13
|
||||||
|
@ -14,11 +14,11 @@ public class Site {
|
||||||
|
|
||||||
private String userAgent;
|
private String userAgent;
|
||||||
|
|
||||||
private String cookie;
|
private Map<String,String> cookies = new LinkedHashMap<String, String>();
|
||||||
|
|
||||||
private String encoding;
|
private String encoding;
|
||||||
|
|
||||||
private String startUrl;
|
private List<String> startUrls;
|
||||||
|
|
||||||
private int sleepTime = 3000;
|
private int sleepTime = 3000;
|
||||||
|
|
||||||
|
@ -34,8 +34,8 @@ public class Site {
|
||||||
return new Site();
|
return new Site();
|
||||||
}
|
}
|
||||||
|
|
||||||
public Site setCookie(String cookie) {
|
public Site setCookie(String name,String value) {
|
||||||
this.cookie = cookie;
|
cookies.put(name,value);
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -44,8 +44,8 @@ public class Site {
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getCookie() {
|
public Map<String,String> getCookies() {
|
||||||
return cookie;
|
return cookies;
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getUserAgent() {
|
public String getUserAgent() {
|
||||||
|
@ -79,12 +79,12 @@ public class Site {
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getStartUrl() {
|
public List<String> getStartUrls() {
|
||||||
return startUrl;
|
return startUrls;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Site setStartUrl(String startUrl) {
|
public Site setStartUrl(String startUrl) {
|
||||||
this.startUrl = startUrl;
|
this.startUrls.add(startUrl);
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -106,8 +106,8 @@ public class Site {
|
||||||
|
|
||||||
if (acceptStatCode != null ? !acceptStatCode.equals(site.acceptStatCode) : site.acceptStatCode != null)
|
if (acceptStatCode != null ? !acceptStatCode.equals(site.acceptStatCode) : site.acceptStatCode != null)
|
||||||
return false;
|
return false;
|
||||||
if (cookie != null ? !cookie.equals(site.cookie) : site.cookie != null) return false;
|
|
||||||
if (!domain.equals(site.domain)) return false;
|
if (!domain.equals(site.domain)) return false;
|
||||||
|
if (!startUrls.equals(site.startUrls)) return false;
|
||||||
if (encoding != null ? !encoding.equals(site.encoding) : site.encoding != null) return false;
|
if (encoding != null ? !encoding.equals(site.encoding) : site.encoding != null) return false;
|
||||||
if (userAgent != null ? !userAgent.equals(site.userAgent) : site.userAgent != null) return false;
|
if (userAgent != null ? !userAgent.equals(site.userAgent) : site.userAgent != null) return false;
|
||||||
|
|
||||||
|
@ -117,8 +117,8 @@ public class Site {
|
||||||
@Override
|
@Override
|
||||||
public int hashCode() {
|
public int hashCode() {
|
||||||
int result = domain.hashCode();
|
int result = domain.hashCode();
|
||||||
|
result = 31 * result + (startUrls != null ? startUrls.hashCode() : 0);
|
||||||
result = 31 * result + (userAgent != null ? userAgent.hashCode() : 0);
|
result = 31 * result + (userAgent != null ? userAgent.hashCode() : 0);
|
||||||
result = 31 * result + (cookie != null ? cookie.hashCode() : 0);
|
|
||||||
result = 31 * result + (encoding != null ? encoding.hashCode() : 0);
|
result = 31 * result + (encoding != null ? encoding.hashCode() : 0);
|
||||||
result = 31 * result + (acceptStatCode != null ? acceptStatCode.hashCode() : 0);
|
result = 31 * result + (acceptStatCode != null ? acceptStatCode.hashCode() : 0);
|
||||||
return result;
|
return result;
|
||||||
|
|
|
@ -36,7 +36,9 @@ public class Spider implements Runnable {
|
||||||
|
|
||||||
public Spider processor(PageProcessor pageProcessor) {
|
public Spider processor(PageProcessor pageProcessor) {
|
||||||
this.pageProcessor = pageProcessor;
|
this.pageProcessor = pageProcessor;
|
||||||
schedular.push(new Request(pageProcessor.getSite().getStartUrl()), pageProcessor.getSite());
|
for (String startUrl : pageProcessor.getSite().getStartUrls()) {
|
||||||
|
schedular.push(new Request(startUrl), pageProcessor.getSite());
|
||||||
|
}
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -5,7 +5,7 @@ import us.codecraft.webmagic.Request;
|
||||||
import us.codecraft.webmagic.Site;
|
import us.codecraft.webmagic.Site;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Downloader是webmagic抓取页面的核心接口。
|
* Downloader是webmagic下载页面的接口。webmagic默认使用了HttpComponent作为下载器,一般情况,你无需自己实现这个接口。
|
||||||
* Author: code4crafter@gmail.com
|
* Author: code4crafter@gmail.com
|
||||||
* Date: 13-4-21
|
* Date: 13-4-21
|
||||||
* Time: 下午12:14
|
* Time: 下午12:14
|
||||||
|
@ -13,10 +13,11 @@ import us.codecraft.webmagic.Site;
|
||||||
public interface Downloader {
|
public interface Downloader {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
* 下载页面,并保存信息到Page对象中。
|
||||||
*
|
*
|
||||||
* @param request
|
* @param request
|
||||||
* @param site
|
* @param site
|
||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
public Page download(Request request,Site site);
|
public Page download(Request request, Site site);
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,17 +1,22 @@
|
||||||
package us.codecraft.webmagic.downloader;
|
package us.codecraft.webmagic.downloader;
|
||||||
|
|
||||||
import org.apache.http.HttpVersion;
|
import org.apache.http.HttpVersion;
|
||||||
|
import org.apache.http.client.CookieStore;
|
||||||
import org.apache.http.client.HttpClient;
|
import org.apache.http.client.HttpClient;
|
||||||
import org.apache.http.client.params.ClientPNames;
|
import org.apache.http.client.params.ClientPNames;
|
||||||
import org.apache.http.client.params.CookiePolicy;
|
import org.apache.http.client.params.CookiePolicy;
|
||||||
import org.apache.http.conn.scheme.PlainSocketFactory;
|
import org.apache.http.conn.scheme.PlainSocketFactory;
|
||||||
import org.apache.http.conn.scheme.Scheme;
|
import org.apache.http.conn.scheme.Scheme;
|
||||||
import org.apache.http.conn.scheme.SchemeRegistry;
|
import org.apache.http.conn.scheme.SchemeRegistry;
|
||||||
|
import org.apache.http.impl.client.BasicCookieStore;
|
||||||
import org.apache.http.impl.client.DefaultHttpClient;
|
import org.apache.http.impl.client.DefaultHttpClient;
|
||||||
import org.apache.http.impl.conn.PoolingClientConnectionManager;
|
import org.apache.http.impl.conn.PoolingClientConnectionManager;
|
||||||
|
import org.apache.http.impl.cookie.BasicClientCookie;
|
||||||
import org.apache.http.params.*;
|
import org.apache.http.params.*;
|
||||||
import us.codecraft.webmagic.Site;
|
import us.codecraft.webmagic.Site;
|
||||||
|
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Author: code4crafter@gmail.com
|
* Author: code4crafter@gmail.com
|
||||||
* Date: 13-4-21
|
* Date: 13-4-21
|
||||||
|
@ -50,15 +55,23 @@ public class HttpClientPool {
|
||||||
schemeRegistry.register(new Scheme("http", 80, PlainSocketFactory.getSocketFactory()));
|
schemeRegistry.register(new Scheme("http", 80, PlainSocketFactory.getSocketFactory()));
|
||||||
|
|
||||||
PoolingClientConnectionManager connectionManager = new PoolingClientConnectionManager(schemeRegistry);
|
PoolingClientConnectionManager connectionManager = new PoolingClientConnectionManager(schemeRegistry);
|
||||||
connectionManager.setMaxTotal(100);
|
connectionManager.setMaxTotal(poolSize);
|
||||||
connectionManager.setDefaultMaxPerRoute(100);
|
connectionManager.setDefaultMaxPerRoute(100);
|
||||||
HttpClient httpClient = new DefaultHttpClient(connectionManager, params);
|
DefaultHttpClient httpClient = new DefaultHttpClient(connectionManager, params);
|
||||||
|
generateCookie(httpClient, site);
|
||||||
httpClient.getParams().setIntParameter("http.socket.timeout", 60000);
|
httpClient.getParams().setIntParameter("http.socket.timeout", 60000);
|
||||||
httpClient.getParams().setParameter(ClientPNames.COOKIE_POLICY, CookiePolicy.BEST_MATCH);
|
httpClient.getParams().setParameter(ClientPNames.COOKIE_POLICY, CookiePolicy.BEST_MATCH);
|
||||||
return httpClient;
|
return httpClient;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void pushBack(HttpClient httpClient) {
|
private void generateCookie(DefaultHttpClient httpClient, Site site) {
|
||||||
|
CookieStore cookieStore = new BasicCookieStore();
|
||||||
|
for (Map.Entry<String, String> cookieEntry : site.getCookies().entrySet()) {
|
||||||
|
BasicClientCookie cookie = new BasicClientCookie(cookieEntry.getKey(), cookieEntry.getValue());
|
||||||
|
cookie.setDomain(site.getDomain());
|
||||||
|
cookieStore.addCookie(cookie);
|
||||||
|
}
|
||||||
|
httpClient.setCookieStore(cookieStore);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,23 @@
|
||||||
|
package us.codecraft.webmagic.downloader;
|
||||||
|
|
||||||
|
import org.junit.Assert;
|
||||||
|
import org.junit.Test;
|
||||||
|
import us.codecraft.webmagic.Page;
|
||||||
|
import us.codecraft.webmagic.Request;
|
||||||
|
import us.codecraft.webmagic.Site;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Author: code4crafer@gmail.com
|
||||||
|
* Date: 13-6-18
|
||||||
|
* Time: 上午8:22
|
||||||
|
*/
|
||||||
|
public class HttpClientDownloaderTest {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testCookie() {
|
||||||
|
Site site = Site.me().setDomain("www.diandian.com").setCookie("t", "yct7q7e6v319wpg4cpxqduu5m77lcgix");
|
||||||
|
HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
|
||||||
|
Page download = httpClientDownloader.download(new Request("http://www.diandian.com"), site);
|
||||||
|
Assert.assertTrue(download.getHtml().toString().contains("flashsword30"));
|
||||||
|
}
|
||||||
|
}
|
|
@ -14,7 +14,7 @@ public class KaichibaProcessor implements PageProcessor {
|
||||||
public void process(Page page) {
|
public void process(Page page) {
|
||||||
//http://progressdaily.diandian.com/post/2013-01-24/40046867275
|
//http://progressdaily.diandian.com/post/2013-01-24/40046867275
|
||||||
int i = Integer.valueOf(page.getUrl().r("shop/(\\d+)").toString()) + 1;
|
int i = Integer.valueOf(page.getUrl().r("shop/(\\d+)").toString()) + 1;
|
||||||
page.addTargetRequests("http://kaichiba.com/shop/"+i);
|
page.addTargetRequest("http://kaichiba.com/shop/" + i);
|
||||||
page.putField("title",page.getHtml().x("//Title"));
|
page.putField("title",page.getHtml().x("//Title"));
|
||||||
page.putField("items", page.getHtml().xs("//li[@class=\"foodTitle\"]").rp("^\\s+", "").rp("\\s+$", "").rp("<span>.*?</span>", ""));
|
page.putField("items", page.getHtml().xs("//li[@class=\"foodTitle\"]").rp("^\\s+", "").rp("\\s+$", "").rp("<span>.*?</span>", ""));
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue