add cookie support & add docs
parent
8cef8774cb
commit
0ae7adf324
|
@ -10,6 +10,7 @@ import java.util.Map;
|
|||
import java.util.concurrent.ConcurrentHashMap;
|
||||
|
||||
/**
|
||||
* Page保存了抓取的结果,并可定义下一次抓取的链接内容。
|
||||
* Author: code4crafter@gmail.com
|
||||
* Date: 13-4-21
|
||||
* Time: 上午11:22
|
||||
|
@ -65,7 +66,7 @@ public class Page {
|
|||
}
|
||||
}
|
||||
|
||||
public void addTargetRequests(String requestString) {
|
||||
public void addTargetRequest(String requestString) {
|
||||
if (StringUtils.isBlank(requestString) || requestString.equals("#")) {
|
||||
return;
|
||||
}
|
||||
|
@ -75,6 +76,12 @@ public class Page {
|
|||
}
|
||||
}
|
||||
|
||||
public void addTargetRequest(Request request) {
|
||||
synchronized (targetRequests) {
|
||||
targetRequests.add(request);
|
||||
}
|
||||
}
|
||||
|
||||
public Selectable getUrl() {
|
||||
return url;
|
||||
}
|
||||
|
|
|
@ -1,7 +1,22 @@
|
|||
package us.codecraft.webmagic;
|
||||
|
||||
/**
|
||||
* Request对象是
|
||||
* Request对象封装了待抓取的url信息。<br/>
|
||||
* 在PageProcessor中,Request对象可以通过{@link us.codecraft.webmagic.Page#getRequest()} 获取。<br/>
|
||||
* Request对象包含一个extra属性,可以写入一些必须的上下文,这个特性在某些场合会有用。<br/>
|
||||
* <pre>
|
||||
* Example:
|
||||
* 抓取<a href="${link}">${linktext}</a>时,希望提取链接link,并保存linktext的信息。
|
||||
* 在上一个页面:
|
||||
* public void process(Page page){
|
||||
* Request request = new Request(link,linktext);
|
||||
* page.addTargetRequest(request)
|
||||
* }
|
||||
* 在下一个页面:
|
||||
* public void process(Page page){
|
||||
* String linktext = (String)page.getRequest().getExtra()[0];
|
||||
* }
|
||||
* </pre>
|
||||
* Author: code4crafter@gmail.com
|
||||
* Date: 13-4-21
|
||||
* Time: 上午11:37
|
||||
|
@ -12,15 +27,28 @@ public class Request {
|
|||
|
||||
private Object[] extra;
|
||||
|
||||
/**
|
||||
* 构建一个request对象
|
||||
* @param url 必须参数,待抓取的url
|
||||
* @param extra 额外参数,可以保存一些需要的上下文信息
|
||||
*/
|
||||
public Request(String url, Object... extra) {
|
||||
this.url = url;
|
||||
this.extra = extra;
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取预存的对象
|
||||
* @return object[] 预存的对象数组
|
||||
*/
|
||||
public Object[] getExtra() {
|
||||
return extra;
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取待抓取的url
|
||||
* @return url 待抓取的url
|
||||
*/
|
||||
public String getUrl() {
|
||||
return url;
|
||||
}
|
||||
|
|
|
@ -1,9 +1,9 @@
|
|||
package us.codecraft.webmagic;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Site定义一个待抓取的站点的各种信息。
|
||||
* Author: code4crafter@gmail.com
|
||||
* Date: 13-4-21
|
||||
* Time: 下午12:13
|
||||
|
@ -14,11 +14,11 @@ public class Site {
|
|||
|
||||
private String userAgent;
|
||||
|
||||
private String cookie;
|
||||
private Map<String,String> cookies = new LinkedHashMap<String, String>();
|
||||
|
||||
private String encoding;
|
||||
|
||||
private String startUrl;
|
||||
private List<String> startUrls;
|
||||
|
||||
private int sleepTime = 3000;
|
||||
|
||||
|
@ -34,8 +34,8 @@ public class Site {
|
|||
return new Site();
|
||||
}
|
||||
|
||||
public Site setCookie(String cookie) {
|
||||
this.cookie = cookie;
|
||||
public Site setCookie(String name,String value) {
|
||||
cookies.put(name,value);
|
||||
return this;
|
||||
}
|
||||
|
||||
|
@ -44,8 +44,8 @@ public class Site {
|
|||
return this;
|
||||
}
|
||||
|
||||
public String getCookie() {
|
||||
return cookie;
|
||||
public Map<String,String> getCookies() {
|
||||
return cookies;
|
||||
}
|
||||
|
||||
public String getUserAgent() {
|
||||
|
@ -79,12 +79,12 @@ public class Site {
|
|||
return this;
|
||||
}
|
||||
|
||||
public String getStartUrl() {
|
||||
return startUrl;
|
||||
public List<String> getStartUrls() {
|
||||
return startUrls;
|
||||
}
|
||||
|
||||
public Site setStartUrl(String startUrl) {
|
||||
this.startUrl = startUrl;
|
||||
this.startUrls.add(startUrl);
|
||||
return this;
|
||||
}
|
||||
|
||||
|
@ -106,8 +106,8 @@ public class Site {
|
|||
|
||||
if (acceptStatCode != null ? !acceptStatCode.equals(site.acceptStatCode) : site.acceptStatCode != null)
|
||||
return false;
|
||||
if (cookie != null ? !cookie.equals(site.cookie) : site.cookie != null) return false;
|
||||
if (!domain.equals(site.domain)) return false;
|
||||
if (!startUrls.equals(site.startUrls)) return false;
|
||||
if (encoding != null ? !encoding.equals(site.encoding) : site.encoding != null) return false;
|
||||
if (userAgent != null ? !userAgent.equals(site.userAgent) : site.userAgent != null) return false;
|
||||
|
||||
|
@ -117,8 +117,8 @@ public class Site {
|
|||
@Override
|
||||
public int hashCode() {
|
||||
int result = domain.hashCode();
|
||||
result = 31 * result + (startUrls != null ? startUrls.hashCode() : 0);
|
||||
result = 31 * result + (userAgent != null ? userAgent.hashCode() : 0);
|
||||
result = 31 * result + (cookie != null ? cookie.hashCode() : 0);
|
||||
result = 31 * result + (encoding != null ? encoding.hashCode() : 0);
|
||||
result = 31 * result + (acceptStatCode != null ? acceptStatCode.hashCode() : 0);
|
||||
return result;
|
||||
|
|
|
@ -36,7 +36,9 @@ public class Spider implements Runnable {
|
|||
|
||||
public Spider processor(PageProcessor pageProcessor) {
|
||||
this.pageProcessor = pageProcessor;
|
||||
schedular.push(new Request(pageProcessor.getSite().getStartUrl()), pageProcessor.getSite());
|
||||
for (String startUrl : pageProcessor.getSite().getStartUrls()) {
|
||||
schedular.push(new Request(startUrl), pageProcessor.getSite());
|
||||
}
|
||||
return this;
|
||||
}
|
||||
|
||||
|
|
|
@ -5,7 +5,7 @@ import us.codecraft.webmagic.Request;
|
|||
import us.codecraft.webmagic.Site;
|
||||
|
||||
/**
|
||||
* Downloader是webmagic抓取页面的核心接口。
|
||||
* Downloader是webmagic下载页面的接口。webmagic默认使用了HttpComponent作为下载器,一般情况,你无需自己实现这个接口。
|
||||
* Author: code4crafter@gmail.com
|
||||
* Date: 13-4-21
|
||||
* Time: 下午12:14
|
||||
|
@ -13,10 +13,11 @@ import us.codecraft.webmagic.Site;
|
|||
public interface Downloader {
|
||||
|
||||
/**
|
||||
* 下载页面,并保存信息到Page对象中。
|
||||
*
|
||||
* @param request
|
||||
* @param site
|
||||
* @return
|
||||
*/
|
||||
public Page download(Request request,Site site);
|
||||
public Page download(Request request, Site site);
|
||||
}
|
||||
|
|
|
@ -1,17 +1,22 @@
|
|||
package us.codecraft.webmagic.downloader;
|
||||
|
||||
import org.apache.http.HttpVersion;
|
||||
import org.apache.http.client.CookieStore;
|
||||
import org.apache.http.client.HttpClient;
|
||||
import org.apache.http.client.params.ClientPNames;
|
||||
import org.apache.http.client.params.CookiePolicy;
|
||||
import org.apache.http.conn.scheme.PlainSocketFactory;
|
||||
import org.apache.http.conn.scheme.Scheme;
|
||||
import org.apache.http.conn.scheme.SchemeRegistry;
|
||||
import org.apache.http.impl.client.BasicCookieStore;
|
||||
import org.apache.http.impl.client.DefaultHttpClient;
|
||||
import org.apache.http.impl.conn.PoolingClientConnectionManager;
|
||||
import org.apache.http.impl.cookie.BasicClientCookie;
|
||||
import org.apache.http.params.*;
|
||||
import us.codecraft.webmagic.Site;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Author: code4crafter@gmail.com
|
||||
* Date: 13-4-21
|
||||
|
@ -50,15 +55,23 @@ public class HttpClientPool {
|
|||
schemeRegistry.register(new Scheme("http", 80, PlainSocketFactory.getSocketFactory()));
|
||||
|
||||
PoolingClientConnectionManager connectionManager = new PoolingClientConnectionManager(schemeRegistry);
|
||||
connectionManager.setMaxTotal(100);
|
||||
connectionManager.setMaxTotal(poolSize);
|
||||
connectionManager.setDefaultMaxPerRoute(100);
|
||||
HttpClient httpClient = new DefaultHttpClient(connectionManager, params);
|
||||
DefaultHttpClient httpClient = new DefaultHttpClient(connectionManager, params);
|
||||
generateCookie(httpClient, site);
|
||||
httpClient.getParams().setIntParameter("http.socket.timeout", 60000);
|
||||
httpClient.getParams().setParameter(ClientPNames.COOKIE_POLICY, CookiePolicy.BEST_MATCH);
|
||||
return httpClient;
|
||||
}
|
||||
|
||||
public void pushBack(HttpClient httpClient) {
|
||||
|
||||
private void generateCookie(DefaultHttpClient httpClient, Site site) {
|
||||
CookieStore cookieStore = new BasicCookieStore();
|
||||
for (Map.Entry<String, String> cookieEntry : site.getCookies().entrySet()) {
|
||||
BasicClientCookie cookie = new BasicClientCookie(cookieEntry.getKey(), cookieEntry.getValue());
|
||||
cookie.setDomain(site.getDomain());
|
||||
cookieStore.addCookie(cookie);
|
||||
}
|
||||
httpClient.setCookieStore(cookieStore);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -0,0 +1,23 @@
|
|||
package us.codecraft.webmagic.downloader;
|
||||
|
||||
import org.junit.Assert;
|
||||
import org.junit.Test;
|
||||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.Request;
|
||||
import us.codecraft.webmagic.Site;
|
||||
|
||||
/**
|
||||
* Author: code4crafer@gmail.com
|
||||
* Date: 13-6-18
|
||||
* Time: 上午8:22
|
||||
*/
|
||||
public class HttpClientDownloaderTest {
|
||||
|
||||
@Test
|
||||
public void testCookie() {
|
||||
Site site = Site.me().setDomain("www.diandian.com").setCookie("t", "yct7q7e6v319wpg4cpxqduu5m77lcgix");
|
||||
HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
|
||||
Page download = httpClientDownloader.download(new Request("http://www.diandian.com"), site);
|
||||
Assert.assertTrue(download.getHtml().toString().contains("flashsword30"));
|
||||
}
|
||||
}
|
|
@ -14,7 +14,7 @@ public class KaichibaProcessor implements PageProcessor {
|
|||
public void process(Page page) {
|
||||
//http://progressdaily.diandian.com/post/2013-01-24/40046867275
|
||||
int i = Integer.valueOf(page.getUrl().r("shop/(\\d+)").toString()) + 1;
|
||||
page.addTargetRequests("http://kaichiba.com/shop/"+i);
|
||||
page.addTargetRequest("http://kaichiba.com/shop/" + i);
|
||||
page.putField("title",page.getHtml().x("//Title"));
|
||||
page.putField("items", page.getHtml().xs("//li[@class=\"foodTitle\"]").rp("^\\s+", "").rp("\\s+$", "").rp("<span>.*?</span>", ""));
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue