add cookie support & add docs

master
yihua.huang 2013-06-18 08:32:11 +08:00
parent 8cef8774cb
commit 0ae7adf324
8 changed files with 97 additions and 23 deletions

View File

@ -10,6 +10,7 @@ import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
/**
* Page
* Author: code4crafter@gmail.com
* Date: 13-4-21
* Time: 11:22
@ -65,7 +66,7 @@ public class Page {
}
}
public void addTargetRequests(String requestString) {
public void addTargetRequest(String requestString) {
if (StringUtils.isBlank(requestString) || requestString.equals("#")) {
return;
}
@ -75,6 +76,12 @@ public class Page {
}
}
public void addTargetRequest(Request request) {
synchronized (targetRequests) {
targetRequests.add(request);
}
}
public Selectable getUrl() {
return url;
}

View File

@ -1,7 +1,22 @@
package us.codecraft.webmagic;
/**
* Request
* Requesturl<br/>
* PageProcessorRequest{@link us.codecraft.webmagic.Page#getRequest()} <br/>
* Requestextra<br/>
* <pre>
* Example:
* <a href="${link}">${linktext}</a>linklinktext
*
* public void process(Page page){
* Request request = new Request(link,linktext);
* page.addTargetRequest(request)
* }
*
* public void process(Page page){
* String linktext = (String)page.getRequest().getExtra()[0];
* }
* </pre>
* Author: code4crafter@gmail.com
* Date: 13-4-21
* Time: 11:37
@ -12,15 +27,28 @@ public class Request {
private Object[] extra;
/**
* request
* @param url url
* @param extra
*/
public Request(String url, Object... extra) {
this.url = url;
this.extra = extra;
}
/**
*
* @return object[]
*/
public Object[] getExtra() {
return extra;
}
/**
* url
* @return url url
*/
public String getUrl() {
return url;
}

View File

@ -1,9 +1,9 @@
package us.codecraft.webmagic;
import java.util.HashSet;
import java.util.Set;
import java.util.*;
/**
* Site
* Author: code4crafter@gmail.com
* Date: 13-4-21
* Time: 12:13
@ -14,11 +14,11 @@ public class Site {
private String userAgent;
private String cookie;
private Map<String,String> cookies = new LinkedHashMap<String, String>();
private String encoding;
private String startUrl;
private List<String> startUrls;
private int sleepTime = 3000;
@ -34,8 +34,8 @@ public class Site {
return new Site();
}
public Site setCookie(String cookie) {
this.cookie = cookie;
public Site setCookie(String name,String value) {
cookies.put(name,value);
return this;
}
@ -44,8 +44,8 @@ public class Site {
return this;
}
public String getCookie() {
return cookie;
public Map<String,String> getCookies() {
return cookies;
}
public String getUserAgent() {
@ -79,12 +79,12 @@ public class Site {
return this;
}
public String getStartUrl() {
return startUrl;
public List<String> getStartUrls() {
return startUrls;
}
public Site setStartUrl(String startUrl) {
this.startUrl = startUrl;
this.startUrls.add(startUrl);
return this;
}
@ -106,8 +106,8 @@ public class Site {
if (acceptStatCode != null ? !acceptStatCode.equals(site.acceptStatCode) : site.acceptStatCode != null)
return false;
if (cookie != null ? !cookie.equals(site.cookie) : site.cookie != null) return false;
if (!domain.equals(site.domain)) return false;
if (!startUrls.equals(site.startUrls)) return false;
if (encoding != null ? !encoding.equals(site.encoding) : site.encoding != null) return false;
if (userAgent != null ? !userAgent.equals(site.userAgent) : site.userAgent != null) return false;
@ -117,8 +117,8 @@ public class Site {
@Override
public int hashCode() {
int result = domain.hashCode();
result = 31 * result + (startUrls != null ? startUrls.hashCode() : 0);
result = 31 * result + (userAgent != null ? userAgent.hashCode() : 0);
result = 31 * result + (cookie != null ? cookie.hashCode() : 0);
result = 31 * result + (encoding != null ? encoding.hashCode() : 0);
result = 31 * result + (acceptStatCode != null ? acceptStatCode.hashCode() : 0);
return result;

View File

@ -36,7 +36,9 @@ public class Spider implements Runnable {
public Spider processor(PageProcessor pageProcessor) {
this.pageProcessor = pageProcessor;
schedular.push(new Request(pageProcessor.getSite().getStartUrl()), pageProcessor.getSite());
for (String startUrl : pageProcessor.getSite().getStartUrls()) {
schedular.push(new Request(startUrl), pageProcessor.getSite());
}
return this;
}

View File

@ -5,7 +5,7 @@ import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
/**
* Downloaderwebmagic
* Downloaderwebmagicwebmagic使HttpComponent
* Author: code4crafter@gmail.com
* Date: 13-4-21
* Time: 12:14
@ -13,10 +13,11 @@ import us.codecraft.webmagic.Site;
public interface Downloader {
/**
* Page
*
* @param request
* @param site
* @return
*/
public Page download(Request request,Site site);
public Page download(Request request, Site site);
}

View File

@ -1,17 +1,22 @@
package us.codecraft.webmagic.downloader;
import org.apache.http.HttpVersion;
import org.apache.http.client.CookieStore;
import org.apache.http.client.HttpClient;
import org.apache.http.client.params.ClientPNames;
import org.apache.http.client.params.CookiePolicy;
import org.apache.http.conn.scheme.PlainSocketFactory;
import org.apache.http.conn.scheme.Scheme;
import org.apache.http.conn.scheme.SchemeRegistry;
import org.apache.http.impl.client.BasicCookieStore;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.impl.conn.PoolingClientConnectionManager;
import org.apache.http.impl.cookie.BasicClientCookie;
import org.apache.http.params.*;
import us.codecraft.webmagic.Site;
import java.util.Map;
/**
* Author: code4crafter@gmail.com
* Date: 13-4-21
@ -50,15 +55,23 @@ public class HttpClientPool {
schemeRegistry.register(new Scheme("http", 80, PlainSocketFactory.getSocketFactory()));
PoolingClientConnectionManager connectionManager = new PoolingClientConnectionManager(schemeRegistry);
connectionManager.setMaxTotal(100);
connectionManager.setMaxTotal(poolSize);
connectionManager.setDefaultMaxPerRoute(100);
HttpClient httpClient = new DefaultHttpClient(connectionManager, params);
DefaultHttpClient httpClient = new DefaultHttpClient(connectionManager, params);
generateCookie(httpClient, site);
httpClient.getParams().setIntParameter("http.socket.timeout", 60000);
httpClient.getParams().setParameter(ClientPNames.COOKIE_POLICY, CookiePolicy.BEST_MATCH);
return httpClient;
}
public void pushBack(HttpClient httpClient) {
private void generateCookie(DefaultHttpClient httpClient, Site site) {
CookieStore cookieStore = new BasicCookieStore();
for (Map.Entry<String, String> cookieEntry : site.getCookies().entrySet()) {
BasicClientCookie cookie = new BasicClientCookie(cookieEntry.getKey(), cookieEntry.getValue());
cookie.setDomain(site.getDomain());
cookieStore.addCookie(cookie);
}
httpClient.setCookieStore(cookieStore);
}
}

View File

@ -0,0 +1,23 @@
package us.codecraft.webmagic.downloader;
import org.junit.Assert;
import org.junit.Test;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
/**
* Author: code4crafer@gmail.com
* Date: 13-6-18
* Time: 8:22
*/
public class HttpClientDownloaderTest {
@Test
public void testCookie() {
Site site = Site.me().setDomain("www.diandian.com").setCookie("t", "yct7q7e6v319wpg4cpxqduu5m77lcgix");
HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
Page download = httpClientDownloader.download(new Request("http://www.diandian.com"), site);
Assert.assertTrue(download.getHtml().toString().contains("flashsword30"));
}
}

View File

@ -14,7 +14,7 @@ public class KaichibaProcessor implements PageProcessor {
public void process(Page page) {
//http://progressdaily.diandian.com/post/2013-01-24/40046867275
int i = Integer.valueOf(page.getUrl().r("shop/(\\d+)").toString()) + 1;
page.addTargetRequests("http://kaichiba.com/shop/"+i);
page.addTargetRequest("http://kaichiba.com/shop/" + i);
page.putField("title",page.getHtml().x("//Title"));
page.putField("items", page.getHtml().xs("//li[@class=\"foodTitle\"]").rp("^\\s+", "").rp("\\s+$", "").rp("<span>.*?</span>", ""));
}