diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java
index 63b1b53..eee1a8a 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java
@@ -10,6 +10,7 @@ import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
/**
+ * Page保存了抓取的结果,并可定义下一次抓取的链接内容。
* Author: code4crafter@gmail.com
* Date: 13-4-21
* Time: 上午11:22
@@ -65,7 +66,7 @@ public class Page {
}
}
- public void addTargetRequests(String requestString) {
+ public void addTargetRequest(String requestString) {
if (StringUtils.isBlank(requestString) || requestString.equals("#")) {
return;
}
@@ -75,6 +76,12 @@ public class Page {
}
}
+ public void addTargetRequest(Request request) {
+ synchronized (targetRequests) {
+ targetRequests.add(request);
+ }
+ }
+
public Selectable getUrl() {
return url;
}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java
index ce7870b..ecb8b4e 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java
@@ -1,7 +1,22 @@
package us.codecraft.webmagic;
/**
- * Request对象是
+ * Request对象封装了待抓取的url信息。
+ * 在PageProcessor中,Request对象可以通过{@link us.codecraft.webmagic.Page#getRequest()} 获取。
+ * Request对象包含一个extra属性,可以写入一些必须的上下文,这个特性在某些场合会有用。
+ *
+ * Example:
+ * 抓取${linktext}时,希望提取链接link,并保存linktext的信息。
+ * 在上一个页面:
+ * public void process(Page page){
+ * Request request = new Request(link,linktext);
+ * page.addTargetRequest(request)
+ * }
+ * 在下一个页面:
+ * public void process(Page page){
+ * String linktext = (String)page.getRequest().getExtra()[0];
+ * }
+ *
* Author: code4crafter@gmail.com
* Date: 13-4-21
* Time: 上午11:37
@@ -12,15 +27,28 @@ public class Request {
private Object[] extra;
+ /**
+ * 构建一个request对象
+ * @param url 必须参数,待抓取的url
+ * @param extra 额外参数,可以保存一些需要的上下文信息
+ */
public Request(String url, Object... extra) {
this.url = url;
this.extra = extra;
}
+ /**
+ * 获取预存的对象
+ * @return object[] 预存的对象数组
+ */
public Object[] getExtra() {
return extra;
}
+ /**
+ * 获取待抓取的url
+ * @return url 待抓取的url
+ */
public String getUrl() {
return url;
}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java
index 05117f0..413d8d8 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java
@@ -1,9 +1,9 @@
package us.codecraft.webmagic;
-import java.util.HashSet;
-import java.util.Set;
+import java.util.*;
/**
+ * Site定义一个待抓取的站点的各种信息。
* Author: code4crafter@gmail.com
* Date: 13-4-21
* Time: 下午12:13
@@ -14,11 +14,11 @@ public class Site {
private String userAgent;
- private String cookie;
+ private Map cookies = new LinkedHashMap();
private String encoding;
- private String startUrl;
+ private List startUrls;
private int sleepTime = 3000;
@@ -34,8 +34,8 @@ public class Site {
return new Site();
}
- public Site setCookie(String cookie) {
- this.cookie = cookie;
+ public Site setCookie(String name,String value) {
+ cookies.put(name,value);
return this;
}
@@ -44,8 +44,8 @@ public class Site {
return this;
}
- public String getCookie() {
- return cookie;
+ public Map getCookies() {
+ return cookies;
}
public String getUserAgent() {
@@ -79,12 +79,12 @@ public class Site {
return this;
}
- public String getStartUrl() {
- return startUrl;
+ public List getStartUrls() {
+ return startUrls;
}
public Site setStartUrl(String startUrl) {
- this.startUrl = startUrl;
+ this.startUrls.add(startUrl);
return this;
}
@@ -106,8 +106,8 @@ public class Site {
if (acceptStatCode != null ? !acceptStatCode.equals(site.acceptStatCode) : site.acceptStatCode != null)
return false;
- if (cookie != null ? !cookie.equals(site.cookie) : site.cookie != null) return false;
if (!domain.equals(site.domain)) return false;
+ if (!startUrls.equals(site.startUrls)) return false;
if (encoding != null ? !encoding.equals(site.encoding) : site.encoding != null) return false;
if (userAgent != null ? !userAgent.equals(site.userAgent) : site.userAgent != null) return false;
@@ -117,8 +117,8 @@ public class Site {
@Override
public int hashCode() {
int result = domain.hashCode();
+ result = 31 * result + (startUrls != null ? startUrls.hashCode() : 0);
result = 31 * result + (userAgent != null ? userAgent.hashCode() : 0);
- result = 31 * result + (cookie != null ? cookie.hashCode() : 0);
result = 31 * result + (encoding != null ? encoding.hashCode() : 0);
result = 31 * result + (acceptStatCode != null ? acceptStatCode.hashCode() : 0);
return result;
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
index bbab1a5..180d752 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
@@ -36,7 +36,9 @@ public class Spider implements Runnable {
public Spider processor(PageProcessor pageProcessor) {
this.pageProcessor = pageProcessor;
- schedular.push(new Request(pageProcessor.getSite().getStartUrl()), pageProcessor.getSite());
+ for (String startUrl : pageProcessor.getSite().getStartUrls()) {
+ schedular.push(new Request(startUrl), pageProcessor.getSite());
+ }
return this;
}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Downloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Downloader.java
index 2711ba4..f276fde 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Downloader.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Downloader.java
@@ -5,7 +5,7 @@ import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
/**
- * Downloader是webmagic抓取页面的核心接口。
+ * Downloader是webmagic下载页面的接口。webmagic默认使用了HttpComponent作为下载器,一般情况,你无需自己实现这个接口。
* Author: code4crafter@gmail.com
* Date: 13-4-21
* Time: 下午12:14
@@ -13,10 +13,11 @@ import us.codecraft.webmagic.Site;
public interface Downloader {
/**
+ * 下载页面,并保存信息到Page对象中。
*
* @param request
* @param site
* @return
*/
- public Page download(Request request,Site site);
+ public Page download(Request request, Site site);
}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java
index 066a24e..891ff18 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java
@@ -1,17 +1,22 @@
package us.codecraft.webmagic.downloader;
import org.apache.http.HttpVersion;
+import org.apache.http.client.CookieStore;
import org.apache.http.client.HttpClient;
import org.apache.http.client.params.ClientPNames;
import org.apache.http.client.params.CookiePolicy;
import org.apache.http.conn.scheme.PlainSocketFactory;
import org.apache.http.conn.scheme.Scheme;
import org.apache.http.conn.scheme.SchemeRegistry;
+import org.apache.http.impl.client.BasicCookieStore;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.impl.conn.PoolingClientConnectionManager;
+import org.apache.http.impl.cookie.BasicClientCookie;
import org.apache.http.params.*;
import us.codecraft.webmagic.Site;
+import java.util.Map;
+
/**
* Author: code4crafter@gmail.com
* Date: 13-4-21
@@ -50,15 +55,23 @@ public class HttpClientPool {
schemeRegistry.register(new Scheme("http", 80, PlainSocketFactory.getSocketFactory()));
PoolingClientConnectionManager connectionManager = new PoolingClientConnectionManager(schemeRegistry);
- connectionManager.setMaxTotal(100);
+ connectionManager.setMaxTotal(poolSize);
connectionManager.setDefaultMaxPerRoute(100);
- HttpClient httpClient = new DefaultHttpClient(connectionManager, params);
+ DefaultHttpClient httpClient = new DefaultHttpClient(connectionManager, params);
+ generateCookie(httpClient, site);
httpClient.getParams().setIntParameter("http.socket.timeout", 60000);
httpClient.getParams().setParameter(ClientPNames.COOKIE_POLICY, CookiePolicy.BEST_MATCH);
return httpClient;
}
- public void pushBack(HttpClient httpClient) {
-
+ private void generateCookie(DefaultHttpClient httpClient, Site site) {
+ CookieStore cookieStore = new BasicCookieStore();
+ for (Map.Entry cookieEntry : site.getCookies().entrySet()) {
+ BasicClientCookie cookie = new BasicClientCookie(cookieEntry.getKey(), cookieEntry.getValue());
+ cookie.setDomain(site.getDomain());
+ cookieStore.addCookie(cookie);
+ }
+ httpClient.setCookieStore(cookieStore);
}
+
}
diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java
new file mode 100644
index 0000000..7f00e17
--- /dev/null
+++ b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java
@@ -0,0 +1,23 @@
+package us.codecraft.webmagic.downloader;
+
+import org.junit.Assert;
+import org.junit.Test;
+import us.codecraft.webmagic.Page;
+import us.codecraft.webmagic.Request;
+import us.codecraft.webmagic.Site;
+
+/**
+ * Author: code4crafer@gmail.com
+ * Date: 13-6-18
+ * Time: 上午8:22
+ */
+public class HttpClientDownloaderTest {
+
+ @Test
+ public void testCookie() {
+ Site site = Site.me().setDomain("www.diandian.com").setCookie("t", "yct7q7e6v319wpg4cpxqduu5m77lcgix");
+ HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
+ Page download = httpClientDownloader.download(new Request("http://www.diandian.com"), site);
+ Assert.assertTrue(download.getHtml().toString().contains("flashsword30"));
+ }
+}
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/KaichibaProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/KaichibaProcessor.java
index 866c903..f2d4050 100644
--- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/KaichibaProcessor.java
+++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/KaichibaProcessor.java
@@ -14,7 +14,7 @@ public class KaichibaProcessor implements PageProcessor {
public void process(Page page) {
//http://progressdaily.diandian.com/post/2013-01-24/40046867275
int i = Integer.valueOf(page.getUrl().r("shop/(\\d+)").toString()) + 1;
- page.addTargetRequests("http://kaichiba.com/shop/"+i);
+ page.addTargetRequest("http://kaichiba.com/shop/" + i);
page.putField("title",page.getHtml().x("//Title"));
page.putField("items", page.getHtml().xs("//li[@class=\"foodTitle\"]").rp("^\\s+", "").rp("\\s+$", "").rp(".*?", ""));
}