diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java index 63b1b53..eee1a8a 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java @@ -10,6 +10,7 @@ import java.util.Map; import java.util.concurrent.ConcurrentHashMap; /** + * Page保存了抓取的结果,并可定义下一次抓取的链接内容。 * Author: code4crafter@gmail.com * Date: 13-4-21 * Time: 上午11:22 @@ -65,7 +66,7 @@ public class Page { } } - public void addTargetRequests(String requestString) { + public void addTargetRequest(String requestString) { if (StringUtils.isBlank(requestString) || requestString.equals("#")) { return; } @@ -75,6 +76,12 @@ public class Page { } } + public void addTargetRequest(Request request) { + synchronized (targetRequests) { + targetRequests.add(request); + } + } + public Selectable getUrl() { return url; } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java index ce7870b..ecb8b4e 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java @@ -1,7 +1,22 @@ package us.codecraft.webmagic; /** - * Request对象是 + * Request对象封装了待抓取的url信息。
+ * 在PageProcessor中,Request对象可以通过{@link us.codecraft.webmagic.Page#getRequest()} 获取。
+ * Request对象包含一个extra属性,可以写入一些必须的上下文,这个特性在某些场合会有用。
+ *
+ *      Example:
+ *          抓取${linktext}时,希望提取链接link,并保存linktext的信息。
+ *      在上一个页面:
+ *      public void process(Page page){
+ *          Request request = new Request(link,linktext);
+ *          page.addTargetRequest(request)
+ *      }
+ *      在下一个页面:
+ *      public void process(Page page){
+ *          String linktext =  (String)page.getRequest().getExtra()[0];
+ *      }
+ * 
* Author: code4crafter@gmail.com * Date: 13-4-21 * Time: 上午11:37 @@ -12,15 +27,28 @@ public class Request { private Object[] extra; + /** + * 构建一个request对象 + * @param url 必须参数,待抓取的url + * @param extra 额外参数,可以保存一些需要的上下文信息 + */ public Request(String url, Object... extra) { this.url = url; this.extra = extra; } + /** + * 获取预存的对象 + * @return object[] 预存的对象数组 + */ public Object[] getExtra() { return extra; } + /** + * 获取待抓取的url + * @return url 待抓取的url + */ public String getUrl() { return url; } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java index 05117f0..413d8d8 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java @@ -1,9 +1,9 @@ package us.codecraft.webmagic; -import java.util.HashSet; -import java.util.Set; +import java.util.*; /** + * Site定义一个待抓取的站点的各种信息。 * Author: code4crafter@gmail.com * Date: 13-4-21 * Time: 下午12:13 @@ -14,11 +14,11 @@ public class Site { private String userAgent; - private String cookie; + private Map cookies = new LinkedHashMap(); private String encoding; - private String startUrl; + private List startUrls; private int sleepTime = 3000; @@ -34,8 +34,8 @@ public class Site { return new Site(); } - public Site setCookie(String cookie) { - this.cookie = cookie; + public Site setCookie(String name,String value) { + cookies.put(name,value); return this; } @@ -44,8 +44,8 @@ public class Site { return this; } - public String getCookie() { - return cookie; + public Map getCookies() { + return cookies; } public String getUserAgent() { @@ -79,12 +79,12 @@ public class Site { return this; } - public String getStartUrl() { - return startUrl; + public List getStartUrls() { + return startUrls; } public Site setStartUrl(String startUrl) { - this.startUrl = startUrl; + this.startUrls.add(startUrl); return this; } @@ -106,8 +106,8 @@ public class Site { if (acceptStatCode != null ? !acceptStatCode.equals(site.acceptStatCode) : site.acceptStatCode != null) return false; - if (cookie != null ? !cookie.equals(site.cookie) : site.cookie != null) return false; if (!domain.equals(site.domain)) return false; + if (!startUrls.equals(site.startUrls)) return false; if (encoding != null ? !encoding.equals(site.encoding) : site.encoding != null) return false; if (userAgent != null ? !userAgent.equals(site.userAgent) : site.userAgent != null) return false; @@ -117,8 +117,8 @@ public class Site { @Override public int hashCode() { int result = domain.hashCode(); + result = 31 * result + (startUrls != null ? startUrls.hashCode() : 0); result = 31 * result + (userAgent != null ? userAgent.hashCode() : 0); - result = 31 * result + (cookie != null ? cookie.hashCode() : 0); result = 31 * result + (encoding != null ? encoding.hashCode() : 0); result = 31 * result + (acceptStatCode != null ? acceptStatCode.hashCode() : 0); return result; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index bbab1a5..180d752 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -36,7 +36,9 @@ public class Spider implements Runnable { public Spider processor(PageProcessor pageProcessor) { this.pageProcessor = pageProcessor; - schedular.push(new Request(pageProcessor.getSite().getStartUrl()), pageProcessor.getSite()); + for (String startUrl : pageProcessor.getSite().getStartUrls()) { + schedular.push(new Request(startUrl), pageProcessor.getSite()); + } return this; } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Downloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Downloader.java index 2711ba4..f276fde 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Downloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Downloader.java @@ -5,7 +5,7 @@ import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Site; /** - * Downloader是webmagic抓取页面的核心接口。 + * Downloader是webmagic下载页面的接口。webmagic默认使用了HttpComponent作为下载器,一般情况,你无需自己实现这个接口。 * Author: code4crafter@gmail.com * Date: 13-4-21 * Time: 下午12:14 @@ -13,10 +13,11 @@ import us.codecraft.webmagic.Site; public interface Downloader { /** + * 下载页面,并保存信息到Page对象中。 * * @param request * @param site * @return */ - public Page download(Request request,Site site); + public Page download(Request request, Site site); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java index 066a24e..891ff18 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java @@ -1,17 +1,22 @@ package us.codecraft.webmagic.downloader; import org.apache.http.HttpVersion; +import org.apache.http.client.CookieStore; import org.apache.http.client.HttpClient; import org.apache.http.client.params.ClientPNames; import org.apache.http.client.params.CookiePolicy; import org.apache.http.conn.scheme.PlainSocketFactory; import org.apache.http.conn.scheme.Scheme; import org.apache.http.conn.scheme.SchemeRegistry; +import org.apache.http.impl.client.BasicCookieStore; import org.apache.http.impl.client.DefaultHttpClient; import org.apache.http.impl.conn.PoolingClientConnectionManager; +import org.apache.http.impl.cookie.BasicClientCookie; import org.apache.http.params.*; import us.codecraft.webmagic.Site; +import java.util.Map; + /** * Author: code4crafter@gmail.com * Date: 13-4-21 @@ -50,15 +55,23 @@ public class HttpClientPool { schemeRegistry.register(new Scheme("http", 80, PlainSocketFactory.getSocketFactory())); PoolingClientConnectionManager connectionManager = new PoolingClientConnectionManager(schemeRegistry); - connectionManager.setMaxTotal(100); + connectionManager.setMaxTotal(poolSize); connectionManager.setDefaultMaxPerRoute(100); - HttpClient httpClient = new DefaultHttpClient(connectionManager, params); + DefaultHttpClient httpClient = new DefaultHttpClient(connectionManager, params); + generateCookie(httpClient, site); httpClient.getParams().setIntParameter("http.socket.timeout", 60000); httpClient.getParams().setParameter(ClientPNames.COOKIE_POLICY, CookiePolicy.BEST_MATCH); return httpClient; } - public void pushBack(HttpClient httpClient) { - + private void generateCookie(DefaultHttpClient httpClient, Site site) { + CookieStore cookieStore = new BasicCookieStore(); + for (Map.Entry cookieEntry : site.getCookies().entrySet()) { + BasicClientCookie cookie = new BasicClientCookie(cookieEntry.getKey(), cookieEntry.getValue()); + cookie.setDomain(site.getDomain()); + cookieStore.addCookie(cookie); + } + httpClient.setCookieStore(cookieStore); } + } diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java new file mode 100644 index 0000000..7f00e17 --- /dev/null +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java @@ -0,0 +1,23 @@ +package us.codecraft.webmagic.downloader; + +import org.junit.Assert; +import org.junit.Test; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.Site; + +/** + * Author: code4crafer@gmail.com + * Date: 13-6-18 + * Time: 上午8:22 + */ +public class HttpClientDownloaderTest { + + @Test + public void testCookie() { + Site site = Site.me().setDomain("www.diandian.com").setCookie("t", "yct7q7e6v319wpg4cpxqduu5m77lcgix"); + HttpClientDownloader httpClientDownloader = new HttpClientDownloader(); + Page download = httpClientDownloader.download(new Request("http://www.diandian.com"), site); + Assert.assertTrue(download.getHtml().toString().contains("flashsword30")); + } +} diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/KaichibaProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/KaichibaProcessor.java index 866c903..f2d4050 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/KaichibaProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/KaichibaProcessor.java @@ -14,7 +14,7 @@ public class KaichibaProcessor implements PageProcessor { public void process(Page page) { //http://progressdaily.diandian.com/post/2013-01-24/40046867275 int i = Integer.valueOf(page.getUrl().r("shop/(\\d+)").toString()) + 1; - page.addTargetRequests("http://kaichiba.com/shop/"+i); + page.addTargetRequest("http://kaichiba.com/shop/" + i); page.putField("title",page.getHtml().x("//Title")); page.putField("items", page.getHtml().xs("//li[@class=\"foodTitle\"]").rp("^\\s+", "").rp("\\s+$", "").rp(".*?", "")); }