diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java index a84ba48..b5f8865 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java @@ -24,7 +24,7 @@ public class Site { /** * startUrls is the urls the crawler to start with. */ - private List startUrls = new ArrayList(); + private List startRequests = new ArrayList(); private int sleepTime = 3000; @@ -38,7 +38,7 @@ public class Site { private Set acceptStatCode = DEFAULT_STATUS_CODE_SET; - private Map headers = new HashMap(); + private Map headers = new HashMap(); public static interface HeaderConst { @@ -182,9 +182,16 @@ public class Site { * get start urls * * @return start urls + * @see #getStartRequests + * @deprecated */ + @Deprecated public List getStartUrls() { - return startUrls; + return UrlUtils.convertToUrls(startRequests); + } + + public List getStartRequests() { + return startRequests; } /** @@ -194,11 +201,19 @@ public class Site { * @return this */ public Site addStartUrl(String startUrl) { - this.startUrls.add(startUrl); - if (domain == null) { - if (startUrls.size() > 0) { - domain = UrlUtils.getDomain(startUrls.get(0)); - } + return addStartRequest(new Request(startUrl)); + } + + /** + * Add a url to start url.
+ * + * @param startUrl + * @return this + */ + public Site addStartRequest(Request startRequest) { + this.startRequests.add(startRequest); + if (domain == null && startRequest.getUrl() != null) { + domain = UrlUtils.getDomain(startRequest.getUrl()); } return this; } @@ -241,12 +256,13 @@ public class Site { /** * Put an Http header for downloader.
* Use {@link #addCookie(String, String)} for cookie and {@link #setUserAgent(String)} for user-agent.
- * @param key key of http header, there are some keys constant in {@link HeaderConst} + * + * @param key key of http header, there are some keys constant in {@link HeaderConst} * @param value value of header * @return */ - public Site addHeader(String key, String value){ - headers.put(key,value); + public Site addHeader(String key, String value) { + headers.put(key, value); return this; } @@ -279,23 +295,6 @@ public class Site { return this; } - @Override - public boolean equals(Object o) { - if (this == o) return true; - if (o == null || getClass() != o.getClass()) return false; - - Site site = (Site) o; - - if (acceptStatCode != null ? !acceptStatCode.equals(site.acceptStatCode) : site.acceptStatCode != null) - return false; - if (!domain.equals(site.domain)) return false; - if (!startUrls.equals(site.startUrls)) return false; - if (charset != null ? !charset.equals(site.charset) : site.charset != null) return false; - if (userAgent != null ? !userAgent.equals(site.userAgent) : site.userAgent != null) return false; - - return true; - } - public Task toTask() { return new Task() { @Override @@ -310,13 +309,60 @@ public class Site { }; } + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + Site site = (Site) o; + + if (cycleRetryTimes != site.cycleRetryTimes) return false; + if (retryTimes != site.retryTimes) return false; + if (sleepTime != site.sleepTime) return false; + if (timeOut != site.timeOut) return false; + if (acceptStatCode != null ? !acceptStatCode.equals(site.acceptStatCode) : site.acceptStatCode != null) + return false; + if (charset != null ? !charset.equals(site.charset) : site.charset != null) return false; + if (cookies != null ? !cookies.equals(site.cookies) : site.cookies != null) return false; + if (domain != null ? !domain.equals(site.domain) : site.domain != null) return false; + if (headers != null ? !headers.equals(site.headers) : site.headers != null) return false; + if (startRequests != null ? !startRequests.equals(site.startRequests) : site.startRequests != null) + return false; + if (userAgent != null ? !userAgent.equals(site.userAgent) : site.userAgent != null) return false; + + return true; + } + @Override public int hashCode() { - int result = domain.hashCode(); - result = 31 * result + (startUrls != null ? startUrls.hashCode() : 0); + int result = domain != null ? domain.hashCode() : 0; result = 31 * result + (userAgent != null ? userAgent.hashCode() : 0); + result = 31 * result + (cookies != null ? cookies.hashCode() : 0); result = 31 * result + (charset != null ? charset.hashCode() : 0); + result = 31 * result + (startRequests != null ? startRequests.hashCode() : 0); + result = 31 * result + sleepTime; + result = 31 * result + retryTimes; + result = 31 * result + cycleRetryTimes; + result = 31 * result + timeOut; result = 31 * result + (acceptStatCode != null ? acceptStatCode.hashCode() : 0); + result = 31 * result + (headers != null ? headers.hashCode() : 0); return result; } + + @Override + public String toString() { + return "Site{" + + "domain='" + domain + '\'' + + ", userAgent='" + userAgent + '\'' + + ", cookies=" + cookies + + ", charset='" + charset + '\'' + + ", startRequests=" + startRequests + + ", sleepTime=" + sleepTime + + ", retryTimes=" + retryTimes + + ", cycleRetryTimes=" + cycleRetryTimes + + ", timeOut=" + timeOut + + ", acceptStatCode=" + acceptStatCode + + ", headers=" + headers + + '}'; + } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index da0d98a..54f51d9 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -11,6 +11,7 @@ import us.codecraft.webmagic.scheduler.QueueScheduler; import us.codecraft.webmagic.scheduler.Scheduler; import us.codecraft.webmagic.utils.EnvironmentUtil; import us.codecraft.webmagic.utils.ThreadUtils; +import us.codecraft.webmagic.utils.UrlUtils; import java.io.Closeable; import java.io.IOException; @@ -60,7 +61,7 @@ public class Spider implements Runnable, Task { protected PageProcessor pageProcessor; - protected List startUrls; + protected List startRequests; protected Site site; @@ -107,7 +108,7 @@ public class Spider implements Runnable, Task { public Spider(PageProcessor pageProcessor) { this.pageProcessor = pageProcessor; this.site = pageProcessor.getSite(); - this.startUrls = pageProcessor.getSite().getStartUrls(); + this.startRequests = pageProcessor.getSite().getStartRequests(); } /** @@ -119,7 +120,20 @@ public class Spider implements Runnable, Task { */ public Spider startUrls(List startUrls) { checkIfRunning(); - this.startUrls = startUrls; + this.startRequests = UrlUtils.convertToRequests(startUrls); + return this; + } + + /** + * Set startUrls of Spider.
+ * Prior to startUrls of Site. + * + * @param startUrls + * @return this + */ + public Spider startRequest(List startRequests) { + checkIfRunning(); + this.startRequests = startRequests; return this; } @@ -231,11 +245,11 @@ public class Spider implements Runnable, Task { } downloader.setThread(threadNum); executorService = ThreadUtils.newFixedThreadPool(threadNum); - if (startUrls != null) { - for (String startUrl : startUrls) { - scheduler.push(new Request(startUrl), this); + if (startRequests != null) { + for (Request request : startRequests) { + scheduler.push(request, this); } - startUrls.clear(); + startRequests.clear(); } } @@ -390,6 +404,20 @@ public class Spider implements Runnable, Task { return this; } + /** + * Add urls with information to crawl.
+ * + * @param urls + * @return + */ + public Spider addRequest(Request... requests) { + for (Request request : requests) { + addRequest(request); + } + signalNewUrl(); + return this; + } + private void waitNewUrl() { try { newUrlLock.lock(); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/OschinaBlogPageProcesser.java b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/OschinaBlogPageProcesser.java index 2c53b2d..fa8dab6 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/OschinaBlogPageProcesser.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/OschinaBlogPageProcesser.java @@ -34,6 +34,6 @@ public class OschinaBlogPageProcesser implements PageProcessor { } public static void main(String[] args) { - Spider.create(new OschinaBlogPageProcesser()).thread(10).run(); + Spider.create(new OschinaBlogPageProcesser()).thread(2).run(); } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java index 9ca776d..e45f948 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java @@ -1,10 +1,13 @@ package us.codecraft.webmagic.utils; import org.apache.commons.lang3.StringUtils; +import us.codecraft.webmagic.Request; import java.net.MalformedURLException; import java.net.URL; import java.nio.charset.Charset; +import java.util.ArrayList; +import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -18,7 +21,7 @@ public class UrlUtils { /** * canonicalizeUrl - * + *

* Borrowed from Jsoup. * * @param url @@ -85,6 +88,22 @@ public class UrlUtils { return stringBuilder.toString(); } + public static List convertToRequests(List urls) { + List requestList = new ArrayList(urls.size()); + for (String url : urls) { + requestList.add(new Request(url)); + } + return requestList; + } + + public static List convertToUrls(List requests) { + List urlList = new ArrayList(requests.size()); + for (Request request : requests) { + urlList.add(request.getUrl()); + } + return urlList; + } + private static final Pattern patternForCharset = Pattern.compile("charset\\s*=\\s*['\"]*([^\\s;'\"]*)"); public static String getCharset(String contentType) {