#29 seed urls with more information
parent
1446ada732
commit
6fa82a418b
|
@ -24,7 +24,7 @@ public class Site {
|
||||||
/**
|
/**
|
||||||
* startUrls is the urls the crawler to start with.
|
* startUrls is the urls the crawler to start with.
|
||||||
*/
|
*/
|
||||||
private List<String> startUrls = new ArrayList<String>();
|
private List<Request> startRequests = new ArrayList<Request>();
|
||||||
|
|
||||||
private int sleepTime = 3000;
|
private int sleepTime = 3000;
|
||||||
|
|
||||||
|
@ -38,7 +38,7 @@ public class Site {
|
||||||
|
|
||||||
private Set<Integer> acceptStatCode = DEFAULT_STATUS_CODE_SET;
|
private Set<Integer> acceptStatCode = DEFAULT_STATUS_CODE_SET;
|
||||||
|
|
||||||
private Map<String,String> headers = new HashMap<String, String>();
|
private Map<String, String> headers = new HashMap<String, String>();
|
||||||
|
|
||||||
public static interface HeaderConst {
|
public static interface HeaderConst {
|
||||||
|
|
||||||
|
@ -182,9 +182,16 @@ public class Site {
|
||||||
* get start urls
|
* get start urls
|
||||||
*
|
*
|
||||||
* @return start urls
|
* @return start urls
|
||||||
|
* @see #getStartRequests
|
||||||
|
* @deprecated
|
||||||
*/
|
*/
|
||||||
|
@Deprecated
|
||||||
public List<String> getStartUrls() {
|
public List<String> getStartUrls() {
|
||||||
return startUrls;
|
return UrlUtils.convertToUrls(startRequests);
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<Request> getStartRequests() {
|
||||||
|
return startRequests;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -194,11 +201,19 @@ public class Site {
|
||||||
* @return this
|
* @return this
|
||||||
*/
|
*/
|
||||||
public Site addStartUrl(String startUrl) {
|
public Site addStartUrl(String startUrl) {
|
||||||
this.startUrls.add(startUrl);
|
return addStartRequest(new Request(startUrl));
|
||||||
if (domain == null) {
|
}
|
||||||
if (startUrls.size() > 0) {
|
|
||||||
domain = UrlUtils.getDomain(startUrls.get(0));
|
/**
|
||||||
}
|
* Add a url to start url.<br>
|
||||||
|
*
|
||||||
|
* @param startUrl
|
||||||
|
* @return this
|
||||||
|
*/
|
||||||
|
public Site addStartRequest(Request startRequest) {
|
||||||
|
this.startRequests.add(startRequest);
|
||||||
|
if (domain == null && startRequest.getUrl() != null) {
|
||||||
|
domain = UrlUtils.getDomain(startRequest.getUrl());
|
||||||
}
|
}
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
@ -241,12 +256,13 @@ public class Site {
|
||||||
/**
|
/**
|
||||||
* Put an Http header for downloader. <br/>
|
* Put an Http header for downloader. <br/>
|
||||||
* Use {@link #addCookie(String, String)} for cookie and {@link #setUserAgent(String)} for user-agent. <br/>
|
* Use {@link #addCookie(String, String)} for cookie and {@link #setUserAgent(String)} for user-agent. <br/>
|
||||||
* @param key key of http header, there are some keys constant in {@link HeaderConst}
|
*
|
||||||
|
* @param key key of http header, there are some keys constant in {@link HeaderConst}
|
||||||
* @param value value of header
|
* @param value value of header
|
||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
public Site addHeader(String key, String value){
|
public Site addHeader(String key, String value) {
|
||||||
headers.put(key,value);
|
headers.put(key, value);
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -279,23 +295,6 @@ public class Site {
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
public boolean equals(Object o) {
|
|
||||||
if (this == o) return true;
|
|
||||||
if (o == null || getClass() != o.getClass()) return false;
|
|
||||||
|
|
||||||
Site site = (Site) o;
|
|
||||||
|
|
||||||
if (acceptStatCode != null ? !acceptStatCode.equals(site.acceptStatCode) : site.acceptStatCode != null)
|
|
||||||
return false;
|
|
||||||
if (!domain.equals(site.domain)) return false;
|
|
||||||
if (!startUrls.equals(site.startUrls)) return false;
|
|
||||||
if (charset != null ? !charset.equals(site.charset) : site.charset != null) return false;
|
|
||||||
if (userAgent != null ? !userAgent.equals(site.userAgent) : site.userAgent != null) return false;
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
public Task toTask() {
|
public Task toTask() {
|
||||||
return new Task() {
|
return new Task() {
|
||||||
@Override
|
@Override
|
||||||
|
@ -310,13 +309,60 @@ public class Site {
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean equals(Object o) {
|
||||||
|
if (this == o) return true;
|
||||||
|
if (o == null || getClass() != o.getClass()) return false;
|
||||||
|
|
||||||
|
Site site = (Site) o;
|
||||||
|
|
||||||
|
if (cycleRetryTimes != site.cycleRetryTimes) return false;
|
||||||
|
if (retryTimes != site.retryTimes) return false;
|
||||||
|
if (sleepTime != site.sleepTime) return false;
|
||||||
|
if (timeOut != site.timeOut) return false;
|
||||||
|
if (acceptStatCode != null ? !acceptStatCode.equals(site.acceptStatCode) : site.acceptStatCode != null)
|
||||||
|
return false;
|
||||||
|
if (charset != null ? !charset.equals(site.charset) : site.charset != null) return false;
|
||||||
|
if (cookies != null ? !cookies.equals(site.cookies) : site.cookies != null) return false;
|
||||||
|
if (domain != null ? !domain.equals(site.domain) : site.domain != null) return false;
|
||||||
|
if (headers != null ? !headers.equals(site.headers) : site.headers != null) return false;
|
||||||
|
if (startRequests != null ? !startRequests.equals(site.startRequests) : site.startRequests != null)
|
||||||
|
return false;
|
||||||
|
if (userAgent != null ? !userAgent.equals(site.userAgent) : site.userAgent != null) return false;
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int hashCode() {
|
public int hashCode() {
|
||||||
int result = domain.hashCode();
|
int result = domain != null ? domain.hashCode() : 0;
|
||||||
result = 31 * result + (startUrls != null ? startUrls.hashCode() : 0);
|
|
||||||
result = 31 * result + (userAgent != null ? userAgent.hashCode() : 0);
|
result = 31 * result + (userAgent != null ? userAgent.hashCode() : 0);
|
||||||
|
result = 31 * result + (cookies != null ? cookies.hashCode() : 0);
|
||||||
result = 31 * result + (charset != null ? charset.hashCode() : 0);
|
result = 31 * result + (charset != null ? charset.hashCode() : 0);
|
||||||
|
result = 31 * result + (startRequests != null ? startRequests.hashCode() : 0);
|
||||||
|
result = 31 * result + sleepTime;
|
||||||
|
result = 31 * result + retryTimes;
|
||||||
|
result = 31 * result + cycleRetryTimes;
|
||||||
|
result = 31 * result + timeOut;
|
||||||
result = 31 * result + (acceptStatCode != null ? acceptStatCode.hashCode() : 0);
|
result = 31 * result + (acceptStatCode != null ? acceptStatCode.hashCode() : 0);
|
||||||
|
result = 31 * result + (headers != null ? headers.hashCode() : 0);
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
return "Site{" +
|
||||||
|
"domain='" + domain + '\'' +
|
||||||
|
", userAgent='" + userAgent + '\'' +
|
||||||
|
", cookies=" + cookies +
|
||||||
|
", charset='" + charset + '\'' +
|
||||||
|
", startRequests=" + startRequests +
|
||||||
|
", sleepTime=" + sleepTime +
|
||||||
|
", retryTimes=" + retryTimes +
|
||||||
|
", cycleRetryTimes=" + cycleRetryTimes +
|
||||||
|
", timeOut=" + timeOut +
|
||||||
|
", acceptStatCode=" + acceptStatCode +
|
||||||
|
", headers=" + headers +
|
||||||
|
'}';
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -11,6 +11,7 @@ import us.codecraft.webmagic.scheduler.QueueScheduler;
|
||||||
import us.codecraft.webmagic.scheduler.Scheduler;
|
import us.codecraft.webmagic.scheduler.Scheduler;
|
||||||
import us.codecraft.webmagic.utils.EnvironmentUtil;
|
import us.codecraft.webmagic.utils.EnvironmentUtil;
|
||||||
import us.codecraft.webmagic.utils.ThreadUtils;
|
import us.codecraft.webmagic.utils.ThreadUtils;
|
||||||
|
import us.codecraft.webmagic.utils.UrlUtils;
|
||||||
|
|
||||||
import java.io.Closeable;
|
import java.io.Closeable;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
@ -60,7 +61,7 @@ public class Spider implements Runnable, Task {
|
||||||
|
|
||||||
protected PageProcessor pageProcessor;
|
protected PageProcessor pageProcessor;
|
||||||
|
|
||||||
protected List<String> startUrls;
|
protected List<Request> startRequests;
|
||||||
|
|
||||||
protected Site site;
|
protected Site site;
|
||||||
|
|
||||||
|
@ -107,7 +108,7 @@ public class Spider implements Runnable, Task {
|
||||||
public Spider(PageProcessor pageProcessor) {
|
public Spider(PageProcessor pageProcessor) {
|
||||||
this.pageProcessor = pageProcessor;
|
this.pageProcessor = pageProcessor;
|
||||||
this.site = pageProcessor.getSite();
|
this.site = pageProcessor.getSite();
|
||||||
this.startUrls = pageProcessor.getSite().getStartUrls();
|
this.startRequests = pageProcessor.getSite().getStartRequests();
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -119,7 +120,20 @@ public class Spider implements Runnable, Task {
|
||||||
*/
|
*/
|
||||||
public Spider startUrls(List<String> startUrls) {
|
public Spider startUrls(List<String> startUrls) {
|
||||||
checkIfRunning();
|
checkIfRunning();
|
||||||
this.startUrls = startUrls;
|
this.startRequests = UrlUtils.convertToRequests(startUrls);
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Set startUrls of Spider.<br>
|
||||||
|
* Prior to startUrls of Site.
|
||||||
|
*
|
||||||
|
* @param startUrls
|
||||||
|
* @return this
|
||||||
|
*/
|
||||||
|
public Spider startRequest(List<Request> startRequests) {
|
||||||
|
checkIfRunning();
|
||||||
|
this.startRequests = startRequests;
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -231,11 +245,11 @@ public class Spider implements Runnable, Task {
|
||||||
}
|
}
|
||||||
downloader.setThread(threadNum);
|
downloader.setThread(threadNum);
|
||||||
executorService = ThreadUtils.newFixedThreadPool(threadNum);
|
executorService = ThreadUtils.newFixedThreadPool(threadNum);
|
||||||
if (startUrls != null) {
|
if (startRequests != null) {
|
||||||
for (String startUrl : startUrls) {
|
for (Request request : startRequests) {
|
||||||
scheduler.push(new Request(startUrl), this);
|
scheduler.push(request, this);
|
||||||
}
|
}
|
||||||
startUrls.clear();
|
startRequests.clear();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -390,6 +404,20 @@ public class Spider implements Runnable, Task {
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Add urls with information to crawl.<br/>
|
||||||
|
*
|
||||||
|
* @param urls
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public Spider addRequest(Request... requests) {
|
||||||
|
for (Request request : requests) {
|
||||||
|
addRequest(request);
|
||||||
|
}
|
||||||
|
signalNewUrl();
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
private void waitNewUrl() {
|
private void waitNewUrl() {
|
||||||
try {
|
try {
|
||||||
newUrlLock.lock();
|
newUrlLock.lock();
|
||||||
|
|
|
@ -34,6 +34,6 @@ public class OschinaBlogPageProcesser implements PageProcessor {
|
||||||
}
|
}
|
||||||
|
|
||||||
public static void main(String[] args) {
|
public static void main(String[] args) {
|
||||||
Spider.create(new OschinaBlogPageProcesser()).thread(10).run();
|
Spider.create(new OschinaBlogPageProcesser()).thread(2).run();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,10 +1,13 @@
|
||||||
package us.codecraft.webmagic.utils;
|
package us.codecraft.webmagic.utils;
|
||||||
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
import us.codecraft.webmagic.Request;
|
||||||
|
|
||||||
import java.net.MalformedURLException;
|
import java.net.MalformedURLException;
|
||||||
import java.net.URL;
|
import java.net.URL;
|
||||||
import java.nio.charset.Charset;
|
import java.nio.charset.Charset;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
import java.util.regex.Matcher;
|
import java.util.regex.Matcher;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
|
@ -18,7 +21,7 @@ public class UrlUtils {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* canonicalizeUrl
|
* canonicalizeUrl
|
||||||
*
|
* <p/>
|
||||||
* Borrowed from Jsoup.
|
* Borrowed from Jsoup.
|
||||||
*
|
*
|
||||||
* @param url
|
* @param url
|
||||||
|
@ -85,6 +88,22 @@ public class UrlUtils {
|
||||||
return stringBuilder.toString();
|
return stringBuilder.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static List<Request> convertToRequests(List<String> urls) {
|
||||||
|
List<Request> requestList = new ArrayList<Request>(urls.size());
|
||||||
|
for (String url : urls) {
|
||||||
|
requestList.add(new Request(url));
|
||||||
|
}
|
||||||
|
return requestList;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static List<String> convertToUrls(List<Request> requests) {
|
||||||
|
List<String> urlList = new ArrayList<String>(requests.size());
|
||||||
|
for (Request request : requests) {
|
||||||
|
urlList.add(request.getUrl());
|
||||||
|
}
|
||||||
|
return urlList;
|
||||||
|
}
|
||||||
|
|
||||||
private static final Pattern patternForCharset = Pattern.compile("charset\\s*=\\s*['\"]*([^\\s;'\"]*)");
|
private static final Pattern patternForCharset = Pattern.compile("charset\\s*=\\s*['\"]*([^\\s;'\"]*)");
|
||||||
|
|
||||||
public static String getCharset(String contentType) {
|
public static String getCharset(String contentType) {
|
||||||
|
|
Loading…
Reference in New Issue