为downloader增加了一个新方法,可设置线程数
parent
6a87a778fd
commit
e87aabf8fd
|
@ -58,6 +58,8 @@ public class Spider implements Runnable, Task {
|
|||
|
||||
private ExecutorService executorService;
|
||||
|
||||
private int threadNum = 1;
|
||||
|
||||
private AtomicInteger stat = new AtomicInteger(STAT_INIT);
|
||||
|
||||
private final static int STAT_INIT = 0;
|
||||
|
@ -144,6 +146,10 @@ public class Spider implements Runnable, Task {
|
|||
if (downloader == null) {
|
||||
this.downloader = new HttpClientDownloader();
|
||||
}
|
||||
if (pipelines.isEmpty()) {
|
||||
pipelines.add(new ConsolePipeline());
|
||||
}
|
||||
downloader.setThread(threadNum);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -158,9 +164,6 @@ public class Spider implements Runnable, Task {
|
|||
}
|
||||
}
|
||||
Request request = scheduler.poll(this);
|
||||
if (pipelines.isEmpty()) {
|
||||
pipelines.add(new ConsolePipeline());
|
||||
}
|
||||
//singel thread
|
||||
if (executorService == null) {
|
||||
while (request != null) {
|
||||
|
@ -211,9 +214,9 @@ public class Spider implements Runnable, Task {
|
|||
}
|
||||
}
|
||||
|
||||
private void destroyEach(Object object){
|
||||
private void destroyEach(Object object) {
|
||||
if (object instanceof Destroyable) {
|
||||
((Destroyable)object).destroy();
|
||||
((Destroyable) object).destroy();
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -267,12 +270,10 @@ public class Spider implements Runnable, Task {
|
|||
*/
|
||||
public Spider thread(int threadNum) {
|
||||
checkIfNotRunning();
|
||||
this.threadNum = threadNum;
|
||||
if (threadNum <= 0) {
|
||||
throw new IllegalArgumentException("threadNum should be more than one!");
|
||||
}
|
||||
if (downloader==null || downloader instanceof HttpClientDownloader){
|
||||
downloader = new HttpClientDownloader(threadNum);
|
||||
}
|
||||
if (threadNum == 1) {
|
||||
return this;
|
||||
}
|
||||
|
|
|
@ -6,9 +6,10 @@ import us.codecraft.webmagic.Task;
|
|||
|
||||
/**
|
||||
* Downloader是webmagic下载页面的接口。webmagic默认使用了HttpComponent作为下载器,一般情况,你无需自己实现这个接口。<br>
|
||||
*
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-4-21
|
||||
* Time: 下午12:14
|
||||
* Date: 13-4-21
|
||||
* Time: 下午12:14
|
||||
*/
|
||||
public interface Downloader {
|
||||
|
||||
|
@ -20,4 +21,12 @@ public interface Downloader {
|
|||
* @return page
|
||||
*/
|
||||
public Page download(Request request, Task task);
|
||||
|
||||
/**
|
||||
* 设置线程数,多线程程序一般需要Downloader支持<br>
|
||||
* 如果不考虑多线程的可以不实现这个方法<br>
|
||||
*
|
||||
* @param thread 线程数量
|
||||
*/
|
||||
public void setThread(int thread);
|
||||
}
|
||||
|
|
|
@ -67,6 +67,11 @@ public class FileDownloader implements Downloader {
|
|||
return page;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setThread(int thread) {
|
||||
|
||||
}
|
||||
|
||||
private String getHtml(BufferedReader bufferedReader) throws IOException {
|
||||
String line;
|
||||
StringBuilder htmlBuilder= new StringBuilder();
|
||||
|
|
|
@ -32,14 +32,6 @@ public class HttpClientDownloader implements Downloader {
|
|||
|
||||
private int poolSize;
|
||||
|
||||
public HttpClientDownloader(int poolSize) {
|
||||
this.poolSize = poolSize;
|
||||
}
|
||||
|
||||
public HttpClientDownloader() {
|
||||
this(5);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Page download(Request request, Task task) {
|
||||
Site site = task.getSite();
|
||||
|
@ -90,6 +82,11 @@ public class HttpClientDownloader implements Downloader {
|
|||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setThread(int thread) {
|
||||
poolSize=thread;
|
||||
}
|
||||
|
||||
private void handleGzip(HttpResponse httpResponse) {
|
||||
Header ceheader = httpResponse.getEntity().getContentEncoding();
|
||||
if (ceheader != null) {
|
||||
|
|
|
@ -27,12 +27,14 @@ import java.util.Map;
|
|||
*/
|
||||
public class SeleniumDownloader implements Downloader, Destroyable {
|
||||
|
||||
private WebDriverPool webDriverPool;
|
||||
private volatile WebDriverPool webDriverPool;
|
||||
|
||||
private Logger logger = Logger.getLogger(getClass());
|
||||
|
||||
private int sleepTime = 0;
|
||||
|
||||
private int poolSize = 1;
|
||||
|
||||
/**
|
||||
* 新建
|
||||
*
|
||||
|
@ -40,16 +42,11 @@ public class SeleniumDownloader implements Downloader, Destroyable {
|
|||
*/
|
||||
public SeleniumDownloader(String chromeDriverPath) {
|
||||
System.getProperties().setProperty("webdriver.chrome.driver", chromeDriverPath);
|
||||
webDriverPool = new WebDriverPool();
|
||||
}
|
||||
|
||||
public SeleniumDownloader(String chromeDriverPath, int poolSize) {
|
||||
System.getProperties().setProperty("webdriver.chrome.driver", chromeDriverPath);
|
||||
webDriverPool = new WebDriverPool(poolSize);
|
||||
}
|
||||
|
||||
/**
|
||||
* set sleep time to wait until load success
|
||||
*
|
||||
* @param sleepTime
|
||||
* @return this
|
||||
*/
|
||||
|
@ -60,6 +57,7 @@ public class SeleniumDownloader implements Downloader, Destroyable {
|
|||
|
||||
@Override
|
||||
public Page download(Request request, Task task) {
|
||||
checkInit();
|
||||
WebDriver webDriver;
|
||||
try {
|
||||
webDriver = webDriverPool.get();
|
||||
|
@ -93,6 +91,19 @@ public class SeleniumDownloader implements Downloader, Destroyable {
|
|||
return page;
|
||||
}
|
||||
|
||||
private void checkInit() {
|
||||
if (webDriverPool == null) {
|
||||
synchronized (this){
|
||||
webDriverPool = new WebDriverPool(poolSize);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setThread(int thread) {
|
||||
this.poolSize = thread;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void destroy() {
|
||||
webDriverPool.closeAll();
|
||||
|
|
Loading…
Reference in New Issue