为downloader增加了一个新方法,可设置线程数
parent
6a87a778fd
commit
e87aabf8fd
|
@ -58,6 +58,8 @@ public class Spider implements Runnable, Task {
|
||||||
|
|
||||||
private ExecutorService executorService;
|
private ExecutorService executorService;
|
||||||
|
|
||||||
|
private int threadNum = 1;
|
||||||
|
|
||||||
private AtomicInteger stat = new AtomicInteger(STAT_INIT);
|
private AtomicInteger stat = new AtomicInteger(STAT_INIT);
|
||||||
|
|
||||||
private final static int STAT_INIT = 0;
|
private final static int STAT_INIT = 0;
|
||||||
|
@ -144,6 +146,10 @@ public class Spider implements Runnable, Task {
|
||||||
if (downloader == null) {
|
if (downloader == null) {
|
||||||
this.downloader = new HttpClientDownloader();
|
this.downloader = new HttpClientDownloader();
|
||||||
}
|
}
|
||||||
|
if (pipelines.isEmpty()) {
|
||||||
|
pipelines.add(new ConsolePipeline());
|
||||||
|
}
|
||||||
|
downloader.setThread(threadNum);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -158,9 +164,6 @@ public class Spider implements Runnable, Task {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Request request = scheduler.poll(this);
|
Request request = scheduler.poll(this);
|
||||||
if (pipelines.isEmpty()) {
|
|
||||||
pipelines.add(new ConsolePipeline());
|
|
||||||
}
|
|
||||||
//singel thread
|
//singel thread
|
||||||
if (executorService == null) {
|
if (executorService == null) {
|
||||||
while (request != null) {
|
while (request != null) {
|
||||||
|
@ -211,9 +214,9 @@ public class Spider implements Runnable, Task {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private void destroyEach(Object object){
|
private void destroyEach(Object object) {
|
||||||
if (object instanceof Destroyable) {
|
if (object instanceof Destroyable) {
|
||||||
((Destroyable)object).destroy();
|
((Destroyable) object).destroy();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -267,12 +270,10 @@ public class Spider implements Runnable, Task {
|
||||||
*/
|
*/
|
||||||
public Spider thread(int threadNum) {
|
public Spider thread(int threadNum) {
|
||||||
checkIfNotRunning();
|
checkIfNotRunning();
|
||||||
|
this.threadNum = threadNum;
|
||||||
if (threadNum <= 0) {
|
if (threadNum <= 0) {
|
||||||
throw new IllegalArgumentException("threadNum should be more than one!");
|
throw new IllegalArgumentException("threadNum should be more than one!");
|
||||||
}
|
}
|
||||||
if (downloader==null || downloader instanceof HttpClientDownloader){
|
|
||||||
downloader = new HttpClientDownloader(threadNum);
|
|
||||||
}
|
|
||||||
if (threadNum == 1) {
|
if (threadNum == 1) {
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
|
@ -6,9 +6,10 @@ import us.codecraft.webmagic.Task;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Downloader是webmagic下载页面的接口。webmagic默认使用了HttpComponent作为下载器,一般情况,你无需自己实现这个接口。<br>
|
* Downloader是webmagic下载页面的接口。webmagic默认使用了HttpComponent作为下载器,一般情况,你无需自己实现这个接口。<br>
|
||||||
|
*
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
* Date: 13-4-21
|
* Date: 13-4-21
|
||||||
* Time: 下午12:14
|
* Time: 下午12:14
|
||||||
*/
|
*/
|
||||||
public interface Downloader {
|
public interface Downloader {
|
||||||
|
|
||||||
|
@ -20,4 +21,12 @@ public interface Downloader {
|
||||||
* @return page
|
* @return page
|
||||||
*/
|
*/
|
||||||
public Page download(Request request, Task task);
|
public Page download(Request request, Task task);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 设置线程数,多线程程序一般需要Downloader支持<br>
|
||||||
|
* 如果不考虑多线程的可以不实现这个方法<br>
|
||||||
|
*
|
||||||
|
* @param thread 线程数量
|
||||||
|
*/
|
||||||
|
public void setThread(int thread);
|
||||||
}
|
}
|
||||||
|
|
|
@ -67,6 +67,11 @@ public class FileDownloader implements Downloader {
|
||||||
return page;
|
return page;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void setThread(int thread) {
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
private String getHtml(BufferedReader bufferedReader) throws IOException {
|
private String getHtml(BufferedReader bufferedReader) throws IOException {
|
||||||
String line;
|
String line;
|
||||||
StringBuilder htmlBuilder= new StringBuilder();
|
StringBuilder htmlBuilder= new StringBuilder();
|
||||||
|
|
|
@ -32,14 +32,6 @@ public class HttpClientDownloader implements Downloader {
|
||||||
|
|
||||||
private int poolSize;
|
private int poolSize;
|
||||||
|
|
||||||
public HttpClientDownloader(int poolSize) {
|
|
||||||
this.poolSize = poolSize;
|
|
||||||
}
|
|
||||||
|
|
||||||
public HttpClientDownloader() {
|
|
||||||
this(5);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Page download(Request request, Task task) {
|
public Page download(Request request, Task task) {
|
||||||
Site site = task.getSite();
|
Site site = task.getSite();
|
||||||
|
@ -90,6 +82,11 @@ public class HttpClientDownloader implements Downloader {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void setThread(int thread) {
|
||||||
|
poolSize=thread;
|
||||||
|
}
|
||||||
|
|
||||||
private void handleGzip(HttpResponse httpResponse) {
|
private void handleGzip(HttpResponse httpResponse) {
|
||||||
Header ceheader = httpResponse.getEntity().getContentEncoding();
|
Header ceheader = httpResponse.getEntity().getContentEncoding();
|
||||||
if (ceheader != null) {
|
if (ceheader != null) {
|
||||||
|
|
|
@ -27,12 +27,14 @@ import java.util.Map;
|
||||||
*/
|
*/
|
||||||
public class SeleniumDownloader implements Downloader, Destroyable {
|
public class SeleniumDownloader implements Downloader, Destroyable {
|
||||||
|
|
||||||
private WebDriverPool webDriverPool;
|
private volatile WebDriverPool webDriverPool;
|
||||||
|
|
||||||
private Logger logger = Logger.getLogger(getClass());
|
private Logger logger = Logger.getLogger(getClass());
|
||||||
|
|
||||||
private int sleepTime = 0;
|
private int sleepTime = 0;
|
||||||
|
|
||||||
|
private int poolSize = 1;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 新建
|
* 新建
|
||||||
*
|
*
|
||||||
|
@ -40,16 +42,11 @@ public class SeleniumDownloader implements Downloader, Destroyable {
|
||||||
*/
|
*/
|
||||||
public SeleniumDownloader(String chromeDriverPath) {
|
public SeleniumDownloader(String chromeDriverPath) {
|
||||||
System.getProperties().setProperty("webdriver.chrome.driver", chromeDriverPath);
|
System.getProperties().setProperty("webdriver.chrome.driver", chromeDriverPath);
|
||||||
webDriverPool = new WebDriverPool();
|
|
||||||
}
|
|
||||||
|
|
||||||
public SeleniumDownloader(String chromeDriverPath, int poolSize) {
|
|
||||||
System.getProperties().setProperty("webdriver.chrome.driver", chromeDriverPath);
|
|
||||||
webDriverPool = new WebDriverPool(poolSize);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* set sleep time to wait until load success
|
* set sleep time to wait until load success
|
||||||
|
*
|
||||||
* @param sleepTime
|
* @param sleepTime
|
||||||
* @return this
|
* @return this
|
||||||
*/
|
*/
|
||||||
|
@ -60,6 +57,7 @@ public class SeleniumDownloader implements Downloader, Destroyable {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Page download(Request request, Task task) {
|
public Page download(Request request, Task task) {
|
||||||
|
checkInit();
|
||||||
WebDriver webDriver;
|
WebDriver webDriver;
|
||||||
try {
|
try {
|
||||||
webDriver = webDriverPool.get();
|
webDriver = webDriverPool.get();
|
||||||
|
@ -93,6 +91,19 @@ public class SeleniumDownloader implements Downloader, Destroyable {
|
||||||
return page;
|
return page;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private void checkInit() {
|
||||||
|
if (webDriverPool == null) {
|
||||||
|
synchronized (this){
|
||||||
|
webDriverPool = new WebDriverPool(poolSize);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void setThread(int thread) {
|
||||||
|
this.poolSize = thread;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void destroy() {
|
public void destroy() {
|
||||||
webDriverPool.closeAll();
|
webDriverPool.closeAll();
|
||||||
|
|
Loading…
Reference in New Issue