add multithread support
parent
5a6a68a318
commit
cad2594a08
|
@ -9,9 +9,12 @@ import us.codecraft.webmagic.pipeline.Pipeline;
|
||||||
import us.codecraft.webmagic.processor.PageProcessor;
|
import us.codecraft.webmagic.processor.PageProcessor;
|
||||||
import us.codecraft.webmagic.schedular.QueueScheduler;
|
import us.codecraft.webmagic.schedular.QueueScheduler;
|
||||||
import us.codecraft.webmagic.schedular.Scheduler;
|
import us.codecraft.webmagic.schedular.Scheduler;
|
||||||
|
import us.codecraft.webmagic.utils.ThreadUtils;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.concurrent.ExecutorService;
|
||||||
|
import java.util.concurrent.atomic.AtomicInteger;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* <pre>
|
* <pre>
|
||||||
|
@ -51,6 +54,16 @@ public class Spider implements Runnable, Task {
|
||||||
|
|
||||||
private Logger logger = Logger.getLogger(getClass());
|
private Logger logger = Logger.getLogger(getClass());
|
||||||
|
|
||||||
|
private ExecutorService executorService;
|
||||||
|
|
||||||
|
private AtomicInteger stat = new AtomicInteger(STAT_INIT);
|
||||||
|
|
||||||
|
private final static int STAT_INIT = 0;
|
||||||
|
|
||||||
|
private final static int STAT_RUNNING = 1;
|
||||||
|
|
||||||
|
private final static int STAT_STOPPED = 2;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 使用已定义的抽取规则新建一个Spider。
|
* 使用已定义的抽取规则新建一个Spider。
|
||||||
* @param pageProcessor 已定义的抽取规则
|
* @param pageProcessor 已定义的抽取规则
|
||||||
|
@ -76,6 +89,7 @@ public class Spider implements Runnable, Task {
|
||||||
* @return this
|
* @return this
|
||||||
*/
|
*/
|
||||||
public Spider startUrls(List<String> startUrls) {
|
public Spider startUrls(List<String> startUrls) {
|
||||||
|
checkIfNotRunning();
|
||||||
this.startUrls = startUrls;
|
this.startUrls = startUrls;
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
@ -96,6 +110,7 @@ public class Spider implements Runnable, Task {
|
||||||
* @return this
|
* @return this
|
||||||
*/
|
*/
|
||||||
public Spider scheduler(Scheduler scheduler) {
|
public Spider scheduler(Scheduler scheduler) {
|
||||||
|
checkIfNotRunning();
|
||||||
this.scheduler = scheduler;
|
this.scheduler = scheduler;
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
@ -106,6 +121,7 @@ public class Spider implements Runnable, Task {
|
||||||
* @return this
|
* @return this
|
||||||
*/
|
*/
|
||||||
public Spider pipeline(Pipeline pipeline) {
|
public Spider pipeline(Pipeline pipeline) {
|
||||||
|
checkIfNotRunning();
|
||||||
this.pipelines.add(pipeline);
|
this.pipelines.add(pipeline);
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
@ -113,6 +129,9 @@ public class Spider implements Runnable, Task {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void run() {
|
public void run() {
|
||||||
|
if (!stat.compareAndSet(STAT_INIT, STAT_RUNNING)) {
|
||||||
|
throw new IllegalStateException("Spider is already running!");
|
||||||
|
}
|
||||||
if (startUrls != null) {
|
if (startUrls != null) {
|
||||||
for (String startUrl : startUrls) {
|
for (String startUrl : startUrls) {
|
||||||
scheduler.push(new Request(startUrl), this);
|
scheduler.push(new Request(startUrl), this);
|
||||||
|
@ -122,20 +141,56 @@ public class Spider implements Runnable, Task {
|
||||||
if (pipelines.isEmpty()) {
|
if (pipelines.isEmpty()) {
|
||||||
pipelines.add(new ConsolePipeline());
|
pipelines.add(new ConsolePipeline());
|
||||||
}
|
}
|
||||||
while (request != null) {
|
//singel thread
|
||||||
Page page = downloader.download(request, site);
|
if (executorService==null){
|
||||||
if (page == null) {
|
while (request != null) {
|
||||||
sleep(site.getSleepTime());
|
processRequest(request);
|
||||||
continue;
|
request = scheduler.poll(this);
|
||||||
}
|
}
|
||||||
pageProcessor.process(page);
|
} else {
|
||||||
addRequest(page);
|
final AtomicInteger threadAlive = new AtomicInteger(0);
|
||||||
for (Pipeline pipeline : pipelines) {
|
while (true) {
|
||||||
pipeline.process(page, this);
|
if (request == null) {
|
||||||
|
try {
|
||||||
|
Thread.sleep(100);
|
||||||
|
} catch (InterruptedException e) {
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
final Request requestFinal = request;
|
||||||
|
threadAlive.incrementAndGet();
|
||||||
|
executorService.execute(new Runnable() {
|
||||||
|
@Override
|
||||||
|
public void run() {
|
||||||
|
processRequest(requestFinal);
|
||||||
|
threadAlive.decrementAndGet();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
request = scheduler.poll(this);
|
||||||
|
if (threadAlive.get() == 0) {
|
||||||
|
request = scheduler.poll(this);
|
||||||
|
if (request == null) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
sleep(site.getSleepTime());
|
executorService.shutdown();
|
||||||
request = scheduler.poll(this);
|
|
||||||
}
|
}
|
||||||
|
stat.compareAndSet(STAT_RUNNING, STAT_STOPPED);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void processRequest(Request request) {
|
||||||
|
Page page = downloader.download(request, site);
|
||||||
|
if (page == null) {
|
||||||
|
sleep(site.getSleepTime());
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
pageProcessor.process(page);
|
||||||
|
addRequest(page);
|
||||||
|
for (Pipeline pipeline : pipelines) {
|
||||||
|
pipeline.process(page, this);
|
||||||
|
}
|
||||||
|
sleep(site.getSleepTime());
|
||||||
}
|
}
|
||||||
|
|
||||||
private void sleep(int time) {
|
private void sleep(int time) {
|
||||||
|
@ -154,6 +209,28 @@ public class Spider implements Runnable, Task {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private void checkIfNotRunning(){
|
||||||
|
if (!stat.compareAndSet(STAT_INIT,STAT_INIT)){
|
||||||
|
throw new IllegalStateException("Spider is already running!");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 建立多个线程下载
|
||||||
|
* @param threadNum 线程数
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public Spider thread(int threadNum) {
|
||||||
|
checkIfNotRunning();
|
||||||
|
if (threadNum <= 1) {
|
||||||
|
throw new IllegalArgumentException("threadNum should be more than one!");
|
||||||
|
}
|
||||||
|
synchronized (this){
|
||||||
|
this.executorService = ThreadUtils.newFixedThreadPool(threadNum);
|
||||||
|
}
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String getUUID() {
|
public String getUUID() {
|
||||||
if (uuid != null) {
|
if (uuid != null) {
|
||||||
|
|
|
@ -0,0 +1,33 @@
|
||||||
|
package us.codecraft.webmagic.utils;
|
||||||
|
|
||||||
|
import java.util.concurrent.ExecutorService;
|
||||||
|
import java.util.concurrent.LinkedBlockingQueue;
|
||||||
|
import java.util.concurrent.ThreadPoolExecutor;
|
||||||
|
import java.util.concurrent.TimeUnit;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author code4crafer@gmail.com
|
||||||
|
* Date: 13-6-23
|
||||||
|
* Time: 下午7:11
|
||||||
|
*/
|
||||||
|
public class ThreadUtils {
|
||||||
|
|
||||||
|
public static ExecutorService newFixedThreadPool(int threadSize) {
|
||||||
|
return new ThreadPoolExecutor(threadSize, threadSize, 0L, TimeUnit.MILLISECONDS,
|
||||||
|
new LinkedBlockingQueue<Runnable>(1) {
|
||||||
|
|
||||||
|
private static final long serialVersionUID = -9028058603126367678L;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean offer(Runnable e) {
|
||||||
|
try {
|
||||||
|
put(e);
|
||||||
|
return true;
|
||||||
|
} catch (InterruptedException ie) {
|
||||||
|
Thread.currentThread().interrupt();
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue