refactor multi thread code in Spider

master
yihua.huang 2013-10-31 21:52:43 +08:00
parent dbfb6b5803
commit a3f9ad198f
5 changed files with 360 additions and 344 deletions

View File

@ -63,6 +63,11 @@
<artifactId>httpclient</artifactId> <artifactId>httpclient</artifactId>
<version>4.2.4</version> <version>4.2.4</version>
</dependency> </dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>15.0</version>
</dependency>
<dependency> <dependency>
<groupId>us.codecraft</groupId> <groupId>us.codecraft</groupId>
<artifactId>xsoup</artifactId> <artifactId>xsoup</artifactId>

View File

@ -20,6 +20,12 @@
<artifactId>junit</artifactId> <artifactId>junit</artifactId>
</dependency> </dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>15.0</version>
</dependency>
<dependency> <dependency>
<groupId>org.apache.commons</groupId> <groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId> <artifactId>commons-lang3</artifactId>

View File

@ -214,7 +214,7 @@ public class Spider implements Runnable, Task {
return this; return this;
} }
protected void checkComponent() { protected void initComponent() {
if (downloader == null) { if (downloader == null) {
this.downloader = new HttpClientDownloader(); this.downloader = new HttpClientDownloader();
} }
@ -222,36 +222,27 @@ public class Spider implements Runnable, Task {
pipelines.add(new ConsolePipeline()); pipelines.add(new ConsolePipeline());
} }
downloader.setThread(threadNum); downloader.setThread(threadNum);
} executorService = ThreadUtils.newFixedThreadPool(threadNum);
@Override
public void run() {
if (!stat.compareAndSet(STAT_INIT, STAT_RUNNING) && !stat.compareAndSet(STAT_STOPPED, STAT_RUNNING)) {
throw new IllegalStateException("Spider is already running!");
}
checkComponent();
if (startUrls != null) { if (startUrls != null) {
for (String startUrl : startUrls) { for (String startUrl : startUrls) {
scheduler.push(new Request(startUrl), this); scheduler.push(new Request(startUrl), this);
} }
startUrls.clear(); startUrls.clear();
} }
Request request = scheduler.poll(this); }
@Override
public void run() {
checkRunningStat();
initComponent();
logger.info("Spider " + getUUID() + " started!"); logger.info("Spider " + getUUID() + " started!");
// single thread
if (threadNum <= 1) {
while (request != null && stat.compareAndSet(STAT_RUNNING, STAT_RUNNING)) {
processRequest(request);
request = scheduler.poll(this);
}
} else {
synchronized (this) {
this.executorService = ThreadUtils.newFixedThreadPool(threadNum);
}
// multi thread
final AtomicInteger threadAlive = new AtomicInteger(0); final AtomicInteger threadAlive = new AtomicInteger(0);
while (true && stat.compareAndSet(STAT_RUNNING, STAT_RUNNING)) { while (!Thread.currentThread().isInterrupted() && stat.get() == STAT_RUNNING) {
Request request = scheduler.poll(this);
if (request == null) { if (request == null) {
if (threadAlive.get() == 0) {
break;
}
// when no request found but some thread is alive, sleep a // when no request found but some thread is alive, sleep a
// while. // while.
try { try {
@ -264,25 +255,34 @@ public class Spider implements Runnable, Task {
executorService.execute(new Runnable() { executorService.execute(new Runnable() {
@Override @Override
public void run() { public void run() {
try {
processRequest(requestFinal); processRequest(requestFinal);
} catch (Exception e) {
logger.error("download "+requestFinal+" error",e);
} finally {
threadAlive.decrementAndGet(); threadAlive.decrementAndGet();
} }
}
}); });
} }
request = scheduler.poll(this); }
if (threadAlive.get() == 0) { executorService.shutdown();
request = scheduler.poll(this); stat.set(STAT_STOPPED);
if (request == null) { // release some resources
destroy();
}
private void checkRunningStat() {
while (true) {
int statNow = stat.get();
if (statNow == STAT_RUNNING) {
throw new IllegalStateException("Spider is already running!");
}
if (stat.compareAndSet(statNow, STAT_RUNNING)) {
break; break;
} }
} }
} }
executorService.shutdown();
}
stat.compareAndSet(STAT_RUNNING, STAT_STOPPED);
// release some resources
destroy();
}
protected void destroy() { protected void destroy() {
destroyEach(downloader); destroyEach(downloader);
@ -305,11 +305,10 @@ public class Spider implements Runnable, Task {
/** /**
* Process specific urls without url discovering. * Process specific urls without url discovering.
* *
* @param urls * @param urls urls to process
* urls to process
*/ */
public void test(String... urls) { public void test(String... urls) {
checkComponent(); initComponent();
if (urls.length > 0) { if (urls.length > 0) {
for (String url : urls) { for (String url : urls) {
processRequest(new Request(url)); processRequest(new Request(url));
@ -356,7 +355,7 @@ public class Spider implements Runnable, Task {
} }
protected void checkIfRunning() { protected void checkIfRunning() {
if (!stat.compareAndSet(STAT_INIT, STAT_INIT) && !stat.compareAndSet(STAT_STOPPED, STAT_STOPPED)) { if (stat.get() == STAT_RUNNING) {
throw new IllegalStateException("Spider is already running!"); throw new IllegalStateException("Spider is already running!");
} }
} }

View File

@ -1,5 +1,7 @@
package us.codecraft.webmagic.utils; package us.codecraft.webmagic.utils;
import com.google.common.util.concurrent.MoreExecutors;
import java.util.concurrent.ExecutorService; import java.util.concurrent.ExecutorService;
import java.util.concurrent.SynchronousQueue; import java.util.concurrent.SynchronousQueue;
import java.util.concurrent.ThreadPoolExecutor; import java.util.concurrent.ThreadPoolExecutor;
@ -12,8 +14,12 @@ import java.util.concurrent.TimeUnit;
public class ThreadUtils { public class ThreadUtils {
public static ExecutorService newFixedThreadPool(int threadSize) { public static ExecutorService newFixedThreadPool(int threadSize) {
if (threadSize <= 1) { if (threadSize <= 0) {
throw new IllegalArgumentException("ThreadSize must be greater than 1!"); throw new IllegalArgumentException("ThreadSize must be greater than 0!");
}
if (threadSize == 1) {
return MoreExecutors.sameThreadExecutor();
} }
return new ThreadPoolExecutor(threadSize - 1, threadSize - 1, 0L, TimeUnit.MILLISECONDS, return new ThreadPoolExecutor(threadSize - 1, threadSize - 1, 0L, TimeUnit.MILLISECONDS,
new SynchronousQueue<Runnable>(), new ThreadPoolExecutor.CallerRunsPolicy()); new SynchronousQueue<Runnable>(), new ThreadPoolExecutor.CallerRunsPolicy());

View File

@ -18,7 +18,7 @@ public class SpiderTest {
public void process(ResultItems resultItems, Task task) { public void process(ResultItems resultItems, Task task) {
System.out.println(1); System.out.println(1);
} }
}).thread(2); }).thread(1);
spider.start(); spider.start();
Thread.sleep(10000); Thread.sleep(10000);
spider.stop(); spider.stop();