refactor multi thread code in Spider
parent
dbfb6b5803
commit
a3f9ad198f
5
pom.xml
5
pom.xml
|
@ -63,6 +63,11 @@
|
|||
<artifactId>httpclient</artifactId>
|
||||
<version>4.2.4</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.google.guava</groupId>
|
||||
<artifactId>guava</artifactId>
|
||||
<version>15.0</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<artifactId>xsoup</artifactId>
|
||||
|
|
|
@ -20,6 +20,12 @@
|
|||
<artifactId>junit</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>com.google.guava</groupId>
|
||||
<artifactId>guava</artifactId>
|
||||
<version>15.0</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.commons</groupId>
|
||||
<artifactId>commons-lang3</artifactId>
|
||||
|
|
|
@ -214,7 +214,7 @@ public class Spider implements Runnable, Task {
|
|||
return this;
|
||||
}
|
||||
|
||||
protected void checkComponent() {
|
||||
protected void initComponent() {
|
||||
if (downloader == null) {
|
||||
this.downloader = new HttpClientDownloader();
|
||||
}
|
||||
|
@ -222,36 +222,27 @@ public class Spider implements Runnable, Task {
|
|||
pipelines.add(new ConsolePipeline());
|
||||
}
|
||||
downloader.setThread(threadNum);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void run() {
|
||||
if (!stat.compareAndSet(STAT_INIT, STAT_RUNNING) && !stat.compareAndSet(STAT_STOPPED, STAT_RUNNING)) {
|
||||
throw new IllegalStateException("Spider is already running!");
|
||||
}
|
||||
checkComponent();
|
||||
executorService = ThreadUtils.newFixedThreadPool(threadNum);
|
||||
if (startUrls != null) {
|
||||
for (String startUrl : startUrls) {
|
||||
scheduler.push(new Request(startUrl), this);
|
||||
}
|
||||
startUrls.clear();
|
||||
}
|
||||
Request request = scheduler.poll(this);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void run() {
|
||||
checkRunningStat();
|
||||
initComponent();
|
||||
logger.info("Spider " + getUUID() + " started!");
|
||||
// single thread
|
||||
if (threadNum <= 1) {
|
||||
while (request != null && stat.compareAndSet(STAT_RUNNING, STAT_RUNNING)) {
|
||||
processRequest(request);
|
||||
request = scheduler.poll(this);
|
||||
}
|
||||
} else {
|
||||
synchronized (this) {
|
||||
this.executorService = ThreadUtils.newFixedThreadPool(threadNum);
|
||||
}
|
||||
// multi thread
|
||||
final AtomicInteger threadAlive = new AtomicInteger(0);
|
||||
while (true && stat.compareAndSet(STAT_RUNNING, STAT_RUNNING)) {
|
||||
while (!Thread.currentThread().isInterrupted() && stat.get() == STAT_RUNNING) {
|
||||
Request request = scheduler.poll(this);
|
||||
if (request == null) {
|
||||
if (threadAlive.get() == 0) {
|
||||
break;
|
||||
}
|
||||
// when no request found but some thread is alive, sleep a
|
||||
// while.
|
||||
try {
|
||||
|
@ -264,25 +255,34 @@ public class Spider implements Runnable, Task {
|
|||
executorService.execute(new Runnable() {
|
||||
@Override
|
||||
public void run() {
|
||||
try {
|
||||
processRequest(requestFinal);
|
||||
} catch (Exception e) {
|
||||
logger.error("download "+requestFinal+" error",e);
|
||||
} finally {
|
||||
threadAlive.decrementAndGet();
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
request = scheduler.poll(this);
|
||||
if (threadAlive.get() == 0) {
|
||||
request = scheduler.poll(this);
|
||||
if (request == null) {
|
||||
}
|
||||
executorService.shutdown();
|
||||
stat.set(STAT_STOPPED);
|
||||
// release some resources
|
||||
destroy();
|
||||
}
|
||||
|
||||
private void checkRunningStat() {
|
||||
while (true) {
|
||||
int statNow = stat.get();
|
||||
if (statNow == STAT_RUNNING) {
|
||||
throw new IllegalStateException("Spider is already running!");
|
||||
}
|
||||
if (stat.compareAndSet(statNow, STAT_RUNNING)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
executorService.shutdown();
|
||||
}
|
||||
stat.compareAndSet(STAT_RUNNING, STAT_STOPPED);
|
||||
// release some resources
|
||||
destroy();
|
||||
}
|
||||
|
||||
protected void destroy() {
|
||||
destroyEach(downloader);
|
||||
|
@ -305,11 +305,10 @@ public class Spider implements Runnable, Task {
|
|||
/**
|
||||
* Process specific urls without url discovering.
|
||||
*
|
||||
* @param urls
|
||||
* urls to process
|
||||
* @param urls urls to process
|
||||
*/
|
||||
public void test(String... urls) {
|
||||
checkComponent();
|
||||
initComponent();
|
||||
if (urls.length > 0) {
|
||||
for (String url : urls) {
|
||||
processRequest(new Request(url));
|
||||
|
@ -356,7 +355,7 @@ public class Spider implements Runnable, Task {
|
|||
}
|
||||
|
||||
protected void checkIfRunning() {
|
||||
if (!stat.compareAndSet(STAT_INIT, STAT_INIT) && !stat.compareAndSet(STAT_STOPPED, STAT_STOPPED)) {
|
||||
if (stat.get() == STAT_RUNNING) {
|
||||
throw new IllegalStateException("Spider is already running!");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,5 +1,7 @@
|
|||
package us.codecraft.webmagic.utils;
|
||||
|
||||
import com.google.common.util.concurrent.MoreExecutors;
|
||||
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import java.util.concurrent.SynchronousQueue;
|
||||
import java.util.concurrent.ThreadPoolExecutor;
|
||||
|
@ -12,8 +14,12 @@ import java.util.concurrent.TimeUnit;
|
|||
public class ThreadUtils {
|
||||
|
||||
public static ExecutorService newFixedThreadPool(int threadSize) {
|
||||
if (threadSize <= 1) {
|
||||
throw new IllegalArgumentException("ThreadSize must be greater than 1!");
|
||||
if (threadSize <= 0) {
|
||||
throw new IllegalArgumentException("ThreadSize must be greater than 0!");
|
||||
}
|
||||
if (threadSize == 1) {
|
||||
return MoreExecutors.sameThreadExecutor();
|
||||
|
||||
}
|
||||
return new ThreadPoolExecutor(threadSize - 1, threadSize - 1, 0L, TimeUnit.MILLISECONDS,
|
||||
new SynchronousQueue<Runnable>(), new ThreadPoolExecutor.CallerRunsPolicy());
|
||||
|
|
|
@ -18,7 +18,7 @@ public class SpiderTest {
|
|||
public void process(ResultItems resultItems, Task task) {
|
||||
System.out.println(1);
|
||||
}
|
||||
}).thread(2);
|
||||
}).thread(1);
|
||||
spider.start();
|
||||
Thread.sleep(10000);
|
||||
spider.stop();
|
||||
|
|
Loading…
Reference in New Issue