add exit when comlete option
parent
352887870c
commit
b4fcf41168
|
@ -18,6 +18,8 @@ import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.concurrent.ExecutorService;
|
import java.util.concurrent.ExecutorService;
|
||||||
import java.util.concurrent.atomic.AtomicInteger;
|
import java.util.concurrent.atomic.AtomicInteger;
|
||||||
|
import java.util.concurrent.locks.Condition;
|
||||||
|
import java.util.concurrent.locks.ReentrantLock;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Entrance of a crawler.<br>
|
* Entrance of a crawler.<br>
|
||||||
|
@ -74,7 +76,7 @@ public class Spider implements Runnable, Task {
|
||||||
|
|
||||||
protected AtomicInteger stat = new AtomicInteger(STAT_INIT);
|
protected AtomicInteger stat = new AtomicInteger(STAT_INIT);
|
||||||
|
|
||||||
protected boolean exitWhenComplete = false;
|
protected boolean exitWhenComplete = true;
|
||||||
|
|
||||||
protected final static int STAT_INIT = 0;
|
protected final static int STAT_INIT = 0;
|
||||||
|
|
||||||
|
@ -82,6 +84,10 @@ public class Spider implements Runnable, Task {
|
||||||
|
|
||||||
protected final static int STAT_STOPPED = 2;
|
protected final static int STAT_STOPPED = 2;
|
||||||
|
|
||||||
|
private ReentrantLock newUrlLock = new ReentrantLock();
|
||||||
|
|
||||||
|
private Condition newUrlCondition = newUrlLock.newCondition();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* create a spider with pageProcessor.
|
* create a spider with pageProcessor.
|
||||||
*
|
*
|
||||||
|
@ -245,11 +251,15 @@ public class Spider implements Runnable, Task {
|
||||||
if (threadAlive.get() == 0 && exitWhenComplete) {
|
if (threadAlive.get() == 0 && exitWhenComplete) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
// when no request found but some thread is alive, sleep a
|
// wait until new url added
|
||||||
// while.
|
|
||||||
try {
|
try {
|
||||||
Thread.sleep(100);
|
newUrlLock.lock();
|
||||||
} catch (InterruptedException e) {
|
try {
|
||||||
|
newUrlCondition.await();
|
||||||
|
} catch (InterruptedException e) {
|
||||||
|
}
|
||||||
|
} finally {
|
||||||
|
newUrlLock.unlock();
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
final Request requestFinal = request;
|
final Request requestFinal = request;
|
||||||
|
@ -263,6 +273,7 @@ public class Spider implements Runnable, Task {
|
||||||
logger.error("download " + requestFinal + " error", e);
|
logger.error("download " + requestFinal + " error", e);
|
||||||
} finally {
|
} finally {
|
||||||
threadAlive.decrementAndGet();
|
threadAlive.decrementAndGet();
|
||||||
|
signalNewUrl();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
@ -351,11 +362,16 @@ public class Spider implements Runnable, Task {
|
||||||
protected void addRequest(Page page) {
|
protected void addRequest(Page page) {
|
||||||
if (CollectionUtils.isNotEmpty(page.getTargetRequests())) {
|
if (CollectionUtils.isNotEmpty(page.getTargetRequests())) {
|
||||||
for (Request request : page.getTargetRequests()) {
|
for (Request request : page.getTargetRequests()) {
|
||||||
scheduler.push(request, this);
|
addRequest(request);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private void addRequest(Request request) {
|
||||||
|
scheduler.push(request, this);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
protected void checkIfRunning() {
|
protected void checkIfRunning() {
|
||||||
if (stat.get() == STAT_RUNNING) {
|
if (stat.get() == STAT_RUNNING) {
|
||||||
throw new IllegalStateException("Spider is already running!");
|
throw new IllegalStateException("Spider is already running!");
|
||||||
|
@ -368,6 +384,29 @@ public class Spider implements Runnable, Task {
|
||||||
thread.start();
|
thread.start();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Add urls to crawl.<br/>
|
||||||
|
*
|
||||||
|
* @param urls
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public Spider addUrl(String... urls) {
|
||||||
|
for (String url : urls) {
|
||||||
|
addRequest(new Request(url));
|
||||||
|
}
|
||||||
|
signalNewUrl();
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void signalNewUrl() {
|
||||||
|
try {
|
||||||
|
newUrlLock.lock();
|
||||||
|
newUrlCondition.signalAll();
|
||||||
|
} finally {
|
||||||
|
newUrlLock.unlock();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public void start() {
|
public void start() {
|
||||||
runAsync();
|
runAsync();
|
||||||
}
|
}
|
||||||
|
|
|
@ -34,6 +34,6 @@ public class OschinaBlogPageProcesser implements PageProcessor {
|
||||||
}
|
}
|
||||||
|
|
||||||
public static void main(String[] args) {
|
public static void main(String[] args) {
|
||||||
Spider.create(new OschinaBlogPageProcesser()).thread(2).run();
|
Spider.create(new OschinaBlogPageProcesser()).thread(10).run();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue