#36 Spider does not exit when success
parent
a01312930a
commit
cf62d707e0
|
@ -186,8 +186,9 @@ public class Page {
|
|||
return rawText;
|
||||
}
|
||||
|
||||
public void setRawText(String rawText) {
|
||||
public Page setRawText(String rawText) {
|
||||
this.rawText = rawText;
|
||||
return this;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -98,6 +98,8 @@ public class Spider implements Runnable, Task {
|
|||
|
||||
private Condition newUrlCondition = newUrlLock.newCondition();
|
||||
|
||||
private final AtomicInteger threadAlive = new AtomicInteger(0);
|
||||
|
||||
/**
|
||||
* create a spider with pageProcessor.
|
||||
*
|
||||
|
@ -276,6 +278,7 @@ public class Spider implements Runnable, Task {
|
|||
}
|
||||
startRequests.clear();
|
||||
}
|
||||
threadAlive.set(0);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -283,7 +286,6 @@ public class Spider implements Runnable, Task {
|
|||
checkRunningStat();
|
||||
initComponent();
|
||||
logger.info("Spider " + getUUID() + " started!");
|
||||
final AtomicInteger threadAlive = new AtomicInteger(0);
|
||||
while (!Thread.currentThread().isInterrupted() && stat.get() == STAT_RUNNING) {
|
||||
Request request = scheduler.poll(this);
|
||||
if (request == null) {
|
||||
|
@ -369,7 +371,7 @@ public class Spider implements Runnable, Task {
|
|||
return;
|
||||
}
|
||||
// for cycle retry
|
||||
if (page.getHtml() == null) {
|
||||
if (page.getRawText() == null) {
|
||||
extractAndAddRequests(page);
|
||||
sleep(site.getSleepTime());
|
||||
return;
|
||||
|
@ -485,6 +487,10 @@ public class Spider implements Runnable, Task {
|
|||
private void waitNewUrl() {
|
||||
try {
|
||||
newUrlLock.lock();
|
||||
//double check
|
||||
if (threadAlive.get() == 0 && exitWhenComplete) {
|
||||
return;
|
||||
}
|
||||
try {
|
||||
newUrlCondition.await();
|
||||
} catch (InterruptedException e) {
|
||||
|
|
|
@ -2,8 +2,14 @@ package us.codecraft.webmagic;
|
|||
|
||||
import org.junit.Ignore;
|
||||
import org.junit.Test;
|
||||
import us.codecraft.webmagic.downloader.Downloader;
|
||||
import us.codecraft.webmagic.pipeline.Pipeline;
|
||||
import us.codecraft.webmagic.processor.PageProcessor;
|
||||
import us.codecraft.webmagic.processor.SimplePageProcessor;
|
||||
import us.codecraft.webmagic.scheduler.Scheduler;
|
||||
|
||||
import java.util.Random;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com
|
||||
|
@ -26,4 +32,62 @@ public class SpiderTest {
|
|||
spider.start();
|
||||
Thread.sleep(10000);
|
||||
}
|
||||
|
||||
@Ignore("long time")
|
||||
@Test
|
||||
public void testWaitAndNotify() throws InterruptedException {
|
||||
for (int i = 0; i < 10000; i++) {
|
||||
System.out.println("round" + i);
|
||||
testRound();
|
||||
}
|
||||
}
|
||||
|
||||
private void testRound() {
|
||||
Spider spider = Spider.create(new PageProcessor() {
|
||||
|
||||
private AtomicInteger count = new AtomicInteger();
|
||||
|
||||
@Override
|
||||
public void process(Page page) {
|
||||
page.setSkip(true);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Site getSite() {
|
||||
return Site.me().setSleepTime(0);
|
||||
}
|
||||
}).setDownloader(new Downloader() {
|
||||
@Override
|
||||
public Page download(Request request, Task task) {
|
||||
return new Page().setRawText("");
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setThread(int threadNum) {
|
||||
|
||||
}
|
||||
}).setScheduler(new Scheduler() {
|
||||
|
||||
private AtomicInteger count = new AtomicInteger();
|
||||
|
||||
private Random random = new Random();
|
||||
|
||||
@Override
|
||||
public void push(Request request, Task task) {
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public synchronized Request poll(Task task) {
|
||||
if (count.incrementAndGet() > 1000) {
|
||||
return null;
|
||||
}
|
||||
if (random.nextInt(100)>90){
|
||||
return null;
|
||||
}
|
||||
return new Request("test");
|
||||
}
|
||||
}).thread(10);
|
||||
spider.run();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,13 +1,15 @@
|
|||
package us.codecraft.webmagic;
|
||||
package us.codecraft.webmagic.downloader;
|
||||
|
||||
import us.codecraft.webmagic.downloader.Downloader;
|
||||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.Request;
|
||||
import us.codecraft.webmagic.Task;
|
||||
import us.codecraft.webmagic.selector.Html;
|
||||
import us.codecraft.webmagic.selector.PlainText;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com
|
||||
*/
|
||||
public class MockDownloader implements Downloader{
|
||||
public class MockGithubDownloader implements Downloader{
|
||||
|
||||
private String html = "\n" +
|
||||
"\n" +
|
|
@ -2,7 +2,7 @@ package us.codecraft.webmagic.model;
|
|||
|
||||
import junit.framework.Assert;
|
||||
import org.junit.Test;
|
||||
import us.codecraft.webmagic.MockDownloader;
|
||||
import us.codecraft.webmagic.downloader.MockGithubDownloader;
|
||||
import us.codecraft.webmagic.Site;
|
||||
import us.codecraft.webmagic.Task;
|
||||
import us.codecraft.webmagic.example.GithubRepo;
|
||||
|
@ -22,6 +22,6 @@ public class GithubRepoTest {
|
|||
Assert.assertEquals(86, o.getStar());
|
||||
Assert.assertEquals(70, o.getFork());
|
||||
}
|
||||
}, GithubRepo.class).setDownloader(new MockDownloader()).test("https://github.com/code4craft/webmagic");
|
||||
}, GithubRepo.class).setDownloader(new MockGithubDownloader()).test("https://github.com/code4craft/webmagic");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -3,6 +3,7 @@ package us.codecraft.webmagic.processor;
|
|||
import junit.framework.Assert;
|
||||
import org.junit.Test;
|
||||
import us.codecraft.webmagic.*;
|
||||
import us.codecraft.webmagic.downloader.MockGithubDownloader;
|
||||
import us.codecraft.webmagic.model.OOSpider;
|
||||
import us.codecraft.webmagic.pipeline.Pipeline;
|
||||
|
||||
|
@ -29,7 +30,7 @@ public class GithubRepoProcessor implements PageProcessor {
|
|||
Assert.assertEquals("78",((String)resultItems.get("star")).trim());
|
||||
Assert.assertEquals("65",((String)resultItems.get("fork")).trim());
|
||||
}
|
||||
}).setDownloader(new MockDownloader()).test("https://github.com/code4craft/webmagic");
|
||||
}).setDownloader(new MockGithubDownloader()).test("https://github.com/code4craft/webmagic");
|
||||
}
|
||||
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue