#36 Spider does not exit when success
parent
a01312930a
commit
cf62d707e0
|
@ -186,8 +186,9 @@ public class Page {
|
||||||
return rawText;
|
return rawText;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setRawText(String rawText) {
|
public Page setRawText(String rawText) {
|
||||||
this.rawText = rawText;
|
this.rawText = rawText;
|
||||||
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -98,6 +98,8 @@ public class Spider implements Runnable, Task {
|
||||||
|
|
||||||
private Condition newUrlCondition = newUrlLock.newCondition();
|
private Condition newUrlCondition = newUrlLock.newCondition();
|
||||||
|
|
||||||
|
private final AtomicInteger threadAlive = new AtomicInteger(0);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* create a spider with pageProcessor.
|
* create a spider with pageProcessor.
|
||||||
*
|
*
|
||||||
|
@ -276,6 +278,7 @@ public class Spider implements Runnable, Task {
|
||||||
}
|
}
|
||||||
startRequests.clear();
|
startRequests.clear();
|
||||||
}
|
}
|
||||||
|
threadAlive.set(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -283,7 +286,6 @@ public class Spider implements Runnable, Task {
|
||||||
checkRunningStat();
|
checkRunningStat();
|
||||||
initComponent();
|
initComponent();
|
||||||
logger.info("Spider " + getUUID() + " started!");
|
logger.info("Spider " + getUUID() + " started!");
|
||||||
final AtomicInteger threadAlive = new AtomicInteger(0);
|
|
||||||
while (!Thread.currentThread().isInterrupted() && stat.get() == STAT_RUNNING) {
|
while (!Thread.currentThread().isInterrupted() && stat.get() == STAT_RUNNING) {
|
||||||
Request request = scheduler.poll(this);
|
Request request = scheduler.poll(this);
|
||||||
if (request == null) {
|
if (request == null) {
|
||||||
|
@ -369,7 +371,7 @@ public class Spider implements Runnable, Task {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
// for cycle retry
|
// for cycle retry
|
||||||
if (page.getHtml() == null) {
|
if (page.getRawText() == null) {
|
||||||
extractAndAddRequests(page);
|
extractAndAddRequests(page);
|
||||||
sleep(site.getSleepTime());
|
sleep(site.getSleepTime());
|
||||||
return;
|
return;
|
||||||
|
@ -485,6 +487,10 @@ public class Spider implements Runnable, Task {
|
||||||
private void waitNewUrl() {
|
private void waitNewUrl() {
|
||||||
try {
|
try {
|
||||||
newUrlLock.lock();
|
newUrlLock.lock();
|
||||||
|
//double check
|
||||||
|
if (threadAlive.get() == 0 && exitWhenComplete) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
try {
|
try {
|
||||||
newUrlCondition.await();
|
newUrlCondition.await();
|
||||||
} catch (InterruptedException e) {
|
} catch (InterruptedException e) {
|
||||||
|
|
|
@ -2,8 +2,14 @@ package us.codecraft.webmagic;
|
||||||
|
|
||||||
import org.junit.Ignore;
|
import org.junit.Ignore;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
import us.codecraft.webmagic.downloader.Downloader;
|
||||||
import us.codecraft.webmagic.pipeline.Pipeline;
|
import us.codecraft.webmagic.pipeline.Pipeline;
|
||||||
|
import us.codecraft.webmagic.processor.PageProcessor;
|
||||||
import us.codecraft.webmagic.processor.SimplePageProcessor;
|
import us.codecraft.webmagic.processor.SimplePageProcessor;
|
||||||
|
import us.codecraft.webmagic.scheduler.Scheduler;
|
||||||
|
|
||||||
|
import java.util.Random;
|
||||||
|
import java.util.concurrent.atomic.AtomicInteger;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @author code4crafter@gmail.com
|
* @author code4crafter@gmail.com
|
||||||
|
@ -26,4 +32,62 @@ public class SpiderTest {
|
||||||
spider.start();
|
spider.start();
|
||||||
Thread.sleep(10000);
|
Thread.sleep(10000);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Ignore("long time")
|
||||||
|
@Test
|
||||||
|
public void testWaitAndNotify() throws InterruptedException {
|
||||||
|
for (int i = 0; i < 10000; i++) {
|
||||||
|
System.out.println("round" + i);
|
||||||
|
testRound();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void testRound() {
|
||||||
|
Spider spider = Spider.create(new PageProcessor() {
|
||||||
|
|
||||||
|
private AtomicInteger count = new AtomicInteger();
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void process(Page page) {
|
||||||
|
page.setSkip(true);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Site getSite() {
|
||||||
|
return Site.me().setSleepTime(0);
|
||||||
|
}
|
||||||
|
}).setDownloader(new Downloader() {
|
||||||
|
@Override
|
||||||
|
public Page download(Request request, Task task) {
|
||||||
|
return new Page().setRawText("");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void setThread(int threadNum) {
|
||||||
|
|
||||||
|
}
|
||||||
|
}).setScheduler(new Scheduler() {
|
||||||
|
|
||||||
|
private AtomicInteger count = new AtomicInteger();
|
||||||
|
|
||||||
|
private Random random = new Random();
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void push(Request request, Task task) {
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public synchronized Request poll(Task task) {
|
||||||
|
if (count.incrementAndGet() > 1000) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
if (random.nextInt(100)>90){
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
return new Request("test");
|
||||||
|
}
|
||||||
|
}).thread(10);
|
||||||
|
spider.run();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,13 +1,15 @@
|
||||||
package us.codecraft.webmagic;
|
package us.codecraft.webmagic.downloader;
|
||||||
|
|
||||||
import us.codecraft.webmagic.downloader.Downloader;
|
import us.codecraft.webmagic.Page;
|
||||||
|
import us.codecraft.webmagic.Request;
|
||||||
|
import us.codecraft.webmagic.Task;
|
||||||
import us.codecraft.webmagic.selector.Html;
|
import us.codecraft.webmagic.selector.Html;
|
||||||
import us.codecraft.webmagic.selector.PlainText;
|
import us.codecraft.webmagic.selector.PlainText;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @author code4crafter@gmail.com
|
* @author code4crafter@gmail.com
|
||||||
*/
|
*/
|
||||||
public class MockDownloader implements Downloader{
|
public class MockGithubDownloader implements Downloader{
|
||||||
|
|
||||||
private String html = "\n" +
|
private String html = "\n" +
|
||||||
"\n" +
|
"\n" +
|
|
@ -2,7 +2,7 @@ package us.codecraft.webmagic.model;
|
||||||
|
|
||||||
import junit.framework.Assert;
|
import junit.framework.Assert;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
import us.codecraft.webmagic.MockDownloader;
|
import us.codecraft.webmagic.downloader.MockGithubDownloader;
|
||||||
import us.codecraft.webmagic.Site;
|
import us.codecraft.webmagic.Site;
|
||||||
import us.codecraft.webmagic.Task;
|
import us.codecraft.webmagic.Task;
|
||||||
import us.codecraft.webmagic.example.GithubRepo;
|
import us.codecraft.webmagic.example.GithubRepo;
|
||||||
|
@ -22,6 +22,6 @@ public class GithubRepoTest {
|
||||||
Assert.assertEquals(86, o.getStar());
|
Assert.assertEquals(86, o.getStar());
|
||||||
Assert.assertEquals(70, o.getFork());
|
Assert.assertEquals(70, o.getFork());
|
||||||
}
|
}
|
||||||
}, GithubRepo.class).setDownloader(new MockDownloader()).test("https://github.com/code4craft/webmagic");
|
}, GithubRepo.class).setDownloader(new MockGithubDownloader()).test("https://github.com/code4craft/webmagic");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -3,6 +3,7 @@ package us.codecraft.webmagic.processor;
|
||||||
import junit.framework.Assert;
|
import junit.framework.Assert;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
import us.codecraft.webmagic.*;
|
import us.codecraft.webmagic.*;
|
||||||
|
import us.codecraft.webmagic.downloader.MockGithubDownloader;
|
||||||
import us.codecraft.webmagic.model.OOSpider;
|
import us.codecraft.webmagic.model.OOSpider;
|
||||||
import us.codecraft.webmagic.pipeline.Pipeline;
|
import us.codecraft.webmagic.pipeline.Pipeline;
|
||||||
|
|
||||||
|
@ -29,7 +30,7 @@ public class GithubRepoProcessor implements PageProcessor {
|
||||||
Assert.assertEquals("78",((String)resultItems.get("star")).trim());
|
Assert.assertEquals("78",((String)resultItems.get("star")).trim());
|
||||||
Assert.assertEquals("65",((String)resultItems.get("fork")).trim());
|
Assert.assertEquals("65",((String)resultItems.get("fork")).trim());
|
||||||
}
|
}
|
||||||
}).setDownloader(new MockDownloader()).test("https://github.com/code4craft/webmagic");
|
}).setDownloader(new MockGithubDownloader()).test("https://github.com/code4craft/webmagic");
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue