From cf62d707e0690c10e0ee50a1acf9ce416185c549 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Wed, 27 Nov 2013 23:33:18 +0800 Subject: [PATCH] #36 Spider does not exit when success --- .../main/java/us/codecraft/webmagic/Page.java | 3 +- .../java/us/codecraft/webmagic/Spider.java | 10 ++- .../us/codecraft/webmagic/SpiderTest.java | 64 +++++++++++++++++++ .../MockGithubDownloader.java} | 8 ++- .../webmagic/model/GithubRepoTest.java | 4 +- .../processor/GithubRepoProcessor.java | 3 +- 6 files changed, 83 insertions(+), 9 deletions(-) rename webmagic-extension/src/test/java/us/codecraft/webmagic/{MockDownloader.java => downloader/MockGithubDownloader.java} (99%) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java index ab2b544..aeccb5b 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java @@ -186,8 +186,9 @@ public class Page { return rawText; } - public void setRawText(String rawText) { + public Page setRawText(String rawText) { this.rawText = rawText; + return this; } @Override diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 22ca276..9c25ed5 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -98,6 +98,8 @@ public class Spider implements Runnable, Task { private Condition newUrlCondition = newUrlLock.newCondition(); + private final AtomicInteger threadAlive = new AtomicInteger(0); + /** * create a spider with pageProcessor. * @@ -276,6 +278,7 @@ public class Spider implements Runnable, Task { } startRequests.clear(); } + threadAlive.set(0); } @Override @@ -283,7 +286,6 @@ public class Spider implements Runnable, Task { checkRunningStat(); initComponent(); logger.info("Spider " + getUUID() + " started!"); - final AtomicInteger threadAlive = new AtomicInteger(0); while (!Thread.currentThread().isInterrupted() && stat.get() == STAT_RUNNING) { Request request = scheduler.poll(this); if (request == null) { @@ -369,7 +371,7 @@ public class Spider implements Runnable, Task { return; } // for cycle retry - if (page.getHtml() == null) { + if (page.getRawText() == null) { extractAndAddRequests(page); sleep(site.getSleepTime()); return; @@ -485,6 +487,10 @@ public class Spider implements Runnable, Task { private void waitNewUrl() { try { newUrlLock.lock(); + //double check + if (threadAlive.get() == 0 && exitWhenComplete) { + return; + } try { newUrlCondition.await(); } catch (InterruptedException e) { diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/SpiderTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/SpiderTest.java index 3add86c..9d950ae 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/SpiderTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/SpiderTest.java @@ -2,8 +2,14 @@ package us.codecraft.webmagic; import org.junit.Ignore; import org.junit.Test; +import us.codecraft.webmagic.downloader.Downloader; import us.codecraft.webmagic.pipeline.Pipeline; +import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.processor.SimplePageProcessor; +import us.codecraft.webmagic.scheduler.Scheduler; + +import java.util.Random; +import java.util.concurrent.atomic.AtomicInteger; /** * @author code4crafter@gmail.com @@ -26,4 +32,62 @@ public class SpiderTest { spider.start(); Thread.sleep(10000); } + + @Ignore("long time") + @Test + public void testWaitAndNotify() throws InterruptedException { + for (int i = 0; i < 10000; i++) { + System.out.println("round" + i); + testRound(); + } + } + + private void testRound() { + Spider spider = Spider.create(new PageProcessor() { + + private AtomicInteger count = new AtomicInteger(); + + @Override + public void process(Page page) { + page.setSkip(true); + } + + @Override + public Site getSite() { + return Site.me().setSleepTime(0); + } + }).setDownloader(new Downloader() { + @Override + public Page download(Request request, Task task) { + return new Page().setRawText(""); + } + + @Override + public void setThread(int threadNum) { + + } + }).setScheduler(new Scheduler() { + + private AtomicInteger count = new AtomicInteger(); + + private Random random = new Random(); + + @Override + public void push(Request request, Task task) { + + } + + @Override + public synchronized Request poll(Task task) { + if (count.incrementAndGet() > 1000) { + return null; + } + if (random.nextInt(100)>90){ + return null; + } + return new Request("test"); + } + }).thread(10); + spider.run(); + } } diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/MockDownloader.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java similarity index 99% rename from webmagic-extension/src/test/java/us/codecraft/webmagic/MockDownloader.java rename to webmagic-extension/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java index aa62e9e..49774f1 100644 --- a/webmagic-extension/src/test/java/us/codecraft/webmagic/MockDownloader.java +++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java @@ -1,13 +1,15 @@ -package us.codecraft.webmagic; +package us.codecraft.webmagic.downloader; -import us.codecraft.webmagic.downloader.Downloader; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.Task; import us.codecraft.webmagic.selector.Html; import us.codecraft.webmagic.selector.PlainText; /** * @author code4crafter@gmail.com */ -public class MockDownloader implements Downloader{ +public class MockGithubDownloader implements Downloader{ private String html = "\n" + "\n" + diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/model/GithubRepoTest.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/GithubRepoTest.java index b719bf0..85b6858 100644 --- a/webmagic-extension/src/test/java/us/codecraft/webmagic/model/GithubRepoTest.java +++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/GithubRepoTest.java @@ -2,7 +2,7 @@ package us.codecraft.webmagic.model; import junit.framework.Assert; import org.junit.Test; -import us.codecraft.webmagic.MockDownloader; +import us.codecraft.webmagic.downloader.MockGithubDownloader; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.example.GithubRepo; @@ -22,6 +22,6 @@ public class GithubRepoTest { Assert.assertEquals(86, o.getStar()); Assert.assertEquals(70, o.getFork()); } - }, GithubRepo.class).setDownloader(new MockDownloader()).test("https://github.com/code4craft/webmagic"); + }, GithubRepo.class).setDownloader(new MockGithubDownloader()).test("https://github.com/code4craft/webmagic"); } } diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/processor/GithubRepoProcessor.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/processor/GithubRepoProcessor.java index 02b2ac1..bf9e381 100644 --- a/webmagic-extension/src/test/java/us/codecraft/webmagic/processor/GithubRepoProcessor.java +++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/processor/GithubRepoProcessor.java @@ -3,6 +3,7 @@ package us.codecraft.webmagic.processor; import junit.framework.Assert; import org.junit.Test; import us.codecraft.webmagic.*; +import us.codecraft.webmagic.downloader.MockGithubDownloader; import us.codecraft.webmagic.model.OOSpider; import us.codecraft.webmagic.pipeline.Pipeline; @@ -29,7 +30,7 @@ public class GithubRepoProcessor implements PageProcessor { Assert.assertEquals("78",((String)resultItems.get("star")).trim()); Assert.assertEquals("65",((String)resultItems.get("fork")).trim()); } - }).setDownloader(new MockDownloader()).test("https://github.com/code4craft/webmagic"); + }).setDownloader(new MockGithubDownloader()).test("https://github.com/code4craft/webmagic"); } }