From 2c97dd90c7d60624864d5d2c2b2e93f725940e76 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Fri, 26 Jul 2013 19:02:41 +0800 Subject: [PATCH] fix redisScheduler thread problem --- .../webmagic/scheduler/RedisScheduler.java | 42 +++++++++++++++---- .../webmagic/selenium/SeleniumTest.java | 16 ++++++- 2 files changed, 50 insertions(+), 8 deletions(-) diff --git a/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java index 481981d..575beef 100644 --- a/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java +++ b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java @@ -7,13 +7,17 @@ import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.schedular.Scheduler; +import java.util.concurrent.locks.Condition; +import java.util.concurrent.locks.ReentrantLock; + /** * 使用redis管理url,构建一个分布式的爬虫。
+ * * @author yihua.huang@dianping.com
* @date: 13-7-25
* Time: 上午7:07
*/ -public class RedisScheduler implements Scheduler{ +public class RedisScheduler implements Scheduler { private JedisPool pool; @@ -21,7 +25,11 @@ public class RedisScheduler implements Scheduler{ private static final String SET_PREFIX = "set_"; - public RedisScheduler(String host){ + private ReentrantLock lock = new ReentrantLock(); + + private Condition condition = lock.newCondition(); + + public RedisScheduler(String host) { pool = new JedisPool(new JedisPoolConfig(), host); } @@ -29,10 +37,16 @@ public class RedisScheduler implements Scheduler{ public synchronized void push(Request request, Task task) { Jedis jedis = pool.getResource(); //使用SortedSet进行url去重 - if (jedis.zrank(SET_PREFIX+task.getUUID(),request.getUrl())==null){ - //使用List保存队列 - jedis.rpush(QUEUE_PREFIX+task.getUUID(),request.getUrl()); - jedis.zadd(SET_PREFIX+task.getUUID(),System.currentTimeMillis(),request.getUrl()); + if (jedis.zrank(SET_PREFIX + task.getUUID(), request.getUrl()) == null) { + try { + lock.lock(); + //使用List保存队列 + jedis.rpush(QUEUE_PREFIX + task.getUUID(), request.getUrl()); + jedis.zadd(SET_PREFIX + task.getUUID(), System.currentTimeMillis(), request.getUrl()); + condition.signal(); + } finally { + lock.unlock(); + } } pool.returnResource(jedis); } @@ -40,7 +54,21 @@ public class RedisScheduler implements Scheduler{ @Override public synchronized Request poll(Task task) { Jedis jedis = pool.getResource(); - String url = jedis.lpop(QUEUE_PREFIX+task.getUUID()); + String url = jedis.lpop(QUEUE_PREFIX + task.getUUID()); + if (url == null) { + try { + lock.lock(); + while (url == null) { + try { + condition.await(); + url = jedis.lpop(QUEUE_PREFIX + task.getUUID()); + } catch (InterruptedException e) { + } + } + } finally { + lock.unlock(); + } + } pool.returnResource(jedis); return new Request(url); } diff --git a/webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/SeleniumTest.java b/webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/SeleniumTest.java index 6cf50c3..a403b91 100644 --- a/webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/SeleniumTest.java +++ b/webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/SeleniumTest.java @@ -6,6 +6,11 @@ import org.openqa.selenium.By; import org.openqa.selenium.WebDriver; import org.openqa.selenium.WebElement; import org.openqa.selenium.chrome.ChromeDriver; +import org.openqa.selenium.remote.DesiredCapabilities; + +import java.util.Arrays; +import java.util.HashMap; +import java.util.Map; /** * @author yihua.huang@dianping.com
@@ -18,7 +23,16 @@ public class SeleniumTest { @Test public void testSelenium() { System.getProperties().setProperty("webdriver.chrome.driver", "/Users/yihua/Downloads/chromedriver"); - WebDriver webDriver = new ChromeDriver(); + Map contentSettings = new HashMap(); + contentSettings.put("images", 2); + + Map preferences = new HashMap(); + preferences.put("profile.default_content_settings", contentSettings); + + DesiredCapabilities caps = DesiredCapabilities.chrome(); + caps.setCapability("chrome.prefs", preferences); + caps.setCapability("chrome.switches", Arrays.asList("--user-data-dir=/Users/yihua/temp/chrome")); + WebDriver webDriver = new ChromeDriver(caps); webDriver.get("http://huaban.com/"); WebElement webElement = webDriver.findElement(By.xpath("/html")); System.out.println(webElement.getAttribute("outerHTML"));