diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java index 77db2c1..6cf344f 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java @@ -1,5 +1,6 @@ package us.codecraft.webmagic; +import java.io.Serializable; import java.util.HashMap; import java.util.Map; @@ -26,7 +27,9 @@ import java.util.Map; * Date: 13-4-21 * Time: 上午11:37 */ -public class Request { +public class Request implements Serializable { + + private static final long serialVersionUID = 2062192774891352043L; private String url; @@ -40,7 +43,7 @@ public class Request { /** * 构建一个request对象 * - * @param url 必须参数,待抓取的url + * @param url 必须参数,待抓取的url */ public Request(String url) { this.url = url; @@ -56,17 +59,17 @@ public class Request { } public Object getExtra(String key) { - if (extras==null){ + if (extras == null) { return null; } return extras.get(key); } - public Request putExtra(String key,Object value) { - if (extras==null){ + public Request putExtra(String key, Object value) { + if (extras == null) { extras = new HashMap(); } - extras.put(key,value); + extras.put(key, value); return this; } @@ -91,6 +94,10 @@ public class Request { return true; } + public Map getExtras() { + return extras; + } + @Override public int hashCode() { return url.hashCode(); diff --git a/webmagic-plugin/webmagic-misc/pom.xml b/webmagic-plugin/webmagic-misc/pom.xml index c545615..4d8776c 100644 --- a/webmagic-plugin/webmagic-misc/pom.xml +++ b/webmagic-plugin/webmagic-misc/pom.xml @@ -17,6 +17,11 @@ freemarker 2.3.15 + + org.resthub + hessian + 4.0.8 + redis.clients jedis diff --git a/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/scheduler/HessianSerializer.java b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/scheduler/HessianSerializer.java new file mode 100644 index 0000000..68cb5bb --- /dev/null +++ b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/scheduler/HessianSerializer.java @@ -0,0 +1,33 @@ +package us.codecraft.webmagic.scheduler; + +import com.caucho.hessian.io.Hessian2Input; +import com.caucho.hessian.io.Hessian2Output; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; + +/** + * @author yihua.huang@dianping.com
+ * @date: 13-7-14
+ * Time: 下午9:20
+ */ +public enum HessianSerializer { + INSTANCE; + public byte[] serialize(T v) throws IOException { + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + Hessian2Output hessian2Output = new Hessian2Output(baos); + hessian2Output.writeObject(v); + hessian2Output.close(); + return baos.toByteArray(); + } + + @SuppressWarnings("unchecked") + public T deSerialize(byte[] bytes) throws IOException { + ByteArrayInputStream bais = new ByteArrayInputStream(bytes); + Hessian2Input hessian2Input = new Hessian2Input(bais); + T t = (T) hessian2Input.readObject(); + hessian2Input.close(); + return t; + } +} diff --git a/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java index 8109ad1..c00c12f 100644 --- a/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java +++ b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java @@ -1,5 +1,6 @@ package us.codecraft.webmagic.scheduler; +import org.apache.commons.codec.digest.DigestUtils; import redis.clients.jedis.Jedis; import redis.clients.jedis.JedisPool; import redis.clients.jedis.JedisPoolConfig; @@ -7,6 +8,8 @@ import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.schedular.Scheduler; +import java.io.IOException; + /** * 使用redis管理url,构建一个分布式的爬虫。
* @@ -22,6 +25,8 @@ public class RedisScheduler implements Scheduler { private static final String SET_PREFIX = "set_"; + private static final String ITEM_PREFIX = "item_"; + public RedisScheduler(String host) { pool = new JedisPool(new JedisPoolConfig(), host); } @@ -34,6 +39,15 @@ public class RedisScheduler implements Scheduler { //使用List保存队列 jedis.rpush(QUEUE_PREFIX + task.getUUID(), request.getUrl()); jedis.zadd(SET_PREFIX + task.getUUID(), request.getPriority(), request.getUrl()); + if (request.getExtras() != null) { + String key = ITEM_PREFIX + DigestUtils.shaHex(request.getUrl()); + try { + byte[] serialize = HessianSerializer.INSTANCE.serialize(request); + jedis.set(key.getBytes(), serialize); + } catch (IOException e) { + e.printStackTrace(); + } + } } pool.returnResource(jedis); } @@ -42,8 +56,16 @@ public class RedisScheduler implements Scheduler { public synchronized Request poll(Task task) { Jedis jedis = pool.getResource(); String url = jedis.lpop(QUEUE_PREFIX + task.getUUID()); + String key = ITEM_PREFIX + DigestUtils.shaHex(url); + byte[] bytes = jedis.get(key.getBytes()); + try { + Object o = HessianSerializer.INSTANCE.deSerialize(bytes); + return (Request)o; + } catch (Exception e) { + e.printStackTrace(); + } pool.returnResource(jedis); - if (url==null){ + if (url == null) { return null; } return new Request(url);