complete redis support
parent
f3a29d9315
commit
b0af45f4bb
|
@ -1,5 +1,6 @@
|
|||
package us.codecraft.webmagic;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
|
@ -26,7 +27,9 @@ import java.util.Map;
|
|||
* Date: 13-4-21
|
||||
* Time: 上午11:37
|
||||
*/
|
||||
public class Request {
|
||||
public class Request implements Serializable {
|
||||
|
||||
private static final long serialVersionUID = 2062192774891352043L;
|
||||
|
||||
private String url;
|
||||
|
||||
|
@ -40,7 +43,7 @@ public class Request {
|
|||
/**
|
||||
* 构建一个request对象
|
||||
*
|
||||
* @param url 必须参数,待抓取的url
|
||||
* @param url 必须参数,待抓取的url
|
||||
*/
|
||||
public Request(String url) {
|
||||
this.url = url;
|
||||
|
@ -56,17 +59,17 @@ public class Request {
|
|||
}
|
||||
|
||||
public Object getExtra(String key) {
|
||||
if (extras==null){
|
||||
if (extras == null) {
|
||||
return null;
|
||||
}
|
||||
return extras.get(key);
|
||||
}
|
||||
|
||||
public Request putExtra(String key,Object value) {
|
||||
if (extras==null){
|
||||
public Request putExtra(String key, Object value) {
|
||||
if (extras == null) {
|
||||
extras = new HashMap<String, Object>();
|
||||
}
|
||||
extras.put(key,value);
|
||||
extras.put(key, value);
|
||||
return this;
|
||||
}
|
||||
|
||||
|
@ -91,6 +94,10 @@ public class Request {
|
|||
return true;
|
||||
}
|
||||
|
||||
public Map<String, Object> getExtras() {
|
||||
return extras;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return url.hashCode();
|
||||
|
|
|
@ -17,6 +17,11 @@
|
|||
<artifactId>freemarker</artifactId>
|
||||
<version>2.3.15</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.resthub</groupId>
|
||||
<artifactId>hessian</artifactId>
|
||||
<version>4.0.8</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>redis.clients</groupId>
|
||||
<artifactId>jedis</artifactId>
|
||||
|
|
|
@ -0,0 +1,33 @@
|
|||
package us.codecraft.webmagic.scheduler;
|
||||
|
||||
import com.caucho.hessian.io.Hessian2Input;
|
||||
import com.caucho.hessian.io.Hessian2Output;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* @author yihua.huang@dianping.com <br>
|
||||
* @date: 13-7-14 <br>
|
||||
* Time: 下午9:20 <br>
|
||||
*/
|
||||
public enum HessianSerializer {
|
||||
INSTANCE;
|
||||
public <T> byte[] serialize(T v) throws IOException {
|
||||
ByteArrayOutputStream baos = new ByteArrayOutputStream();
|
||||
Hessian2Output hessian2Output = new Hessian2Output(baos);
|
||||
hessian2Output.writeObject(v);
|
||||
hessian2Output.close();
|
||||
return baos.toByteArray();
|
||||
}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
public <T> T deSerialize(byte[] bytes) throws IOException {
|
||||
ByteArrayInputStream bais = new ByteArrayInputStream(bytes);
|
||||
Hessian2Input hessian2Input = new Hessian2Input(bais);
|
||||
T t = (T) hessian2Input.readObject();
|
||||
hessian2Input.close();
|
||||
return t;
|
||||
}
|
||||
}
|
|
@ -1,5 +1,6 @@
|
|||
package us.codecraft.webmagic.scheduler;
|
||||
|
||||
import org.apache.commons.codec.digest.DigestUtils;
|
||||
import redis.clients.jedis.Jedis;
|
||||
import redis.clients.jedis.JedisPool;
|
||||
import redis.clients.jedis.JedisPoolConfig;
|
||||
|
@ -7,6 +8,8 @@ import us.codecraft.webmagic.Request;
|
|||
import us.codecraft.webmagic.Task;
|
||||
import us.codecraft.webmagic.schedular.Scheduler;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* 使用redis管理url,构建一个分布式的爬虫。<br>
|
||||
*
|
||||
|
@ -22,6 +25,8 @@ public class RedisScheduler implements Scheduler {
|
|||
|
||||
private static final String SET_PREFIX = "set_";
|
||||
|
||||
private static final String ITEM_PREFIX = "item_";
|
||||
|
||||
public RedisScheduler(String host) {
|
||||
pool = new JedisPool(new JedisPoolConfig(), host);
|
||||
}
|
||||
|
@ -34,6 +39,15 @@ public class RedisScheduler implements Scheduler {
|
|||
//使用List保存队列
|
||||
jedis.rpush(QUEUE_PREFIX + task.getUUID(), request.getUrl());
|
||||
jedis.zadd(SET_PREFIX + task.getUUID(), request.getPriority(), request.getUrl());
|
||||
if (request.getExtras() != null) {
|
||||
String key = ITEM_PREFIX + DigestUtils.shaHex(request.getUrl());
|
||||
try {
|
||||
byte[] serialize = HessianSerializer.INSTANCE.serialize(request);
|
||||
jedis.set(key.getBytes(), serialize);
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
}
|
||||
pool.returnResource(jedis);
|
||||
}
|
||||
|
@ -42,8 +56,16 @@ public class RedisScheduler implements Scheduler {
|
|||
public synchronized Request poll(Task task) {
|
||||
Jedis jedis = pool.getResource();
|
||||
String url = jedis.lpop(QUEUE_PREFIX + task.getUUID());
|
||||
String key = ITEM_PREFIX + DigestUtils.shaHex(url);
|
||||
byte[] bytes = jedis.get(key.getBytes());
|
||||
try {
|
||||
Object o = HessianSerializer.INSTANCE.deSerialize(bytes);
|
||||
return (Request)o;
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
pool.returnResource(jedis);
|
||||
if (url==null){
|
||||
if (url == null) {
|
||||
return null;
|
||||
}
|
||||
return new Request(url);
|
||||
|
|
Loading…
Reference in New Issue