change serilizer to fastjson
parent
5436ecbb7b
commit
a2d830f7b0
|
@ -13,9 +13,9 @@
|
||||||
|
|
||||||
<dependencies>
|
<dependencies>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.resthub</groupId>
|
<groupId>com.alibaba</groupId>
|
||||||
<artifactId>hessian</artifactId>
|
<artifactId>fastjson</artifactId>
|
||||||
<version>4.0.8</version>
|
<version>1.1.35</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>redis.clients</groupId>
|
<groupId>redis.clients</groupId>
|
||||||
|
|
|
@ -0,0 +1,58 @@
|
||||||
|
package us.codecraft.webmagic.pipeline;
|
||||||
|
|
||||||
|
import com.alibaba.fastjson.JSON;
|
||||||
|
import org.apache.commons.codec.digest.DigestUtils;
|
||||||
|
import org.apache.log4j.Logger;
|
||||||
|
import us.codecraft.webmagic.ResultItems;
|
||||||
|
import us.codecraft.webmagic.Task;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.FileWriter;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.PrintWriter;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 持久化到文件的接口。
|
||||||
|
*
|
||||||
|
* @author code4crafter@gmail.com <br>
|
||||||
|
* Date: 13-4-21
|
||||||
|
* Time: 下午6:28
|
||||||
|
*/
|
||||||
|
public class JsonFilePipeline implements Pipeline {
|
||||||
|
|
||||||
|
private String path = "/data/temp/webmagic/";
|
||||||
|
|
||||||
|
private Logger logger = Logger.getLogger(getClass());
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 新建一个FilePipeline,使用默认保存路径"/data/temp/webmagic/"
|
||||||
|
*/
|
||||||
|
public JsonFilePipeline() {
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 新建一个FilePipeline
|
||||||
|
*
|
||||||
|
* @param path 文件保存路径
|
||||||
|
*/
|
||||||
|
public JsonFilePipeline(String path) {
|
||||||
|
this.path = path;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void process(ResultItems resultItems, Task task) {
|
||||||
|
String path = this.path + "/" + task.getUUID() + "/";
|
||||||
|
File file = new File(path);
|
||||||
|
if (!file.exists()) {
|
||||||
|
file.mkdirs();
|
||||||
|
}
|
||||||
|
try {
|
||||||
|
PrintWriter printWriter = new PrintWriter(new FileWriter(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".json"));
|
||||||
|
printWriter.write(JSON.toJSONString(resultItems.getAll()));
|
||||||
|
printWriter.close();
|
||||||
|
} catch (IOException e) {
|
||||||
|
logger.warn("write file error", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -1,33 +0,0 @@
|
||||||
package us.codecraft.webmagic.scheduler;
|
|
||||||
|
|
||||||
import com.caucho.hessian.io.Hessian2Input;
|
|
||||||
import com.caucho.hessian.io.Hessian2Output;
|
|
||||||
|
|
||||||
import java.io.ByteArrayInputStream;
|
|
||||||
import java.io.ByteArrayOutputStream;
|
|
||||||
import java.io.IOException;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @author code4crafter@gmail.com <br>
|
|
||||||
* @date: 13-7-14 <br>
|
|
||||||
* Time: 下午9:20 <br>
|
|
||||||
*/
|
|
||||||
public enum HessianSerializer {
|
|
||||||
INSTANCE;
|
|
||||||
public <T> byte[] serialize(T v) throws IOException {
|
|
||||||
ByteArrayOutputStream baos = new ByteArrayOutputStream();
|
|
||||||
Hessian2Output hessian2Output = new Hessian2Output(baos);
|
|
||||||
hessian2Output.writeObject(v);
|
|
||||||
hessian2Output.close();
|
|
||||||
return baos.toByteArray();
|
|
||||||
}
|
|
||||||
|
|
||||||
@SuppressWarnings("unchecked")
|
|
||||||
public <T> T deSerialize(byte[] bytes) throws IOException {
|
|
||||||
ByteArrayInputStream bais = new ByteArrayInputStream(bytes);
|
|
||||||
Hessian2Input hessian2Input = new Hessian2Input(bais);
|
|
||||||
T t = (T) hessian2Input.readObject();
|
|
||||||
hessian2Input.close();
|
|
||||||
return t;
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,5 +1,6 @@
|
||||||
package us.codecraft.webmagic.scheduler;
|
package us.codecraft.webmagic.scheduler;
|
||||||
|
|
||||||
|
import com.alibaba.fastjson.JSON;
|
||||||
import org.apache.commons.codec.digest.DigestUtils;
|
import org.apache.commons.codec.digest.DigestUtils;
|
||||||
import redis.clients.jedis.Jedis;
|
import redis.clients.jedis.Jedis;
|
||||||
import redis.clients.jedis.JedisPool;
|
import redis.clients.jedis.JedisPool;
|
||||||
|
@ -8,8 +9,6 @@ import us.codecraft.webmagic.Request;
|
||||||
import us.codecraft.webmagic.Task;
|
import us.codecraft.webmagic.Task;
|
||||||
import us.codecraft.webmagic.schedular.Scheduler;
|
import us.codecraft.webmagic.schedular.Scheduler;
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 使用redis管理url,构建一个分布式的爬虫。<br>
|
* 使用redis管理url,构建一个分布式的爬虫。<br>
|
||||||
*
|
*
|
||||||
|
@ -41,12 +40,8 @@ public class RedisScheduler implements Scheduler {
|
||||||
jedis.zadd(SET_PREFIX + task.getUUID(), request.getPriority(), request.getUrl());
|
jedis.zadd(SET_PREFIX + task.getUUID(), request.getPriority(), request.getUrl());
|
||||||
if (request.getExtras() != null) {
|
if (request.getExtras() != null) {
|
||||||
String key = ITEM_PREFIX + DigestUtils.shaHex(request.getUrl());
|
String key = ITEM_PREFIX + DigestUtils.shaHex(request.getUrl());
|
||||||
try {
|
byte[] serialize = JSON.toJSONBytes(request);
|
||||||
byte[] serialize = HessianSerializer.INSTANCE.serialize(request);
|
jedis.set(key.getBytes(), serialize);
|
||||||
jedis.set(key.getBytes(), serialize);
|
|
||||||
} catch (IOException e) {
|
|
||||||
e.printStackTrace();
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
pool.returnResource(jedis);
|
pool.returnResource(jedis);
|
||||||
|
@ -61,13 +56,9 @@ public class RedisScheduler implements Scheduler {
|
||||||
}
|
}
|
||||||
String key = ITEM_PREFIX + DigestUtils.shaHex(url);
|
String key = ITEM_PREFIX + DigestUtils.shaHex(url);
|
||||||
byte[] bytes = jedis.get(key.getBytes());
|
byte[] bytes = jedis.get(key.getBytes());
|
||||||
if (bytes!=null){
|
if (bytes != null) {
|
||||||
try {
|
Object o = JSON.parse(bytes);
|
||||||
Object o = HessianSerializer.INSTANCE.deSerialize(bytes);
|
return (Request) o;
|
||||||
return (Request)o;
|
|
||||||
} catch (Exception e) {
|
|
||||||
e.printStackTrace();
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
pool.returnResource(jedis);
|
pool.returnResource(jedis);
|
||||||
return new Request(url);
|
return new Request(url);
|
||||||
|
|
Loading…
Reference in New Issue