From a2d830f7b068d6f4ea2c0def34ddab2c71b7e049 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Tue, 6 Aug 2013 22:25:27 +0800 Subject: [PATCH] change serilizer to fastjson --- webmagic-plugin/webmagic-misc/pom.xml | 6 +- .../webmagic/pipeline/JsonFilePipeline.java | 58 +++++++++++++++++++ .../webmagic/scheduler/HessianSerializer.java | 33 ----------- .../webmagic/scheduler/RedisScheduler.java | 21 ++----- 4 files changed, 67 insertions(+), 51 deletions(-) create mode 100644 webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePipeline.java delete mode 100644 webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/scheduler/HessianSerializer.java diff --git a/webmagic-plugin/webmagic-misc/pom.xml b/webmagic-plugin/webmagic-misc/pom.xml index 98b7c77..5e24173 100644 --- a/webmagic-plugin/webmagic-misc/pom.xml +++ b/webmagic-plugin/webmagic-misc/pom.xml @@ -13,9 +13,9 @@ - org.resthub - hessian - 4.0.8 + com.alibaba + fastjson + 1.1.35 redis.clients diff --git a/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePipeline.java b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePipeline.java new file mode 100644 index 0000000..1500409 --- /dev/null +++ b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePipeline.java @@ -0,0 +1,58 @@ +package us.codecraft.webmagic.pipeline; + +import com.alibaba.fastjson.JSON; +import org.apache.commons.codec.digest.DigestUtils; +import org.apache.log4j.Logger; +import us.codecraft.webmagic.ResultItems; +import us.codecraft.webmagic.Task; + +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.io.PrintWriter; + +/** + * 持久化到文件的接口。 + * + * @author code4crafter@gmail.com
+ * Date: 13-4-21 + * Time: 下午6:28 + */ +public class JsonFilePipeline implements Pipeline { + + private String path = "/data/temp/webmagic/"; + + private Logger logger = Logger.getLogger(getClass()); + + /** + * 新建一个FilePipeline,使用默认保存路径"/data/temp/webmagic/" + */ + public JsonFilePipeline() { + + } + + /** + * 新建一个FilePipeline + * + * @param path 文件保存路径 + */ + public JsonFilePipeline(String path) { + this.path = path; + } + + @Override + public void process(ResultItems resultItems, Task task) { + String path = this.path + "/" + task.getUUID() + "/"; + File file = new File(path); + if (!file.exists()) { + file.mkdirs(); + } + try { + PrintWriter printWriter = new PrintWriter(new FileWriter(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".json")); + printWriter.write(JSON.toJSONString(resultItems.getAll())); + printWriter.close(); + } catch (IOException e) { + logger.warn("write file error", e); + } + } +} diff --git a/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/scheduler/HessianSerializer.java b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/scheduler/HessianSerializer.java deleted file mode 100644 index c137bfb..0000000 --- a/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/scheduler/HessianSerializer.java +++ /dev/null @@ -1,33 +0,0 @@ -package us.codecraft.webmagic.scheduler; - -import com.caucho.hessian.io.Hessian2Input; -import com.caucho.hessian.io.Hessian2Output; - -import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; -import java.io.IOException; - -/** - * @author code4crafter@gmail.com
- * @date: 13-7-14
- * Time: 下午9:20
- */ -public enum HessianSerializer { - INSTANCE; - public byte[] serialize(T v) throws IOException { - ByteArrayOutputStream baos = new ByteArrayOutputStream(); - Hessian2Output hessian2Output = new Hessian2Output(baos); - hessian2Output.writeObject(v); - hessian2Output.close(); - return baos.toByteArray(); - } - - @SuppressWarnings("unchecked") - public T deSerialize(byte[] bytes) throws IOException { - ByteArrayInputStream bais = new ByteArrayInputStream(bytes); - Hessian2Input hessian2Input = new Hessian2Input(bais); - T t = (T) hessian2Input.readObject(); - hessian2Input.close(); - return t; - } -} diff --git a/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java index fb82a69..8233698 100644 --- a/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java +++ b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java @@ -1,5 +1,6 @@ package us.codecraft.webmagic.scheduler; +import com.alibaba.fastjson.JSON; import org.apache.commons.codec.digest.DigestUtils; import redis.clients.jedis.Jedis; import redis.clients.jedis.JedisPool; @@ -8,8 +9,6 @@ import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.schedular.Scheduler; -import java.io.IOException; - /** * 使用redis管理url,构建一个分布式的爬虫。
* @@ -41,12 +40,8 @@ public class RedisScheduler implements Scheduler { jedis.zadd(SET_PREFIX + task.getUUID(), request.getPriority(), request.getUrl()); if (request.getExtras() != null) { String key = ITEM_PREFIX + DigestUtils.shaHex(request.getUrl()); - try { - byte[] serialize = HessianSerializer.INSTANCE.serialize(request); - jedis.set(key.getBytes(), serialize); - } catch (IOException e) { - e.printStackTrace(); - } + byte[] serialize = JSON.toJSONBytes(request); + jedis.set(key.getBytes(), serialize); } } pool.returnResource(jedis); @@ -61,13 +56,9 @@ public class RedisScheduler implements Scheduler { } String key = ITEM_PREFIX + DigestUtils.shaHex(url); byte[] bytes = jedis.get(key.getBytes()); - if (bytes!=null){ - try { - Object o = HessianSerializer.INSTANCE.deSerialize(bytes); - return (Request)o; - } catch (Exception e) { - e.printStackTrace(); - } + if (bytes != null) { + Object o = JSON.parse(bytes); + return (Request) o; } pool.returnResource(jedis); return new Request(url);