From 26d38851b5d712377789c8e4ec8e9c86225b84b5 Mon Sep 17 00:00:00 2001 From: ywooer Date: Tue, 6 May 2014 18:28:50 +0800 Subject: [PATCH 1/2] add charset to Writer --- .../webmagic/pipeline/FilePipeline.java | 6 ++- .../webmagic/pipeline/FilePipelineTest.java | 44 +++++++++++++++++++ 2 files changed, 48 insertions(+), 2 deletions(-) create mode 100644 webmagic-core/src/test/java/us/codecraft/webmagic/pipeline/FilePipelineTest.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java index 8eab426..014c881 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java @@ -4,12 +4,14 @@ import org.apache.commons.codec.digest.DigestUtils; import org.apache.http.annotation.ThreadSafe; import org.slf4j.Logger; import org.slf4j.LoggerFactory; + import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.utils.FilePersistentBase; -import java.io.FileWriter; +import java.io.FileOutputStream; import java.io.IOException; +import java.io.OutputStreamWriter; import java.io.PrintWriter; import java.util.Map; @@ -39,7 +41,7 @@ public class FilePipeline extends FilePersistentBase implements Pipeline { public void process(ResultItems resultItems, Task task) { String path = this.path + PATH_SEPERATOR + task.getUUID() + PATH_SEPERATOR; try { - PrintWriter printWriter = new PrintWriter(new FileWriter(getFile(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".html"))); + PrintWriter printWriter = new PrintWriter(new OutputStreamWriter(new FileOutputStream(getFile(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".html")),"UTF-8")); printWriter.println("url:\t" + resultItems.getRequest().getUrl()); for (Map.Entry entry : resultItems.getAll().entrySet()) { if (entry.getValue() instanceof Iterable) { diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/pipeline/FilePipelineTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/pipeline/FilePipelineTest.java new file mode 100644 index 0000000..e420588 --- /dev/null +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/pipeline/FilePipelineTest.java @@ -0,0 +1,44 @@ +package us.codecraft.webmagic.pipeline; + +import org.junit.BeforeClass; +import org.junit.Test; +import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.ResultItems; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Task; + +import java.util.UUID; + +/** + * Created by ywooer on 2014/5/6 0006. + */ +public class FilePipelineTest { + + private static ResultItems resultItems; + private static Task task; + + @BeforeClass + public static void before() { + resultItems = new ResultItems(); + resultItems.put("content", "webmagic 爬虫工具"); + Request request = new Request("http://www.baidu.com"); + resultItems.setRequest(request); + + task = new Task() { + @Override + public String getUUID() { + return UUID.randomUUID().toString(); + } + + @Override + public Site getSite() { + return null; + } + }; + } + @Test + public void testProcess() { + FilePipeline filePipeline = new FilePipeline(); + filePipeline.process(resultItems, task); + } +} From 259f0a16c5920a4a0493c860210dc9c34b3a6474 Mon Sep 17 00:00:00 2001 From: ywooer Date: Tue, 6 May 2014 18:33:00 +0800 Subject: [PATCH 2/2] Update FilePipeline.java --- .../main/java/us/codecraft/webmagic/pipeline/FilePipeline.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java index 014c881..57d6eea 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java @@ -41,7 +41,7 @@ public class FilePipeline extends FilePersistentBase implements Pipeline { public void process(ResultItems resultItems, Task task) { String path = this.path + PATH_SEPERATOR + task.getUUID() + PATH_SEPERATOR; try { - PrintWriter printWriter = new PrintWriter(new OutputStreamWriter(new FileOutputStream(getFile(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".html")),"UTF-8")); + PrintWriter printWriter = new PrintWriter(new OutputStreamWriter(new FileOutputStream(getFile(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".html")),"UTF-8")); printWriter.println("url:\t" + resultItems.getRequest().getUrl()); for (Map.Entry entry : resultItems.getAll().entrySet()) { if (entry.getValue() instanceof Iterable) {