diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java index 4879b28..2303377 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java @@ -28,6 +28,8 @@ public class Site { private String charset; + private String defaultCharset; + private int sleepTime = 5000; private int retryTimes = 0; @@ -168,6 +170,30 @@ public class Site { return charset; } + /** + * Set default charset of page. + * + * When charset detect failed, use this default charset. + * + * @param defaultCharset the default charset + * @return this + * @since 0.9.0 + */ + public Site setDefaultCharset(String defaultCharset) { + this.defaultCharset = defaultCharset; + return this; + } + + /** + * The default charset if charset detected failed. + * + * @return the defulat charset + * @since 0.9.0 + */ + public String getDefaultCharset() { + return defaultCharset; + } + public int getTimeOut() { return timeOut; } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 72821f3..bfd24f0 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -4,6 +4,7 @@ import java.io.IOException; import java.nio.charset.Charset; import java.util.HashMap; import java.util.Map; +import java.util.Optional; import org.apache.commons.io.IOUtils; import org.apache.http.HttpResponse; @@ -116,7 +117,7 @@ public class HttpClientDownloader extends AbstractDownloader { page.setBytes(bytes); if (!request.isBinaryContent()) { if (charset == null) { - charset = getHtmlCharset(contentType, bytes); + charset = getHtmlCharset(contentType, bytes, task); } page.setCharset(charset); page.setRawText(new String(bytes, charset)); @@ -131,11 +132,11 @@ public class HttpClientDownloader extends AbstractDownloader { return page; } - private String getHtmlCharset(String contentType, byte[] contentBytes) throws IOException { + private String getHtmlCharset(String contentType, byte[] contentBytes, Task task) throws IOException { String charset = CharsetUtils.detectCharset(contentType, contentBytes); if (charset == null) { - charset = Charset.defaultCharset().name(); - logger.warn("Charset autodetect failed, use {} as charset. Please specify charset in Site.setCharset()", Charset.defaultCharset()); + charset = Optional.ofNullable(task.getSite().getDefaultCharset()).orElseGet(Charset.defaultCharset()::name); + logger.info("Charset autodetect failed, use {} as charset.", task.getSite().getDefaultCharset()); } return charset; } diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/SiteTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/SiteTest.java new file mode 100644 index 0000000..783b82d --- /dev/null +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/SiteTest.java @@ -0,0 +1,17 @@ +package us.codecraft.webmagic; + +import static org.junit.Assert.assertEquals; + +import java.nio.charset.StandardCharsets; + +import org.junit.Test; + +public class SiteTest { + + @Test + public void test() { + Site site = Site.me().setDefaultCharset(StandardCharsets.UTF_8.name()); + assertEquals(StandardCharsets.UTF_8.name(), site.getDefaultCharset()); + } + +}