Add Site.defaultCharset. closes #1101.

master
Joe Zhou 2022-12-20 23:41:31 +08:00
parent 80424b0bd7
commit a266df406f
3 changed files with 48 additions and 4 deletions

View File

@ -28,6 +28,8 @@ public class Site {
private String charset; private String charset;
private String defaultCharset;
private int sleepTime = 5000; private int sleepTime = 5000;
private int retryTimes = 0; private int retryTimes = 0;
@ -168,6 +170,30 @@ public class Site {
return charset; return charset;
} }
/**
* Set default charset of page.
*
* When charset detect failed, use this default charset.
*
* @param defaultCharset the default charset
* @return this
* @since 0.9.0
*/
public Site setDefaultCharset(String defaultCharset) {
this.defaultCharset = defaultCharset;
return this;
}
/**
* The default charset if charset detected failed.
*
* @return the defulat charset
* @since 0.9.0
*/
public String getDefaultCharset() {
return defaultCharset;
}
public int getTimeOut() { public int getTimeOut() {
return timeOut; return timeOut;
} }

View File

@ -4,6 +4,7 @@ import java.io.IOException;
import java.nio.charset.Charset; import java.nio.charset.Charset;
import java.util.HashMap; import java.util.HashMap;
import java.util.Map; import java.util.Map;
import java.util.Optional;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.http.HttpResponse; import org.apache.http.HttpResponse;
@ -116,7 +117,7 @@ public class HttpClientDownloader extends AbstractDownloader {
page.setBytes(bytes); page.setBytes(bytes);
if (!request.isBinaryContent()) { if (!request.isBinaryContent()) {
if (charset == null) { if (charset == null) {
charset = getHtmlCharset(contentType, bytes); charset = getHtmlCharset(contentType, bytes, task);
} }
page.setCharset(charset); page.setCharset(charset);
page.setRawText(new String(bytes, charset)); page.setRawText(new String(bytes, charset));
@ -131,11 +132,11 @@ public class HttpClientDownloader extends AbstractDownloader {
return page; return page;
} }
private String getHtmlCharset(String contentType, byte[] contentBytes) throws IOException { private String getHtmlCharset(String contentType, byte[] contentBytes, Task task) throws IOException {
String charset = CharsetUtils.detectCharset(contentType, contentBytes); String charset = CharsetUtils.detectCharset(contentType, contentBytes);
if (charset == null) { if (charset == null) {
charset = Charset.defaultCharset().name(); charset = Optional.ofNullable(task.getSite().getDefaultCharset()).orElseGet(Charset.defaultCharset()::name);
logger.warn("Charset autodetect failed, use {} as charset. Please specify charset in Site.setCharset()", Charset.defaultCharset()); logger.info("Charset autodetect failed, use {} as charset.", task.getSite().getDefaultCharset());
} }
return charset; return charset;
} }

View File

@ -0,0 +1,17 @@
package us.codecraft.webmagic;
import static org.junit.Assert.assertEquals;
import java.nio.charset.StandardCharsets;
import org.junit.Test;
public class SiteTest {
@Test
public void test() {
Site site = Site.me().setDefaultCharset(StandardCharsets.UTF_8.name());
assertEquals(StandardCharsets.UTF_8.name(), site.getDefaultCharset());
}
}