fix issue #21 charset detect error
parent
e1b6b54097
commit
692de76f86
|
@ -2,6 +2,7 @@ package us.codecraft.webmagic.utils;
|
||||||
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
|
||||||
|
import java.nio.charset.Charset;
|
||||||
import java.util.regex.Matcher;
|
import java.util.regex.Matcher;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
|
@ -98,15 +99,17 @@ public class UrlUtils {
|
||||||
return stringBuilder.toString();
|
return stringBuilder.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
private static final Pattern patternForCharset = Pattern.compile("charset=([^\\s;]*)");
|
private static final Pattern patternForCharset = Pattern.compile("charset\\s*=\\s*['\"]*([^\\s;'\"]*)");
|
||||||
|
|
||||||
public static String getCharset(String contentType) {
|
public static String getCharset(String contentType) {
|
||||||
Matcher matcher = patternForCharset.matcher(contentType);
|
Matcher matcher = patternForCharset.matcher(contentType);
|
||||||
if (matcher.find()) {
|
if (matcher.find()) {
|
||||||
return matcher.group(1);
|
String charset = matcher.group(1);
|
||||||
} else {
|
if (Charset.isSupported(charset)) {
|
||||||
return null;
|
return charset;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue