From 692de76f869312dc22b479b81b7f0f8b809f1c2f Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Wed, 4 Sep 2013 15:27:51 +0800 Subject: [PATCH] fix issue #21 charset detect error --- .../java/us/codecraft/webmagic/utils/UrlUtils.java | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java index 7dae1f2..4e1140b 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java @@ -2,6 +2,7 @@ package us.codecraft.webmagic.utils; import org.apache.commons.lang3.StringUtils; +import java.nio.charset.Charset; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -98,15 +99,17 @@ public class UrlUtils { return stringBuilder.toString(); } - private static final Pattern patternForCharset = Pattern.compile("charset=([^\\s;]*)"); + private static final Pattern patternForCharset = Pattern.compile("charset\\s*=\\s*['\"]*([^\\s;'\"]*)"); public static String getCharset(String contentType) { Matcher matcher = patternForCharset.matcher(contentType); if (matcher.find()) { - return matcher.group(1); - } else { - return null; + String charset = matcher.group(1); + if (Charset.isSupported(charset)) { + return charset; + } } + return null; } }