mirror of
https://gitlab.freedesktop.org/uchardet/uchardet.git
synced 2025-12-06 08:46:40 +08:00
Stating endianness of UTF-16 and UTF-32 was an error when BOM present.
According to RFC 2781, section 3.3: "Systems labelling UTF-16BE/LE text MUST NOT prepend a BOM to the text." Since uchardet cannot (and should not, obviously, it's not its role) modify input text, when a BOM is present, we should always label the encoding as "UTF-16" only. Also it broke unit tests in using programs since a conversion from UTF-8 to UTF-16LE/BE would create a text without BOM, and a conversion from UTF-16LE/BE to UTF-8 creates a UTF-8 text with a BOM, which changed existing behaviours. Same goes for UTF-32. See also Unicode 5.0.0 standard, section 3.10 (tables 3.8 and 3.9 in particular).
This commit is contained in:
parent
2856e68aac
commit
e5234d6b61
@ -121,7 +121,7 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
|
||||
case '\xFE':
|
||||
if ('\xFF' == aBuf[1])
|
||||
/* FE FF: UTF-16, big endian BOM. */
|
||||
mDetectedCharset = "UTF-16BE";
|
||||
mDetectedCharset = "UTF-16";
|
||||
break;
|
||||
case '\xFF':
|
||||
if ('\xFE' == aBuf[1])
|
||||
@ -131,12 +131,12 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
|
||||
aBuf[3] == '\x00')
|
||||
{
|
||||
/* FF FE 00 00: UTF-32 (LE). */
|
||||
mDetectedCharset = "UTF-32LE";
|
||||
mDetectedCharset = "UTF-32";
|
||||
}
|
||||
else
|
||||
{
|
||||
/* FF FE: UTF-16, little endian BOM. */
|
||||
mDetectedCharset = "UTF-16LE";
|
||||
mDetectedCharset = "UTF-16";
|
||||
}
|
||||
}
|
||||
break;
|
||||
@ -147,7 +147,7 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
|
||||
aBuf[3] == '\xFF')
|
||||
{
|
||||
/* 00 00 FE FF: UTF-32 (BE). */
|
||||
mDetectedCharset = "UTF-32BE";
|
||||
mDetectedCharset = "UTF-32";
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user