mirror of
https://gitlab.freedesktop.org/uchardet/uchardet.git
synced 2025-12-06 16:56:40 +08:00
Stating endianness of UTF-16 and UTF-32 was an error when BOM present.
According to RFC 2781, section 3.3: "Systems labelling UTF-16BE/LE text MUST NOT prepend a BOM to the text." Since uchardet cannot (and should not, obviously, it's not its role) modify input text, when a BOM is present, we should always label the encoding as "UTF-16" only. Also it broke unit tests in using programs since a conversion from UTF-8 to UTF-16LE/BE would create a text without BOM, and a conversion from UTF-16LE/BE to UTF-8 creates a UTF-8 text with a BOM, which changed existing behaviours. Same goes for UTF-32. See also Unicode 5.0.0 standard, section 3.10 (tables 3.8 and 3.9 in particular).
This commit is contained in:
parent
2856e68aac
commit
e5234d6b61
@ -121,7 +121,7 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
|
|||||||
case '\xFE':
|
case '\xFE':
|
||||||
if ('\xFF' == aBuf[1])
|
if ('\xFF' == aBuf[1])
|
||||||
/* FE FF: UTF-16, big endian BOM. */
|
/* FE FF: UTF-16, big endian BOM. */
|
||||||
mDetectedCharset = "UTF-16BE";
|
mDetectedCharset = "UTF-16";
|
||||||
break;
|
break;
|
||||||
case '\xFF':
|
case '\xFF':
|
||||||
if ('\xFE' == aBuf[1])
|
if ('\xFE' == aBuf[1])
|
||||||
@ -131,12 +131,12 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
|
|||||||
aBuf[3] == '\x00')
|
aBuf[3] == '\x00')
|
||||||
{
|
{
|
||||||
/* FF FE 00 00: UTF-32 (LE). */
|
/* FF FE 00 00: UTF-32 (LE). */
|
||||||
mDetectedCharset = "UTF-32LE";
|
mDetectedCharset = "UTF-32";
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
/* FF FE: UTF-16, little endian BOM. */
|
/* FF FE: UTF-16, little endian BOM. */
|
||||||
mDetectedCharset = "UTF-16LE";
|
mDetectedCharset = "UTF-16";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
@ -147,7 +147,7 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
|
|||||||
aBuf[3] == '\xFF')
|
aBuf[3] == '\xFF')
|
||||||
{
|
{
|
||||||
/* 00 00 FE FF: UTF-32 (BE). */
|
/* 00 00 FE FF: UTF-32 (BE). */
|
||||||
mDetectedCharset = "UTF-32BE";
|
mDetectedCharset = "UTF-32";
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user