From e5234d6b6181bb3bd022c2a67064a290011d9c14 Mon Sep 17 00:00:00 2001 From: Jehan Date: Fri, 4 Dec 2015 19:19:39 +0100 Subject: [PATCH] Stating endianness of UTF-16 and UTF-32 was an error when BOM present. According to RFC 2781, section 3.3: "Systems labelling UTF-16BE/LE text MUST NOT prepend a BOM to the text." Since uchardet cannot (and should not, obviously, it's not its role) modify input text, when a BOM is present, we should always label the encoding as "UTF-16" only. Also it broke unit tests in using programs since a conversion from UTF-8 to UTF-16LE/BE would create a text without BOM, and a conversion from UTF-16LE/BE to UTF-8 creates a UTF-8 text with a BOM, which changed existing behaviours. Same goes for UTF-32. See also Unicode 5.0.0 standard, section 3.10 (tables 3.8 and 3.9 in particular). --- src/nsUniversalDetector.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/nsUniversalDetector.cpp b/src/nsUniversalDetector.cpp index 5e13b81..ab8bae0 100644 --- a/src/nsUniversalDetector.cpp +++ b/src/nsUniversalDetector.cpp @@ -121,7 +121,7 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen) case '\xFE': if ('\xFF' == aBuf[1]) /* FE FF: UTF-16, big endian BOM. */ - mDetectedCharset = "UTF-16BE"; + mDetectedCharset = "UTF-16"; break; case '\xFF': if ('\xFE' == aBuf[1]) @@ -131,12 +131,12 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen) aBuf[3] == '\x00') { /* FF FE 00 00: UTF-32 (LE). */ - mDetectedCharset = "UTF-32LE"; + mDetectedCharset = "UTF-32"; } else { /* FF FE: UTF-16, little endian BOM. */ - mDetectedCharset = "UTF-16LE"; + mDetectedCharset = "UTF-16"; } } break; @@ -147,7 +147,7 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen) aBuf[3] == '\xFF') { /* 00 00 FE FF: UTF-32 (BE). */ - mDetectedCharset = "UTF-32BE"; + mDetectedCharset = "UTF-32"; } break; }