From e5234d6b6181bb3bd022c2a67064a290011d9c14 Mon Sep 17 00:00:00 2001
From: Jehan <jehan@girinstud.io>
Date: Fri, 4 Dec 2015 19:19:39 +0100
Subject: [PATCH] Stating endianness of UTF-16 and UTF-32 was an error when BOM
 present.

According to RFC 2781, section 3.3: "Systems labelling UTF-16BE/LE text
MUST NOT prepend a BOM to the text."
Since uchardet cannot (and should not, obviously, it's not its role)
modify input text, when a BOM is present, we should always label the
encoding as "UTF-16" only.
Also it broke unit tests in using programs since a conversion from UTF-8
to UTF-16LE/BE would create a text without BOM, and a conversion from
UTF-16LE/BE to UTF-8 creates a UTF-8 text with a BOM, which changed
existing behaviours.
Same goes for UTF-32.
See also Unicode 5.0.0 standard, section 3.10 (tables 3.8 and 3.9 in
particular).
---
 src/nsUniversalDetector.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/nsUniversalDetector.cpp b/src/nsUniversalDetector.cpp
index 5e13b81..ab8bae0 100644
--- a/src/nsUniversalDetector.cpp
+++ b/src/nsUniversalDetector.cpp
@@ -121,7 +121,7 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
         case '\xFE':
           if ('\xFF' == aBuf[1])
             /* FE FF: UTF-16, big endian BOM. */
-            mDetectedCharset = "UTF-16BE";
+            mDetectedCharset = "UTF-16";
         break;
         case '\xFF':
           if ('\xFE' == aBuf[1])
@@ -131,12 +131,12 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
                 aBuf[3] == '\x00')
             {
                 /* FF FE 00 00: UTF-32 (LE). */
-                mDetectedCharset = "UTF-32LE";
+                mDetectedCharset = "UTF-32";
             }
             else
             {
                 /* FF FE: UTF-16, little endian BOM. */
-                mDetectedCharset = "UTF-16LE";
+                mDetectedCharset = "UTF-16";
             }
           }
           break;
@@ -147,7 +147,7 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
               aBuf[3] == '\xFF')
           {
               /* 00 00 FE FF: UTF-32 (BE). */
-              mDetectedCharset = "UTF-32BE";
+              mDetectedCharset = "UTF-32";
           }
           break;
         }