From e8dd55995a0f18dbeb0a3f0e4bfd9982c40e6723 Mon Sep 17 00:00:00 2001
From: Jehan <jehan@girinstud.io>
Date: Tue, 24 Nov 2015 18:50:23 +0100
Subject: [PATCH] Add "LE/BE" suffix to "UTF-16" result for Little/Big Endian
 info...

... and add UTF-32 BOM detection.
---
 src/nsUniversalDetector.cpp | 46 ++++++++++++++++++++++++++++---------
 1 file changed, 35 insertions(+), 11 deletions(-)

diff --git a/src/nsUniversalDetector.cpp b/src/nsUniversalDetector.cpp
index 68f2914..f95244e 100644
--- a/src/nsUniversalDetector.cpp
+++ b/src/nsUniversalDetector.cpp
@@ -106,7 +106,7 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
   if (aLen > 0)
     mGotData = PR_TRUE;
 
-  //If the data starts with BOM, we know it is UTF
+  /* If the data starts with BOM, we know it is UTF. */
   if (mStart)
   {
     mStart = PR_FALSE;
@@ -115,20 +115,42 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
         {
         case '\xEF':
           if (('\xBB' == aBuf[1]) && ('\xBF' == aBuf[2]))
-            // EF BB BF  UTF-8 encoded BOM
+            /* EF BB BF: UTF-8 encoded BOM. */
             mDetectedCharset = "UTF-8";
         break;
         case '\xFE':
           if ('\xFF' == aBuf[1])
-            // FE FF  UTF-16, big endian BOM
-            mDetectedCharset = "UTF-16";
+            /* FE FF: UTF-16, big endian BOM. */
+            mDetectedCharset = "UTF-16BE";
         break;
         case '\xFF':
           if ('\xFE' == aBuf[1])
-            // FF FE  UTF-16, little endian BOM
-            mDetectedCharset = "UTF-16";
-        break;
-      }  // switch
+          {
+            if (aLen > 3          &&
+                aBuf[2] == '\x00' &&
+                aBuf[3] == '\x00')
+            {
+                /* FF FE 00 00: UTF-32 (LE). */
+                mDetectedCharset = "UTF-32LE";
+            }
+            else
+            {
+                /* FF FE: UTF-16, little endian BOM. */
+                mDetectedCharset = "UTF-16LE";
+            }
+          }
+          break;
+        case '\x00':
+          if (aLen > 3           &&
+              aBuf[1] == '\x00' &&
+              aBuf[2] == '\xFE' &&
+              aBuf[3] == '\xFF')
+          {
+              /* 00 00 FE FF: UTF-32 (BE). */
+              mDetectedCharset = "UTF-32BE";
+          }
+          break;
+        }
 
       if (mDetectedCharset)
       {
@@ -140,10 +162,12 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
   PRUint32 i;
   for (i = 0; i < aLen; i++)
   {
-    //other than 0xa0, if every othe character is ascii, the page is ascii
-    if (aBuf[i] & '\x80' && aBuf[i] != '\xA0')  //Since many Ascii only page contains NBSP
+    /* Other than 0xA0, if every other character is ASCII, the page is ASCII.
+     * 0xA0 (NBSP in a few charset) is apparently a rare exception
+     * of non-ASCII character contained in ASCII text. */
+    if (aBuf[i] & '\x80' && aBuf[i] != '\xA0')
     {
-      //we got a non-ascii byte (high-byte)
+      /* We got a non-ASCII byte (high-byte) */
       if (mInputState != eHighbyte)
       {
         //adjust state