From 98bf4d73fdc1400a16209cb55840fd7dd46632ab Mon Sep 17 00:00:00 2001
From: Jehan <jehan@girinstud.io>
Date: Sun, 28 May 2017 14:06:53 +0200
Subject: [PATCH] Bug 101204 - different results with different chunk sizes.

ASCII and ISO-8859-1 should not be detected in
nsUniversalDetector::HandleData() but in nsUniversalDetector::DataEnd()
instead. Otherwise it creates an unwanted shortcut from the first call
to uchardet_handle_data() if the input is broken into several pieces and
if the first chunk happens to be ASCII (or ASCII + NBSP).
---
 src/nsUniversalDetector.cpp | 44 +++++++++++++++++++------------------
 1 file changed, 23 insertions(+), 21 deletions(-)

diff --git a/src/nsUniversalDetector.cpp b/src/nsUniversalDetector.cpp
index 9711618..75474e0 100644
--- a/src/nsUniversalDetector.cpp
+++ b/src/nsUniversalDetector.cpp
@@ -242,16 +242,6 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
       mDone = PR_TRUE;
       mDetectedCharset = mEscCharSetProber->GetCharSetName();
     }
-    else if (mNbspFound)
-    {
-      mDetectedCharset = "ISO-8859-1";
-    }
-    else
-    {
-      /* ASCII with the ESC character (or the sequence "~{") is still
-       * ASCII until proven otherwise. */
-      mDetectedCharset = "ASCII";
-    }
     break;
   case eHighbyte:
     for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
@@ -270,17 +260,6 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
     break;
 
   default:
-    if (mNbspFound)
-    {
-      /* ISO-8859-1 is a good result candidate for ASCII + NBSP.
-       * (though it could have been any ISO-8859 encoding). */
-      mDetectedCharset = "ISO-8859-1";
-    }
-    else
-    {
-      /* Pure ASCII */
-      mDetectedCharset = "ASCII";
-    }
     break;
   }
   return NS_OK;
@@ -297,6 +276,29 @@ void nsUniversalDetector::DataEnd()
     return;
   }
 
+  if (! mDetectedCharset)
+  {
+    switch (mInputState)
+    {
+    case eEscAscii:
+    case ePureAscii:
+      if (mNbspFound)
+      {
+          /* ISO-8859-1 is a good result candidate for ASCII + NBSP.
+           * (though it could have been any ISO-8859 encoding). */
+          mDetectedCharset = "ISO-8859-1";
+      }
+      else
+      {
+          /* ASCII with the ESC character (or the sequence "~{") is still
+           * ASCII until proven otherwise. */
+          mDetectedCharset = "ASCII";
+      }
+    default:
+      break;
+    }
+  }
+
   if (mDetectedCharset)
   {
     mDone = PR_TRUE;