Bug 101204 - different results with different chunk sizes.

ASCII and ISO-8859-1 should not be detected in nsUniversalDetector::HandleData() but in nsUniversalDetector::DataEnd() instead. Otherwise it creates an unwanted shortcut from the first call to uchardet_handle_data() if the input is broken into several pieces and if the first chunk happens to be ASCII (or ASCII + NBSP).
2025-12-06 16:56:40 +08:00 · 2017-05-28 14:06:53 +02:00 · 2017-05-28 14:06:53 +02:00 · 98bf4d73fd
commit 98bf4d73fd
parent 50743e16f8
1 changed files with 23 additions and 21 deletions
--- a/src/nsUniversalDetector.cpp
+++ b/src/nsUniversalDetector.cpp
@ -242,16 +242,6 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
      mDone = PR_TRUE;
      mDetectedCharset = mEscCharSetProber->GetCharSetName();
    }
-    else if (mNbspFound)
-    {
-      mDetectedCharset = "ISO-8859-1";
-    }
-    else
-    {
-      /* ASCII with the ESC character (or the sequence "~{") is still
-       * ASCII until proven otherwise. */
-      mDetectedCharset = "ASCII";
-    }
    break;
  case eHighbyte:
    for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
@ -270,17 +260,6 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
    break;

  default:
-    if (mNbspFound)
-    {
-      /* ISO-8859-1 is a good result candidate for ASCII + NBSP.
-       * (though it could have been any ISO-8859 encoding). */
-      mDetectedCharset = "ISO-8859-1";
-    }
-    else
-    {
-      /* Pure ASCII */
-      mDetectedCharset = "ASCII";
-    }
    break;
  }
  return NS_OK;
@ -297,6 +276,29 @@ void nsUniversalDetector::DataEnd()
    return;
  }

+  if (! mDetectedCharset)
+  {
+    switch (mInputState)
+    {
+    case eEscAscii:
+    case ePureAscii:
+      if (mNbspFound)
+      {
+          /* ISO-8859-1 is a good result candidate for ASCII + NBSP.
+           * (though it could have been any ISO-8859 encoding). */
+          mDetectedCharset = "ISO-8859-1";
+      }
+      else
+      {
+          /* ASCII with the ESC character (or the sequence "~{") is still
+           * ASCII until proven otherwise. */
+          mDetectedCharset = "ASCII";
+      }
+    default:
+      break;
+    }
+  }
+
  if (mDetectedCharset)
  {
    mDone = PR_TRUE;