From b00c85a6a652260caa3eb479c64386e4a75acd7c Mon Sep 17 00:00:00 2001
From: Jehan <jehan@girinstud.io>
Date: Wed, 17 Mar 2021 21:26:31 +0100
Subject: [PATCH] src: do not shortcut UTF-8 detection too early.

I had the case with the Czech test which was considered as Irish after
being shortcutted far too early after only 16 characters. Confidence
values was just barely above 0.5 for Irish (and barely below for Czech).

By adding a threshold (at least 256 characters), we give a bit of
relevant data to the engine to actually make an informed decision. By
then, the Czech detection was at more than 0.7, whereas the Irish one at
0.6.
---
 src/nsUTF8Prober.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/nsUTF8Prober.cpp b/src/nsUTF8Prober.cpp
index 744c66d..21f885e 100644
--- a/src/nsUTF8Prober.cpp
+++ b/src/nsUTF8Prober.cpp
@@ -45,6 +45,8 @@ void  nsUTF8Prober::Reset(void)
   currentCodePoint = 0;
 }
 
+#define ENOUGH_CHAR_THRESHOLD 256
+
 nsProbingState nsUTF8Prober::HandleData(const char* aBuf, PRUint32 aLen,
                                         int** codePointBuffer,
                                         int*  codePointBufferIdx)
@@ -88,7 +90,7 @@ nsProbingState nsUTF8Prober::HandleData(const char* aBuf, PRUint32 aLen,
   }
 
   if (mState == eDetecting)
-    if (GetConfidence(0) > SHORTCUT_THRESHOLD)
+    if (mNumOfMBChar > ENOUGH_CHAR_THRESHOLD && GetConfidence(0) > SHORTCUT_THRESHOLD)
       mState = eFoundIt;
   return mState;
 }