From 41fc0f235ba2d05e148e33c8c8fe37a74e1482b3 Mon Sep 17 00:00:00 2001
From: Jehan <jehan@girinstud.io>
Date: Wed, 17 Mar 2021 13:09:10 +0100
Subject: [PATCH] src: nsMBCSGroupProber confidence weighed by language
 confidence.

Since our whole charset detection logics is based on text having meaning
(using actual language statistics), just because a text is valid UTF-8
does not mean it is absolutely the right encoding. It may also fit other
encoding with maybe very high statistical confidence (and therefore a
better candidate).
Therefore instead of just returning 0.99 or other high values, let's
weigh our encoding confidence with the best language confidence.
---
 src/nsMBCSGroupProber.cpp | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/src/nsMBCSGroupProber.cpp b/src/nsMBCSGroupProber.cpp
index a47906a..ea2f88f 100644
--- a/src/nsMBCSGroupProber.cpp
+++ b/src/nsMBCSGroupProber.cpp
@@ -306,16 +306,30 @@ float nsMBCSGroupProber::GetConfidence(void)
 
   switch (mState)
   {
-  case eFoundIt:
-    return (float)0.99;
   case eNotMe:
     return (float)0.01;
+  case eFoundIt:
   default:
     for (i = 0; i < NUM_OF_PROBERS; i++)
     {
+      float bestLangConf = 0.0;
+
       if (!mIsActive[i])
         continue;
       cf = mProbers[i]->GetConfidence();
+
+      if (mProbers[i]->DecodeToUnicode())
+      {
+        for (int j = 0; j < NUM_OF_LANGUAGES; j++)
+        {
+            float langConf = langDetectors[i][j]->GetConfidence();
+
+            if (bestLangConf < langConf)
+              bestLangConf = langConf;
+        }
+        cf *= bestLangConf;
+      }
+
       if (bestConf < cf)
       {
         bestConf = cf;