src: nsMBCSGroupProber confidence weighed by language confidence.

Since our whole charset detection logics is based on text having meaning (using actual language statistics), just because a text is valid UTF-8 does not mean it is absolutely the right encoding. It may also fit other encoding with maybe very high statistical confidence (and therefore a better candidate). Therefore instead of just returning 0.99 or other high values, let's weigh our encoding confidence with the best language confidence.
2025-12-06 16:56:40 +08:00 · 2021-03-17 13:09:10 +01:00 · 2021-03-17 13:09:10 +01:00 · ea32980273
commit ea32980273
parent 25d2890676
1 changed files with 16 additions and 2 deletions
--- a/src/nsMBCSGroupProber.cpp
+++ b/src/nsMBCSGroupProber.cpp
@ -306,16 +306,30 @@ float nsMBCSGroupProber::GetConfidence(void)

  switch (mState)
  {
-  case eFoundIt:
-    return (float)0.99;
  case eNotMe:
    return (float)0.01;
+  case eFoundIt:
  default:
    for (i = 0; i < NUM_OF_PROBERS; i++)
    {
+      float bestLangConf = 0.0;
+
      if (!mIsActive[i])
        continue;
      cf = mProbers[i]->GetConfidence();
+
+      if (mProbers[i]->DecodeToUnicode())
+      {
+        for (int j = 0; j < NUM_OF_LANGUAGES; j++)
+        {
+            float langConf = langDetectors[i][j]->GetConfidence();
+
+            if (bestLangConf < langConf)
+              bestLangConf = langConf;
+        }
+        cf *= bestLangConf;
+      }
+
      if (bestConf < cf)
      {
        bestConf = cf;