src: nsMBCSGroupProber confidence weighed by language confidence.

Since our whole charset detection logics is based on text having meaning (using actual language statistics), just because a text is valid UTF-8 does not mean it is absolutely the right encoding. It may also fit other encoding with maybe very high statistical confidence (and therefore a better candidate). Therefore instead of just returning 0.99 or other high values, let's weigh our encoding confidence with the best language confidence.
2026-02-16 23:30:00 +08:00 · 2021-03-17 13:09:10 +01:00 · 2021-03-17 13:09:10 +01:00 · 41fc0f235b
commit 41fc0f235b
parent 714ae9ca29
1 changed files with 16 additions and 2 deletions
--- a/src/nsMBCSGroupProber.cpp
+++ b/src/nsMBCSGroupProber.cpp
@ -306,16 +306,30 @@ float nsMBCSGroupProber::GetConfidence(void)
  switch (mState)
  {
  case eFoundIt:
    return (float)0.99;
  case eNotMe:
    return (float)0.01;
  case eFoundIt:
  default:
    for (i = 0; i < NUM_OF_PROBERS; i++)
    {
      float bestLangConf = 0.0;
      if (!mIsActive[i])
        continue;
      cf = mProbers[i]->GetConfidence();
      if (mProbers[i]->DecodeToUnicode())
      {
        for (int j = 0; j < NUM_OF_LANGUAGES; j++)
        {
            float langConf = langDetectors[i][j]->GetConfidence();
            if (bestLangConf < langConf)
              bestLangConf = langConf;
        }
        cf *= bestLangConf;
      }
      if (bestConf < cf)
      {
        bestConf = cf;