From 41fc0f235ba2d05e148e33c8c8fe37a74e1482b3 Mon Sep 17 00:00:00 2001 From: Jehan Date: Wed, 17 Mar 2021 13:09:10 +0100 Subject: [PATCH] src: nsMBCSGroupProber confidence weighed by language confidence. Since our whole charset detection logics is based on text having meaning (using actual language statistics), just because a text is valid UTF-8 does not mean it is absolutely the right encoding. It may also fit other encoding with maybe very high statistical confidence (and therefore a better candidate). Therefore instead of just returning 0.99 or other high values, let's weigh our encoding confidence with the best language confidence. --- src/nsMBCSGroupProber.cpp | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/src/nsMBCSGroupProber.cpp b/src/nsMBCSGroupProber.cpp index a47906a..ea2f88f 100644 --- a/src/nsMBCSGroupProber.cpp +++ b/src/nsMBCSGroupProber.cpp @@ -306,16 +306,30 @@ float nsMBCSGroupProber::GetConfidence(void) switch (mState) { - case eFoundIt: - return (float)0.99; case eNotMe: return (float)0.01; + case eFoundIt: default: for (i = 0; i < NUM_OF_PROBERS; i++) { + float bestLangConf = 0.0; + if (!mIsActive[i]) continue; cf = mProbers[i]->GetConfidence(); + + if (mProbers[i]->DecodeToUnicode()) + { + for (int j = 0; j < NUM_OF_LANGUAGES; j++) + { + float langConf = langDetectors[i][j]->GetConfidence(); + + if (bestLangConf < langConf) + bestLangConf = langConf; + } + cf *= bestLangConf; + } + if (bestConf < cf) { bestConf = cf;