mirror of
https://gitlab.freedesktop.org/uchardet/uchardet.git
synced 2025-12-06 16:56:40 +08:00
src: nsMBCSGroupProber confidence weighed by language confidence.
Since our whole charset detection logics is based on text having meaning (using actual language statistics), just because a text is valid UTF-8 does not mean it is absolutely the right encoding. It may also fit other encoding with maybe very high statistical confidence (and therefore a better candidate). Therefore instead of just returning 0.99 or other high values, let's weigh our encoding confidence with the best language confidence.
This commit is contained in:
parent
25d2890676
commit
ea32980273
@ -306,16 +306,30 @@ float nsMBCSGroupProber::GetConfidence(void)
|
||||
|
||||
switch (mState)
|
||||
{
|
||||
case eFoundIt:
|
||||
return (float)0.99;
|
||||
case eNotMe:
|
||||
return (float)0.01;
|
||||
case eFoundIt:
|
||||
default:
|
||||
for (i = 0; i < NUM_OF_PROBERS; i++)
|
||||
{
|
||||
float bestLangConf = 0.0;
|
||||
|
||||
if (!mIsActive[i])
|
||||
continue;
|
||||
cf = mProbers[i]->GetConfidence();
|
||||
|
||||
if (mProbers[i]->DecodeToUnicode())
|
||||
{
|
||||
for (int j = 0; j < NUM_OF_LANGUAGES; j++)
|
||||
{
|
||||
float langConf = langDetectors[i][j]->GetConfidence();
|
||||
|
||||
if (bestLangConf < langConf)
|
||||
bestLangConf = langConf;
|
||||
}
|
||||
cf *= bestLangConf;
|
||||
}
|
||||
|
||||
if (bestConf < cf)
|
||||
{
|
||||
bestConf = cf;
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user