mirror of
https://gitlab.freedesktop.org/uchardet/uchardet.git
synced 2026-02-16 23:30:00 +08:00
src: nsMBCSGroupProber confidence weighed by language confidence.
Since our whole charset detection logics is based on text having meaning (using actual language statistics), just because a text is valid UTF-8 does not mean it is absolutely the right encoding. It may also fit other encoding with maybe very high statistical confidence (and therefore a better candidate). Therefore instead of just returning 0.99 or other high values, let's weigh our encoding confidence with the best language confidence.
This commit is contained in:
parent
714ae9ca29
commit
41fc0f235b
@ -306,16 +306,30 @@ float nsMBCSGroupProber::GetConfidence(void)
|
|||||||
|
|
||||||
switch (mState)
|
switch (mState)
|
||||||
{
|
{
|
||||||
case eFoundIt:
|
|
||||||
return (float)0.99;
|
|
||||||
case eNotMe:
|
case eNotMe:
|
||||||
return (float)0.01;
|
return (float)0.01;
|
||||||
|
case eFoundIt:
|
||||||
default:
|
default:
|
||||||
for (i = 0; i < NUM_OF_PROBERS; i++)
|
for (i = 0; i < NUM_OF_PROBERS; i++)
|
||||||
{
|
{
|
||||||
|
float bestLangConf = 0.0;
|
||||||
|
|
||||||
if (!mIsActive[i])
|
if (!mIsActive[i])
|
||||||
continue;
|
continue;
|
||||||
cf = mProbers[i]->GetConfidence();
|
cf = mProbers[i]->GetConfidence();
|
||||||
|
|
||||||
|
if (mProbers[i]->DecodeToUnicode())
|
||||||
|
{
|
||||||
|
for (int j = 0; j < NUM_OF_LANGUAGES; j++)
|
||||||
|
{
|
||||||
|
float langConf = langDetectors[i][j]->GetConfidence();
|
||||||
|
|
||||||
|
if (bestLangConf < langConf)
|
||||||
|
bestLangConf = langConf;
|
||||||
|
}
|
||||||
|
cf *= bestLangConf;
|
||||||
|
}
|
||||||
|
|
||||||
if (bestConf < cf)
|
if (bestConf < cf)
|
||||||
{
|
{
|
||||||
bestConf = cf;
|
bestConf = cf;
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user