src: improve confidence computation (generic and single-byte charset).

Nearly the same algorithm on both pieces of code now. I reintroduced the
mTypicalPositiveRatio now that our models actually gives the right ratio
(not the "first 512" meaningless stuff anymore).
In remaining differences, the last computation is the ratio of frequent
characters on the whole characters. For the generic detector, we use the
frequent+out sum instead. It works much better. I think that Unicode
text is much more prone to have characters outside your expected range,
while still being meaningful characters. Even control characters are
much more meaningful in Unicode.
So a ratio off it would make much too low confidence.

Anyway this confidence algorithm is already better. We seem to approach
much nicer confidence at each iteration, very satisfying!
This commit is contained in:
Jehan 2021-03-22 18:03:02 +01:00
parent 8e2cf7b81b
commit 6436e1dd47
3 changed files with 31 additions and 26 deletions

View File

@ -212,7 +212,7 @@ float nsLanguageDetector::GetConfidence(void)
//float neutralSeqs = mSeqCounters[LANG_NEUTRAL_CAT];
float negativeSeqs = mSeqCounters[LANG_NEGATIVE_CAT];
r = (positiveSeqs + probableSeqs / 4 - negativeSeqs * 2) / mTotalSeqs;
r = (positiveSeqs + probableSeqs / 4 - negativeSeqs * 4) / mTotalSeqs / mModel->mTypicalPositiveRatio;
/* The more characters outside the expected characters
* (proportionnaly to the size of the text), the less confident we
* become in the current language.

View File

@ -48,11 +48,8 @@ nsProbingState nsSingleByteCharSetProber::HandleData(const char* aBuf, PRUint32
{
order = mModel->charToOrderMap[(unsigned char)aBuf[i]];
if (order < SYMBOL_CAT_ORDER)
{
mTotalChar++;
}
else if (order == ILL)
mTotalChar++;
if (order == ILL)
{
/* When encountering an illegal codepoint, no need
* to continue analyzing data. */
@ -63,7 +60,7 @@ nsProbingState nsSingleByteCharSetProber::HandleData(const char* aBuf, PRUint32
{
mCtrlChar++;
}
if (order < mModel->freqCharCount)
else if (order < mModel->freqCharCount)
{
mFreqChar++;
@ -75,6 +72,21 @@ nsProbingState nsSingleByteCharSetProber::HandleData(const char* aBuf, PRUint32
else // reverse the order of the letters in the lookup
++(mSeqCounters[mModel->precedenceMatrix[order*mModel->freqCharCount+mLastOrder]]);
}
else if (mLastOrder < SYMBOL_CAT_ORDER)
{
mSeqCounters[NEGATIVE_CAT]++;
mTotalSeqs++;
}
}
else if (order < SYMBOL_CAT_ORDER)
{
mOutChar++;
if (mLastOrder < SYMBOL_CAT_ORDER)
{
mTotalSeqs++;
mSeqCounters[NEGATIVE_CAT]++;
}
}
mLastOrder = order;
}
@ -92,7 +104,7 @@ nsProbingState nsSingleByteCharSetProber::HandleData(const char* aBuf, PRUint32
return mState;
}
void nsSingleByteCharSetProber::Reset(void)
void nsSingleByteCharSetProber::Reset(void)
{
mState = eDetecting;
mLastOrder = 255;
@ -101,7 +113,8 @@ void nsSingleByteCharSetProber::Reset(void)
mTotalSeqs = 0;
mTotalChar = 0;
mCtrlChar = 0;
mFreqChar = 0;
mFreqChar = 0;
mOutChar = 0;
}
//#define NEGATIVE_APPROACH 1
@ -117,23 +130,14 @@ float nsSingleByteCharSetProber::GetConfidence(int candidate)
float r;
if (mTotalSeqs > 0) {
r = ((float)1.0) * mSeqCounters[POSITIVE_CAT] / mTotalSeqs / mModel->mTypicalPositiveRatio;
/* Multiply by a ratio of positive sequences per characters.
* This would help in particular to distinguish close winners.
* Indeed if you add a letter, you'd expect the positive sequence count
* to increase as well. If it doesn't, it may mean that this new codepoint
* may not have been a letter, but instead a symbol (or some other
* character). This could make the difference between very closely related
* charsets used for the same language.
*/
r = r * (mSeqCounters[POSITIVE_CAT] + (float) mSeqCounters[PROBABLE_CAT] / 4) / mTotalChar;
/* The more control characters (proportionnaly to the size of the text), the
* less confident we become in the current charset.
*/
r = r * ((float) mTotalChar - mCtrlChar) / mTotalChar;
r = r*mFreqChar/mTotalChar;
if (r >= (float)1.00)
r = (float)0.99;
float positiveSeqs = mSeqCounters[POSITIVE_CAT];
float probableSeqs = mSeqCounters[PROBABLE_CAT];
float negativeSeqs = mSeqCounters[NEGATIVE_CAT];
r = (positiveSeqs + probableSeqs / 4 - negativeSeqs * 4) / mTotalSeqs / mModel->mTypicalPositiveRatio;
r = r * (mTotalChar - mOutChar - mCtrlChar) / mTotalChar;
r = r * mFreqChar / mTotalChar;
return r;
}
return (float)0.01;

View File

@ -124,6 +124,7 @@ protected:
PRUint32 mCtrlChar;
//characters that fall in our sampling range
PRUint32 mFreqChar;
PRUint32 mOutChar;
// Optional auxiliary prober for name decision. created and destroyed by the GroupProber
nsCharSetProber* mNameProber;