Single Byte charsets: high ctrl character ratio lowers confidence.

Control characters are not an error per-se. Nevertheless they are clearly not
frequent in single-byte charset texts. It is only normal for them to lower
confidence in a charset. In particular a higher ctrl-per-letter ratio means
a lower confidence.
This fixes for instance our Windows-1252 German test (otherwise detected as
ISO-8859-1).
This commit is contained in:
Jehan 2015-12-04 00:00:33 +01:00
parent aa587a64bd
commit 55b4f23971
2 changed files with 15 additions and 3 deletions

View File

@ -46,15 +46,21 @@ nsProbingState nsSingleByteCharSetProber::HandleData(const char* aBuf, PRUint32
{
order = mModel->charToOrderMap[(unsigned char)aBuf[i]];
if (order == ILL)
if (order < SYMBOL_CAT_ORDER)
{
mTotalChar++;
}
else if (order == ILL)
{
/* When encountering an illegal codepoint, no need
* to continue analyzing data. */
mState = eNotMe;
break;
}
if (order < SYMBOL_CAT_ORDER)
mTotalChar++;
else if (order == CTR)
{
mCtrlChar++;
}
if (order < mModel->freqCharCount)
{
mFreqChar++;
@ -92,6 +98,7 @@ void nsSingleByteCharSetProber::Reset(void)
mSeqCounters[i] = 0;
mTotalSeqs = 0;
mTotalChar = 0;
mCtrlChar = 0;
mFreqChar = 0;
}
@ -118,6 +125,10 @@ float nsSingleByteCharSetProber::GetConfidence(void)
* charsets used for the same language.
*/
r = r*mSeqCounters[POSITIVE_CAT] / mTotalChar;
/* The more control characters (proportionnaly to the size of the text), the
* less confident we become in the current charset.
*/
r = r * (mTotalChar - mCtrlChar) / mTotalChar;
r = r*mFreqChar/mTotalChar;
if (r >= (float)1.00)
r = (float)0.99;

View File

@ -113,6 +113,7 @@ protected:
PRUint32 mSeqCounters[NUMBER_OF_SEQ_CAT];
PRUint32 mTotalChar;
PRUint32 mCtrlChar;
//characters that fall in our sampling range
PRUint32 mFreqChar;