From 55b4f23971db61c9ed93be6630c79a50bda9bd24 Mon Sep 17 00:00:00 2001 From: Jehan Date: Fri, 4 Dec 2015 00:00:33 +0100 Subject: [PATCH] Single Byte charsets: high ctrl character ratio lowers confidence. Control characters are not an error per-se. Nevertheless they are clearly not frequent in single-byte charset texts. It is only normal for them to lower confidence in a charset. In particular a higher ctrl-per-letter ratio means a lower confidence. This fixes for instance our Windows-1252 German test (otherwise detected as ISO-8859-1). --- src/nsSBCharSetProber.cpp | 17 ++++++++++++++--- src/nsSBCharSetProber.h | 1 + 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/src/nsSBCharSetProber.cpp b/src/nsSBCharSetProber.cpp index fed3f03..fda1b9f 100644 --- a/src/nsSBCharSetProber.cpp +++ b/src/nsSBCharSetProber.cpp @@ -46,15 +46,21 @@ nsProbingState nsSingleByteCharSetProber::HandleData(const char* aBuf, PRUint32 { order = mModel->charToOrderMap[(unsigned char)aBuf[i]]; - if (order == ILL) + if (order < SYMBOL_CAT_ORDER) + { + mTotalChar++; + } + else if (order == ILL) { /* When encountering an illegal codepoint, no need * to continue analyzing data. */ mState = eNotMe; break; } - if (order < SYMBOL_CAT_ORDER) - mTotalChar++; + else if (order == CTR) + { + mCtrlChar++; + } if (order < mModel->freqCharCount) { mFreqChar++; @@ -92,6 +98,7 @@ void nsSingleByteCharSetProber::Reset(void) mSeqCounters[i] = 0; mTotalSeqs = 0; mTotalChar = 0; + mCtrlChar = 0; mFreqChar = 0; } @@ -118,6 +125,10 @@ float nsSingleByteCharSetProber::GetConfidence(void) * charsets used for the same language. */ r = r*mSeqCounters[POSITIVE_CAT] / mTotalChar; + /* The more control characters (proportionnaly to the size of the text), the + * less confident we become in the current charset. + */ + r = r * (mTotalChar - mCtrlChar) / mTotalChar; r = r*mFreqChar/mTotalChar; if (r >= (float)1.00) r = (float)0.99; diff --git a/src/nsSBCharSetProber.h b/src/nsSBCharSetProber.h index 23c6cbb..63da429 100644 --- a/src/nsSBCharSetProber.h +++ b/src/nsSBCharSetProber.h @@ -113,6 +113,7 @@ protected: PRUint32 mSeqCounters[NUMBER_OF_SEQ_CAT]; PRUint32 mTotalChar; + PRUint32 mCtrlChar; //characters that fall in our sampling range PRUint32 mFreqChar;