mirror of
https://gitlab.freedesktop.org/uchardet/uchardet.git
synced 2025-12-08 01:36:41 +08:00
Single Byte charsets: high ctrl character ratio lowers confidence.
Control characters are not an error per-se. Nevertheless they are clearly not frequent in single-byte charset texts. It is only normal for them to lower confidence in a charset. In particular a higher ctrl-per-letter ratio means a lower confidence. This fixes for instance our Windows-1252 German test (otherwise detected as ISO-8859-1).
This commit is contained in:
parent
aa587a64bd
commit
55b4f23971
@ -46,15 +46,21 @@ nsProbingState nsSingleByteCharSetProber::HandleData(const char* aBuf, PRUint32
|
||||
{
|
||||
order = mModel->charToOrderMap[(unsigned char)aBuf[i]];
|
||||
|
||||
if (order == ILL)
|
||||
if (order < SYMBOL_CAT_ORDER)
|
||||
{
|
||||
mTotalChar++;
|
||||
}
|
||||
else if (order == ILL)
|
||||
{
|
||||
/* When encountering an illegal codepoint, no need
|
||||
* to continue analyzing data. */
|
||||
mState = eNotMe;
|
||||
break;
|
||||
}
|
||||
if (order < SYMBOL_CAT_ORDER)
|
||||
mTotalChar++;
|
||||
else if (order == CTR)
|
||||
{
|
||||
mCtrlChar++;
|
||||
}
|
||||
if (order < mModel->freqCharCount)
|
||||
{
|
||||
mFreqChar++;
|
||||
@ -92,6 +98,7 @@ void nsSingleByteCharSetProber::Reset(void)
|
||||
mSeqCounters[i] = 0;
|
||||
mTotalSeqs = 0;
|
||||
mTotalChar = 0;
|
||||
mCtrlChar = 0;
|
||||
mFreqChar = 0;
|
||||
}
|
||||
|
||||
@ -118,6 +125,10 @@ float nsSingleByteCharSetProber::GetConfidence(void)
|
||||
* charsets used for the same language.
|
||||
*/
|
||||
r = r*mSeqCounters[POSITIVE_CAT] / mTotalChar;
|
||||
/* The more control characters (proportionnaly to the size of the text), the
|
||||
* less confident we become in the current charset.
|
||||
*/
|
||||
r = r * (mTotalChar - mCtrlChar) / mTotalChar;
|
||||
r = r*mFreqChar/mTotalChar;
|
||||
if (r >= (float)1.00)
|
||||
r = (float)0.99;
|
||||
|
||||
@ -113,6 +113,7 @@ protected:
|
||||
PRUint32 mSeqCounters[NUMBER_OF_SEQ_CAT];
|
||||
|
||||
PRUint32 mTotalChar;
|
||||
PRUint32 mCtrlChar;
|
||||
//characters that fall in our sampling range
|
||||
PRUint32 mFreqChar;
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user