Single Byte charsets: high ctrl character ratio lowers confidence.

Control characters are not an error per-se. Nevertheless they are clearly not frequent in single-byte charset texts. It is only normal for them to lower confidence in a charset. In particular a higher ctrl-per-letter ratio means a lower confidence. This fixes for instance our Windows-1252 German test (otherwise detected as ISO-8859-1).
2026-02-07 18:26:51 +08:00 · 2015-12-04 00:00:33 +01:00 · 2015-12-04 00:00:33 +01:00 · 55b4f23971
commit 55b4f23971
parent aa587a64bd
2 changed files with 15 additions and 3 deletions
--- a/src/nsSBCharSetProber.cpp
+++ b/src/nsSBCharSetProber.cpp
@ -46,15 +46,21 @@ nsProbingState nsSingleByteCharSetProber::HandleData(const char* aBuf, PRUint32
  {
    order = mModel->charToOrderMap[(unsigned char)aBuf[i]];

-    if (order == ILL)
+    if (order < SYMBOL_CAT_ORDER)
+    {
+      mTotalChar++;
+    }
+    else if (order == ILL)
    {
      /* When encountering an illegal codepoint, no need
       * to continue analyzing data. */
      mState = eNotMe;
      break;
    }
-    if (order < SYMBOL_CAT_ORDER)
-      mTotalChar++;
+    else if (order == CTR)
+    {
+      mCtrlChar++;
+    }
    if (order < mModel->freqCharCount)
    {
        mFreqChar++;
@ -92,6 +98,7 @@ void  nsSingleByteCharSetProber::Reset(void)
    mSeqCounters[i] = 0;
  mTotalSeqs = 0;
  mTotalChar = 0;
+  mCtrlChar  = 0;
  mFreqChar = 0;
 }

@ -118,6 +125,10 @@ float nsSingleByteCharSetProber::GetConfidence(void)
     * charsets used for the same language.
     */
    r = r*mSeqCounters[POSITIVE_CAT] / mTotalChar;
+    /* The more control characters (proportionnaly to the size of the text), the
+     * less confident we become in the current charset.
+     */
+    r = r * (mTotalChar - mCtrlChar) / mTotalChar;
    r = r*mFreqChar/mTotalChar;
    if (r >= (float)1.00)
      r = (float)0.99;
--- a/src/nsSBCharSetProber.h
+++ b/src/nsSBCharSetProber.h
@ -113,6 +113,7 @@ protected:
  PRUint32 mSeqCounters[NUMBER_OF_SEQ_CAT];

  PRUint32 mTotalChar;
+  PRUint32 mCtrlChar;
  //characters that fall in our sampling range
  PRUint32 mFreqChar;