src: consider any combination with a non-frequent character as sequence.

Basically since we excluse non-letters (Control chars, punctuations, spaces, separators, emoticones and whatnot), we consider any remaining character as an off-script letter (we may have forgotten some cases, but so far, it looks promising). Hence it is normal to consider a combination with these (i.e. 2 off-script letters or 1 frequent letter + 1 off-script, in any order) as a sequence too. Doing so will drop the confidence even more of any text having too much of these. As a consequence, it expands again the gap between the first and second contender, which seems to really show it works.
2025-12-06 16:56:40 +08:00 · 2021-03-19 22:37:27 +01:00 · 2021-03-19 22:37:27 +01:00 · 8113f604de
commit 8113f604de
parent a1b186fa8b
1 changed files with 10 additions and 0 deletions
--- a/src/nsLanguageDetector.cpp
+++ b/src/nsLanguageDetector.cpp
@ -143,6 +143,11 @@ nsDetectState nsLanguageDetector::HandleData(const int* codePoints, PRUint32 cpL
         * but they will drop a bit the confidence.
         */
        mOutChar++;
+        order = -2;
+
+        if (mLastOrder == -2 || mLastOrder >= 0)
+          /* Adding a non frequent sequence. */
+          mTotalSeqs++;
      }
    }
    else if (order < mModel->freqCharCount)
@ -154,6 +159,11 @@ nsDetectState nsLanguageDetector::HandleData(const int* codePoints, PRUint32 cpL
        mTotalSeqs++;
        ++(mSeqCounters[mModel->precedenceMatrix[mLastOrder*mModel->freqCharCount+order]]);
      }
+      else if (mLastOrder == -2)
+      {
+        /* Adding a non frequent sequence. */
+        mTotalSeqs++;
+      }
    }
    mLastOrder = order;
  }