From 8113f604de9124b7710304909d926573d4800296 Mon Sep 17 00:00:00 2001
From: Jehan <jehan@girinstud.io>
Date: Fri, 19 Mar 2021 22:37:27 +0100
Subject: [PATCH] src: consider any combination with a non-frequent character
 as sequence.

Basically since we excluse non-letters (Control chars, punctuations,
spaces, separators, emoticones and whatnot), we consider any remaining
character as an off-script letter (we may have forgotten some cases, but
so far, it looks promising). Hence it is normal to consider a
combination with these (i.e. 2 off-script letters or 1 frequent letter +
1 off-script, in any order) as a sequence too. Doing so will drop the
confidence even more of any text having too much of these. As a
consequence, it expands again the gap between the first and second
contender, which seems to really show it works.
---
 src/nsLanguageDetector.cpp | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/src/nsLanguageDetector.cpp b/src/nsLanguageDetector.cpp
index 2cb28ab..0b5baa1 100644
--- a/src/nsLanguageDetector.cpp
+++ b/src/nsLanguageDetector.cpp
@@ -143,6 +143,11 @@ nsDetectState nsLanguageDetector::HandleData(const int* codePoints, PRUint32 cpL
          * but they will drop a bit the confidence.
          */
         mOutChar++;
+        order = -2;
+
+        if (mLastOrder == -2 || mLastOrder >= 0)
+          /* Adding a non frequent sequence. */
+          mTotalSeqs++;
       }
     }
     else if (order < mModel->freqCharCount)
@@ -154,6 +159,11 @@ nsDetectState nsLanguageDetector::HandleData(const int* codePoints, PRUint32 cpL
         mTotalSeqs++;
         ++(mSeqCounters[mModel->precedenceMatrix[mLastOrder*mModel->freqCharCount+order]]);
       }
+      else if (mLastOrder == -2)
+      {
+        /* Adding a non frequent sequence. */
+        mTotalSeqs++;
+      }
     }
     mLastOrder = order;
   }