From 8113f604de9124b7710304909d926573d4800296 Mon Sep 17 00:00:00 2001 From: Jehan Date: Fri, 19 Mar 2021 22:37:27 +0100 Subject: [PATCH] src: consider any combination with a non-frequent character as sequence. Basically since we excluse non-letters (Control chars, punctuations, spaces, separators, emoticones and whatnot), we consider any remaining character as an off-script letter (we may have forgotten some cases, but so far, it looks promising). Hence it is normal to consider a combination with these (i.e. 2 off-script letters or 1 frequent letter + 1 off-script, in any order) as a sequence too. Doing so will drop the confidence even more of any text having too much of these. As a consequence, it expands again the gap between the first and second contender, which seems to really show it works. --- src/nsLanguageDetector.cpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/nsLanguageDetector.cpp b/src/nsLanguageDetector.cpp index 2cb28ab..0b5baa1 100644 --- a/src/nsLanguageDetector.cpp +++ b/src/nsLanguageDetector.cpp @@ -143,6 +143,11 @@ nsDetectState nsLanguageDetector::HandleData(const int* codePoints, PRUint32 cpL * but they will drop a bit the confidence. */ mOutChar++; + order = -2; + + if (mLastOrder == -2 || mLastOrder >= 0) + /* Adding a non frequent sequence. */ + mTotalSeqs++; } } else if (order < mModel->freqCharCount) @@ -154,6 +159,11 @@ nsDetectState nsLanguageDetector::HandleData(const int* codePoints, PRUint32 cpL mTotalSeqs++; ++(mSeqCounters[mModel->precedenceMatrix[mLastOrder*mModel->freqCharCount+order]]); } + else if (mLastOrder == -2) + { + /* Adding a non frequent sequence. */ + mTotalSeqs++; + } } mLastOrder = order; }