From 401eb55dfc792dad1dd34f259e414ff3bf2cd38e Mon Sep 17 00:00:00 2001
From: Jehan <jehan@girinstud.io>
Date: Wed, 14 Dec 2022 20:02:59 +0100
Subject: [PATCH] src: improve algorithm for confidence computation.

Additionally to the "frequent characters" concept, we add 2
sub-categories, which are the "very frequent characters" and "rare
characters". The former are usually just a few characters which are used
most of the time (like 3 or 4 characters used 40% of the time!), whereas
the later are often a dozen or more characters which are barely used a
few percents of the time, all together.

We use this additional concept to help distinguish very similar
languages, or languages whose frequent characters are a subset of
the ones from another language (typically English, whose alphabet is a
subset of many other European languages).

The mTypicalPositiveRatio is getting rid of, as it was anyway barely of
any use (it was 0.99-something for nearly all languages!). Instead we
get these 2 new ratios: veryFreqRatio and lowFreqRatio, and of course
the associated order counts to know which character are in these sets.
---
 src/nsLanguageDetector.cpp | 19 ++++++++++++++++---
 src/nsLanguageDetector.h   | 17 +++++++++++++++--
 2 files changed, 31 insertions(+), 5 deletions(-)
diff --git a/src/nsLanguageDetector.cpp b/src/nsLanguageDetector.cpp
index 8097028..1cce160 100644
--- a/src/nsLanguageDetector.cpp
+++ b/src/nsLanguageDetector.cpp
@@ -37,6 +37,7 @@
  *
  * ***** END LICENSE BLOCK ***** */
 
+#include <math.h>
 #include "nsLanguageDetector.h"
 
 nsDetectState nsLanguageDetector::HandleData(const int* codePoints, PRUint32 cpLen)
@@ -164,6 +165,11 @@ nsDetectState nsLanguageDetector::HandleData(const int* codePoints, PRUint32 cpL
         /* Adding a non frequent sequence. */
         mTotalSeqs++;
       }
+
+      if (order < mModel->veryFreqCharCount)
+        mVeryFreqChar++;
+      if (order > mModel->lowFreqOrder)
+        mLowFreqChar++;
     }
     mLastOrder = order;
   }
@@ -192,8 +198,10 @@ void nsLanguageDetector::Reset(void)
   //mCtrlChar  = 0;
   //mEmoticons  = 0;
   //mVariousBetween  = 0;
-  mFreqChar  = 0;
-  mOutChar   = 0;
+  mFreqChar     = 0;
+  mVeryFreqChar = 0;
+  mLowFreqChar  = 0;
+  mOutChar      = 0;
 }
 
 #include <cstdio>
@@ -212,7 +220,7 @@ float nsLanguageDetector::GetConfidence(void)
     //float neutralSeqs  = mSeqCounters[LANG_NEUTRAL_CAT];
     float negativeSeqs = mSeqCounters[LANG_NEGATIVE_CAT];
 
-    r = (positiveSeqs + probableSeqs / 4 - negativeSeqs * 4) / mTotalSeqs / mModel->mTypicalPositiveRatio;
+    r = (positiveSeqs + probableSeqs / 4 - negativeSeqs * 4) / mTotalSeqs;
     /* The more characters outside the expected characters
      * (proportionnaly to the size of the text), the less confident we
      * become in the current language.
@@ -222,6 +230,11 @@ float nsLanguageDetector::GetConfidence(void)
     r = r * (mTotalChar - mOutChar) / mTotalChar;
     r = r * mFreqChar / (mFreqChar + mOutChar);
 
+    /* How similar are the very frequent character ratio. */
+    r = r * (1.0 - fabs((float) mVeryFreqChar / mFreqChar - mModel->veryFreqRatio) / 4.0);
+    /* How similar are the very rare character ratio. */
+    r = r * (1.0 - fabs((float) mLowFreqChar / mFreqChar - mModel->lowFreqRatio) / 4.0);
+
     return r;
   }
   return (float)0.01;
diff --git a/src/nsLanguageDetector.h b/src/nsLanguageDetector.h
index 339d4e2..5feb726 100644
--- a/src/nsLanguageDetector.h
+++ b/src/nsLanguageDetector.h
@@ -67,8 +67,17 @@ typedef struct
    * characters. Yet it maps to this range of orders.
    */
   int                  freqCharCount;
-
-  float                mTypicalPositiveRatio;
+  /* Most languages have 3 or 4 characters which are used more than 40% of the
+   * times. We count how many they are and what ratio they are used.
+   */
+  int                  veryFreqCharCount;
+  float                veryFreqRatio;
+  /* Most languages will have a whole range of characters which in cumulated
+   * total are barely used a few percents of the times. We count how many they
+   * are and what ratio they are used.
+   */
+  int                  lowFreqOrder;
+  float                lowFreqRatio;
 } LanguageModel;
 
 typedef enum {
@@ -105,6 +114,10 @@ protected:
   /*PRUint32 mVariousBetween;*/
   /* Characters that fall in our sampling range */
   PRUint32 mFreqChar;
+  /* Most common characters from our sampling range */
+  PRUint32 mVeryFreqChar;
+  /* Most rare characters from our sampling range */
+  PRUint32 mLowFreqChar;
   PRUint32 mOutChar;
 
 private: