From ba6b46a68c61a723bd84b3774f3a433d6162f347 Mon Sep 17 00:00:00 2001
From: Jehan <jehan@girinstud.io>
Date: Wed, 17 Mar 2021 16:34:26 +0100
Subject: [PATCH] src: make nsMBCSGroupProber report all valid candidates.

Returning only the best one has limits, as it doesn't allow to check
very close confidence candidates. Now in particular, the UTF-8 prober
will return all ("UTF-8", lang) candidates for every language with
probable statistical fit.
---
 src/nsMBCSGroupProber.cpp   | 224 ++++++++++++++++++++++++++----------
 src/nsMBCSGroupProber.h     |   8 +-
 src/nsUniversalDetector.cpp |  65 ++++++-----
 src/nsUniversalDetector.h   |   6 +-
 4 files changed, 203 insertions(+), 100 deletions(-)

diff --git a/src/nsMBCSGroupProber.cpp b/src/nsMBCSGroupProber.cpp
index 3b21530..790d099 100644
--- a/src/nsMBCSGroupProber.cpp
+++ b/src/nsMBCSGroupProber.cpp
@@ -138,45 +138,87 @@ nsMBCSGroupProber::~nsMBCSGroupProber()
   }
 }
 
+#define CANDIDATE_THRESHOLD 0.3f
+
+int nsMBCSGroupProber::GetCandidates()
+{
+  int num_candidates = 0;
+
+  CheckCandidates();
+
+  for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++)
+    for (PRUint32 j = 0; j < NUM_OF_LANGUAGES; j++)
+      if (candidates[i][j])
+        num_candidates++;
+
+  return num_candidates;
+}
+
 const char* nsMBCSGroupProber::GetCharSetName(int candidate)
 {
-  if (mBestGuess == -1)
-  {
-    GetConfidence(0);
-    if (mBestGuess == -1)
-      mBestGuess = 0;
-  }
-  return mProbers[mBestGuess]->GetCharSetName(0);
+  int num_candidates = GetCandidates();
+  int candidate_it   = 0;
+
+  if (num_candidates == 0)
+    return NULL;
+  else if (candidate >= num_candidates)
+    /* Just show the first candidate. */
+    candidate = 0;
+
+  for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++)
+    for (PRUint32 j = 0; j < NUM_OF_LANGUAGES; j++)
+      if (candidates[i][j])
+      {
+        if (candidate == candidate_it)
+        {
+          /* We assume that probers included in the nsMBCSGroupProber
+           * return only one candidate themselves.
+           * */
+          return mProbers[i]->GetCharSetName(0);
+        }
+        candidate_it++;
+      }
+
+  /* Should not happen. */
+  return NULL;
 }
 
 const char* nsMBCSGroupProber::GetLanguage(int candidate)
 {
-  const char* maxLang       = NULL;
-  int         maxLangIdx    = -1;
-  float       maxConfidence = 0.0;
+  const char* lang   = NULL;
+  int num_candidates = GetCandidates();
+  int candidate_it   = 0;
 
-  if (mBestGuess == -1)
+  if (num_candidates == 0)
     return NULL;
-  else
-    maxLang = mProbers[mBestGuess]->GetLanguage(0);
+  else if (candidate >= num_candidates)
+    /* Just show the first candidate. */
+    candidate = 0;
 
-  if (maxLang == NULL && mProbers[mBestGuess]->DecodeToUnicode())
-  {
+  for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++)
     for (PRUint32 j = 0; j < NUM_OF_LANGUAGES; j++)
-    {
-      float conf = langDetectors[mBestGuess][j]->GetConfidence();
-
-      if (conf > maxConfidence)
+      if (candidates[i][j])
       {
-        maxLangIdx = j;
-        maxConfidence = conf;
-      }
-    }
-    if (maxLangIdx != -1)
-      maxLang = langDetectors[mBestGuess][maxLangIdx]->GetLanguage();
-  }
+        if (candidate == candidate_it)
+        {
+          /* We assume that probers included in the nsMBCSGroupProber
+           * return only one candidate themselves.
+           * */
+          lang = mProbers[i]->GetLanguage(0);
 
-  return maxLang;
+          if (! lang)
+          {
+            /* The prober does not come with its own language. */
+            if (langDetectors[i][j])
+              lang = langDetectors[i][j]->GetLanguage();
+          }
+
+          return lang;
+        }
+        candidate_it++;
+      }
+
+  return lang;
 }
 
 void nsMBCSGroupProber::Reset(void)
@@ -196,17 +238,18 @@ void nsMBCSGroupProber::Reset(void)
         codePointBuffer[i] = new int[codePointBufferSize[i]];
       }
       codePointBufferIdx[i] = 0;
-
-      for (PRUint32 j = 0; j < NUM_OF_LANGUAGES; j++)
-      {
-        if (langDetectors[i][j])
-          langDetectors[i][j]->Reset();
-      }
     }
     else
       mIsActive[i] = PR_FALSE;
+
+    for (PRUint32 j = 0; j < NUM_OF_LANGUAGES; j++)
+    {
+      if (langDetectors[i][j])
+        langDetectors[i][j]->Reset();
+
+      candidates[i][j] = false;
+    }
   }
-  mBestGuess = -1;
   mState = eDetecting;
   mKeepNext = 0;
 }
@@ -252,9 +295,21 @@ nsProbingState nsMBCSGroupProber::HandleData(const char* aBuf, PRUint32 aLen,
 
           if (st == eFoundIt)
           {
-            mBestGuess = i;
-            mState = eFoundIt;
-            return mState;
+            float cf = mProbers[i]->GetConfidence(0);
+
+            for (PRUint32 j = 0; j < NUM_OF_LANGUAGES; j++)
+            {
+              float langConf = langDetectors[i][j]->GetConfidence();
+
+              if (cf * langConf > CANDIDATE_THRESHOLD)
+              {
+                /* There is at least one (charset, lang) couple for
+                 * which the confidence is high enough.
+                 */
+                mState = eFoundIt;
+                return mState;
+              }
+            }
           }
         }
       }
@@ -288,9 +343,21 @@ nsProbingState nsMBCSGroupProber::HandleData(const char* aBuf, PRUint32 aLen,
 
       if (st == eFoundIt)
       {
-        mBestGuess = i;
-        mState = eFoundIt;
-        return mState;
+        float cf = mProbers[i]->GetConfidence(0);
+
+        for (PRUint32 j = 0; j < NUM_OF_LANGUAGES; j++)
+        {
+          float langConf = langDetectors[i][j]->GetConfidence();
+
+          if (cf * langConf > CANDIDATE_THRESHOLD)
+          {
+            /* There is at least one (charset, lang) couple for
+             * which the confidence is high enough.
+             */
+            mState = eFoundIt;
+            return mState;
+          }
+        }
       }
     }
   }
@@ -299,10 +366,49 @@ nsProbingState nsMBCSGroupProber::HandleData(const char* aBuf, PRUint32 aLen,
   return mState;
 }
 
+void nsMBCSGroupProber::CheckCandidates()
+{
+  for (int i = 0; i < NUM_OF_PROBERS; i++)
+  {
+    if (! mIsActive[i])
+    {
+      for (int j = 0; j < NUM_OF_LANGUAGES; j++)
+        candidates[i][j] = false;
+    }
+    else
+    {
+      float cf = mProbers[i]->GetConfidence(0);
+
+      if (mProbers[i]->DecodeToUnicode())
+      {
+        for (int j = 0; j < NUM_OF_LANGUAGES; j++)
+        {
+          float langConf = langDetectors[i][j]->GetConfidence();
+
+          candidates[i][j] = (cf * langConf > CANDIDATE_THRESHOLD);
+        }
+      }
+      else
+      {
+        for (int j = 0; j < NUM_OF_LANGUAGES; j++)
+          candidates[i][j] = (cf > CANDIDATE_THRESHOLD);
+      }
+    }
+  }
+}
+
 float nsMBCSGroupProber::GetConfidence(int candidate)
 {
+  int num_candidates = GetCandidates();
+  int candidate_it   = 0;
+
   PRUint32 i;
-  float bestConf = 0.0, cf;
+
+  if (num_candidates == 0)
+    return 0.0;
+  else if (candidate >= num_candidates)
+    /* Just show the first candidate. */
+    candidate = 0;
 
   switch (mState)
   {
@@ -312,32 +418,26 @@ float nsMBCSGroupProber::GetConfidence(int candidate)
   default:
     for (i = 0; i < NUM_OF_PROBERS; i++)
     {
-      float bestLangConf = 0.0;
-
-      if (!mIsActive[i])
-        continue;
-      cf = mProbers[i]->GetConfidence(0);
-
-      if (mProbers[i]->DecodeToUnicode())
-      {
-        for (int j = 0; j < NUM_OF_LANGUAGES; j++)
+      for (PRUint32 j = 0; j < NUM_OF_LANGUAGES; j++)
+        if (candidates[i][j])
         {
-            float langConf = langDetectors[i][j]->GetConfidence();
+          if (candidate == candidate_it)
+          {
+            float cf       = mProbers[i]->GetConfidence(0);
+            float langConf = 1.0;
 
-            if (bestLangConf < langConf)
-              bestLangConf = langConf;
+            if (langDetectors[i][j])
+              langConf = langDetectors[i][j]->GetConfidence();
+
+            return cf * langConf;
+          }
+          candidate_it++;
         }
-        cf *= bestLangConf;
-      }
-
-      if (bestConf < cf)
-      {
-        bestConf = cf;
-        mBestGuess = i;
-      }
     }
   }
-  return bestConf;
+
+  /* Should not happen. */
+  return 0.0;
 }
 
 #ifdef DEBUG_chardet
diff --git a/src/nsMBCSGroupProber.h b/src/nsMBCSGroupProber.h
index 2da8d79..190ef65 100644
--- a/src/nsMBCSGroupProber.h
+++ b/src/nsMBCSGroupProber.h
@@ -57,7 +57,7 @@ public:
   nsProbingState HandleData(const char* aBuf, PRUint32 aLen,
                             int** codePointBuffer,
                             int*  codePointBufferIdx);
-  int GetCandidates() { return 1; }
+  int         GetCandidates();
   const char* GetCharSetName(int candidate);
   const char* GetLanguage(int candidate);
   nsProbingState GetState(void) {return mState;}
@@ -76,15 +76,19 @@ protected:
   nsProbingState mState;
   nsCharSetProber* mProbers[NUM_OF_PROBERS];
   PRBool          mIsActive[NUM_OF_PROBERS];
-  PRInt32 mBestGuess;
   PRUint32 mActiveNum;
   PRUint32 mKeepNext;
 
+  PRBool   candidates[NUM_OF_PROBERS][NUM_OF_LANGUAGES];
+
   int *codePointBuffer[NUM_OF_PROBERS];
   int  codePointBufferSize[NUM_OF_PROBERS];
   int  codePointBufferIdx[NUM_OF_PROBERS];
 
   nsLanguageDetector *langDetectors[NUM_OF_PROBERS][NUM_OF_LANGUAGES];
+
+private:
+  void CheckCandidates();
 };
 
 #endif /* nsMBCSGroupProber_h__ */
diff --git a/src/nsUniversalDetector.cpp b/src/nsUniversalDetector.cpp
index 184a114..6695aff 100644
--- a/src/nsUniversalDetector.cpp
+++ b/src/nsUniversalDetector.cpp
@@ -54,9 +54,6 @@ nsUniversalDetector::nsUniversalDetector(PRUint32 aLanguageFilter)
   mEscCharSetProber = nsnull;
 
   mStart = PR_TRUE;
-  mDetectedCharset = nsnull;
-  mDetectedLanguage = nsnull;
-  mDetectedConfidence = 0.0;
   mGotData = PR_FALSE;
   mInputState = ePureAscii;
   mLastChar = '\0';
@@ -84,9 +81,6 @@ nsUniversalDetector::Reset()
   mInTag = PR_FALSE;
 
   mStart = PR_TRUE;
-  mDetectedCharset = nsnull;
-  mDetectedLanguage = nsnull;
-  mDetectedConfidence = 0.0;
   mGotData = PR_FALSE;
   mInputState = ePureAscii;
   mLastChar = '\0';
@@ -124,16 +118,16 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
           if (('\xBB' == aBuf[1]) && ('\xBF' == aBuf[2]))
           {
             /* EF BB BF: UTF-8 encoded BOM. */
-            mDetectedCharset = "UTF-8";
-            mDetectedConfidence = 0.99;
+            shortcutCharset = "UTF-8";
+            shortcutConfidence = 0.99;
           }
         break;
         case '\xFE':
           if ('\xFF' == aBuf[1])
           {
             /* FE FF: UTF-16, big endian BOM. */
-            mDetectedCharset = "UTF-16";
-            mDetectedConfidence = 0.99;
+            shortcutCharset = "UTF-16";
+            shortcutConfidence = 0.99;
           }
         break;
         case '\xFF':
@@ -144,14 +138,14 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
                 aBuf[3] == '\x00')
             {
                 /* FF FE 00 00: UTF-32 (LE). */
-                mDetectedCharset = "UTF-32";
-                mDetectedConfidence = 0.99;
+                shortcutCharset = "UTF-32";
+                shortcutConfidence = 0.99;
             }
             else
             {
                 /* FF FE: UTF-16, little endian BOM. */
-                mDetectedCharset = "UTF-16";
-                mDetectedConfidence = 0.99;
+                shortcutCharset = "UTF-16";
+                shortcutConfidence = 0.99;
             }
           }
           break;
@@ -162,14 +156,14 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
               aBuf[3] == '\xFF')
           {
               /* 00 00 FE FF: UTF-32 (BE). */
-              mDetectedCharset = "UTF-32";
-              mDetectedConfidence = 0.99;
+              shortcutCharset = "UTF-32";
+              shortcutConfidence = 0.99;
           }
           break;
         }
     }
 
-    if (mDetectedCharset)
+    if (shortcutCharset)
     {
         mDone = PR_TRUE;
         return NS_OK;
@@ -252,9 +246,9 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
     st = mEscCharSetProber->HandleData(aBuf, aLen, NULL, NULL);
     if (st == eFoundIt)
     {
+      shortcutCharset = mEscCharSetProber->GetCharSetName(0);
+      shortcutConfidence = mEscCharSetProber->GetConfidence(0);
       mDone = PR_TRUE;
-      mDetectedCharset = mEscCharSetProber->GetCharSetName(0);
-      mDetectedConfidence = mEscCharSetProber->GetConfidence(0);
     }
     break;
   case eHighbyte:
@@ -266,9 +260,6 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
         if (st == eFoundIt)
         {
           mDone = PR_TRUE;
-          mDetectedCharset = mCharSetProbers[i]->GetCharSetName(0);
-          mDetectedLanguage = mCharSetProbers[i]->GetLanguage(0);
-          mDetectedConfidence = mCharSetProbers[i]->GetConfidence(0);
           return NS_OK;
         }
       }
@@ -292,7 +283,7 @@ void nsUniversalDetector::DataEnd()
     return;
   }
 
-  if (! mDetectedCharset)
+  if (! shortcutCharset)
   {
     switch (mInputState)
     {
@@ -302,26 +293,27 @@ void nsUniversalDetector::DataEnd()
       {
           /* ISO-8859-1 is a good result candidate for ASCII + NBSP.
            * (though it could have been any ISO-8859 encoding). */
-          mDetectedCharset = "ISO-8859-1";
+          shortcutCharset = "ISO-8859-1";
       }
       else
       {
           /* ASCII with the ESC character (or the sequence "~{") is still
            * ASCII until proven otherwise. */
-          mDetectedCharset = "ASCII";
+          shortcutCharset = "ASCII";
       }
+      shortcutConfidence = 0.99;
     default:
       break;
     }
   }
 
-  if (mDetectedCharset)
+  if (shortcutCharset)
   {
       /* These cases are limited enough that we are always confident
        * when finding them.
        */
       mDone = PR_TRUE;
-      Report(mDetectedCharset, mDetectedLanguage, mDetectedConfidence);
+      Report(shortcutCharset, NULL, shortcutConfidence);
       return;
   }
 
@@ -335,13 +327,20 @@ void nsUniversalDetector::DataEnd()
       {
         if (mCharSetProbers[i])
         {
-          proberConfidence = mCharSetProbers[i]->GetConfidence(0);
+          int n_candidates = mCharSetProbers[i]->GetCandidates();
 
-          if (proberConfidence > MINIMUM_THRESHOLD)
-              /* Only report what we are confident in. */
-              Report(mCharSetProbers[i]->GetCharSetName(0),
-                     mCharSetProbers[i]->GetLanguage(0),
-                     proberConfidence);
+          for (int c = 0; c < n_candidates; c++)
+          {
+            proberConfidence = mCharSetProbers[i]->GetConfidence(c);
+
+            if (proberConfidence > MINIMUM_THRESHOLD)
+            {
+                /* Only report what we are confident in. */
+                Report(mCharSetProbers[i]->GetCharSetName(c),
+                       mCharSetProbers[i]->GetLanguage(c),
+                       proberConfidence);
+            }
+          }
         }
       }
     }
diff --git a/src/nsUniversalDetector.h b/src/nsUniversalDetector.h
index 521e424..a286ed9 100644
--- a/src/nsUniversalDetector.h
+++ b/src/nsUniversalDetector.h
@@ -80,9 +80,9 @@ protected:
    PRBool  mStart;
    PRBool  mGotData;
    char    mLastChar;
-   const char *  mDetectedCharset;
-   const char *  mDetectedLanguage;
-   float         mDetectedConfidence;
+   const char *  shortcutCharset;
+   float         shortcutConfidence;
+
    PRInt32 mBestGuess;
    PRUint32 mLanguageFilter;