src: make nsMBCSGroupProber report all valid candidates.

Returning only the best one has limits, as it doesn't allow to check very close confidence candidates. Now in particular, the UTF-8 prober will return all ("UTF-8", lang) candidates for every language with probable statistical fit.
2026-02-07 10:19:59 +08:00 · 2021-03-17 16:34:26 +01:00 · 2021-03-17 16:34:26 +01:00 · 6138d9e0f0
commit 6138d9e0f0
parent 2127f4fc0d
4 changed files with 203 additions and 100 deletions
--- a/src/nsMBCSGroupProber.cpp
+++ b/src/nsMBCSGroupProber.cpp
@ -138,45 +138,87 @@ nsMBCSGroupProber::~nsMBCSGroupProber()
  }
 }

+#define CANDIDATE_THRESHOLD 0.3f
+
+int nsMBCSGroupProber::GetCandidates()
+{
+  int num_candidates = 0;
+
+  CheckCandidates();
+
+  for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++)
+    for (PRUint32 j = 0; j < NUM_OF_LANGUAGES; j++)
+      if (candidates[i][j])
+        num_candidates++;
+
+  return num_candidates;
+}
+
 const char* nsMBCSGroupProber::GetCharSetName(int candidate)
 {
-  if (mBestGuess == -1)
-  {
-    GetConfidence(0);
-    if (mBestGuess == -1)
-      mBestGuess = 0;
-  }
-  return mProbers[mBestGuess]->GetCharSetName(0);
+  int num_candidates = GetCandidates();
+  int candidate_it   = 0;
+
+  if (num_candidates == 0)
+    return NULL;
+  else if (candidate >= num_candidates)
+    /* Just show the first candidate. */
+    candidate = 0;
+
+  for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++)
+    for (PRUint32 j = 0; j < NUM_OF_LANGUAGES; j++)
+      if (candidates[i][j])
+      {
+        if (candidate == candidate_it)
+        {
+          /* We assume that probers included in the nsMBCSGroupProber
+           * return only one candidate themselves.
+           * */
+          return mProbers[i]->GetCharSetName(0);
+        }
+        candidate_it++;
+      }
+
+  /* Should not happen. */
+  return NULL;
 }

 const char* nsMBCSGroupProber::GetLanguage(int candidate)
 {
-  const char* maxLang       = NULL;
-  int         maxLangIdx    = -1;
-  float       maxConfidence = 0.0;
+  const char* lang   = NULL;
+  int num_candidates = GetCandidates();
+  int candidate_it   = 0;

-  if (mBestGuess == -1)
+  if (num_candidates == 0)
    return NULL;
-  else
-    maxLang = mProbers[mBestGuess]->GetLanguage(0);
+  else if (candidate >= num_candidates)
+    /* Just show the first candidate. */
+    candidate = 0;

-  if (maxLang == NULL && mProbers[mBestGuess]->DecodeToUnicode())
-  {
+  for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++)
    for (PRUint32 j = 0; j < NUM_OF_LANGUAGES; j++)
-    {
-      float conf = langDetectors[mBestGuess][j]->GetConfidence();
-
-      if (conf > maxConfidence)
+      if (candidates[i][j])
      {
-        maxLangIdx = j;
-        maxConfidence = conf;
-      }
-    }
-    if (maxLangIdx != -1)
-      maxLang = langDetectors[mBestGuess][maxLangIdx]->GetLanguage();
-  }
+        if (candidate == candidate_it)
+        {
+          /* We assume that probers included in the nsMBCSGroupProber
+           * return only one candidate themselves.
+           * */
+          lang = mProbers[i]->GetLanguage(0);

-  return maxLang;
+          if (! lang)
+          {
+            /* The prober does not come with its own language. */
+            if (langDetectors[i][j])
+              lang = langDetectors[i][j]->GetLanguage();
+          }
+
+          return lang;
+        }
+        candidate_it++;
+      }
+
+  return lang;
 }

 void nsMBCSGroupProber::Reset(void)
@ -196,17 +238,18 @@ void nsMBCSGroupProber::Reset(void)
        codePointBuffer[i] = new int[codePointBufferSize[i]];
      }
      codePointBufferIdx[i] = 0;
-
-      for (PRUint32 j = 0; j < NUM_OF_LANGUAGES; j++)
-      {
-        if (langDetectors[i][j])
-          langDetectors[i][j]->Reset();
-      }
    }
    else
      mIsActive[i] = PR_FALSE;
+
+    for (PRUint32 j = 0; j < NUM_OF_LANGUAGES; j++)
+    {
+      if (langDetectors[i][j])
+        langDetectors[i][j]->Reset();
+
+      candidates[i][j] = false;
+    }
  }
-  mBestGuess = -1;
  mState = eDetecting;
  mKeepNext = 0;
 }
@ -252,9 +295,21 @@ nsProbingState nsMBCSGroupProber::HandleData(const char* aBuf, PRUint32 aLen,

          if (st == eFoundIt)
          {
-            mBestGuess = i;
-            mState = eFoundIt;
-            return mState;
+            float cf = mProbers[i]->GetConfidence(0);
+
+            for (PRUint32 j = 0; j < NUM_OF_LANGUAGES; j++)
+            {
+              float langConf = langDetectors[i][j]->GetConfidence();
+
+              if (cf * langConf > CANDIDATE_THRESHOLD)
+              {
+                /* There is at least one (charset, lang) couple for
+                 * which the confidence is high enough.
+                 */
+                mState = eFoundIt;
+                return mState;
+              }
+            }
          }
        }
      }
@ -288,9 +343,21 @@ nsProbingState nsMBCSGroupProber::HandleData(const char* aBuf, PRUint32 aLen,

      if (st == eFoundIt)
      {
-        mBestGuess = i;
-        mState = eFoundIt;
-        return mState;
+        float cf = mProbers[i]->GetConfidence(0);
+
+        for (PRUint32 j = 0; j < NUM_OF_LANGUAGES; j++)
+        {
+          float langConf = langDetectors[i][j]->GetConfidence();
+
+          if (cf * langConf > CANDIDATE_THRESHOLD)
+          {
+            /* There is at least one (charset, lang) couple for
+             * which the confidence is high enough.
+             */
+            mState = eFoundIt;
+            return mState;
+          }
+        }
      }
    }
  }
@ -299,10 +366,49 @@ nsProbingState nsMBCSGroupProber::HandleData(const char* aBuf, PRUint32 aLen,
  return mState;
 }

+void nsMBCSGroupProber::CheckCandidates()
+{
+  for (int i = 0; i < NUM_OF_PROBERS; i++)
+  {
+    if (! mIsActive[i])
+    {
+      for (int j = 0; j < NUM_OF_LANGUAGES; j++)
+        candidates[i][j] = false;
+    }
+    else
+    {
+      float cf = mProbers[i]->GetConfidence(0);
+
+      if (mProbers[i]->DecodeToUnicode())
+      {
+        for (int j = 0; j < NUM_OF_LANGUAGES; j++)
+        {
+          float langConf = langDetectors[i][j]->GetConfidence();
+
+          candidates[i][j] = (cf * langConf > CANDIDATE_THRESHOLD);
+        }
+      }
+      else
+      {
+        for (int j = 0; j < NUM_OF_LANGUAGES; j++)
+          candidates[i][j] = (cf > CANDIDATE_THRESHOLD);
+      }
+    }
+  }
+}
+
 float nsMBCSGroupProber::GetConfidence(int candidate)
 {
+  int num_candidates = GetCandidates();
+  int candidate_it   = 0;
+
  PRUint32 i;
-  float bestConf = 0.0, cf;
+
+  if (num_candidates == 0)
+    return 0.0;
+  else if (candidate >= num_candidates)
+    /* Just show the first candidate. */
+    candidate = 0;

  switch (mState)
  {
@ -312,32 +418,26 @@ float nsMBCSGroupProber::GetConfidence(int candidate)
  default:
    for (i = 0; i < NUM_OF_PROBERS; i++)
    {
-      float bestLangConf = 0.0;
-
-      if (!mIsActive[i])
-        continue;
-      cf = mProbers[i]->GetConfidence(0);
-
-      if (mProbers[i]->DecodeToUnicode())
-      {
-        for (int j = 0; j < NUM_OF_LANGUAGES; j++)
+      for (PRUint32 j = 0; j < NUM_OF_LANGUAGES; j++)
+        if (candidates[i][j])
        {
-            float langConf = langDetectors[i][j]->GetConfidence();
+          if (candidate == candidate_it)
+          {
+            float cf       = mProbers[i]->GetConfidence(0);
+            float langConf = 1.0;

-            if (bestLangConf < langConf)
-              bestLangConf = langConf;
+            if (langDetectors[i][j])
+              langConf = langDetectors[i][j]->GetConfidence();
+
+            return cf * langConf;
+          }
+          candidate_it++;
        }
-        cf *= bestLangConf;
-      }
-
-      if (bestConf < cf)
-      {
-        bestConf = cf;
-        mBestGuess = i;
-      }
    }
  }
-  return bestConf;
+
+  /* Should not happen. */
+  return 0.0;
 }

 #ifdef DEBUG_chardet
--- a/src/nsMBCSGroupProber.h
+++ b/src/nsMBCSGroupProber.h
@ -57,7 +57,7 @@ public:
  nsProbingState HandleData(const char* aBuf, PRUint32 aLen,
                            int** codePointBuffer,
                            int*  codePointBufferIdx);
-  int GetCandidates() { return 1; }
+  int         GetCandidates();
  const char* GetCharSetName(int candidate);
  const char* GetLanguage(int candidate);
  nsProbingState GetState(void) {return mState;}
@ -76,15 +76,19 @@ protected:
  nsProbingState mState;
  nsCharSetProber* mProbers[NUM_OF_PROBERS];
  PRBool          mIsActive[NUM_OF_PROBERS];
-  PRInt32 mBestGuess;
  PRUint32 mActiveNum;
  PRUint32 mKeepNext;

+  PRBool   candidates[NUM_OF_PROBERS][NUM_OF_LANGUAGES];
+
  int *codePointBuffer[NUM_OF_PROBERS];
  int  codePointBufferSize[NUM_OF_PROBERS];
  int  codePointBufferIdx[NUM_OF_PROBERS];

  nsLanguageDetector *langDetectors[NUM_OF_PROBERS][NUM_OF_LANGUAGES];
+
+private:
+  void CheckCandidates();
 };

 #endif /* nsMBCSGroupProber_h__ */
--- a/src/nsUniversalDetector.cpp
+++ b/src/nsUniversalDetector.cpp
@ -54,9 +54,6 @@ nsUniversalDetector::nsUniversalDetector(PRUint32 aLanguageFilter)
  mEscCharSetProber = nsnull;

  mStart = PR_TRUE;
-  mDetectedCharset = nsnull;
-  mDetectedLanguage = nsnull;
-  mDetectedConfidence = 0.0;
  mGotData = PR_FALSE;
  mInputState = ePureAscii;
  mLastChar = '\0';
@ -84,9 +81,6 @@ nsUniversalDetector::Reset()
  mInTag = PR_FALSE;

  mStart = PR_TRUE;
-  mDetectedCharset = nsnull;
-  mDetectedLanguage = nsnull;
-  mDetectedConfidence = 0.0;
  mGotData = PR_FALSE;
  mInputState = ePureAscii;
  mLastChar = '\0';
@ -124,16 +118,16 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
          if (('\xBB' == aBuf[1]) && ('\xBF' == aBuf[2]))
          {
            /* EF BB BF: UTF-8 encoded BOM. */
-            mDetectedCharset = "UTF-8";
-            mDetectedConfidence = 0.99;
+            shortcutCharset = "UTF-8";
+            shortcutConfidence = 0.99;
          }
        break;
        case '\xFE':
          if ('\xFF' == aBuf[1])
          {
            /* FE FF: UTF-16, big endian BOM. */
-            mDetectedCharset = "UTF-16";
-            mDetectedConfidence = 0.99;
+            shortcutCharset = "UTF-16";
+            shortcutConfidence = 0.99;
          }
        break;
        case '\xFF':
@ -144,14 +138,14 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
                aBuf[3] == '\x00')
            {
                /* FF FE 00 00: UTF-32 (LE). */
-                mDetectedCharset = "UTF-32";
-                mDetectedConfidence = 0.99;
+                shortcutCharset = "UTF-32";
+                shortcutConfidence = 0.99;
            }
            else
            {
                /* FF FE: UTF-16, little endian BOM. */
-                mDetectedCharset = "UTF-16";
-                mDetectedConfidence = 0.99;
+                shortcutCharset = "UTF-16";
+                shortcutConfidence = 0.99;
            }
          }
          break;
@ -162,14 +156,14 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
              aBuf[3] == '\xFF')
          {
              /* 00 00 FE FF: UTF-32 (BE). */
-              mDetectedCharset = "UTF-32";
-              mDetectedConfidence = 0.99;
+              shortcutCharset = "UTF-32";
+              shortcutConfidence = 0.99;
          }
          break;
        }
    }

-    if (mDetectedCharset)
+    if (shortcutCharset)
    {
        mDone = PR_TRUE;
        return NS_OK;
@ -252,9 +246,9 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
    st = mEscCharSetProber->HandleData(aBuf, aLen, NULL, NULL);
    if (st == eFoundIt)
    {
+      shortcutCharset = mEscCharSetProber->GetCharSetName(0);
+      shortcutConfidence = mEscCharSetProber->GetConfidence(0);
      mDone = PR_TRUE;
-      mDetectedCharset = mEscCharSetProber->GetCharSetName(0);
-      mDetectedConfidence = mEscCharSetProber->GetConfidence(0);
    }
    break;
  case eHighbyte:
@ -266,9 +260,6 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
        if (st == eFoundIt)
        {
          mDone = PR_TRUE;
-          mDetectedCharset = mCharSetProbers[i]->GetCharSetName(0);
-          mDetectedLanguage = mCharSetProbers[i]->GetLanguage(0);
-          mDetectedConfidence = mCharSetProbers[i]->GetConfidence(0);
          return NS_OK;
        }
      }
@ -292,7 +283,7 @@ void nsUniversalDetector::DataEnd()
    return;
  }

-  if (! mDetectedCharset)
+  if (! shortcutCharset)
  {
    switch (mInputState)
    {
@ -302,26 +293,27 @@ void nsUniversalDetector::DataEnd()
      {
          /* ISO-8859-1 is a good result candidate for ASCII + NBSP.
           * (though it could have been any ISO-8859 encoding). */
-          mDetectedCharset = "ISO-8859-1";
+          shortcutCharset = "ISO-8859-1";
      }
      else
      {
          /* ASCII with the ESC character (or the sequence "~{") is still
           * ASCII until proven otherwise. */
-          mDetectedCharset = "ASCII";
+          shortcutCharset = "ASCII";
      }
+      shortcutConfidence = 0.99;
    default:
      break;
    }
  }

-  if (mDetectedCharset)
+  if (shortcutCharset)
  {
      /* These cases are limited enough that we are always confident
       * when finding them.
       */
      mDone = PR_TRUE;
-      Report(mDetectedCharset, mDetectedLanguage, mDetectedConfidence);
+      Report(shortcutCharset, NULL, shortcutConfidence);
      return;
  }

@ -335,13 +327,20 @@ void nsUniversalDetector::DataEnd()
      {
        if (mCharSetProbers[i])
        {
-          proberConfidence = mCharSetProbers[i]->GetConfidence(0);
+          int n_candidates = mCharSetProbers[i]->GetCandidates();

-          if (proberConfidence > MINIMUM_THRESHOLD)
-              /* Only report what we are confident in. */
-              Report(mCharSetProbers[i]->GetCharSetName(0),
-                     mCharSetProbers[i]->GetLanguage(0),
-                     proberConfidence);
+          for (int c = 0; c < n_candidates; c++)
+          {
+            proberConfidence = mCharSetProbers[i]->GetConfidence(c);
+
+            if (proberConfidence > MINIMUM_THRESHOLD)
+            {
+                /* Only report what we are confident in. */
+                Report(mCharSetProbers[i]->GetCharSetName(c),
+                       mCharSetProbers[i]->GetLanguage(c),
+                       proberConfidence);
+            }
+          }
        }
      }
    }
--- a/src/nsUniversalDetector.h
+++ b/src/nsUniversalDetector.h
@ -80,9 +80,9 @@ protected:
   PRBool  mStart;
   PRBool  mGotData;
   char    mLastChar;
-   const char *  mDetectedCharset;
-   const char *  mDetectedLanguage;
-   float         mDetectedConfidence;
+   const char *  shortcutCharset;
+   float         shortcutConfidence;
+
   PRInt32 mBestGuess;
   PRUint32 mLanguageFilter;