src: improve confidence computation (generic and single-byte charset).

Nearly the same algorithm on both pieces of code now. I reintroduced the mTypicalPositiveRatio now that our models actually gives the right ratio (not the "first 512" meaningless stuff anymore). In remaining differences, the last computation is the ratio of frequent characters on the whole characters. For the generic detector, we use the frequent+out sum instead. It works much better. I think that Unicode text is much more prone to have characters outside your expected range, while still being meaningful characters. Even control characters are much more meaningful in Unicode. So a ratio off it would make much too low confidence. Anyway this confidence algorithm is already better. We seem to approach much nicer confidence at each iteration, very satisfying!
2025-12-08 01:36:41 +08:00 · 2021-03-22 18:03:02 +01:00 · 2021-03-22 18:03:02 +01:00 · 6436e1dd47
commit 6436e1dd47
parent 8e2cf7b81b
3 changed files with 31 additions and 26 deletions
--- a/src/nsLanguageDetector.cpp
+++ b/src/nsLanguageDetector.cpp
@ -212,7 +212,7 @@ float nsLanguageDetector::GetConfidence(void)
    //float neutralSeqs  = mSeqCounters[LANG_NEUTRAL_CAT];
    float negativeSeqs = mSeqCounters[LANG_NEGATIVE_CAT];

-    r = (positiveSeqs + probableSeqs / 4 - negativeSeqs * 2) / mTotalSeqs;
+    r = (positiveSeqs + probableSeqs / 4 - negativeSeqs * 4) / mTotalSeqs / mModel->mTypicalPositiveRatio;
    /* The more characters outside the expected characters
     * (proportionnaly to the size of the text), the less confident we
     * become in the current language.
--- a/src/nsSBCharSetProber.cpp
+++ b/src/nsSBCharSetProber.cpp
@ -48,11 +48,8 @@ nsProbingState nsSingleByteCharSetProber::HandleData(const char* aBuf, PRUint32
  {
    order = mModel->charToOrderMap[(unsigned char)aBuf[i]];

-    if (order < SYMBOL_CAT_ORDER)
-    {
-      mTotalChar++;
-    }
-    else if (order == ILL)
+    mTotalChar++;
+    if (order == ILL)
    {
      /* When encountering an illegal codepoint, no need
       * to continue analyzing data. */
@ -63,7 +60,7 @@ nsProbingState nsSingleByteCharSetProber::HandleData(const char* aBuf, PRUint32
    {
      mCtrlChar++;
    }
-    if (order < mModel->freqCharCount)
+    else if (order < mModel->freqCharCount)
    {
      mFreqChar++;

@ -75,6 +72,21 @@ nsProbingState nsSingleByteCharSetProber::HandleData(const char* aBuf, PRUint32
        else // reverse the order of the letters in the lookup
          ++(mSeqCounters[mModel->precedenceMatrix[order*mModel->freqCharCount+mLastOrder]]);
      }
+      else if (mLastOrder < SYMBOL_CAT_ORDER)
+      {
+        mSeqCounters[NEGATIVE_CAT]++;
+        mTotalSeqs++;
+      }
+    }
+    else if (order < SYMBOL_CAT_ORDER)
+    {
+      mOutChar++;
+
+      if (mLastOrder < SYMBOL_CAT_ORDER)
+      {
+        mTotalSeqs++;
+        mSeqCounters[NEGATIVE_CAT]++;
+      }
    }
    mLastOrder = order;
  }
@ -92,7 +104,7 @@ nsProbingState nsSingleByteCharSetProber::HandleData(const char* aBuf, PRUint32
  return mState;
 }

-void  nsSingleByteCharSetProber::Reset(void)
+void nsSingleByteCharSetProber::Reset(void)
 {
  mState = eDetecting;
  mLastOrder = 255;
@ -101,7 +113,8 @@ void  nsSingleByteCharSetProber::Reset(void)
  mTotalSeqs = 0;
  mTotalChar = 0;
  mCtrlChar  = 0;
-  mFreqChar = 0;
+  mFreqChar  = 0;
+  mOutChar   = 0;
 }

 //#define NEGATIVE_APPROACH 1
@ -117,23 +130,14 @@ float nsSingleByteCharSetProber::GetConfidence(int candidate)
  float r;

  if (mTotalSeqs > 0) {
-    r = ((float)1.0) * mSeqCounters[POSITIVE_CAT] / mTotalSeqs / mModel->mTypicalPositiveRatio;
-    /* Multiply by a ratio of positive sequences per characters.
-     * This would help in particular to distinguish close winners.
-     * Indeed if you add a letter, you'd expect the positive sequence count
-     * to increase as well. If it doesn't, it may mean that this new codepoint
-     * may not have been a letter, but instead a symbol (or some other
-     * character). This could make the difference between very closely related
-     * charsets used for the same language.
-     */
-    r = r * (mSeqCounters[POSITIVE_CAT] + (float) mSeqCounters[PROBABLE_CAT] / 4) / mTotalChar;
-    /* The more control characters (proportionnaly to the size of the text), the
-     * less confident we become in the current charset.
-     */
-    r = r * ((float) mTotalChar - mCtrlChar) / mTotalChar;
-    r = r*mFreqChar/mTotalChar;
-    if (r >= (float)1.00)
-      r = (float)0.99;
+    float positiveSeqs = mSeqCounters[POSITIVE_CAT];
+    float probableSeqs = mSeqCounters[PROBABLE_CAT];
+    float negativeSeqs = mSeqCounters[NEGATIVE_CAT];
+
+    r = (positiveSeqs + probableSeqs / 4 - negativeSeqs * 4) / mTotalSeqs / mModel->mTypicalPositiveRatio;
+    r = r * (mTotalChar - mOutChar - mCtrlChar) / mTotalChar;
+    r = r * mFreqChar / mTotalChar;
+
    return r;
  }
  return (float)0.01;
--- a/src/nsSBCharSetProber.h
+++ b/src/nsSBCharSetProber.h
@ -124,6 +124,7 @@ protected:
  PRUint32 mCtrlChar;
  //characters that fall in our sampling range
  PRUint32 mFreqChar;
+  PRUint32 mOutChar;
  
  // Optional auxiliary prober for name decision. created and destroyed by the GroupProber
  nsCharSetProber* mNameProber;