From 401eb55dfc792dad1dd34f259e414ff3bf2cd38e Mon Sep 17 00:00:00 2001 From: Jehan Date: Wed, 14 Dec 2022 20:02:59 +0100 Subject: [PATCH] src: improve algorithm for confidence computation. Additionally to the "frequent characters" concept, we add 2 sub-categories, which are the "very frequent characters" and "rare characters". The former are usually just a few characters which are used most of the time (like 3 or 4 characters used 40% of the time!), whereas the later are often a dozen or more characters which are barely used a few percents of the time, all together. We use this additional concept to help distinguish very similar languages, or languages whose frequent characters are a subset of the ones from another language (typically English, whose alphabet is a subset of many other European languages). The mTypicalPositiveRatio is getting rid of, as it was anyway barely of any use (it was 0.99-something for nearly all languages!). Instead we get these 2 new ratios: veryFreqRatio and lowFreqRatio, and of course the associated order counts to know which character are in these sets. --- src/nsLanguageDetector.cpp | 19 ++++++++++++++++--- src/nsLanguageDetector.h | 17 +++++++++++++++-- 2 files changed, 31 insertions(+), 5 deletions(-) diff --git a/src/nsLanguageDetector.cpp b/src/nsLanguageDetector.cpp index 8097028..1cce160 100644 --- a/src/nsLanguageDetector.cpp +++ b/src/nsLanguageDetector.cpp @@ -37,6 +37,7 @@ * * ***** END LICENSE BLOCK ***** */ +#include #include "nsLanguageDetector.h" nsDetectState nsLanguageDetector::HandleData(const int* codePoints, PRUint32 cpLen) @@ -164,6 +165,11 @@ nsDetectState nsLanguageDetector::HandleData(const int* codePoints, PRUint32 cpL /* Adding a non frequent sequence. */ mTotalSeqs++; } + + if (order < mModel->veryFreqCharCount) + mVeryFreqChar++; + if (order > mModel->lowFreqOrder) + mLowFreqChar++; } mLastOrder = order; } @@ -192,8 +198,10 @@ void nsLanguageDetector::Reset(void) //mCtrlChar = 0; //mEmoticons = 0; //mVariousBetween = 0; - mFreqChar = 0; - mOutChar = 0; + mFreqChar = 0; + mVeryFreqChar = 0; + mLowFreqChar = 0; + mOutChar = 0; } #include @@ -212,7 +220,7 @@ float nsLanguageDetector::GetConfidence(void) //float neutralSeqs = mSeqCounters[LANG_NEUTRAL_CAT]; float negativeSeqs = mSeqCounters[LANG_NEGATIVE_CAT]; - r = (positiveSeqs + probableSeqs / 4 - negativeSeqs * 4) / mTotalSeqs / mModel->mTypicalPositiveRatio; + r = (positiveSeqs + probableSeqs / 4 - negativeSeqs * 4) / mTotalSeqs; /* The more characters outside the expected characters * (proportionnaly to the size of the text), the less confident we * become in the current language. @@ -222,6 +230,11 @@ float nsLanguageDetector::GetConfidence(void) r = r * (mTotalChar - mOutChar) / mTotalChar; r = r * mFreqChar / (mFreqChar + mOutChar); + /* How similar are the very frequent character ratio. */ + r = r * (1.0 - fabs((float) mVeryFreqChar / mFreqChar - mModel->veryFreqRatio) / 4.0); + /* How similar are the very rare character ratio. */ + r = r * (1.0 - fabs((float) mLowFreqChar / mFreqChar - mModel->lowFreqRatio) / 4.0); + return r; } return (float)0.01; diff --git a/src/nsLanguageDetector.h b/src/nsLanguageDetector.h index 339d4e2..5feb726 100644 --- a/src/nsLanguageDetector.h +++ b/src/nsLanguageDetector.h @@ -67,8 +67,17 @@ typedef struct * characters. Yet it maps to this range of orders. */ int freqCharCount; - - float mTypicalPositiveRatio; + /* Most languages have 3 or 4 characters which are used more than 40% of the + * times. We count how many they are and what ratio they are used. + */ + int veryFreqCharCount; + float veryFreqRatio; + /* Most languages will have a whole range of characters which in cumulated + * total are barely used a few percents of the times. We count how many they + * are and what ratio they are used. + */ + int lowFreqOrder; + float lowFreqRatio; } LanguageModel; typedef enum { @@ -105,6 +114,10 @@ protected: /*PRUint32 mVariousBetween;*/ /* Characters that fall in our sampling range */ PRUint32 mFreqChar; + /* Most common characters from our sampling range */ + PRUint32 mVeryFreqChar; + /* Most rare characters from our sampling range */ + PRUint32 mLowFreqChar; PRUint32 mOutChar; private: