mirror of
https://gitlab.freedesktop.org/uchardet/uchardet.git
synced 2025-12-06 08:46:40 +08:00
src: improve algorithm for confidence computation.
Additionally to the "frequent characters" concept, we add 2 sub-categories, which are the "very frequent characters" and "rare characters". The former are usually just a few characters which are used most of the time (like 3 or 4 characters used 40% of the time!), whereas the later are often a dozen or more characters which are barely used a few percents of the time, all together. We use this additional concept to help distinguish very similar languages, or languages whose frequent characters are a subset of the ones from another language (typically English, whose alphabet is a subset of many other European languages). The mTypicalPositiveRatio is getting rid of, as it was anyway barely of any use (it was 0.99-something for nearly all languages!). Instead we get these 2 new ratios: veryFreqRatio and lowFreqRatio, and of course the associated order counts to know which character are in these sets.
This commit is contained in:
parent
4f35cd4416
commit
401eb55dfc
@ -37,6 +37,7 @@
|
||||
*
|
||||
* ***** END LICENSE BLOCK ***** */
|
||||
|
||||
#include <math.h>
|
||||
#include "nsLanguageDetector.h"
|
||||
|
||||
nsDetectState nsLanguageDetector::HandleData(const int* codePoints, PRUint32 cpLen)
|
||||
@ -164,6 +165,11 @@ nsDetectState nsLanguageDetector::HandleData(const int* codePoints, PRUint32 cpL
|
||||
/* Adding a non frequent sequence. */
|
||||
mTotalSeqs++;
|
||||
}
|
||||
|
||||
if (order < mModel->veryFreqCharCount)
|
||||
mVeryFreqChar++;
|
||||
if (order > mModel->lowFreqOrder)
|
||||
mLowFreqChar++;
|
||||
}
|
||||
mLastOrder = order;
|
||||
}
|
||||
@ -192,8 +198,10 @@ void nsLanguageDetector::Reset(void)
|
||||
//mCtrlChar = 0;
|
||||
//mEmoticons = 0;
|
||||
//mVariousBetween = 0;
|
||||
mFreqChar = 0;
|
||||
mOutChar = 0;
|
||||
mFreqChar = 0;
|
||||
mVeryFreqChar = 0;
|
||||
mLowFreqChar = 0;
|
||||
mOutChar = 0;
|
||||
}
|
||||
|
||||
#include <cstdio>
|
||||
@ -212,7 +220,7 @@ float nsLanguageDetector::GetConfidence(void)
|
||||
//float neutralSeqs = mSeqCounters[LANG_NEUTRAL_CAT];
|
||||
float negativeSeqs = mSeqCounters[LANG_NEGATIVE_CAT];
|
||||
|
||||
r = (positiveSeqs + probableSeqs / 4 - negativeSeqs * 4) / mTotalSeqs / mModel->mTypicalPositiveRatio;
|
||||
r = (positiveSeqs + probableSeqs / 4 - negativeSeqs * 4) / mTotalSeqs;
|
||||
/* The more characters outside the expected characters
|
||||
* (proportionnaly to the size of the text), the less confident we
|
||||
* become in the current language.
|
||||
@ -222,6 +230,11 @@ float nsLanguageDetector::GetConfidence(void)
|
||||
r = r * (mTotalChar - mOutChar) / mTotalChar;
|
||||
r = r * mFreqChar / (mFreqChar + mOutChar);
|
||||
|
||||
/* How similar are the very frequent character ratio. */
|
||||
r = r * (1.0 - fabs((float) mVeryFreqChar / mFreqChar - mModel->veryFreqRatio) / 4.0);
|
||||
/* How similar are the very rare character ratio. */
|
||||
r = r * (1.0 - fabs((float) mLowFreqChar / mFreqChar - mModel->lowFreqRatio) / 4.0);
|
||||
|
||||
return r;
|
||||
}
|
||||
return (float)0.01;
|
||||
|
||||
@ -67,8 +67,17 @@ typedef struct
|
||||
* characters. Yet it maps to this range of orders.
|
||||
*/
|
||||
int freqCharCount;
|
||||
|
||||
float mTypicalPositiveRatio;
|
||||
/* Most languages have 3 or 4 characters which are used more than 40% of the
|
||||
* times. We count how many they are and what ratio they are used.
|
||||
*/
|
||||
int veryFreqCharCount;
|
||||
float veryFreqRatio;
|
||||
/* Most languages will have a whole range of characters which in cumulated
|
||||
* total are barely used a few percents of the times. We count how many they
|
||||
* are and what ratio they are used.
|
||||
*/
|
||||
int lowFreqOrder;
|
||||
float lowFreqRatio;
|
||||
} LanguageModel;
|
||||
|
||||
typedef enum {
|
||||
@ -105,6 +114,10 @@ protected:
|
||||
/*PRUint32 mVariousBetween;*/
|
||||
/* Characters that fall in our sampling range */
|
||||
PRUint32 mFreqChar;
|
||||
/* Most common characters from our sampling range */
|
||||
PRUint32 mVeryFreqChar;
|
||||
/* Most rare characters from our sampling range */
|
||||
PRUint32 mLowFreqChar;
|
||||
PRUint32 mOutChar;
|
||||
|
||||
private:
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user