From ba6b46a68c61a723bd84b3774f3a433d6162f347 Mon Sep 17 00:00:00 2001 From: Jehan Date: Wed, 17 Mar 2021 16:34:26 +0100 Subject: [PATCH] src: make nsMBCSGroupProber report all valid candidates. Returning only the best one has limits, as it doesn't allow to check very close confidence candidates. Now in particular, the UTF-8 prober will return all ("UTF-8", lang) candidates for every language with probable statistical fit. --- src/nsMBCSGroupProber.cpp | 224 ++++++++++++++++++++++++++---------- src/nsMBCSGroupProber.h | 8 +- src/nsUniversalDetector.cpp | 65 ++++++----- src/nsUniversalDetector.h | 6 +- 4 files changed, 203 insertions(+), 100 deletions(-) diff --git a/src/nsMBCSGroupProber.cpp b/src/nsMBCSGroupProber.cpp index 3b21530..790d099 100644 --- a/src/nsMBCSGroupProber.cpp +++ b/src/nsMBCSGroupProber.cpp @@ -138,45 +138,87 @@ nsMBCSGroupProber::~nsMBCSGroupProber() } } +#define CANDIDATE_THRESHOLD 0.3f + +int nsMBCSGroupProber::GetCandidates() +{ + int num_candidates = 0; + + CheckCandidates(); + + for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++) + for (PRUint32 j = 0; j < NUM_OF_LANGUAGES; j++) + if (candidates[i][j]) + num_candidates++; + + return num_candidates; +} + const char* nsMBCSGroupProber::GetCharSetName(int candidate) { - if (mBestGuess == -1) - { - GetConfidence(0); - if (mBestGuess == -1) - mBestGuess = 0; - } - return mProbers[mBestGuess]->GetCharSetName(0); + int num_candidates = GetCandidates(); + int candidate_it = 0; + + if (num_candidates == 0) + return NULL; + else if (candidate >= num_candidates) + /* Just show the first candidate. */ + candidate = 0; + + for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++) + for (PRUint32 j = 0; j < NUM_OF_LANGUAGES; j++) + if (candidates[i][j]) + { + if (candidate == candidate_it) + { + /* We assume that probers included in the nsMBCSGroupProber + * return only one candidate themselves. + * */ + return mProbers[i]->GetCharSetName(0); + } + candidate_it++; + } + + /* Should not happen. */ + return NULL; } const char* nsMBCSGroupProber::GetLanguage(int candidate) { - const char* maxLang = NULL; - int maxLangIdx = -1; - float maxConfidence = 0.0; + const char* lang = NULL; + int num_candidates = GetCandidates(); + int candidate_it = 0; - if (mBestGuess == -1) + if (num_candidates == 0) return NULL; - else - maxLang = mProbers[mBestGuess]->GetLanguage(0); + else if (candidate >= num_candidates) + /* Just show the first candidate. */ + candidate = 0; - if (maxLang == NULL && mProbers[mBestGuess]->DecodeToUnicode()) - { + for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++) for (PRUint32 j = 0; j < NUM_OF_LANGUAGES; j++) - { - float conf = langDetectors[mBestGuess][j]->GetConfidence(); - - if (conf > maxConfidence) + if (candidates[i][j]) { - maxLangIdx = j; - maxConfidence = conf; - } - } - if (maxLangIdx != -1) - maxLang = langDetectors[mBestGuess][maxLangIdx]->GetLanguage(); - } + if (candidate == candidate_it) + { + /* We assume that probers included in the nsMBCSGroupProber + * return only one candidate themselves. + * */ + lang = mProbers[i]->GetLanguage(0); - return maxLang; + if (! lang) + { + /* The prober does not come with its own language. */ + if (langDetectors[i][j]) + lang = langDetectors[i][j]->GetLanguage(); + } + + return lang; + } + candidate_it++; + } + + return lang; } void nsMBCSGroupProber::Reset(void) @@ -196,17 +238,18 @@ void nsMBCSGroupProber::Reset(void) codePointBuffer[i] = new int[codePointBufferSize[i]]; } codePointBufferIdx[i] = 0; - - for (PRUint32 j = 0; j < NUM_OF_LANGUAGES; j++) - { - if (langDetectors[i][j]) - langDetectors[i][j]->Reset(); - } } else mIsActive[i] = PR_FALSE; + + for (PRUint32 j = 0; j < NUM_OF_LANGUAGES; j++) + { + if (langDetectors[i][j]) + langDetectors[i][j]->Reset(); + + candidates[i][j] = false; + } } - mBestGuess = -1; mState = eDetecting; mKeepNext = 0; } @@ -252,9 +295,21 @@ nsProbingState nsMBCSGroupProber::HandleData(const char* aBuf, PRUint32 aLen, if (st == eFoundIt) { - mBestGuess = i; - mState = eFoundIt; - return mState; + float cf = mProbers[i]->GetConfidence(0); + + for (PRUint32 j = 0; j < NUM_OF_LANGUAGES; j++) + { + float langConf = langDetectors[i][j]->GetConfidence(); + + if (cf * langConf > CANDIDATE_THRESHOLD) + { + /* There is at least one (charset, lang) couple for + * which the confidence is high enough. + */ + mState = eFoundIt; + return mState; + } + } } } } @@ -288,9 +343,21 @@ nsProbingState nsMBCSGroupProber::HandleData(const char* aBuf, PRUint32 aLen, if (st == eFoundIt) { - mBestGuess = i; - mState = eFoundIt; - return mState; + float cf = mProbers[i]->GetConfidence(0); + + for (PRUint32 j = 0; j < NUM_OF_LANGUAGES; j++) + { + float langConf = langDetectors[i][j]->GetConfidence(); + + if (cf * langConf > CANDIDATE_THRESHOLD) + { + /* There is at least one (charset, lang) couple for + * which the confidence is high enough. + */ + mState = eFoundIt; + return mState; + } + } } } } @@ -299,10 +366,49 @@ nsProbingState nsMBCSGroupProber::HandleData(const char* aBuf, PRUint32 aLen, return mState; } +void nsMBCSGroupProber::CheckCandidates() +{ + for (int i = 0; i < NUM_OF_PROBERS; i++) + { + if (! mIsActive[i]) + { + for (int j = 0; j < NUM_OF_LANGUAGES; j++) + candidates[i][j] = false; + } + else + { + float cf = mProbers[i]->GetConfidence(0); + + if (mProbers[i]->DecodeToUnicode()) + { + for (int j = 0; j < NUM_OF_LANGUAGES; j++) + { + float langConf = langDetectors[i][j]->GetConfidence(); + + candidates[i][j] = (cf * langConf > CANDIDATE_THRESHOLD); + } + } + else + { + for (int j = 0; j < NUM_OF_LANGUAGES; j++) + candidates[i][j] = (cf > CANDIDATE_THRESHOLD); + } + } + } +} + float nsMBCSGroupProber::GetConfidence(int candidate) { + int num_candidates = GetCandidates(); + int candidate_it = 0; + PRUint32 i; - float bestConf = 0.0, cf; + + if (num_candidates == 0) + return 0.0; + else if (candidate >= num_candidates) + /* Just show the first candidate. */ + candidate = 0; switch (mState) { @@ -312,32 +418,26 @@ float nsMBCSGroupProber::GetConfidence(int candidate) default: for (i = 0; i < NUM_OF_PROBERS; i++) { - float bestLangConf = 0.0; - - if (!mIsActive[i]) - continue; - cf = mProbers[i]->GetConfidence(0); - - if (mProbers[i]->DecodeToUnicode()) - { - for (int j = 0; j < NUM_OF_LANGUAGES; j++) + for (PRUint32 j = 0; j < NUM_OF_LANGUAGES; j++) + if (candidates[i][j]) { - float langConf = langDetectors[i][j]->GetConfidence(); + if (candidate == candidate_it) + { + float cf = mProbers[i]->GetConfidence(0); + float langConf = 1.0; - if (bestLangConf < langConf) - bestLangConf = langConf; + if (langDetectors[i][j]) + langConf = langDetectors[i][j]->GetConfidence(); + + return cf * langConf; + } + candidate_it++; } - cf *= bestLangConf; - } - - if (bestConf < cf) - { - bestConf = cf; - mBestGuess = i; - } } } - return bestConf; + + /* Should not happen. */ + return 0.0; } #ifdef DEBUG_chardet diff --git a/src/nsMBCSGroupProber.h b/src/nsMBCSGroupProber.h index 2da8d79..190ef65 100644 --- a/src/nsMBCSGroupProber.h +++ b/src/nsMBCSGroupProber.h @@ -57,7 +57,7 @@ public: nsProbingState HandleData(const char* aBuf, PRUint32 aLen, int** codePointBuffer, int* codePointBufferIdx); - int GetCandidates() { return 1; } + int GetCandidates(); const char* GetCharSetName(int candidate); const char* GetLanguage(int candidate); nsProbingState GetState(void) {return mState;} @@ -76,15 +76,19 @@ protected: nsProbingState mState; nsCharSetProber* mProbers[NUM_OF_PROBERS]; PRBool mIsActive[NUM_OF_PROBERS]; - PRInt32 mBestGuess; PRUint32 mActiveNum; PRUint32 mKeepNext; + PRBool candidates[NUM_OF_PROBERS][NUM_OF_LANGUAGES]; + int *codePointBuffer[NUM_OF_PROBERS]; int codePointBufferSize[NUM_OF_PROBERS]; int codePointBufferIdx[NUM_OF_PROBERS]; nsLanguageDetector *langDetectors[NUM_OF_PROBERS][NUM_OF_LANGUAGES]; + +private: + void CheckCandidates(); }; #endif /* nsMBCSGroupProber_h__ */ diff --git a/src/nsUniversalDetector.cpp b/src/nsUniversalDetector.cpp index 184a114..6695aff 100644 --- a/src/nsUniversalDetector.cpp +++ b/src/nsUniversalDetector.cpp @@ -54,9 +54,6 @@ nsUniversalDetector::nsUniversalDetector(PRUint32 aLanguageFilter) mEscCharSetProber = nsnull; mStart = PR_TRUE; - mDetectedCharset = nsnull; - mDetectedLanguage = nsnull; - mDetectedConfidence = 0.0; mGotData = PR_FALSE; mInputState = ePureAscii; mLastChar = '\0'; @@ -84,9 +81,6 @@ nsUniversalDetector::Reset() mInTag = PR_FALSE; mStart = PR_TRUE; - mDetectedCharset = nsnull; - mDetectedLanguage = nsnull; - mDetectedConfidence = 0.0; mGotData = PR_FALSE; mInputState = ePureAscii; mLastChar = '\0'; @@ -124,16 +118,16 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen) if (('\xBB' == aBuf[1]) && ('\xBF' == aBuf[2])) { /* EF BB BF: UTF-8 encoded BOM. */ - mDetectedCharset = "UTF-8"; - mDetectedConfidence = 0.99; + shortcutCharset = "UTF-8"; + shortcutConfidence = 0.99; } break; case '\xFE': if ('\xFF' == aBuf[1]) { /* FE FF: UTF-16, big endian BOM. */ - mDetectedCharset = "UTF-16"; - mDetectedConfidence = 0.99; + shortcutCharset = "UTF-16"; + shortcutConfidence = 0.99; } break; case '\xFF': @@ -144,14 +138,14 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen) aBuf[3] == '\x00') { /* FF FE 00 00: UTF-32 (LE). */ - mDetectedCharset = "UTF-32"; - mDetectedConfidence = 0.99; + shortcutCharset = "UTF-32"; + shortcutConfidence = 0.99; } else { /* FF FE: UTF-16, little endian BOM. */ - mDetectedCharset = "UTF-16"; - mDetectedConfidence = 0.99; + shortcutCharset = "UTF-16"; + shortcutConfidence = 0.99; } } break; @@ -162,14 +156,14 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen) aBuf[3] == '\xFF') { /* 00 00 FE FF: UTF-32 (BE). */ - mDetectedCharset = "UTF-32"; - mDetectedConfidence = 0.99; + shortcutCharset = "UTF-32"; + shortcutConfidence = 0.99; } break; } } - if (mDetectedCharset) + if (shortcutCharset) { mDone = PR_TRUE; return NS_OK; @@ -252,9 +246,9 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen) st = mEscCharSetProber->HandleData(aBuf, aLen, NULL, NULL); if (st == eFoundIt) { + shortcutCharset = mEscCharSetProber->GetCharSetName(0); + shortcutConfidence = mEscCharSetProber->GetConfidence(0); mDone = PR_TRUE; - mDetectedCharset = mEscCharSetProber->GetCharSetName(0); - mDetectedConfidence = mEscCharSetProber->GetConfidence(0); } break; case eHighbyte: @@ -266,9 +260,6 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen) if (st == eFoundIt) { mDone = PR_TRUE; - mDetectedCharset = mCharSetProbers[i]->GetCharSetName(0); - mDetectedLanguage = mCharSetProbers[i]->GetLanguage(0); - mDetectedConfidence = mCharSetProbers[i]->GetConfidence(0); return NS_OK; } } @@ -292,7 +283,7 @@ void nsUniversalDetector::DataEnd() return; } - if (! mDetectedCharset) + if (! shortcutCharset) { switch (mInputState) { @@ -302,26 +293,27 @@ void nsUniversalDetector::DataEnd() { /* ISO-8859-1 is a good result candidate for ASCII + NBSP. * (though it could have been any ISO-8859 encoding). */ - mDetectedCharset = "ISO-8859-1"; + shortcutCharset = "ISO-8859-1"; } else { /* ASCII with the ESC character (or the sequence "~{") is still * ASCII until proven otherwise. */ - mDetectedCharset = "ASCII"; + shortcutCharset = "ASCII"; } + shortcutConfidence = 0.99; default: break; } } - if (mDetectedCharset) + if (shortcutCharset) { /* These cases are limited enough that we are always confident * when finding them. */ mDone = PR_TRUE; - Report(mDetectedCharset, mDetectedLanguage, mDetectedConfidence); + Report(shortcutCharset, NULL, shortcutConfidence); return; } @@ -335,13 +327,20 @@ void nsUniversalDetector::DataEnd() { if (mCharSetProbers[i]) { - proberConfidence = mCharSetProbers[i]->GetConfidence(0); + int n_candidates = mCharSetProbers[i]->GetCandidates(); - if (proberConfidence > MINIMUM_THRESHOLD) - /* Only report what we are confident in. */ - Report(mCharSetProbers[i]->GetCharSetName(0), - mCharSetProbers[i]->GetLanguage(0), - proberConfidence); + for (int c = 0; c < n_candidates; c++) + { + proberConfidence = mCharSetProbers[i]->GetConfidence(c); + + if (proberConfidence > MINIMUM_THRESHOLD) + { + /* Only report what we are confident in. */ + Report(mCharSetProbers[i]->GetCharSetName(c), + mCharSetProbers[i]->GetLanguage(c), + proberConfidence); + } + } } } } diff --git a/src/nsUniversalDetector.h b/src/nsUniversalDetector.h index 521e424..a286ed9 100644 --- a/src/nsUniversalDetector.h +++ b/src/nsUniversalDetector.h @@ -80,9 +80,9 @@ protected: PRBool mStart; PRBool mGotData; char mLastChar; - const char * mDetectedCharset; - const char * mDetectedLanguage; - float mDetectedConfidence; + const char * shortcutCharset; + float shortcutConfidence; + PRInt32 mBestGuess; PRUint32 mLanguageFilter;