src: make nsMBCSGroupProber report all valid candidates.

Returning only the best one has limits, as it doesn't allow to check
very close confidence candidates. Now in particular, the UTF-8 prober
will return all ("UTF-8", lang) candidates for every language with
probable statistical fit.
This commit is contained in:
Jehan 2021-03-17 16:34:26 +01:00
parent 2127f4fc0d
commit 6138d9e0f0
4 changed files with 203 additions and 100 deletions

View File

@ -138,45 +138,87 @@ nsMBCSGroupProber::~nsMBCSGroupProber()
}
}
#define CANDIDATE_THRESHOLD 0.3f
int nsMBCSGroupProber::GetCandidates()
{
int num_candidates = 0;
CheckCandidates();
for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++)
for (PRUint32 j = 0; j < NUM_OF_LANGUAGES; j++)
if (candidates[i][j])
num_candidates++;
return num_candidates;
}
const char* nsMBCSGroupProber::GetCharSetName(int candidate)
{
if (mBestGuess == -1)
{
GetConfidence(0);
if (mBestGuess == -1)
mBestGuess = 0;
}
return mProbers[mBestGuess]->GetCharSetName(0);
int num_candidates = GetCandidates();
int candidate_it = 0;
if (num_candidates == 0)
return NULL;
else if (candidate >= num_candidates)
/* Just show the first candidate. */
candidate = 0;
for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++)
for (PRUint32 j = 0; j < NUM_OF_LANGUAGES; j++)
if (candidates[i][j])
{
if (candidate == candidate_it)
{
/* We assume that probers included in the nsMBCSGroupProber
* return only one candidate themselves.
* */
return mProbers[i]->GetCharSetName(0);
}
candidate_it++;
}
/* Should not happen. */
return NULL;
}
const char* nsMBCSGroupProber::GetLanguage(int candidate)
{
const char* maxLang = NULL;
int maxLangIdx = -1;
float maxConfidence = 0.0;
const char* lang = NULL;
int num_candidates = GetCandidates();
int candidate_it = 0;
if (mBestGuess == -1)
if (num_candidates == 0)
return NULL;
else
maxLang = mProbers[mBestGuess]->GetLanguage(0);
else if (candidate >= num_candidates)
/* Just show the first candidate. */
candidate = 0;
if (maxLang == NULL && mProbers[mBestGuess]->DecodeToUnicode())
{
for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++)
for (PRUint32 j = 0; j < NUM_OF_LANGUAGES; j++)
{
float conf = langDetectors[mBestGuess][j]->GetConfidence();
if (conf > maxConfidence)
if (candidates[i][j])
{
maxLangIdx = j;
maxConfidence = conf;
}
}
if (maxLangIdx != -1)
maxLang = langDetectors[mBestGuess][maxLangIdx]->GetLanguage();
}
if (candidate == candidate_it)
{
/* We assume that probers included in the nsMBCSGroupProber
* return only one candidate themselves.
* */
lang = mProbers[i]->GetLanguage(0);
return maxLang;
if (! lang)
{
/* The prober does not come with its own language. */
if (langDetectors[i][j])
lang = langDetectors[i][j]->GetLanguage();
}
return lang;
}
candidate_it++;
}
return lang;
}
void nsMBCSGroupProber::Reset(void)
@ -196,17 +238,18 @@ void nsMBCSGroupProber::Reset(void)
codePointBuffer[i] = new int[codePointBufferSize[i]];
}
codePointBufferIdx[i] = 0;
for (PRUint32 j = 0; j < NUM_OF_LANGUAGES; j++)
{
if (langDetectors[i][j])
langDetectors[i][j]->Reset();
}
}
else
mIsActive[i] = PR_FALSE;
for (PRUint32 j = 0; j < NUM_OF_LANGUAGES; j++)
{
if (langDetectors[i][j])
langDetectors[i][j]->Reset();
candidates[i][j] = false;
}
}
mBestGuess = -1;
mState = eDetecting;
mKeepNext = 0;
}
@ -252,9 +295,21 @@ nsProbingState nsMBCSGroupProber::HandleData(const char* aBuf, PRUint32 aLen,
if (st == eFoundIt)
{
mBestGuess = i;
mState = eFoundIt;
return mState;
float cf = mProbers[i]->GetConfidence(0);
for (PRUint32 j = 0; j < NUM_OF_LANGUAGES; j++)
{
float langConf = langDetectors[i][j]->GetConfidence();
if (cf * langConf > CANDIDATE_THRESHOLD)
{
/* There is at least one (charset, lang) couple for
* which the confidence is high enough.
*/
mState = eFoundIt;
return mState;
}
}
}
}
}
@ -288,9 +343,21 @@ nsProbingState nsMBCSGroupProber::HandleData(const char* aBuf, PRUint32 aLen,
if (st == eFoundIt)
{
mBestGuess = i;
mState = eFoundIt;
return mState;
float cf = mProbers[i]->GetConfidence(0);
for (PRUint32 j = 0; j < NUM_OF_LANGUAGES; j++)
{
float langConf = langDetectors[i][j]->GetConfidence();
if (cf * langConf > CANDIDATE_THRESHOLD)
{
/* There is at least one (charset, lang) couple for
* which the confidence is high enough.
*/
mState = eFoundIt;
return mState;
}
}
}
}
}
@ -299,10 +366,49 @@ nsProbingState nsMBCSGroupProber::HandleData(const char* aBuf, PRUint32 aLen,
return mState;
}
void nsMBCSGroupProber::CheckCandidates()
{
for (int i = 0; i < NUM_OF_PROBERS; i++)
{
if (! mIsActive[i])
{
for (int j = 0; j < NUM_OF_LANGUAGES; j++)
candidates[i][j] = false;
}
else
{
float cf = mProbers[i]->GetConfidence(0);
if (mProbers[i]->DecodeToUnicode())
{
for (int j = 0; j < NUM_OF_LANGUAGES; j++)
{
float langConf = langDetectors[i][j]->GetConfidence();
candidates[i][j] = (cf * langConf > CANDIDATE_THRESHOLD);
}
}
else
{
for (int j = 0; j < NUM_OF_LANGUAGES; j++)
candidates[i][j] = (cf > CANDIDATE_THRESHOLD);
}
}
}
}
float nsMBCSGroupProber::GetConfidence(int candidate)
{
int num_candidates = GetCandidates();
int candidate_it = 0;
PRUint32 i;
float bestConf = 0.0, cf;
if (num_candidates == 0)
return 0.0;
else if (candidate >= num_candidates)
/* Just show the first candidate. */
candidate = 0;
switch (mState)
{
@ -312,32 +418,26 @@ float nsMBCSGroupProber::GetConfidence(int candidate)
default:
for (i = 0; i < NUM_OF_PROBERS; i++)
{
float bestLangConf = 0.0;
if (!mIsActive[i])
continue;
cf = mProbers[i]->GetConfidence(0);
if (mProbers[i]->DecodeToUnicode())
{
for (int j = 0; j < NUM_OF_LANGUAGES; j++)
for (PRUint32 j = 0; j < NUM_OF_LANGUAGES; j++)
if (candidates[i][j])
{
float langConf = langDetectors[i][j]->GetConfidence();
if (candidate == candidate_it)
{
float cf = mProbers[i]->GetConfidence(0);
float langConf = 1.0;
if (bestLangConf < langConf)
bestLangConf = langConf;
if (langDetectors[i][j])
langConf = langDetectors[i][j]->GetConfidence();
return cf * langConf;
}
candidate_it++;
}
cf *= bestLangConf;
}
if (bestConf < cf)
{
bestConf = cf;
mBestGuess = i;
}
}
}
return bestConf;
/* Should not happen. */
return 0.0;
}
#ifdef DEBUG_chardet

View File

@ -57,7 +57,7 @@ public:
nsProbingState HandleData(const char* aBuf, PRUint32 aLen,
int** codePointBuffer,
int* codePointBufferIdx);
int GetCandidates() { return 1; }
int GetCandidates();
const char* GetCharSetName(int candidate);
const char* GetLanguage(int candidate);
nsProbingState GetState(void) {return mState;}
@ -76,15 +76,19 @@ protected:
nsProbingState mState;
nsCharSetProber* mProbers[NUM_OF_PROBERS];
PRBool mIsActive[NUM_OF_PROBERS];
PRInt32 mBestGuess;
PRUint32 mActiveNum;
PRUint32 mKeepNext;
PRBool candidates[NUM_OF_PROBERS][NUM_OF_LANGUAGES];
int *codePointBuffer[NUM_OF_PROBERS];
int codePointBufferSize[NUM_OF_PROBERS];
int codePointBufferIdx[NUM_OF_PROBERS];
nsLanguageDetector *langDetectors[NUM_OF_PROBERS][NUM_OF_LANGUAGES];
private:
void CheckCandidates();
};
#endif /* nsMBCSGroupProber_h__ */

View File

@ -54,9 +54,6 @@ nsUniversalDetector::nsUniversalDetector(PRUint32 aLanguageFilter)
mEscCharSetProber = nsnull;
mStart = PR_TRUE;
mDetectedCharset = nsnull;
mDetectedLanguage = nsnull;
mDetectedConfidence = 0.0;
mGotData = PR_FALSE;
mInputState = ePureAscii;
mLastChar = '\0';
@ -84,9 +81,6 @@ nsUniversalDetector::Reset()
mInTag = PR_FALSE;
mStart = PR_TRUE;
mDetectedCharset = nsnull;
mDetectedLanguage = nsnull;
mDetectedConfidence = 0.0;
mGotData = PR_FALSE;
mInputState = ePureAscii;
mLastChar = '\0';
@ -124,16 +118,16 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
if (('\xBB' == aBuf[1]) && ('\xBF' == aBuf[2]))
{
/* EF BB BF: UTF-8 encoded BOM. */
mDetectedCharset = "UTF-8";
mDetectedConfidence = 0.99;
shortcutCharset = "UTF-8";
shortcutConfidence = 0.99;
}
break;
case '\xFE':
if ('\xFF' == aBuf[1])
{
/* FE FF: UTF-16, big endian BOM. */
mDetectedCharset = "UTF-16";
mDetectedConfidence = 0.99;
shortcutCharset = "UTF-16";
shortcutConfidence = 0.99;
}
break;
case '\xFF':
@ -144,14 +138,14 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
aBuf[3] == '\x00')
{
/* FF FE 00 00: UTF-32 (LE). */
mDetectedCharset = "UTF-32";
mDetectedConfidence = 0.99;
shortcutCharset = "UTF-32";
shortcutConfidence = 0.99;
}
else
{
/* FF FE: UTF-16, little endian BOM. */
mDetectedCharset = "UTF-16";
mDetectedConfidence = 0.99;
shortcutCharset = "UTF-16";
shortcutConfidence = 0.99;
}
}
break;
@ -162,14 +156,14 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
aBuf[3] == '\xFF')
{
/* 00 00 FE FF: UTF-32 (BE). */
mDetectedCharset = "UTF-32";
mDetectedConfidence = 0.99;
shortcutCharset = "UTF-32";
shortcutConfidence = 0.99;
}
break;
}
}
if (mDetectedCharset)
if (shortcutCharset)
{
mDone = PR_TRUE;
return NS_OK;
@ -252,9 +246,9 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
st = mEscCharSetProber->HandleData(aBuf, aLen, NULL, NULL);
if (st == eFoundIt)
{
shortcutCharset = mEscCharSetProber->GetCharSetName(0);
shortcutConfidence = mEscCharSetProber->GetConfidence(0);
mDone = PR_TRUE;
mDetectedCharset = mEscCharSetProber->GetCharSetName(0);
mDetectedConfidence = mEscCharSetProber->GetConfidence(0);
}
break;
case eHighbyte:
@ -266,9 +260,6 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
if (st == eFoundIt)
{
mDone = PR_TRUE;
mDetectedCharset = mCharSetProbers[i]->GetCharSetName(0);
mDetectedLanguage = mCharSetProbers[i]->GetLanguage(0);
mDetectedConfidence = mCharSetProbers[i]->GetConfidence(0);
return NS_OK;
}
}
@ -292,7 +283,7 @@ void nsUniversalDetector::DataEnd()
return;
}
if (! mDetectedCharset)
if (! shortcutCharset)
{
switch (mInputState)
{
@ -302,26 +293,27 @@ void nsUniversalDetector::DataEnd()
{
/* ISO-8859-1 is a good result candidate for ASCII + NBSP.
* (though it could have been any ISO-8859 encoding). */
mDetectedCharset = "ISO-8859-1";
shortcutCharset = "ISO-8859-1";
}
else
{
/* ASCII with the ESC character (or the sequence "~{") is still
* ASCII until proven otherwise. */
mDetectedCharset = "ASCII";
shortcutCharset = "ASCII";
}
shortcutConfidence = 0.99;
default:
break;
}
}
if (mDetectedCharset)
if (shortcutCharset)
{
/* These cases are limited enough that we are always confident
* when finding them.
*/
mDone = PR_TRUE;
Report(mDetectedCharset, mDetectedLanguage, mDetectedConfidence);
Report(shortcutCharset, NULL, shortcutConfidence);
return;
}
@ -335,13 +327,20 @@ void nsUniversalDetector::DataEnd()
{
if (mCharSetProbers[i])
{
proberConfidence = mCharSetProbers[i]->GetConfidence(0);
int n_candidates = mCharSetProbers[i]->GetCandidates();
if (proberConfidence > MINIMUM_THRESHOLD)
/* Only report what we are confident in. */
Report(mCharSetProbers[i]->GetCharSetName(0),
mCharSetProbers[i]->GetLanguage(0),
proberConfidence);
for (int c = 0; c < n_candidates; c++)
{
proberConfidence = mCharSetProbers[i]->GetConfidence(c);
if (proberConfidence > MINIMUM_THRESHOLD)
{
/* Only report what we are confident in. */
Report(mCharSetProbers[i]->GetCharSetName(c),
mCharSetProbers[i]->GetLanguage(c),
proberConfidence);
}
}
}
}
}

View File

@ -80,9 +80,9 @@ protected:
PRBool mStart;
PRBool mGotData;
char mLastChar;
const char * mDetectedCharset;
const char * mDetectedLanguage;
float mDetectedConfidence;
const char * shortcutCharset;
float shortcutConfidence;
PRInt32 mBestGuess;
PRUint32 mLanguageFilter;