mirror of
https://gitlab.freedesktop.org/uchardet/uchardet.git
synced 2025-12-08 01:36:41 +08:00
src: make nsMBCSGroupProber report all valid candidates.
Returning only the best one has limits, as it doesn't allow to check
very close confidence candidates. Now in particular, the UTF-8 prober
will return all ("UTF-8", lang) candidates for every language with
probable statistical fit.
This commit is contained in:
parent
2127f4fc0d
commit
6138d9e0f0
@ -138,45 +138,87 @@ nsMBCSGroupProber::~nsMBCSGroupProber()
|
||||
}
|
||||
}
|
||||
|
||||
#define CANDIDATE_THRESHOLD 0.3f
|
||||
|
||||
int nsMBCSGroupProber::GetCandidates()
|
||||
{
|
||||
int num_candidates = 0;
|
||||
|
||||
CheckCandidates();
|
||||
|
||||
for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++)
|
||||
for (PRUint32 j = 0; j < NUM_OF_LANGUAGES; j++)
|
||||
if (candidates[i][j])
|
||||
num_candidates++;
|
||||
|
||||
return num_candidates;
|
||||
}
|
||||
|
||||
const char* nsMBCSGroupProber::GetCharSetName(int candidate)
|
||||
{
|
||||
if (mBestGuess == -1)
|
||||
{
|
||||
GetConfidence(0);
|
||||
if (mBestGuess == -1)
|
||||
mBestGuess = 0;
|
||||
}
|
||||
return mProbers[mBestGuess]->GetCharSetName(0);
|
||||
int num_candidates = GetCandidates();
|
||||
int candidate_it = 0;
|
||||
|
||||
if (num_candidates == 0)
|
||||
return NULL;
|
||||
else if (candidate >= num_candidates)
|
||||
/* Just show the first candidate. */
|
||||
candidate = 0;
|
||||
|
||||
for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++)
|
||||
for (PRUint32 j = 0; j < NUM_OF_LANGUAGES; j++)
|
||||
if (candidates[i][j])
|
||||
{
|
||||
if (candidate == candidate_it)
|
||||
{
|
||||
/* We assume that probers included in the nsMBCSGroupProber
|
||||
* return only one candidate themselves.
|
||||
* */
|
||||
return mProbers[i]->GetCharSetName(0);
|
||||
}
|
||||
candidate_it++;
|
||||
}
|
||||
|
||||
/* Should not happen. */
|
||||
return NULL;
|
||||
}
|
||||
|
||||
const char* nsMBCSGroupProber::GetLanguage(int candidate)
|
||||
{
|
||||
const char* maxLang = NULL;
|
||||
int maxLangIdx = -1;
|
||||
float maxConfidence = 0.0;
|
||||
const char* lang = NULL;
|
||||
int num_candidates = GetCandidates();
|
||||
int candidate_it = 0;
|
||||
|
||||
if (mBestGuess == -1)
|
||||
if (num_candidates == 0)
|
||||
return NULL;
|
||||
else
|
||||
maxLang = mProbers[mBestGuess]->GetLanguage(0);
|
||||
else if (candidate >= num_candidates)
|
||||
/* Just show the first candidate. */
|
||||
candidate = 0;
|
||||
|
||||
if (maxLang == NULL && mProbers[mBestGuess]->DecodeToUnicode())
|
||||
{
|
||||
for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++)
|
||||
for (PRUint32 j = 0; j < NUM_OF_LANGUAGES; j++)
|
||||
{
|
||||
float conf = langDetectors[mBestGuess][j]->GetConfidence();
|
||||
|
||||
if (conf > maxConfidence)
|
||||
if (candidates[i][j])
|
||||
{
|
||||
maxLangIdx = j;
|
||||
maxConfidence = conf;
|
||||
}
|
||||
}
|
||||
if (maxLangIdx != -1)
|
||||
maxLang = langDetectors[mBestGuess][maxLangIdx]->GetLanguage();
|
||||
}
|
||||
if (candidate == candidate_it)
|
||||
{
|
||||
/* We assume that probers included in the nsMBCSGroupProber
|
||||
* return only one candidate themselves.
|
||||
* */
|
||||
lang = mProbers[i]->GetLanguage(0);
|
||||
|
||||
return maxLang;
|
||||
if (! lang)
|
||||
{
|
||||
/* The prober does not come with its own language. */
|
||||
if (langDetectors[i][j])
|
||||
lang = langDetectors[i][j]->GetLanguage();
|
||||
}
|
||||
|
||||
return lang;
|
||||
}
|
||||
candidate_it++;
|
||||
}
|
||||
|
||||
return lang;
|
||||
}
|
||||
|
||||
void nsMBCSGroupProber::Reset(void)
|
||||
@ -196,17 +238,18 @@ void nsMBCSGroupProber::Reset(void)
|
||||
codePointBuffer[i] = new int[codePointBufferSize[i]];
|
||||
}
|
||||
codePointBufferIdx[i] = 0;
|
||||
|
||||
for (PRUint32 j = 0; j < NUM_OF_LANGUAGES; j++)
|
||||
{
|
||||
if (langDetectors[i][j])
|
||||
langDetectors[i][j]->Reset();
|
||||
}
|
||||
}
|
||||
else
|
||||
mIsActive[i] = PR_FALSE;
|
||||
|
||||
for (PRUint32 j = 0; j < NUM_OF_LANGUAGES; j++)
|
||||
{
|
||||
if (langDetectors[i][j])
|
||||
langDetectors[i][j]->Reset();
|
||||
|
||||
candidates[i][j] = false;
|
||||
}
|
||||
}
|
||||
mBestGuess = -1;
|
||||
mState = eDetecting;
|
||||
mKeepNext = 0;
|
||||
}
|
||||
@ -252,9 +295,21 @@ nsProbingState nsMBCSGroupProber::HandleData(const char* aBuf, PRUint32 aLen,
|
||||
|
||||
if (st == eFoundIt)
|
||||
{
|
||||
mBestGuess = i;
|
||||
mState = eFoundIt;
|
||||
return mState;
|
||||
float cf = mProbers[i]->GetConfidence(0);
|
||||
|
||||
for (PRUint32 j = 0; j < NUM_OF_LANGUAGES; j++)
|
||||
{
|
||||
float langConf = langDetectors[i][j]->GetConfidence();
|
||||
|
||||
if (cf * langConf > CANDIDATE_THRESHOLD)
|
||||
{
|
||||
/* There is at least one (charset, lang) couple for
|
||||
* which the confidence is high enough.
|
||||
*/
|
||||
mState = eFoundIt;
|
||||
return mState;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -288,9 +343,21 @@ nsProbingState nsMBCSGroupProber::HandleData(const char* aBuf, PRUint32 aLen,
|
||||
|
||||
if (st == eFoundIt)
|
||||
{
|
||||
mBestGuess = i;
|
||||
mState = eFoundIt;
|
||||
return mState;
|
||||
float cf = mProbers[i]->GetConfidence(0);
|
||||
|
||||
for (PRUint32 j = 0; j < NUM_OF_LANGUAGES; j++)
|
||||
{
|
||||
float langConf = langDetectors[i][j]->GetConfidence();
|
||||
|
||||
if (cf * langConf > CANDIDATE_THRESHOLD)
|
||||
{
|
||||
/* There is at least one (charset, lang) couple for
|
||||
* which the confidence is high enough.
|
||||
*/
|
||||
mState = eFoundIt;
|
||||
return mState;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -299,10 +366,49 @@ nsProbingState nsMBCSGroupProber::HandleData(const char* aBuf, PRUint32 aLen,
|
||||
return mState;
|
||||
}
|
||||
|
||||
void nsMBCSGroupProber::CheckCandidates()
|
||||
{
|
||||
for (int i = 0; i < NUM_OF_PROBERS; i++)
|
||||
{
|
||||
if (! mIsActive[i])
|
||||
{
|
||||
for (int j = 0; j < NUM_OF_LANGUAGES; j++)
|
||||
candidates[i][j] = false;
|
||||
}
|
||||
else
|
||||
{
|
||||
float cf = mProbers[i]->GetConfidence(0);
|
||||
|
||||
if (mProbers[i]->DecodeToUnicode())
|
||||
{
|
||||
for (int j = 0; j < NUM_OF_LANGUAGES; j++)
|
||||
{
|
||||
float langConf = langDetectors[i][j]->GetConfidence();
|
||||
|
||||
candidates[i][j] = (cf * langConf > CANDIDATE_THRESHOLD);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (int j = 0; j < NUM_OF_LANGUAGES; j++)
|
||||
candidates[i][j] = (cf > CANDIDATE_THRESHOLD);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
float nsMBCSGroupProber::GetConfidence(int candidate)
|
||||
{
|
||||
int num_candidates = GetCandidates();
|
||||
int candidate_it = 0;
|
||||
|
||||
PRUint32 i;
|
||||
float bestConf = 0.0, cf;
|
||||
|
||||
if (num_candidates == 0)
|
||||
return 0.0;
|
||||
else if (candidate >= num_candidates)
|
||||
/* Just show the first candidate. */
|
||||
candidate = 0;
|
||||
|
||||
switch (mState)
|
||||
{
|
||||
@ -312,32 +418,26 @@ float nsMBCSGroupProber::GetConfidence(int candidate)
|
||||
default:
|
||||
for (i = 0; i < NUM_OF_PROBERS; i++)
|
||||
{
|
||||
float bestLangConf = 0.0;
|
||||
|
||||
if (!mIsActive[i])
|
||||
continue;
|
||||
cf = mProbers[i]->GetConfidence(0);
|
||||
|
||||
if (mProbers[i]->DecodeToUnicode())
|
||||
{
|
||||
for (int j = 0; j < NUM_OF_LANGUAGES; j++)
|
||||
for (PRUint32 j = 0; j < NUM_OF_LANGUAGES; j++)
|
||||
if (candidates[i][j])
|
||||
{
|
||||
float langConf = langDetectors[i][j]->GetConfidence();
|
||||
if (candidate == candidate_it)
|
||||
{
|
||||
float cf = mProbers[i]->GetConfidence(0);
|
||||
float langConf = 1.0;
|
||||
|
||||
if (bestLangConf < langConf)
|
||||
bestLangConf = langConf;
|
||||
if (langDetectors[i][j])
|
||||
langConf = langDetectors[i][j]->GetConfidence();
|
||||
|
||||
return cf * langConf;
|
||||
}
|
||||
candidate_it++;
|
||||
}
|
||||
cf *= bestLangConf;
|
||||
}
|
||||
|
||||
if (bestConf < cf)
|
||||
{
|
||||
bestConf = cf;
|
||||
mBestGuess = i;
|
||||
}
|
||||
}
|
||||
}
|
||||
return bestConf;
|
||||
|
||||
/* Should not happen. */
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
#ifdef DEBUG_chardet
|
||||
|
||||
@ -57,7 +57,7 @@ public:
|
||||
nsProbingState HandleData(const char* aBuf, PRUint32 aLen,
|
||||
int** codePointBuffer,
|
||||
int* codePointBufferIdx);
|
||||
int GetCandidates() { return 1; }
|
||||
int GetCandidates();
|
||||
const char* GetCharSetName(int candidate);
|
||||
const char* GetLanguage(int candidate);
|
||||
nsProbingState GetState(void) {return mState;}
|
||||
@ -76,15 +76,19 @@ protected:
|
||||
nsProbingState mState;
|
||||
nsCharSetProber* mProbers[NUM_OF_PROBERS];
|
||||
PRBool mIsActive[NUM_OF_PROBERS];
|
||||
PRInt32 mBestGuess;
|
||||
PRUint32 mActiveNum;
|
||||
PRUint32 mKeepNext;
|
||||
|
||||
PRBool candidates[NUM_OF_PROBERS][NUM_OF_LANGUAGES];
|
||||
|
||||
int *codePointBuffer[NUM_OF_PROBERS];
|
||||
int codePointBufferSize[NUM_OF_PROBERS];
|
||||
int codePointBufferIdx[NUM_OF_PROBERS];
|
||||
|
||||
nsLanguageDetector *langDetectors[NUM_OF_PROBERS][NUM_OF_LANGUAGES];
|
||||
|
||||
private:
|
||||
void CheckCandidates();
|
||||
};
|
||||
|
||||
#endif /* nsMBCSGroupProber_h__ */
|
||||
|
||||
@ -54,9 +54,6 @@ nsUniversalDetector::nsUniversalDetector(PRUint32 aLanguageFilter)
|
||||
mEscCharSetProber = nsnull;
|
||||
|
||||
mStart = PR_TRUE;
|
||||
mDetectedCharset = nsnull;
|
||||
mDetectedLanguage = nsnull;
|
||||
mDetectedConfidence = 0.0;
|
||||
mGotData = PR_FALSE;
|
||||
mInputState = ePureAscii;
|
||||
mLastChar = '\0';
|
||||
@ -84,9 +81,6 @@ nsUniversalDetector::Reset()
|
||||
mInTag = PR_FALSE;
|
||||
|
||||
mStart = PR_TRUE;
|
||||
mDetectedCharset = nsnull;
|
||||
mDetectedLanguage = nsnull;
|
||||
mDetectedConfidence = 0.0;
|
||||
mGotData = PR_FALSE;
|
||||
mInputState = ePureAscii;
|
||||
mLastChar = '\0';
|
||||
@ -124,16 +118,16 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
|
||||
if (('\xBB' == aBuf[1]) && ('\xBF' == aBuf[2]))
|
||||
{
|
||||
/* EF BB BF: UTF-8 encoded BOM. */
|
||||
mDetectedCharset = "UTF-8";
|
||||
mDetectedConfidence = 0.99;
|
||||
shortcutCharset = "UTF-8";
|
||||
shortcutConfidence = 0.99;
|
||||
}
|
||||
break;
|
||||
case '\xFE':
|
||||
if ('\xFF' == aBuf[1])
|
||||
{
|
||||
/* FE FF: UTF-16, big endian BOM. */
|
||||
mDetectedCharset = "UTF-16";
|
||||
mDetectedConfidence = 0.99;
|
||||
shortcutCharset = "UTF-16";
|
||||
shortcutConfidence = 0.99;
|
||||
}
|
||||
break;
|
||||
case '\xFF':
|
||||
@ -144,14 +138,14 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
|
||||
aBuf[3] == '\x00')
|
||||
{
|
||||
/* FF FE 00 00: UTF-32 (LE). */
|
||||
mDetectedCharset = "UTF-32";
|
||||
mDetectedConfidence = 0.99;
|
||||
shortcutCharset = "UTF-32";
|
||||
shortcutConfidence = 0.99;
|
||||
}
|
||||
else
|
||||
{
|
||||
/* FF FE: UTF-16, little endian BOM. */
|
||||
mDetectedCharset = "UTF-16";
|
||||
mDetectedConfidence = 0.99;
|
||||
shortcutCharset = "UTF-16";
|
||||
shortcutConfidence = 0.99;
|
||||
}
|
||||
}
|
||||
break;
|
||||
@ -162,14 +156,14 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
|
||||
aBuf[3] == '\xFF')
|
||||
{
|
||||
/* 00 00 FE FF: UTF-32 (BE). */
|
||||
mDetectedCharset = "UTF-32";
|
||||
mDetectedConfidence = 0.99;
|
||||
shortcutCharset = "UTF-32";
|
||||
shortcutConfidence = 0.99;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (mDetectedCharset)
|
||||
if (shortcutCharset)
|
||||
{
|
||||
mDone = PR_TRUE;
|
||||
return NS_OK;
|
||||
@ -252,9 +246,9 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
|
||||
st = mEscCharSetProber->HandleData(aBuf, aLen, NULL, NULL);
|
||||
if (st == eFoundIt)
|
||||
{
|
||||
shortcutCharset = mEscCharSetProber->GetCharSetName(0);
|
||||
shortcutConfidence = mEscCharSetProber->GetConfidence(0);
|
||||
mDone = PR_TRUE;
|
||||
mDetectedCharset = mEscCharSetProber->GetCharSetName(0);
|
||||
mDetectedConfidence = mEscCharSetProber->GetConfidence(0);
|
||||
}
|
||||
break;
|
||||
case eHighbyte:
|
||||
@ -266,9 +260,6 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
|
||||
if (st == eFoundIt)
|
||||
{
|
||||
mDone = PR_TRUE;
|
||||
mDetectedCharset = mCharSetProbers[i]->GetCharSetName(0);
|
||||
mDetectedLanguage = mCharSetProbers[i]->GetLanguage(0);
|
||||
mDetectedConfidence = mCharSetProbers[i]->GetConfidence(0);
|
||||
return NS_OK;
|
||||
}
|
||||
}
|
||||
@ -292,7 +283,7 @@ void nsUniversalDetector::DataEnd()
|
||||
return;
|
||||
}
|
||||
|
||||
if (! mDetectedCharset)
|
||||
if (! shortcutCharset)
|
||||
{
|
||||
switch (mInputState)
|
||||
{
|
||||
@ -302,26 +293,27 @@ void nsUniversalDetector::DataEnd()
|
||||
{
|
||||
/* ISO-8859-1 is a good result candidate for ASCII + NBSP.
|
||||
* (though it could have been any ISO-8859 encoding). */
|
||||
mDetectedCharset = "ISO-8859-1";
|
||||
shortcutCharset = "ISO-8859-1";
|
||||
}
|
||||
else
|
||||
{
|
||||
/* ASCII with the ESC character (or the sequence "~{") is still
|
||||
* ASCII until proven otherwise. */
|
||||
mDetectedCharset = "ASCII";
|
||||
shortcutCharset = "ASCII";
|
||||
}
|
||||
shortcutConfidence = 0.99;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (mDetectedCharset)
|
||||
if (shortcutCharset)
|
||||
{
|
||||
/* These cases are limited enough that we are always confident
|
||||
* when finding them.
|
||||
*/
|
||||
mDone = PR_TRUE;
|
||||
Report(mDetectedCharset, mDetectedLanguage, mDetectedConfidence);
|
||||
Report(shortcutCharset, NULL, shortcutConfidence);
|
||||
return;
|
||||
}
|
||||
|
||||
@ -335,13 +327,20 @@ void nsUniversalDetector::DataEnd()
|
||||
{
|
||||
if (mCharSetProbers[i])
|
||||
{
|
||||
proberConfidence = mCharSetProbers[i]->GetConfidence(0);
|
||||
int n_candidates = mCharSetProbers[i]->GetCandidates();
|
||||
|
||||
if (proberConfidence > MINIMUM_THRESHOLD)
|
||||
/* Only report what we are confident in. */
|
||||
Report(mCharSetProbers[i]->GetCharSetName(0),
|
||||
mCharSetProbers[i]->GetLanguage(0),
|
||||
proberConfidence);
|
||||
for (int c = 0; c < n_candidates; c++)
|
||||
{
|
||||
proberConfidence = mCharSetProbers[i]->GetConfidence(c);
|
||||
|
||||
if (proberConfidence > MINIMUM_THRESHOLD)
|
||||
{
|
||||
/* Only report what we are confident in. */
|
||||
Report(mCharSetProbers[i]->GetCharSetName(c),
|
||||
mCharSetProbers[i]->GetLanguage(c),
|
||||
proberConfidence);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -80,9 +80,9 @@ protected:
|
||||
PRBool mStart;
|
||||
PRBool mGotData;
|
||||
char mLastChar;
|
||||
const char * mDetectedCharset;
|
||||
const char * mDetectedLanguage;
|
||||
float mDetectedConfidence;
|
||||
const char * shortcutCharset;
|
||||
float shortcutConfidence;
|
||||
|
||||
PRInt32 mBestGuess;
|
||||
PRUint32 mLanguageFilter;
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user