Using the generic language detector in UTF-8 detection.

Now the UTF-8 prober would not only detect valid UTF-8, but would also
detect the most probable language. Using the data generated 2 commits
away, this works very well.

This is still basic and will require even more improvements. In
particular, now the nsUTF8Prober should return an array of ("UTF-8",
language) couple candidate. And nsMBCSGroupProber should itself forward
these candidates as well as other candidates from other multi-byte
detectors. This way, the public-facing API would get more probable
candidates, in case the algorithm is slightly wrong.

Also the UTF-8 confidence is currently stupidly high as soon as we
consider it to be right. We should likely weigh it with language
detection (in particular, if no language is detected, this should
severely weigh down UTF-8 detection; not to 0, but high enough to be a
fallback in case no other encoding+lang is valid and low enough to give
chances to other good candidate couples.
This commit is contained in:
Jehan 2021-03-15 12:01:35 +01:00
parent dac7cbd30f
commit 5257fc1abf
29 changed files with 235 additions and 43 deletions

View File

@ -44,7 +44,9 @@ void nsBig5Prober::Reset(void)
mDistributionAnalyser.Reset(mIsPreferredLanguage);
}
nsProbingState nsBig5Prober::HandleData(const char* aBuf, PRUint32 aLen)
nsProbingState nsBig5Prober::HandleData(const char* aBuf, PRUint32 aLen,
int** codePointBuffer,
int* codePointBufferIdx)
{
PRUint32 codingState;

View File

@ -49,7 +49,9 @@ public:
{mCodingSM = new nsCodingStateMachine(&Big5SMModel);
Reset();}
virtual ~nsBig5Prober(void){delete mCodingSM;}
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
nsProbingState HandleData(const char* aBuf, PRUint32 aLen,
int** codePointBuffer,
int* codePointBufferIdx);
const char* GetCharSetName() {return "BIG5";}
const char* GetLanguage() {return "zh";}
nsProbingState GetState(void) {return mState;}

View File

@ -55,7 +55,10 @@ public:
virtual ~nsCharSetProber() {}
virtual const char* GetCharSetName() = 0;
virtual const char* GetLanguage() = 0;
virtual nsProbingState HandleData(const char* aBuf, PRUint32 aLen) = 0;
virtual nsProbingState HandleData(const char* aBuf, PRUint32 aLen,
int** codePointBuffer,
int* codePointBufferIdx) = 0;
virtual bool DecodeToUnicode() {return false;}
virtual nsProbingState GetState(void) = 0;
virtual void Reset(void) = 0;
virtual float GetConfidence(void) = 0;

View File

@ -50,7 +50,9 @@ void nsEUCJPProber::Reset(void)
mDistributionAnalyser.Reset(mIsPreferredLanguage);
}
nsProbingState nsEUCJPProber::HandleData(const char* aBuf, PRUint32 aLen)
nsProbingState nsEUCJPProber::HandleData(const char* aBuf, PRUint32 aLen,
int** codePointBuffer,
int* codePointBufferIdx)
{
PRUint32 codingState;

View File

@ -55,7 +55,9 @@ public:
{mCodingSM = new nsCodingStateMachine(&EUCJPSMModel);
Reset();}
virtual ~nsEUCJPProber(void){delete mCodingSM;}
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
nsProbingState HandleData(const char* aBuf, PRUint32 aLen,
int** codePointBuffer,
int* codePointBufferIdx);
const char* GetCharSetName() {return "EUC-JP";}
const char* GetLanguage() {return "ja";}
nsProbingState GetState(void) {return mState;}

View File

@ -45,7 +45,9 @@ void nsEUCKRProber::Reset(void)
//mContextAnalyser.Reset();
}
nsProbingState nsEUCKRProber::HandleData(const char* aBuf, PRUint32 aLen)
nsProbingState nsEUCKRProber::HandleData(const char* aBuf, PRUint32 aLen,
int** codePointBuffer,
int* codePointBufferIdx)
{
PRUint32 codingState;

View File

@ -50,7 +50,9 @@ public:
Reset();
}
virtual ~nsEUCKRProber(void){delete mCodingSM;}
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
nsProbingState HandleData(const char* aBuf, PRUint32 aLen,
int** codePointBuffer,
int* codePointBufferIdx);
/* "Unified Hangul Code", also called "CP949" or "Windows-949" is a
* superset of EUC-KR. Though not fully ok to return UHC here (a
* separate prober would be better), it is acceptable, since many

View File

@ -45,7 +45,9 @@ void nsEUCTWProber::Reset(void)
//mContextAnalyser.Reset();
}
nsProbingState nsEUCTWProber::HandleData(const char* aBuf, PRUint32 aLen)
nsProbingState nsEUCTWProber::HandleData(const char* aBuf, PRUint32 aLen,
int** codePointBuffer,
int* codePointBufferIdx)
{
PRUint32 codingState;

View File

@ -49,7 +49,9 @@ public:
{mCodingSM = new nsCodingStateMachine(&EUCTWSMModel);
Reset();}
virtual ~nsEUCTWProber(void){delete mCodingSM;}
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
nsProbingState HandleData(const char* aBuf, PRUint32 aLen,
int** codePointBuffer,
int* codePointBufferIdx);
const char* GetCharSetName() {return "EUC-TW";}
const char* GetLanguage() {return "zh";}
nsProbingState GetState(void) {return mState;}

View File

@ -73,7 +73,9 @@ void nsEscCharSetProber::Reset(void)
mDetectedCharset = nsnull;
}
nsProbingState nsEscCharSetProber::HandleData(const char* aBuf, PRUint32 aLen)
nsProbingState nsEscCharSetProber::HandleData(const char* aBuf, PRUint32 aLen,
int** codePointBuffer,
int* codePointBufferIdx)
{
PRUint32 codingState;
PRInt32 j;

View File

@ -49,7 +49,9 @@ class nsEscCharSetProber: public nsCharSetProber {
public:
nsEscCharSetProber(PRUint32 aLanguageFilter);
virtual ~nsEscCharSetProber(void);
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
nsProbingState HandleData(const char* aBuf, PRUint32 aLen,
int** codePointBuffer,
int* codePointBufferIdx);
const char* GetCharSetName() {return mDetectedCharset;}
const char* GetLanguage() {return NULL;}
nsProbingState GetState(void) {return mState;}

View File

@ -50,7 +50,9 @@ void nsGB18030Prober::Reset(void)
//mContextAnalyser.Reset();
}
nsProbingState nsGB18030Prober::HandleData(const char* aBuf, PRUint32 aLen)
nsProbingState nsGB18030Prober::HandleData(const char* aBuf, PRUint32 aLen,
int** codePointBuffer,
int* codePointBufferIdx)
{
PRUint32 codingState;

View File

@ -51,7 +51,9 @@ public:
{mCodingSM = new nsCodingStateMachine(&GB18030SMModel);
Reset();}
virtual ~nsGB18030Prober(void){delete mCodingSM;}
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
nsProbingState HandleData(const char* aBuf, PRUint32 aLen,
int** codePointBuffer,
int* codePointBufferIdx);
const char* GetCharSetName() {return "GB18030";}
const char* GetLanguage() {return "zh";}
nsProbingState GetState(void) {return mState;}

View File

@ -106,7 +106,9 @@ PRBool nsHebrewProber::isNonFinal(char c)
* The input buffer should not contain any white spaces that are not (' ')
* or any low-ascii punctuation marks.
*/
nsProbingState nsHebrewProber::HandleData(const char* aBuf, PRUint32 aLen)
nsProbingState nsHebrewProber::HandleData(const char* aBuf, PRUint32 aLen,
int** codePointBuffer,
int* codePointBufferIdx)
{
// Both model probers say it's not them. No reason to continue.
if (GetState() == eNotMe)

View File

@ -48,7 +48,9 @@ public:
nsHebrewProber(void) :mLogicalProb(0), mVisualProb(0) { Reset(); }
virtual ~nsHebrewProber(void) {}
virtual nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
virtual nsProbingState HandleData(const char* aBuf, PRUint32 aLen,
int** codePointBuffer,
int* codePointBufferIdx);
virtual const char *GetCharSetName();
virtual const char *GetLanguage(void) { return "he"; }
virtual void Reset(void);

View File

@ -114,7 +114,9 @@ void nsLatin1Prober::Reset(void)
}
nsProbingState nsLatin1Prober::HandleData(const char* aBuf, PRUint32 aLen)
nsProbingState nsLatin1Prober::HandleData(const char* aBuf, PRUint32 aLen,
int** codePointBuffer,
int* codePointBufferIdx)
{
char *newBuf1 = 0;
PRUint32 newLen1 = 0;

View File

@ -49,7 +49,9 @@ class nsLatin1Prober: public nsCharSetProber {
public:
nsLatin1Prober(void){Reset();}
virtual ~nsLatin1Prober(void){}
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
nsProbingState HandleData(const char* aBuf, PRUint32 aLen,
int** codePointBuffer,
int* codePointBufferIdx);
const char* GetCharSetName() {return "WINDOWS-1252";}
const char* GetLanguage() {return NULL;}
nsProbingState GetState(void) {return mState;}

View File

@ -58,7 +58,12 @@ const char *ProberName[] =
nsMBCSGroupProber::nsMBCSGroupProber(PRUint32 aLanguageFilter)
{
for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++)
mProbers[i] = nsnull;
{
mProbers[i] = nsnull;
codePointBuffer[i] = nsnull;
codePointBufferSize[i] = 0;
codePointBufferIdx[i] = 0;
}
mProbers[0] = new nsUTF8Prober();
if (aLanguageFilter & NS_FILTER_JAPANESE)
@ -75,6 +80,24 @@ nsMBCSGroupProber::nsMBCSGroupProber(PRUint32 aLanguageFilter)
mProbers[5] = new nsBig5Prober(aLanguageFilter == NS_FILTER_CHINESE_TRADITIONAL);
mProbers[6] = new nsEUCTWProber(aLanguageFilter == NS_FILTER_CHINESE_TRADITIONAL);
}
for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++)
{
if (mProbers[i]->DecodeToUnicode())
{
langDetectors[i][0] = new nsLanguageDetector(&FrenchModel);
langDetectors[i][1] = new nsLanguageDetector(&ItalianModel);
langDetectors[i][2] = new nsLanguageDetector(&DanishModel);
langDetectors[i][3] = new nsLanguageDetector(&GermanModel);
langDetectors[i][4] = new nsLanguageDetector(&ArabicModel);
langDetectors[i][5] = new nsLanguageDetector(&SpanishModel);
}
else
{
for (PRUint32 j = 0; j < NUM_OF_LANGUAGES; j++)
langDetectors[i][j] = nsnull;
}
}
Reset();
}
@ -83,6 +106,13 @@ nsMBCSGroupProber::~nsMBCSGroupProber()
for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++)
{
delete mProbers[i];
if (codePointBufferSize[i] != 0)
delete [] codePointBuffer[i];
for (PRUint32 j = 0; j < NUM_OF_LANGUAGES; j++)
if (langDetectors[i][j])
delete langDetectors[i][j];
}
}
@ -99,17 +129,35 @@ const char* nsMBCSGroupProber::GetCharSetName()
const char* nsMBCSGroupProber::GetLanguage(void)
{
const char* maxLang = NULL;
int maxLangIdx = -1;
float maxConfidence = 0.0;
if (mBestGuess == -1)
{
GetConfidence();
}
if (mBestGuess == -1)
return NULL;
return NULL;
else
return mProbers[mBestGuess]->GetLanguage();
maxLang = mProbers[mBestGuess]->GetLanguage();
if (maxLang == NULL && mProbers[mBestGuess]->DecodeToUnicode())
{
for (PRUint32 j = 0; j < NUM_OF_LANGUAGES; j++)
{
float conf = langDetectors[mBestGuess][j]->GetConfidence();
if (conf > maxConfidence)
{
maxLangIdx = j;
maxConfidence = conf;
}
}
if (maxLangIdx != -1)
maxLang = langDetectors[mBestGuess][maxLangIdx]->GetLanguage();
}
return maxLang;
}
void nsMBCSGroupProber::Reset(void)
void nsMBCSGroupProber::Reset(void)
{
mActiveNum = 0;
for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++)
@ -119,6 +167,13 @@ void nsMBCSGroupProber::Reset(void)
mProbers[i]->Reset();
mIsActive[i] = PR_TRUE;
++mActiveNum;
if (codePointBufferSize[i] == 0 && mProbers[i]->DecodeToUnicode())
{
codePointBufferSize[i] = 1024;
codePointBuffer[i] = new int[codePointBufferSize[i]];
}
codePointBufferIdx[i] = 0;
}
else
mIsActive[i] = PR_FALSE;
@ -128,7 +183,9 @@ void nsMBCSGroupProber::Reset(void)
mKeepNext = 0;
}
nsProbingState nsMBCSGroupProber::HandleData(const char* aBuf, PRUint32 aLen)
nsProbingState nsMBCSGroupProber::HandleData(const char* aBuf, PRUint32 aLen,
int** cpBuffer,
int* cpBufferIdx)
{
nsProbingState st;
PRUint32 start = 0;
@ -151,7 +208,20 @@ nsProbingState nsMBCSGroupProber::HandleData(const char* aBuf, PRUint32 aLen)
{
if (!mIsActive[i])
continue;
st = mProbers[i]->HandleData(aBuf + start, pos + 1 - start);
if (codePointBuffer[i])
st = mProbers[i]->HandleData(aBuf + start, pos + 1 - start,
&(codePointBuffer[i]), &(codePointBufferIdx[i]));
else
st = mProbers[i]->HandleData(aBuf + start, pos + 1 - start, NULL, NULL);
if (codePointBufferIdx[i] > 0 && codePointBuffer[i])
{
for (PRUint32 j = 0; j < NUM_OF_LANGUAGES; j++)
langDetectors[i][j]->HandleData(codePointBuffer[i], codePointBufferIdx[i]);
codePointBufferIdx[i] = 0;
}
if (st == eFoundIt)
{
mBestGuess = i;
@ -161,6 +231,12 @@ nsProbingState nsMBCSGroupProber::HandleData(const char* aBuf, PRUint32 aLen)
}
}
}
else
{
for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++)
if (codePointBuffer[i])
codePointBuffer[i][(codePointBufferIdx[i])++] = aBuf[pos];
}
}
if (keepNext) {
@ -168,7 +244,20 @@ nsProbingState nsMBCSGroupProber::HandleData(const char* aBuf, PRUint32 aLen)
{
if (!mIsActive[i])
continue;
st = mProbers[i]->HandleData(aBuf + start, aLen - start);
if (codePointBuffer[i])
st = mProbers[i]->HandleData(aBuf + start, aLen - start,
&(codePointBuffer[i]), &(codePointBufferIdx[i]));
else
st = mProbers[i]->HandleData(aBuf + start, aLen - start, NULL, NULL);
if (codePointBufferIdx[i] > 0 && codePointBuffer[i])
{
for (PRUint32 j = 0; j < NUM_OF_LANGUAGES; j++)
langDetectors[i][j]->HandleData(codePointBuffer[i], codePointBufferIdx[i]);
codePointBufferIdx[i] = 0;
}
if (st == eFoundIt)
{
mBestGuess = i;

View File

@ -48,12 +48,15 @@
#include "nsEUCTWProber.h"
#define NUM_OF_PROBERS 7
#define NUM_OF_LANGUAGES 6
class nsMBCSGroupProber: public nsCharSetProber {
public:
nsMBCSGroupProber(PRUint32 aLanguageFilter);
virtual ~nsMBCSGroupProber();
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
nsProbingState HandleData(const char* aBuf, PRUint32 aLen,
int** codePointBuffer,
int* codePointBufferIdx);
const char* GetCharSetName();
const char* GetLanguage();
nsProbingState GetState(void) {return mState;}
@ -75,6 +78,12 @@ protected:
PRInt32 mBestGuess;
PRUint32 mActiveNum;
PRUint32 mKeepNext;
int *codePointBuffer[NUM_OF_PROBERS];
int codePointBufferSize[NUM_OF_PROBERS];
int codePointBufferIdx[NUM_OF_PROBERS];
nsLanguageDetector *langDetectors[NUM_OF_PROBERS][NUM_OF_LANGUAGES];
};
#endif /* nsMBCSGroupProber_h__ */

View File

@ -253,7 +253,9 @@ void nsSBCSGroupProber::Reset(void)
}
nsProbingState nsSBCSGroupProber::HandleData(const char* aBuf, PRUint32 aLen)
nsProbingState nsSBCSGroupProber::HandleData(const char* aBuf, PRUint32 aLen,
int** codePointBuffer,
int* codePointBufferIdx)
{
nsProbingState st;
PRUint32 i;
@ -276,7 +278,7 @@ nsProbingState nsSBCSGroupProber::HandleData(const char* aBuf, PRUint32 aLen)
{
if (!mIsActive[i])
continue;
st = mProbers[i]->HandleData(newBuf1, newLen1);
st = mProbers[i]->HandleData(newBuf1, newLen1, codePointBuffer, codePointBufferIdx);
if (st == eFoundIt)
{
mBestGuess = i;

View File

@ -47,7 +47,9 @@ class nsSBCSGroupProber: public nsCharSetProber {
public:
nsSBCSGroupProber();
virtual ~nsSBCSGroupProber();
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
nsProbingState HandleData(const char* aBuf, PRUint32 aLen,
int** codePointBuffer,
int* codePointBufferIdx);
const char* GetCharSetName();
const char* GetLanguage();
nsProbingState GetState(void) {return mState;}

View File

@ -38,7 +38,9 @@
#include <stdio.h>
#include "nsSBCharSetProber.h"
nsProbingState nsSingleByteCharSetProber::HandleData(const char* aBuf, PRUint32 aLen)
nsProbingState nsSingleByteCharSetProber::HandleData(const char* aBuf, PRUint32 aLen,
int** codePointBuffer,
int* codePointBufferIdx)
{
unsigned char order;

View File

@ -88,7 +88,9 @@ public:
virtual const char* GetCharSetName();
virtual const char* GetLanguage();
virtual nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
virtual nsProbingState HandleData(const char* aBuf, PRUint32 aLen,
int** codePointBuffer,
int* codePointBufferIdx);
virtual nsProbingState GetState(void) {return mState;}
virtual void Reset(void);
virtual float GetConfidence(void);

View File

@ -50,7 +50,9 @@ void nsSJISProber::Reset(void)
mDistributionAnalyser.Reset(mIsPreferredLanguage);
}
nsProbingState nsSJISProber::HandleData(const char* aBuf, PRUint32 aLen)
nsProbingState nsSJISProber::HandleData(const char* aBuf, PRUint32 aLen,
int** codePointBuffer,
int* codePointBufferIdx)
{
PRUint32 codingState;

View File

@ -56,7 +56,9 @@ public:
{mCodingSM = new nsCodingStateMachine(&SJISSMModel);
Reset();}
virtual ~nsSJISProber(void){delete mCodingSM;}
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
nsProbingState HandleData(const char* aBuf, PRUint32 aLen,
int** codePointBuffer,
int* codePointBufferIdx);
const char* GetCharSetName() {return "SHIFT_JIS";}
const char* GetLanguage() {return "ja";}
nsProbingState GetState(void) {return mState;}

View File

@ -42,9 +42,12 @@ void nsUTF8Prober::Reset(void)
mCodingSM->Reset();
mNumOfMBChar = 0;
mState = eDetecting;
currentCodePoint = 0;
}
nsProbingState nsUTF8Prober::HandleData(const char* aBuf, PRUint32 aLen)
nsProbingState nsUTF8Prober::HandleData(const char* aBuf, PRUint32 aLen,
int** codePointBuffer,
int* codePointBufferIdx)
{
PRUint32 codingState;
@ -59,7 +62,28 @@ nsProbingState nsUTF8Prober::HandleData(const char* aBuf, PRUint32 aLen)
if (codingState == eStart)
{
if (mCodingSM->GetCurrentCharLen() >= 2)
{
mNumOfMBChar++;
currentCodePoint = ((0xff & aBuf[i]) & 0x3fu) | (currentCodePoint << 6);
if (mCodingSM->GetCurrentCharLen() == 2)
currentCodePoint &= 0x7ff;
else if (mCodingSM->GetCurrentCharLen() == 3)
currentCodePoint &= 0xffff;
else
currentCodePoint &= 0x1fffff;
}
else
{
currentCodePoint = 0xff & (char) aBuf[i];
}
(*codePointBuffer)[(*codePointBufferIdx)++] = currentCodePoint;
currentCodePoint = 0;
}
else
{
currentCodePoint = ((0xff & aBuf[i]) & 0x3fu) | (currentCodePoint << 6);
}
}
@ -84,4 +108,3 @@ float nsUTF8Prober::GetConfidence(void)
else
return (float)0.99;
}

View File

@ -41,6 +41,7 @@
#include <cstddef>
#include "nsCharSetProber.h"
#include "nsCodingStateMachine.h"
#include "nsLanguageDetector.h"
class nsUTF8Prober: public nsCharSetProber {
public:
@ -48,7 +49,9 @@ public:
mCodingSM = new nsCodingStateMachine(&UTF8SMModel);
Reset(); }
virtual ~nsUTF8Prober(){delete mCodingSM;}
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
nsProbingState HandleData(const char* aBuf, PRUint32 aLen,
int** codePointBuffer,
int* codePointBufferIdx);
const char* GetCharSetName() {return "UTF-8";}
const char* GetLanguage() {return NULL;}
nsProbingState GetState(void) {return mState;}
@ -56,11 +59,14 @@ public:
float GetConfidence(void);
void SetOpion() {}
virtual bool DecodeToUnicode() {return true;}
protected:
nsCodingStateMachine* mCodingSM;
nsProbingState mState;
PRUint32 mNumOfMBChar;
int currentCodePoint;
};
#endif /* nsUTF8Prober_h__ */

View File

@ -55,6 +55,8 @@ nsUniversalDetector::nsUniversalDetector(PRUint32 aLanguageFilter)
mStart = PR_TRUE;
mDetectedCharset = nsnull;
mDetectedLanguage = nsnull;
mDetectedConfidence = 0.0;
mGotData = PR_FALSE;
mInputState = ePureAscii;
mLastChar = '\0';
@ -83,6 +85,8 @@ nsUniversalDetector::Reset()
mStart = PR_TRUE;
mDetectedCharset = nsnull;
mDetectedLanguage = nsnull;
mDetectedConfidence = 0.0;
mGotData = PR_FALSE;
mInputState = ePureAscii;
mLastChar = '\0';
@ -118,13 +122,19 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
{
case '\xEF':
if (('\xBB' == aBuf[1]) && ('\xBF' == aBuf[2]))
{
/* EF BB BF: UTF-8 encoded BOM. */
mDetectedCharset = "UTF-8";
mDetectedConfidence = 0.99;
}
break;
case '\xFE':
if ('\xFF' == aBuf[1])
{
/* FE FF: UTF-16, big endian BOM. */
mDetectedCharset = "UTF-16";
mDetectedConfidence = 0.99;
}
break;
case '\xFF':
if ('\xFE' == aBuf[1])
@ -135,11 +145,13 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
{
/* FF FE 00 00: UTF-32 (LE). */
mDetectedCharset = "UTF-32";
mDetectedConfidence = 0.99;
}
else
{
/* FF FE: UTF-16, little endian BOM. */
mDetectedCharset = "UTF-16";
mDetectedConfidence = 0.99;
}
}
break;
@ -151,6 +163,7 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
{
/* 00 00 FE FF: UTF-32 (BE). */
mDetectedCharset = "UTF-32";
mDetectedConfidence = 0.99;
}
break;
}
@ -236,11 +249,12 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
if (nsnull == mEscCharSetProber)
return NS_ERROR_OUT_OF_MEMORY;
}
st = mEscCharSetProber->HandleData(aBuf, aLen);
st = mEscCharSetProber->HandleData(aBuf, aLen, NULL, NULL);
if (st == eFoundIt)
{
mDone = PR_TRUE;
mDetectedCharset = mEscCharSetProber->GetCharSetName();
mDetectedConfidence = mEscCharSetProber->GetConfidence();
}
break;
case eHighbyte:
@ -248,11 +262,13 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
{
if (mCharSetProbers[i])
{
st = mCharSetProbers[i]->HandleData(aBuf, aLen);
st = mCharSetProbers[i]->HandleData(aBuf, aLen, NULL, NULL);
if (st == eFoundIt)
{
mDone = PR_TRUE;
mDetectedCharset = mCharSetProbers[i]->GetCharSetName();
mDetectedLanguage = mCharSetProbers[i]->GetLanguage();
mDetectedConfidence = mCharSetProbers[i]->GetConfidence();
return NS_OK;
}
}
@ -305,7 +321,7 @@ void nsUniversalDetector::DataEnd()
* when finding them.
*/
mDone = PR_TRUE;
Report(mDetectedCharset, NULL, 1.0);
Report(mDetectedCharset, mDetectedLanguage, mDetectedConfidence);
return;
}

View File

@ -81,6 +81,8 @@ protected:
PRBool mGotData;
char mLastChar;
const char * mDetectedCharset;
const char * mDetectedLanguage;
float mDetectedConfidence;
PRInt32 mBestGuess;
PRUint32 mLanguageFilter;