src: nsEscCharsetProber also returns the correct language.

nsEscCharsetProber will still only return a single candidate, because
this is detected by a state machine, not language statistics anyway.
Anyway now it will also return the language attached to the encoding.
This commit is contained in:
Jehan 2021-03-17 17:15:56 +01:00
parent 6138d9e0f0
commit 2a16ab2310
6 changed files with 21 additions and 6 deletions

View File

@ -77,7 +77,7 @@ public:
} }
PRUint32 GetCurrentCharLen(void) {return mCurrentCharLen;} PRUint32 GetCurrentCharLen(void) {return mCurrentCharLen;}
void Reset(void) {mCurrentState = eStart;} void Reset(void) {mCurrentState = eStart;}
const char * GetCodingStateMachine() {return mModel->name;} const SMModel* GetCodingStateMachine() {return mModel;}
protected: protected:
PRUint32 mCurrentState; PRUint32 mCurrentState;

View File

@ -55,6 +55,7 @@ nsEscCharSetProber::nsEscCharSetProber(PRUint32 aLanguageFilter)
mActiveSM = NUM_OF_ESC_CHARSETS; mActiveSM = NUM_OF_ESC_CHARSETS;
mState = eDetecting; mState = eDetecting;
mDetectedCharset = nsnull; mDetectedCharset = nsnull;
mDetectedLang = nsnull;
} }
nsEscCharSetProber::~nsEscCharSetProber(void) nsEscCharSetProber::~nsEscCharSetProber(void)
@ -71,8 +72,10 @@ void nsEscCharSetProber::Reset(void)
mCodingSM[i]->Reset(); mCodingSM[i]->Reset();
mActiveSM = NUM_OF_ESC_CHARSETS; mActiveSM = NUM_OF_ESC_CHARSETS;
mDetectedCharset = nsnull; mDetectedCharset = nsnull;
mDetectedLang = nsnull;
} }
#include <cstdio>
nsProbingState nsEscCharSetProber::HandleData(const char* aBuf, PRUint32 aLen, nsProbingState nsEscCharSetProber::HandleData(const char* aBuf, PRUint32 aLen,
int** codePointBuffer, int** codePointBuffer,
int* codePointBufferIdx) int* codePointBufferIdx)
@ -90,8 +93,19 @@ nsProbingState nsEscCharSetProber::HandleData(const char* aBuf, PRUint32 aLen,
codingState = mCodingSM[j]->NextState(aBuf[i]); codingState = mCodingSM[j]->NextState(aBuf[i]);
if (codingState == eItsMe) if (codingState == eItsMe)
{ {
const SMModel *model = mCodingSM[j]->GetCodingStateMachine();
mState = eFoundIt; mState = eFoundIt;
mDetectedCharset = mCodingSM[j]->GetCodingStateMachine(); mDetectedCharset = model->name;
if (model == &HZSMModel ||
model == &ISO2022CNSMModel)
mDetectedLang = "zh";
else if (model == &ISO2022JPSMModel)
mDetectedLang = "ja";
else if (model == &ISO2022KRSMModel)
mDetectedLang = "ko";
return mState; return mState;
} }
} }
@ -100,4 +114,3 @@ nsProbingState nsEscCharSetProber::HandleData(const char* aBuf, PRUint32 aLen,
return mState; return mState;
} }

View File

@ -54,7 +54,7 @@ public:
int* codePointBufferIdx); int* codePointBufferIdx);
virtual int GetCandidates() { return 1; } virtual int GetCandidates() { return 1; }
const char* GetCharSetName(int) {return mDetectedCharset;} const char* GetCharSetName(int) {return mDetectedCharset;}
const char* GetLanguage(int) {return NULL;} const char* GetLanguage(int) {return mDetectedLang;}
nsProbingState GetState(void) {return mState;} nsProbingState GetState(void) {return mState;}
void Reset(void); void Reset(void);
float GetConfidence(int){return (float)0.99;} float GetConfidence(int){return (float)0.99;}
@ -67,6 +67,7 @@ protected:
PRUint32 mActiveSM; PRUint32 mActiveSM;
nsProbingState mState; nsProbingState mState;
const char * mDetectedCharset; const char * mDetectedCharset;
const char * mDetectedLang;
}; };
#endif /* nsEscCharSetProber_h__ */ #endif /* nsEscCharSetProber_h__ */

View File

@ -264,4 +264,3 @@ const SMModel ISO2022KRSMModel = {
ISO2022KRCharLenTable, ISO2022KRCharLenTable,
"ISO-2022-KR", "ISO-2022-KR",
}; };

View File

@ -248,6 +248,7 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
{ {
shortcutCharset = mEscCharSetProber->GetCharSetName(0); shortcutCharset = mEscCharSetProber->GetCharSetName(0);
shortcutConfidence = mEscCharSetProber->GetConfidence(0); shortcutConfidence = mEscCharSetProber->GetConfidence(0);
shortcutLanguage = mEscCharSetProber->GetLanguage(0);
mDone = PR_TRUE; mDone = PR_TRUE;
} }
break; break;
@ -313,7 +314,7 @@ void nsUniversalDetector::DataEnd()
* when finding them. * when finding them.
*/ */
mDone = PR_TRUE; mDone = PR_TRUE;
Report(shortcutCharset, NULL, shortcutConfidence); Report(shortcutCharset, shortcutLanguage, shortcutConfidence);
return; return;
} }

View File

@ -81,6 +81,7 @@ protected:
PRBool mGotData; PRBool mGotData;
char mLastChar; char mLastChar;
const char * shortcutCharset; const char * shortcutCharset;
const char * shortcutLanguage;
float shortcutConfidence; float shortcutConfidence;
PRInt32 mBestGuess; PRInt32 mBestGuess;