src: nsEscCharsetProber also returns the correct language.

nsEscCharsetProber will still only return a single candidate, because
this is detected by a state machine, not language statistics anyway.
Anyway now it will also return the language attached to the encoding.
This commit is contained in:
Jehan 2021-03-17 17:15:56 +01:00
parent 6138d9e0f0
commit 2a16ab2310
6 changed files with 21 additions and 6 deletions

View File

@ -77,7 +77,7 @@ public:
}
PRUint32 GetCurrentCharLen(void) {return mCurrentCharLen;}
void Reset(void) {mCurrentState = eStart;}
const char * GetCodingStateMachine() {return mModel->name;}
const SMModel* GetCodingStateMachine() {return mModel;}
protected:
PRUint32 mCurrentState;

View File

@ -55,6 +55,7 @@ nsEscCharSetProber::nsEscCharSetProber(PRUint32 aLanguageFilter)
mActiveSM = NUM_OF_ESC_CHARSETS;
mState = eDetecting;
mDetectedCharset = nsnull;
mDetectedLang = nsnull;
}
nsEscCharSetProber::~nsEscCharSetProber(void)
@ -71,8 +72,10 @@ void nsEscCharSetProber::Reset(void)
mCodingSM[i]->Reset();
mActiveSM = NUM_OF_ESC_CHARSETS;
mDetectedCharset = nsnull;
mDetectedLang = nsnull;
}
#include <cstdio>
nsProbingState nsEscCharSetProber::HandleData(const char* aBuf, PRUint32 aLen,
int** codePointBuffer,
int* codePointBufferIdx)
@ -90,8 +93,19 @@ nsProbingState nsEscCharSetProber::HandleData(const char* aBuf, PRUint32 aLen,
codingState = mCodingSM[j]->NextState(aBuf[i]);
if (codingState == eItsMe)
{
const SMModel *model = mCodingSM[j]->GetCodingStateMachine();
mState = eFoundIt;
mDetectedCharset = mCodingSM[j]->GetCodingStateMachine();
mDetectedCharset = model->name;
if (model == &HZSMModel ||
model == &ISO2022CNSMModel)
mDetectedLang = "zh";
else if (model == &ISO2022JPSMModel)
mDetectedLang = "ja";
else if (model == &ISO2022KRSMModel)
mDetectedLang = "ko";
return mState;
}
}
@ -100,4 +114,3 @@ nsProbingState nsEscCharSetProber::HandleData(const char* aBuf, PRUint32 aLen,
return mState;
}

View File

@ -54,7 +54,7 @@ public:
int* codePointBufferIdx);
virtual int GetCandidates() { return 1; }
const char* GetCharSetName(int) {return mDetectedCharset;}
const char* GetLanguage(int) {return NULL;}
const char* GetLanguage(int) {return mDetectedLang;}
nsProbingState GetState(void) {return mState;}
void Reset(void);
float GetConfidence(int){return (float)0.99;}
@ -67,6 +67,7 @@ protected:
PRUint32 mActiveSM;
nsProbingState mState;
const char * mDetectedCharset;
const char * mDetectedLang;
};
#endif /* nsEscCharSetProber_h__ */

View File

@ -264,4 +264,3 @@ const SMModel ISO2022KRSMModel = {
ISO2022KRCharLenTable,
"ISO-2022-KR",
};

View File

@ -248,6 +248,7 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
{
shortcutCharset = mEscCharSetProber->GetCharSetName(0);
shortcutConfidence = mEscCharSetProber->GetConfidence(0);
shortcutLanguage = mEscCharSetProber->GetLanguage(0);
mDone = PR_TRUE;
}
break;
@ -313,7 +314,7 @@ void nsUniversalDetector::DataEnd()
* when finding them.
*/
mDone = PR_TRUE;
Report(shortcutCharset, NULL, shortcutConfidence);
Report(shortcutCharset, shortcutLanguage, shortcutConfidence);
return;
}

View File

@ -81,6 +81,7 @@ protected:
PRBool mGotData;
char mLastChar;
const char * shortcutCharset;
const char * shortcutLanguage;
float shortcutConfidence;
PRInt32 mBestGuess;