mirror of
https://gitlab.freedesktop.org/uchardet/uchardet.git
synced 2025-12-06 16:56:40 +08:00
src: nsEscCharsetProber also returns the correct language.
nsEscCharsetProber will still only return a single candidate, because this is detected by a state machine, not language statistics anyway. Anyway now it will also return the language attached to the encoding.
This commit is contained in:
parent
6138d9e0f0
commit
2a16ab2310
@ -77,7 +77,7 @@ public:
|
||||
}
|
||||
PRUint32 GetCurrentCharLen(void) {return mCurrentCharLen;}
|
||||
void Reset(void) {mCurrentState = eStart;}
|
||||
const char * GetCodingStateMachine() {return mModel->name;}
|
||||
const SMModel* GetCodingStateMachine() {return mModel;}
|
||||
|
||||
protected:
|
||||
PRUint32 mCurrentState;
|
||||
|
||||
@ -55,6 +55,7 @@ nsEscCharSetProber::nsEscCharSetProber(PRUint32 aLanguageFilter)
|
||||
mActiveSM = NUM_OF_ESC_CHARSETS;
|
||||
mState = eDetecting;
|
||||
mDetectedCharset = nsnull;
|
||||
mDetectedLang = nsnull;
|
||||
}
|
||||
|
||||
nsEscCharSetProber::~nsEscCharSetProber(void)
|
||||
@ -71,8 +72,10 @@ void nsEscCharSetProber::Reset(void)
|
||||
mCodingSM[i]->Reset();
|
||||
mActiveSM = NUM_OF_ESC_CHARSETS;
|
||||
mDetectedCharset = nsnull;
|
||||
mDetectedLang = nsnull;
|
||||
}
|
||||
|
||||
#include <cstdio>
|
||||
nsProbingState nsEscCharSetProber::HandleData(const char* aBuf, PRUint32 aLen,
|
||||
int** codePointBuffer,
|
||||
int* codePointBufferIdx)
|
||||
@ -90,8 +93,19 @@ nsProbingState nsEscCharSetProber::HandleData(const char* aBuf, PRUint32 aLen,
|
||||
codingState = mCodingSM[j]->NextState(aBuf[i]);
|
||||
if (codingState == eItsMe)
|
||||
{
|
||||
const SMModel *model = mCodingSM[j]->GetCodingStateMachine();
|
||||
|
||||
mState = eFoundIt;
|
||||
mDetectedCharset = mCodingSM[j]->GetCodingStateMachine();
|
||||
mDetectedCharset = model->name;
|
||||
|
||||
if (model == &HZSMModel ||
|
||||
model == &ISO2022CNSMModel)
|
||||
mDetectedLang = "zh";
|
||||
else if (model == &ISO2022JPSMModel)
|
||||
mDetectedLang = "ja";
|
||||
else if (model == &ISO2022KRSMModel)
|
||||
mDetectedLang = "ko";
|
||||
|
||||
return mState;
|
||||
}
|
||||
}
|
||||
@ -100,4 +114,3 @@ nsProbingState nsEscCharSetProber::HandleData(const char* aBuf, PRUint32 aLen,
|
||||
|
||||
return mState;
|
||||
}
|
||||
|
||||
|
||||
@ -54,7 +54,7 @@ public:
|
||||
int* codePointBufferIdx);
|
||||
virtual int GetCandidates() { return 1; }
|
||||
const char* GetCharSetName(int) {return mDetectedCharset;}
|
||||
const char* GetLanguage(int) {return NULL;}
|
||||
const char* GetLanguage(int) {return mDetectedLang;}
|
||||
nsProbingState GetState(void) {return mState;}
|
||||
void Reset(void);
|
||||
float GetConfidence(int){return (float)0.99;}
|
||||
@ -67,6 +67,7 @@ protected:
|
||||
PRUint32 mActiveSM;
|
||||
nsProbingState mState;
|
||||
const char * mDetectedCharset;
|
||||
const char * mDetectedLang;
|
||||
};
|
||||
|
||||
#endif /* nsEscCharSetProber_h__ */
|
||||
|
||||
@ -264,4 +264,3 @@ const SMModel ISO2022KRSMModel = {
|
||||
ISO2022KRCharLenTable,
|
||||
"ISO-2022-KR",
|
||||
};
|
||||
|
||||
|
||||
@ -248,6 +248,7 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
|
||||
{
|
||||
shortcutCharset = mEscCharSetProber->GetCharSetName(0);
|
||||
shortcutConfidence = mEscCharSetProber->GetConfidence(0);
|
||||
shortcutLanguage = mEscCharSetProber->GetLanguage(0);
|
||||
mDone = PR_TRUE;
|
||||
}
|
||||
break;
|
||||
@ -313,7 +314,7 @@ void nsUniversalDetector::DataEnd()
|
||||
* when finding them.
|
||||
*/
|
||||
mDone = PR_TRUE;
|
||||
Report(shortcutCharset, NULL, shortcutConfidence);
|
||||
Report(shortcutCharset, shortcutLanguage, shortcutConfidence);
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
@ -81,6 +81,7 @@ protected:
|
||||
PRBool mGotData;
|
||||
char mLastChar;
|
||||
const char * shortcutCharset;
|
||||
const char * shortcutLanguage;
|
||||
float shortcutConfidence;
|
||||
|
||||
PRInt32 mBestGuess;
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user