diff --git a/src/nsBig5Prober.cpp b/src/nsBig5Prober.cpp index 3e47d6e..46aea0f 100644 --- a/src/nsBig5Prober.cpp +++ b/src/nsBig5Prober.cpp @@ -44,7 +44,9 @@ void nsBig5Prober::Reset(void) mDistributionAnalyser.Reset(mIsPreferredLanguage); } -nsProbingState nsBig5Prober::HandleData(const char* aBuf, PRUint32 aLen) +nsProbingState nsBig5Prober::HandleData(const char* aBuf, PRUint32 aLen, + int** codePointBuffer, + int* codePointBufferIdx) { PRUint32 codingState; diff --git a/src/nsBig5Prober.h b/src/nsBig5Prober.h index 4b5d9fa..61726af 100644 --- a/src/nsBig5Prober.h +++ b/src/nsBig5Prober.h @@ -49,7 +49,9 @@ public: {mCodingSM = new nsCodingStateMachine(&Big5SMModel); Reset();} virtual ~nsBig5Prober(void){delete mCodingSM;} - nsProbingState HandleData(const char* aBuf, PRUint32 aLen); + nsProbingState HandleData(const char* aBuf, PRUint32 aLen, + int** codePointBuffer, + int* codePointBufferIdx); const char* GetCharSetName() {return "BIG5";} const char* GetLanguage() {return "zh";} nsProbingState GetState(void) {return mState;} diff --git a/src/nsCharSetProber.h b/src/nsCharSetProber.h index c13afb8..1aa7dbc 100644 --- a/src/nsCharSetProber.h +++ b/src/nsCharSetProber.h @@ -55,7 +55,10 @@ public: virtual ~nsCharSetProber() {} virtual const char* GetCharSetName() = 0; virtual const char* GetLanguage() = 0; - virtual nsProbingState HandleData(const char* aBuf, PRUint32 aLen) = 0; + virtual nsProbingState HandleData(const char* aBuf, PRUint32 aLen, + int** codePointBuffer, + int* codePointBufferIdx) = 0; + virtual bool DecodeToUnicode() {return false;} virtual nsProbingState GetState(void) = 0; virtual void Reset(void) = 0; virtual float GetConfidence(void) = 0; diff --git a/src/nsEUCJPProber.cpp b/src/nsEUCJPProber.cpp index f84d154..12c82a1 100644 --- a/src/nsEUCJPProber.cpp +++ b/src/nsEUCJPProber.cpp @@ -50,7 +50,9 @@ void nsEUCJPProber::Reset(void) mDistributionAnalyser.Reset(mIsPreferredLanguage); } -nsProbingState nsEUCJPProber::HandleData(const char* aBuf, PRUint32 aLen) +nsProbingState nsEUCJPProber::HandleData(const char* aBuf, PRUint32 aLen, + int** codePointBuffer, + int* codePointBufferIdx) { PRUint32 codingState; diff --git a/src/nsEUCJPProber.h b/src/nsEUCJPProber.h index a74c779..16fe558 100644 --- a/src/nsEUCJPProber.h +++ b/src/nsEUCJPProber.h @@ -55,7 +55,9 @@ public: {mCodingSM = new nsCodingStateMachine(&EUCJPSMModel); Reset();} virtual ~nsEUCJPProber(void){delete mCodingSM;} - nsProbingState HandleData(const char* aBuf, PRUint32 aLen); + nsProbingState HandleData(const char* aBuf, PRUint32 aLen, + int** codePointBuffer, + int* codePointBufferIdx); const char* GetCharSetName() {return "EUC-JP";} const char* GetLanguage() {return "ja";} nsProbingState GetState(void) {return mState;} diff --git a/src/nsEUCKRProber.cpp b/src/nsEUCKRProber.cpp index 6aae8ae..eff70ef 100644 --- a/src/nsEUCKRProber.cpp +++ b/src/nsEUCKRProber.cpp @@ -45,7 +45,9 @@ void nsEUCKRProber::Reset(void) //mContextAnalyser.Reset(); } -nsProbingState nsEUCKRProber::HandleData(const char* aBuf, PRUint32 aLen) +nsProbingState nsEUCKRProber::HandleData(const char* aBuf, PRUint32 aLen, + int** codePointBuffer, + int* codePointBufferIdx) { PRUint32 codingState; diff --git a/src/nsEUCKRProber.h b/src/nsEUCKRProber.h index 8ce9eb2..d41234f 100644 --- a/src/nsEUCKRProber.h +++ b/src/nsEUCKRProber.h @@ -50,7 +50,9 @@ public: Reset(); } virtual ~nsEUCKRProber(void){delete mCodingSM;} - nsProbingState HandleData(const char* aBuf, PRUint32 aLen); + nsProbingState HandleData(const char* aBuf, PRUint32 aLen, + int** codePointBuffer, + int* codePointBufferIdx); /* "Unified Hangul Code", also called "CP949" or "Windows-949" is a * superset of EUC-KR. Though not fully ok to return UHC here (a * separate prober would be better), it is acceptable, since many diff --git a/src/nsEUCTWProber.cpp b/src/nsEUCTWProber.cpp index 7e61ea1..a11b81a 100644 --- a/src/nsEUCTWProber.cpp +++ b/src/nsEUCTWProber.cpp @@ -45,7 +45,9 @@ void nsEUCTWProber::Reset(void) //mContextAnalyser.Reset(); } -nsProbingState nsEUCTWProber::HandleData(const char* aBuf, PRUint32 aLen) +nsProbingState nsEUCTWProber::HandleData(const char* aBuf, PRUint32 aLen, + int** codePointBuffer, + int* codePointBufferIdx) { PRUint32 codingState; diff --git a/src/nsEUCTWProber.h b/src/nsEUCTWProber.h index 6701027..7e7faf3 100644 --- a/src/nsEUCTWProber.h +++ b/src/nsEUCTWProber.h @@ -49,7 +49,9 @@ public: {mCodingSM = new nsCodingStateMachine(&EUCTWSMModel); Reset();} virtual ~nsEUCTWProber(void){delete mCodingSM;} - nsProbingState HandleData(const char* aBuf, PRUint32 aLen); + nsProbingState HandleData(const char* aBuf, PRUint32 aLen, + int** codePointBuffer, + int* codePointBufferIdx); const char* GetCharSetName() {return "EUC-TW";} const char* GetLanguage() {return "zh";} nsProbingState GetState(void) {return mState;} diff --git a/src/nsEscCharsetProber.cpp b/src/nsEscCharsetProber.cpp index d093ee4..4c31105 100644 --- a/src/nsEscCharsetProber.cpp +++ b/src/nsEscCharsetProber.cpp @@ -73,7 +73,9 @@ void nsEscCharSetProber::Reset(void) mDetectedCharset = nsnull; } -nsProbingState nsEscCharSetProber::HandleData(const char* aBuf, PRUint32 aLen) +nsProbingState nsEscCharSetProber::HandleData(const char* aBuf, PRUint32 aLen, + int** codePointBuffer, + int* codePointBufferIdx) { PRUint32 codingState; PRInt32 j; diff --git a/src/nsEscCharsetProber.h b/src/nsEscCharsetProber.h index eab3080..116153e 100644 --- a/src/nsEscCharsetProber.h +++ b/src/nsEscCharsetProber.h @@ -49,7 +49,9 @@ class nsEscCharSetProber: public nsCharSetProber { public: nsEscCharSetProber(PRUint32 aLanguageFilter); virtual ~nsEscCharSetProber(void); - nsProbingState HandleData(const char* aBuf, PRUint32 aLen); + nsProbingState HandleData(const char* aBuf, PRUint32 aLen, + int** codePointBuffer, + int* codePointBufferIdx); const char* GetCharSetName() {return mDetectedCharset;} const char* GetLanguage() {return NULL;} nsProbingState GetState(void) {return mState;} diff --git a/src/nsGB2312Prober.cpp b/src/nsGB2312Prober.cpp index eac0762..193358d 100644 --- a/src/nsGB2312Prober.cpp +++ b/src/nsGB2312Prober.cpp @@ -50,7 +50,9 @@ void nsGB18030Prober::Reset(void) //mContextAnalyser.Reset(); } -nsProbingState nsGB18030Prober::HandleData(const char* aBuf, PRUint32 aLen) +nsProbingState nsGB18030Prober::HandleData(const char* aBuf, PRUint32 aLen, + int** codePointBuffer, + int* codePointBufferIdx) { PRUint32 codingState; diff --git a/src/nsGB2312Prober.h b/src/nsGB2312Prober.h index a35e585..5290bd1 100644 --- a/src/nsGB2312Prober.h +++ b/src/nsGB2312Prober.h @@ -51,7 +51,9 @@ public: {mCodingSM = new nsCodingStateMachine(&GB18030SMModel); Reset();} virtual ~nsGB18030Prober(void){delete mCodingSM;} - nsProbingState HandleData(const char* aBuf, PRUint32 aLen); + nsProbingState HandleData(const char* aBuf, PRUint32 aLen, + int** codePointBuffer, + int* codePointBufferIdx); const char* GetCharSetName() {return "GB18030";} const char* GetLanguage() {return "zh";} nsProbingState GetState(void) {return mState;} diff --git a/src/nsHebrewProber.cpp b/src/nsHebrewProber.cpp index c503617..ac5a9af 100644 --- a/src/nsHebrewProber.cpp +++ b/src/nsHebrewProber.cpp @@ -106,7 +106,9 @@ PRBool nsHebrewProber::isNonFinal(char c) * The input buffer should not contain any white spaces that are not (' ') * or any low-ascii punctuation marks. */ -nsProbingState nsHebrewProber::HandleData(const char* aBuf, PRUint32 aLen) +nsProbingState nsHebrewProber::HandleData(const char* aBuf, PRUint32 aLen, + int** codePointBuffer, + int* codePointBufferIdx) { // Both model probers say it's not them. No reason to continue. if (GetState() == eNotMe) diff --git a/src/nsHebrewProber.h b/src/nsHebrewProber.h index 8442aab..421a7aa 100644 --- a/src/nsHebrewProber.h +++ b/src/nsHebrewProber.h @@ -48,7 +48,9 @@ public: nsHebrewProber(void) :mLogicalProb(0), mVisualProb(0) { Reset(); } virtual ~nsHebrewProber(void) {} - virtual nsProbingState HandleData(const char* aBuf, PRUint32 aLen); + virtual nsProbingState HandleData(const char* aBuf, PRUint32 aLen, + int** codePointBuffer, + int* codePointBufferIdx); virtual const char *GetCharSetName(); virtual const char *GetLanguage(void) { return "he"; } virtual void Reset(void); diff --git a/src/nsLatin1Prober.cpp b/src/nsLatin1Prober.cpp index 7694ef7..cffb391 100644 --- a/src/nsLatin1Prober.cpp +++ b/src/nsLatin1Prober.cpp @@ -114,7 +114,9 @@ void nsLatin1Prober::Reset(void) } -nsProbingState nsLatin1Prober::HandleData(const char* aBuf, PRUint32 aLen) +nsProbingState nsLatin1Prober::HandleData(const char* aBuf, PRUint32 aLen, + int** codePointBuffer, + int* codePointBufferIdx) { char *newBuf1 = 0; PRUint32 newLen1 = 0; diff --git a/src/nsLatin1Prober.h b/src/nsLatin1Prober.h index bd3a9d5..77ff331 100644 --- a/src/nsLatin1Prober.h +++ b/src/nsLatin1Prober.h @@ -49,7 +49,9 @@ class nsLatin1Prober: public nsCharSetProber { public: nsLatin1Prober(void){Reset();} virtual ~nsLatin1Prober(void){} - nsProbingState HandleData(const char* aBuf, PRUint32 aLen); + nsProbingState HandleData(const char* aBuf, PRUint32 aLen, + int** codePointBuffer, + int* codePointBufferIdx); const char* GetCharSetName() {return "WINDOWS-1252";} const char* GetLanguage() {return NULL;} nsProbingState GetState(void) {return mState;} diff --git a/src/nsMBCSGroupProber.cpp b/src/nsMBCSGroupProber.cpp index 68c896a..544a8dd 100644 --- a/src/nsMBCSGroupProber.cpp +++ b/src/nsMBCSGroupProber.cpp @@ -58,7 +58,12 @@ const char *ProberName[] = nsMBCSGroupProber::nsMBCSGroupProber(PRUint32 aLanguageFilter) { for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++) - mProbers[i] = nsnull; + { + mProbers[i] = nsnull; + codePointBuffer[i] = nsnull; + codePointBufferSize[i] = 0; + codePointBufferIdx[i] = 0; + } mProbers[0] = new nsUTF8Prober(); if (aLanguageFilter & NS_FILTER_JAPANESE) @@ -75,6 +80,24 @@ nsMBCSGroupProber::nsMBCSGroupProber(PRUint32 aLanguageFilter) mProbers[5] = new nsBig5Prober(aLanguageFilter == NS_FILTER_CHINESE_TRADITIONAL); mProbers[6] = new nsEUCTWProber(aLanguageFilter == NS_FILTER_CHINESE_TRADITIONAL); } + + for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++) + { + if (mProbers[i]->DecodeToUnicode()) + { + langDetectors[i][0] = new nsLanguageDetector(&FrenchModel); + langDetectors[i][1] = new nsLanguageDetector(&ItalianModel); + langDetectors[i][2] = new nsLanguageDetector(&DanishModel); + langDetectors[i][3] = new nsLanguageDetector(&GermanModel); + langDetectors[i][4] = new nsLanguageDetector(&ArabicModel); + langDetectors[i][5] = new nsLanguageDetector(&SpanishModel); + } + else + { + for (PRUint32 j = 0; j < NUM_OF_LANGUAGES; j++) + langDetectors[i][j] = nsnull; + } + } Reset(); } @@ -83,6 +106,13 @@ nsMBCSGroupProber::~nsMBCSGroupProber() for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++) { delete mProbers[i]; + + if (codePointBufferSize[i] != 0) + delete [] codePointBuffer[i]; + + for (PRUint32 j = 0; j < NUM_OF_LANGUAGES; j++) + if (langDetectors[i][j]) + delete langDetectors[i][j]; } } @@ -99,17 +129,35 @@ const char* nsMBCSGroupProber::GetCharSetName() const char* nsMBCSGroupProber::GetLanguage(void) { + const char* maxLang = NULL; + int maxLangIdx = -1; + float maxConfidence = 0.0; + if (mBestGuess == -1) - { - GetConfidence(); - } - if (mBestGuess == -1) - return NULL; + return NULL; else - return mProbers[mBestGuess]->GetLanguage(); + maxLang = mProbers[mBestGuess]->GetLanguage(); + + if (maxLang == NULL && mProbers[mBestGuess]->DecodeToUnicode()) + { + for (PRUint32 j = 0; j < NUM_OF_LANGUAGES; j++) + { + float conf = langDetectors[mBestGuess][j]->GetConfidence(); + + if (conf > maxConfidence) + { + maxLangIdx = j; + maxConfidence = conf; + } + } + if (maxLangIdx != -1) + maxLang = langDetectors[mBestGuess][maxLangIdx]->GetLanguage(); + } + + return maxLang; } -void nsMBCSGroupProber::Reset(void) +void nsMBCSGroupProber::Reset(void) { mActiveNum = 0; for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++) @@ -119,6 +167,13 @@ void nsMBCSGroupProber::Reset(void) mProbers[i]->Reset(); mIsActive[i] = PR_TRUE; ++mActiveNum; + + if (codePointBufferSize[i] == 0 && mProbers[i]->DecodeToUnicode()) + { + codePointBufferSize[i] = 1024; + codePointBuffer[i] = new int[codePointBufferSize[i]]; + } + codePointBufferIdx[i] = 0; } else mIsActive[i] = PR_FALSE; @@ -128,7 +183,9 @@ void nsMBCSGroupProber::Reset(void) mKeepNext = 0; } -nsProbingState nsMBCSGroupProber::HandleData(const char* aBuf, PRUint32 aLen) +nsProbingState nsMBCSGroupProber::HandleData(const char* aBuf, PRUint32 aLen, + int** cpBuffer, + int* cpBufferIdx) { nsProbingState st; PRUint32 start = 0; @@ -151,7 +208,20 @@ nsProbingState nsMBCSGroupProber::HandleData(const char* aBuf, PRUint32 aLen) { if (!mIsActive[i]) continue; - st = mProbers[i]->HandleData(aBuf + start, pos + 1 - start); + + if (codePointBuffer[i]) + st = mProbers[i]->HandleData(aBuf + start, pos + 1 - start, + &(codePointBuffer[i]), &(codePointBufferIdx[i])); + else + st = mProbers[i]->HandleData(aBuf + start, pos + 1 - start, NULL, NULL); + + if (codePointBufferIdx[i] > 0 && codePointBuffer[i]) + { + for (PRUint32 j = 0; j < NUM_OF_LANGUAGES; j++) + langDetectors[i][j]->HandleData(codePointBuffer[i], codePointBufferIdx[i]); + codePointBufferIdx[i] = 0; + } + if (st == eFoundIt) { mBestGuess = i; @@ -161,6 +231,12 @@ nsProbingState nsMBCSGroupProber::HandleData(const char* aBuf, PRUint32 aLen) } } } + else + { + for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++) + if (codePointBuffer[i]) + codePointBuffer[i][(codePointBufferIdx[i])++] = aBuf[pos]; + } } if (keepNext) { @@ -168,7 +244,20 @@ nsProbingState nsMBCSGroupProber::HandleData(const char* aBuf, PRUint32 aLen) { if (!mIsActive[i]) continue; - st = mProbers[i]->HandleData(aBuf + start, aLen - start); + + if (codePointBuffer[i]) + st = mProbers[i]->HandleData(aBuf + start, aLen - start, + &(codePointBuffer[i]), &(codePointBufferIdx[i])); + else + st = mProbers[i]->HandleData(aBuf + start, aLen - start, NULL, NULL); + + if (codePointBufferIdx[i] > 0 && codePointBuffer[i]) + { + for (PRUint32 j = 0; j < NUM_OF_LANGUAGES; j++) + langDetectors[i][j]->HandleData(codePointBuffer[i], codePointBufferIdx[i]); + codePointBufferIdx[i] = 0; + } + if (st == eFoundIt) { mBestGuess = i; diff --git a/src/nsMBCSGroupProber.h b/src/nsMBCSGroupProber.h index 0e55221..ee6669e 100644 --- a/src/nsMBCSGroupProber.h +++ b/src/nsMBCSGroupProber.h @@ -48,12 +48,15 @@ #include "nsEUCTWProber.h" #define NUM_OF_PROBERS 7 +#define NUM_OF_LANGUAGES 6 class nsMBCSGroupProber: public nsCharSetProber { public: nsMBCSGroupProber(PRUint32 aLanguageFilter); virtual ~nsMBCSGroupProber(); - nsProbingState HandleData(const char* aBuf, PRUint32 aLen); + nsProbingState HandleData(const char* aBuf, PRUint32 aLen, + int** codePointBuffer, + int* codePointBufferIdx); const char* GetCharSetName(); const char* GetLanguage(); nsProbingState GetState(void) {return mState;} @@ -75,6 +78,12 @@ protected: PRInt32 mBestGuess; PRUint32 mActiveNum; PRUint32 mKeepNext; + + int *codePointBuffer[NUM_OF_PROBERS]; + int codePointBufferSize[NUM_OF_PROBERS]; + int codePointBufferIdx[NUM_OF_PROBERS]; + + nsLanguageDetector *langDetectors[NUM_OF_PROBERS][NUM_OF_LANGUAGES]; }; #endif /* nsMBCSGroupProber_h__ */ diff --git a/src/nsSBCSGroupProber.cpp b/src/nsSBCSGroupProber.cpp index ee95cad..92d3ff1 100644 --- a/src/nsSBCSGroupProber.cpp +++ b/src/nsSBCSGroupProber.cpp @@ -253,7 +253,9 @@ void nsSBCSGroupProber::Reset(void) } -nsProbingState nsSBCSGroupProber::HandleData(const char* aBuf, PRUint32 aLen) +nsProbingState nsSBCSGroupProber::HandleData(const char* aBuf, PRUint32 aLen, + int** codePointBuffer, + int* codePointBufferIdx) { nsProbingState st; PRUint32 i; @@ -276,7 +278,7 @@ nsProbingState nsSBCSGroupProber::HandleData(const char* aBuf, PRUint32 aLen) { if (!mIsActive[i]) continue; - st = mProbers[i]->HandleData(newBuf1, newLen1); + st = mProbers[i]->HandleData(newBuf1, newLen1, codePointBuffer, codePointBufferIdx); if (st == eFoundIt) { mBestGuess = i; diff --git a/src/nsSBCSGroupProber.h b/src/nsSBCSGroupProber.h index fb398cc..79e0bb2 100644 --- a/src/nsSBCSGroupProber.h +++ b/src/nsSBCSGroupProber.h @@ -47,7 +47,9 @@ class nsSBCSGroupProber: public nsCharSetProber { public: nsSBCSGroupProber(); virtual ~nsSBCSGroupProber(); - nsProbingState HandleData(const char* aBuf, PRUint32 aLen); + nsProbingState HandleData(const char* aBuf, PRUint32 aLen, + int** codePointBuffer, + int* codePointBufferIdx); const char* GetCharSetName(); const char* GetLanguage(); nsProbingState GetState(void) {return mState;} diff --git a/src/nsSBCharSetProber.cpp b/src/nsSBCharSetProber.cpp index 7832f11..71f6be3 100644 --- a/src/nsSBCharSetProber.cpp +++ b/src/nsSBCharSetProber.cpp @@ -38,7 +38,9 @@ #include #include "nsSBCharSetProber.h" -nsProbingState nsSingleByteCharSetProber::HandleData(const char* aBuf, PRUint32 aLen) +nsProbingState nsSingleByteCharSetProber::HandleData(const char* aBuf, PRUint32 aLen, + int** codePointBuffer, + int* codePointBufferIdx) { unsigned char order; diff --git a/src/nsSBCharSetProber.h b/src/nsSBCharSetProber.h index b0c6faa..b3a01d7 100644 --- a/src/nsSBCharSetProber.h +++ b/src/nsSBCharSetProber.h @@ -88,7 +88,9 @@ public: virtual const char* GetCharSetName(); virtual const char* GetLanguage(); - virtual nsProbingState HandleData(const char* aBuf, PRUint32 aLen); + virtual nsProbingState HandleData(const char* aBuf, PRUint32 aLen, + int** codePointBuffer, + int* codePointBufferIdx); virtual nsProbingState GetState(void) {return mState;} virtual void Reset(void); virtual float GetConfidence(void); diff --git a/src/nsSJISProber.cpp b/src/nsSJISProber.cpp index 1c354a7..82b771e 100644 --- a/src/nsSJISProber.cpp +++ b/src/nsSJISProber.cpp @@ -50,7 +50,9 @@ void nsSJISProber::Reset(void) mDistributionAnalyser.Reset(mIsPreferredLanguage); } -nsProbingState nsSJISProber::HandleData(const char* aBuf, PRUint32 aLen) +nsProbingState nsSJISProber::HandleData(const char* aBuf, PRUint32 aLen, + int** codePointBuffer, + int* codePointBufferIdx) { PRUint32 codingState; diff --git a/src/nsSJISProber.h b/src/nsSJISProber.h index 61e6352..d211412 100644 --- a/src/nsSJISProber.h +++ b/src/nsSJISProber.h @@ -56,7 +56,9 @@ public: {mCodingSM = new nsCodingStateMachine(&SJISSMModel); Reset();} virtual ~nsSJISProber(void){delete mCodingSM;} - nsProbingState HandleData(const char* aBuf, PRUint32 aLen); + nsProbingState HandleData(const char* aBuf, PRUint32 aLen, + int** codePointBuffer, + int* codePointBufferIdx); const char* GetCharSetName() {return "SHIFT_JIS";} const char* GetLanguage() {return "ja";} nsProbingState GetState(void) {return mState;} diff --git a/src/nsUTF8Prober.cpp b/src/nsUTF8Prober.cpp index 93677de..f6f057c 100644 --- a/src/nsUTF8Prober.cpp +++ b/src/nsUTF8Prober.cpp @@ -42,9 +42,12 @@ void nsUTF8Prober::Reset(void) mCodingSM->Reset(); mNumOfMBChar = 0; mState = eDetecting; + currentCodePoint = 0; } -nsProbingState nsUTF8Prober::HandleData(const char* aBuf, PRUint32 aLen) +nsProbingState nsUTF8Prober::HandleData(const char* aBuf, PRUint32 aLen, + int** codePointBuffer, + int* codePointBufferIdx) { PRUint32 codingState; @@ -59,7 +62,28 @@ nsProbingState nsUTF8Prober::HandleData(const char* aBuf, PRUint32 aLen) if (codingState == eStart) { if (mCodingSM->GetCurrentCharLen() >= 2) + { mNumOfMBChar++; + + currentCodePoint = ((0xff & aBuf[i]) & 0x3fu) | (currentCodePoint << 6); + if (mCodingSM->GetCurrentCharLen() == 2) + currentCodePoint &= 0x7ff; + else if (mCodingSM->GetCurrentCharLen() == 3) + currentCodePoint &= 0xffff; + else + currentCodePoint &= 0x1fffff; + } + else + { + currentCodePoint = 0xff & (char) aBuf[i]; + } + + (*codePointBuffer)[(*codePointBufferIdx)++] = currentCodePoint; + currentCodePoint = 0; + } + else + { + currentCodePoint = ((0xff & aBuf[i]) & 0x3fu) | (currentCodePoint << 6); } } @@ -84,4 +108,3 @@ float nsUTF8Prober::GetConfidence(void) else return (float)0.99; } - diff --git a/src/nsUTF8Prober.h b/src/nsUTF8Prober.h index a2cf4ee..180559a 100644 --- a/src/nsUTF8Prober.h +++ b/src/nsUTF8Prober.h @@ -41,6 +41,7 @@ #include #include "nsCharSetProber.h" #include "nsCodingStateMachine.h" +#include "nsLanguageDetector.h" class nsUTF8Prober: public nsCharSetProber { public: @@ -48,7 +49,9 @@ public: mCodingSM = new nsCodingStateMachine(&UTF8SMModel); Reset(); } virtual ~nsUTF8Prober(){delete mCodingSM;} - nsProbingState HandleData(const char* aBuf, PRUint32 aLen); + nsProbingState HandleData(const char* aBuf, PRUint32 aLen, + int** codePointBuffer, + int* codePointBufferIdx); const char* GetCharSetName() {return "UTF-8";} const char* GetLanguage() {return NULL;} nsProbingState GetState(void) {return mState;} @@ -56,11 +59,14 @@ public: float GetConfidence(void); void SetOpion() {} + virtual bool DecodeToUnicode() {return true;} + protected: nsCodingStateMachine* mCodingSM; nsProbingState mState; PRUint32 mNumOfMBChar; + + int currentCodePoint; }; #endif /* nsUTF8Prober_h__ */ - diff --git a/src/nsUniversalDetector.cpp b/src/nsUniversalDetector.cpp index bc9e9b2..354b253 100644 --- a/src/nsUniversalDetector.cpp +++ b/src/nsUniversalDetector.cpp @@ -55,6 +55,8 @@ nsUniversalDetector::nsUniversalDetector(PRUint32 aLanguageFilter) mStart = PR_TRUE; mDetectedCharset = nsnull; + mDetectedLanguage = nsnull; + mDetectedConfidence = 0.0; mGotData = PR_FALSE; mInputState = ePureAscii; mLastChar = '\0'; @@ -83,6 +85,8 @@ nsUniversalDetector::Reset() mStart = PR_TRUE; mDetectedCharset = nsnull; + mDetectedLanguage = nsnull; + mDetectedConfidence = 0.0; mGotData = PR_FALSE; mInputState = ePureAscii; mLastChar = '\0'; @@ -118,13 +122,19 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen) { case '\xEF': if (('\xBB' == aBuf[1]) && ('\xBF' == aBuf[2])) + { /* EF BB BF: UTF-8 encoded BOM. */ mDetectedCharset = "UTF-8"; + mDetectedConfidence = 0.99; + } break; case '\xFE': if ('\xFF' == aBuf[1]) + { /* FE FF: UTF-16, big endian BOM. */ mDetectedCharset = "UTF-16"; + mDetectedConfidence = 0.99; + } break; case '\xFF': if ('\xFE' == aBuf[1]) @@ -135,11 +145,13 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen) { /* FF FE 00 00: UTF-32 (LE). */ mDetectedCharset = "UTF-32"; + mDetectedConfidence = 0.99; } else { /* FF FE: UTF-16, little endian BOM. */ mDetectedCharset = "UTF-16"; + mDetectedConfidence = 0.99; } } break; @@ -151,6 +163,7 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen) { /* 00 00 FE FF: UTF-32 (BE). */ mDetectedCharset = "UTF-32"; + mDetectedConfidence = 0.99; } break; } @@ -236,11 +249,12 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen) if (nsnull == mEscCharSetProber) return NS_ERROR_OUT_OF_MEMORY; } - st = mEscCharSetProber->HandleData(aBuf, aLen); + st = mEscCharSetProber->HandleData(aBuf, aLen, NULL, NULL); if (st == eFoundIt) { mDone = PR_TRUE; mDetectedCharset = mEscCharSetProber->GetCharSetName(); + mDetectedConfidence = mEscCharSetProber->GetConfidence(); } break; case eHighbyte: @@ -248,11 +262,13 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen) { if (mCharSetProbers[i]) { - st = mCharSetProbers[i]->HandleData(aBuf, aLen); + st = mCharSetProbers[i]->HandleData(aBuf, aLen, NULL, NULL); if (st == eFoundIt) { mDone = PR_TRUE; mDetectedCharset = mCharSetProbers[i]->GetCharSetName(); + mDetectedLanguage = mCharSetProbers[i]->GetLanguage(); + mDetectedConfidence = mCharSetProbers[i]->GetConfidence(); return NS_OK; } } @@ -305,7 +321,7 @@ void nsUniversalDetector::DataEnd() * when finding them. */ mDone = PR_TRUE; - Report(mDetectedCharset, NULL, 1.0); + Report(mDetectedCharset, mDetectedLanguage, mDetectedConfidence); return; } diff --git a/src/nsUniversalDetector.h b/src/nsUniversalDetector.h index 702a9fe..521e424 100644 --- a/src/nsUniversalDetector.h +++ b/src/nsUniversalDetector.h @@ -81,6 +81,8 @@ protected: PRBool mGotData; char mLastChar; const char * mDetectedCharset; + const char * mDetectedLanguage; + float mDetectedConfidence; PRInt32 mBestGuess; PRUint32 mLanguageFilter;