From 84284eccf4d51854d3108e5683d1a4294586e1a6 Mon Sep 17 00:00:00 2001 From: BYVoid Date: Mon, 11 Jul 2011 14:42:50 +0800 Subject: [PATCH] Update code from upstream. --- debug.sh | 0 release.sh | 0 src/CMakeLists.txt | 16 ++-- src/CharDistribution.cpp | 8 +- src/CharDistribution.h | 13 ++- src/JpCntx.cpp | 11 ++- src/JpCntx.h | 14 +-- src/LangBulgarianModel.cpp | 15 ++-- src/LangCyrillicModel.cpp | 39 ++++----- src/LangGreekModel.cpp | 15 ++-- src/LangHebrewModel.cpp | 9 +- src/LangHungarianModel.cpp | 15 ++-- src/LangThaiModel.cpp | 9 +- src/nsBig5Prober.cpp | 7 +- src/nsBig5Prober.h | 10 ++- src/nsCharSetProber.h | 2 +- src/nsCodingStateMachine.h | 30 +++---- src/nsEUCJPProber.cpp | 9 +- src/nsEUCJPProber.h | 10 ++- src/nsEUCKRProber.cpp | 7 +- src/nsEUCKRProber.h | 11 ++- src/nsEUCTWProber.cpp | 7 +- src/nsEUCTWProber.h | 10 ++- src/nsEscCharsetProber.cpp | 46 +++++----- src/nsEscCharsetProber.h | 2 +- src/nsEscSM.cpp | 36 ++++---- src/nsGB2312Prober.cpp | 7 +- src/nsGB2312Prober.h | 10 ++- src/nsHebrewProber.cpp | 5 +- src/nsLatin1Prober.cpp | 4 +- src/nsLatin1Prober.h | 3 +- src/nsMBCSGroupProber.cpp | 119 ++++++++++++++----------- src/nsMBCSGroupProber.h | 7 +- src/nsMBCSSM.cpp | 170 ++++++------------------------------ src/nsPkgInt.h | 2 +- src/nsSBCSGroupProber.cpp | 13 +-- src/nsSBCSGroupProber.h | 2 +- src/nsSBCharSetProber.h | 39 +++++---- src/nsSJISProber.cpp | 9 +- src/nsSJISProber.h | 10 ++- src/nsUTF8Prober.cpp | 5 -- src/nsUTF8Prober.h | 3 +- src/nsUniversalDetector.cpp | 89 ++++++++++--------- src/nsUniversalDetector.h | 16 +++- src/nscore.h | 2 + src/tools/uchardet.cpp | 36 ++------ src/uchardet.cpp | 34 ++++---- win32.sh | 7 ++ 48 files changed, 411 insertions(+), 532 deletions(-) mode change 100755 => 100644 debug.sh mode change 100755 => 100644 release.sh create mode 100644 win32.sh diff --git a/debug.sh b/debug.sh old mode 100755 new mode 100644 diff --git a/release.sh b/release.sh old mode 100755 new mode 100644 diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index e30bb9d..0b26a3b 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -11,26 +11,26 @@ set( LangBulgarianModel.cpp LangCyrillicModel.cpp LangGreekModel.cpp - LangHebrewModel.cpp LangHungarianModel.cpp + LangHebrewModel.cpp LangThaiModel.cpp - nsBig5Prober.cpp + nsHebrewProber.cpp nsCharSetProber.cpp - nsEscCharsetProber.cpp - nsEscSM.cpp + nsBig5Prober.cpp nsEUCJPProber.cpp nsEUCKRProber.cpp nsEUCTWProber.cpp + nsEscCharsetProber.cpp + nsEscSM.cpp nsGB2312Prober.cpp - nsHebrewProber.cpp - nsLatin1Prober.cpp nsMBCSGroupProber.cpp nsMBCSSM.cpp - nsSBCharSetProber.cpp nsSBCSGroupProber.cpp + nsSBCharSetProber.cpp nsSJISProber.cpp - nsUniversalDetector.cpp nsUTF8Prober.cpp + nsLatin1Prober.cpp + nsUniversalDetector.cpp uchardet.cpp ) diff --git a/src/CharDistribution.cpp b/src/CharDistribution.cpp index 41a8fdc..488d9bc 100644 --- a/src/CharDistribution.cpp +++ b/src/CharDistribution.cpp @@ -46,15 +46,13 @@ #define SURE_YES 0.99f #define SURE_NO 0.01f -#define MINIMUM_DATA_THRESHOLD 4 - //return confidence base on received data -float CharDistributionAnalysis::GetConfidence() +float CharDistributionAnalysis::GetConfidence(void) { //if we didn't receive any character in our consideration range, or the - //number of frequent characters is below the minimum threshold, return + // number of frequent characters is below the minimum threshold, return // negative answer - if (mTotalChars <= 0 || mFreqChars <= MINIMUM_DATA_THRESHOLD) + if (mTotalChars <= 0 || mFreqChars <= mDataThreshold) return SURE_NO; if (mTotalChars != mFreqChars) { diff --git a/src/CharDistribution.h b/src/CharDistribution.h index 789a659..453c2de 100644 --- a/src/CharDistribution.h +++ b/src/CharDistribution.h @@ -42,11 +42,12 @@ #define ENOUGH_DATA_THRESHOLD 1024 +#define MINIMUM_DATA_THRESHOLD 4 + class CharDistributionAnalysis { public: - CharDistributionAnalysis() {Reset();} - virtual ~CharDistributionAnalysis(){}; + CharDistributionAnalysis() {Reset(PR_FALSE);} //feed a block of data and do distribution analysis void HandleData(const char* aBuf, PRUint32 aLen) {} @@ -72,14 +73,15 @@ public: } //return confidence base on existing data - float GetConfidence(); + float GetConfidence(void); //Reset analyser, clear any state - void Reset(void) + void Reset(PRBool aIsPreferredLanguage) { mDone = PR_FALSE; mTotalChars = 0; mFreqChars = 0; + mDataThreshold = aIsPreferredLanguage ? 0 : MINIMUM_DATA_THRESHOLD; } //This function is for future extension. Caller can use this function to control @@ -105,6 +107,9 @@ protected: //Total character encounted. PRUint32 mTotalChars; + //Number of hi-byte characters needed to trigger detection + PRUint32 mDataThreshold; + //Mapping table to get frequency order from char order (get from GetOrder()) const PRInt16 *mCharToFreqOrder; diff --git a/src/JpCntx.cpp b/src/JpCntx.cpp index 812c3bf..7da0413 100644 --- a/src/JpCntx.cpp +++ b/src/JpCntx.cpp @@ -39,7 +39,7 @@ #include "JpCntx.h" //This is hiragana 2-char sequence table, the number in each cell represents its frequency category -char jp2CharContext[83][83] = +const PRUint8 jp2CharContext[83][83] = { { 0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,}, { 2,4,0,4,0,3,0,4,0,3,4,4,4,2,4,3,3,4,3,2,3,3,4,2,3,3,3,2,4,1,4,3,3,1,5,4,3,4,3,4,3,5,3,0,3,5,4,2,0,3,1,0,3,3,0,3,3,0,1,1,0,4,3,0,3,3,0,4,0,2,0,3,5,5,5,5,4,0,4,1,0,3,4,}, @@ -170,7 +170,7 @@ void JapaneseContextAnalysis::HandleData(const char* aBuf, PRUint32 aLen) return; } -void JapaneseContextAnalysis::Reset(void) +void JapaneseContextAnalysis::Reset(PRBool aIsPreferredLanguage) { mTotalRel = 0; for (PRUint32 i = 0; i < NUM_OF_CATEGORY; i++) @@ -178,13 +178,14 @@ void JapaneseContextAnalysis::Reset(void) mNeedToSkipCharNum = 0; mLastCharOrder = -1; mDone = PR_FALSE; + mDataThreshold = aIsPreferredLanguage ? 0 : MINIMUM_DATA_THRESHOLD; } #define DONT_KNOW (float)-1 -float JapaneseContextAnalysis::GetConfidence() +float JapaneseContextAnalysis::GetConfidence(void) { //This is just one way to calculate confidence. It works well for me. - if (mTotalRel > MINIMUM_DATA_THRESHOLD) + if (mTotalRel > mDataThreshold) return ((float)(mTotalRel - mRelSample[0]))/mTotalRel; else return (float)DONT_KNOW; @@ -227,5 +228,3 @@ PRInt32 EUCJPContextAnalysis::GetOrder(const char* str, PRUint32 *charLen) return (unsigned char)*(str+1) - (unsigned char)0xa1; return -1; } - - diff --git a/src/JpCntx.h b/src/JpCntx.h index 8231a12..fe8fcb8 100644 --- a/src/JpCntx.h +++ b/src/JpCntx.h @@ -46,13 +46,12 @@ #define MAX_REL_THRESHOLD 1000 //hiragana frequency category table -extern char jp2CharContext[83][83]; +extern const PRUint8 jp2CharContext[83][83]; class JapaneseContextAnalysis { public: - JapaneseContextAnalysis() {Reset();} - virtual ~JapaneseContextAnalysis(){}; + JapaneseContextAnalysis() {Reset(PR_FALSE);} void HandleData(const char* aBuf, PRUint32 aLen); @@ -75,8 +74,8 @@ public: mLastCharOrder = order; } - float GetConfidence(); - void Reset(void); + float GetConfidence(void); + void Reset(PRBool aIsPreferredLanguage); void SetOpion(){} PRBool GotEnoughData() {return mTotalRel > ENOUGH_REL_THRESHOLD;} @@ -84,11 +83,14 @@ protected: virtual PRInt32 GetOrder(const char* str, PRUint32 *charLen) = 0; virtual PRInt32 GetOrder(const char* str) = 0; - //category counters, each interger counts sequence in its category + //category counters, each integer counts sequences in its category PRUint32 mRelSample[NUM_OF_CATEGORY]; //total sequence received PRUint32 mTotalRel; + + //Number of sequences needed to trigger detection + PRUint32 mDataThreshold; //The order of previous char PRInt32 mLastCharOrder; diff --git a/src/LangBulgarianModel.cpp b/src/LangBulgarianModel.cpp index 9babfda..0f73282 100644 --- a/src/LangBulgarianModel.cpp +++ b/src/LangBulgarianModel.cpp @@ -35,7 +35,6 @@ * * ***** END LICENSE BLOCK ***** */ -#include "uchardetDefine.h" #include "nsSBCharSetProber.h" /**************************************************************** 255: Control characters that usually does not exist in any text @@ -49,7 +48,7 @@ //this talbe is modified base on win1251BulgarianCharToOrderMap, so //only number <64 is sure valid -unsigned char Latin5_BulgarianCharToOrderMap[] = +static const unsigned char Latin5_BulgarianCharToOrderMap[] = { 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10 @@ -69,7 +68,7 @@ unsigned char Latin5_BulgarianCharToOrderMap[] = 62,242,243,244, 58,245, 98,246,247,248,249,250,251, 91,252,253, //f0 }; -unsigned char win1251BulgarianCharToOrderMap[] = +static const unsigned char win1251BulgarianCharToOrderMap[] = { 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10 @@ -95,7 +94,7 @@ unsigned char win1251BulgarianCharToOrderMap[] = //first 1024 sequences:3.0618% //rest sequences: 0.2992% //negative sequences: 0.0020% -char BulgarianLangModel[] = +static const PRUint8 BulgarianLangModel[] = { 0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,2,3,3,3,3,3, 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,2,2,3,2,2,1,2,2, @@ -227,20 +226,20 @@ char BulgarianLangModel[] = 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1, }; -SequenceModel Latin5BulgarianModel = +const SequenceModel Latin5BulgarianModel = { Latin5_BulgarianCharToOrderMap, BulgarianLangModel, (float)0.969392, PR_FALSE, - CHARDET_ENCODING_ISO_8859_5 + "ISO-8859-5" }; -SequenceModel Win1251BulgarianModel = +const SequenceModel Win1251BulgarianModel = { win1251BulgarianCharToOrderMap, BulgarianLangModel, (float)0.969392, PR_FALSE, - CHARDET_ENCODING_WINDOWS_1251 + "windows-1251" }; diff --git a/src/LangCyrillicModel.cpp b/src/LangCyrillicModel.cpp index deceba3..d8e73e8 100644 --- a/src/LangCyrillicModel.cpp +++ b/src/LangCyrillicModel.cpp @@ -35,14 +35,13 @@ * * ***** END LICENSE BLOCK ***** */ -#include "uchardetDefine.h" #include "nsSBCharSetProber.h" //KOI8-R language model //Character Mapping Table: -unsigned char KOI8R_CharToOrderMap[] = +static const unsigned char KOI8R_CharToOrderMap[] = { 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10 @@ -62,7 +61,7 @@ unsigned char KOI8R_CharToOrderMap[] = 35, 43, 45, 32, 40, 52, 56, 33, 61, 62, 51, 57, 47, 63, 50, 70, //f0 }; -unsigned char win1251_CharToOrderMap[] = +static const unsigned char win1251_CharToOrderMap[] = { 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10 @@ -82,7 +81,7 @@ unsigned char win1251_CharToOrderMap[] = 9, 7, 6, 14, 39, 26, 28, 22, 25, 29, 54, 18, 17, 30, 27, 16, }; -unsigned char latin5_CharToOrderMap[] = +static const unsigned char latin5_CharToOrderMap[] = { 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10 @@ -102,7 +101,7 @@ unsigned char latin5_CharToOrderMap[] = 239, 68,240,241,242,243,244,245,246,247,248,249,250,251,252,255, }; -unsigned char macCyrillic_CharToOrderMap[] = +static const unsigned char macCyrillic_CharToOrderMap[] = { 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10 @@ -122,7 +121,7 @@ unsigned char macCyrillic_CharToOrderMap[] = 9, 7, 6, 14, 39, 26, 28, 22, 25, 29, 54, 18, 17, 30, 27,255, }; -unsigned char IBM855_CharToOrderMap[] = +static const unsigned char IBM855_CharToOrderMap[] = { 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10 @@ -142,7 +141,7 @@ unsigned char IBM855_CharToOrderMap[] = 250, 18, 62, 20, 51, 25, 57, 30, 47, 29, 63, 22, 50,251,252,255, }; -unsigned char IBM866_CharToOrderMap[] = +static const unsigned char IBM866_CharToOrderMap[] = { 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10 @@ -168,7 +167,7 @@ unsigned char IBM866_CharToOrderMap[] = //first 1024 sequences: 2.3389% //rest sequences: 0.1237% //negative sequences: 0.0009% -char RussianLangModel[] = +static const PRUint8 RussianLangModel[] = { 0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,1,3,3,3,3,1,3,3,3,2,3,2,3,3, 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,2,2,2,2,2,0,0,2, @@ -301,56 +300,56 @@ char RussianLangModel[] = }; -SequenceModel Koi8rModel = +const SequenceModel Koi8rModel = { KOI8R_CharToOrderMap, RussianLangModel, (float)0.976601, PR_FALSE, - CHARDET_ENCODING_KOI8_R + "KOI8-R" }; -SequenceModel Win1251Model = +const SequenceModel Win1251Model = { win1251_CharToOrderMap, RussianLangModel, (float)0.976601, PR_FALSE, - CHARDET_ENCODING_WINDOWS_1251 + "windows-1251" }; -SequenceModel Latin5Model = +const SequenceModel Latin5Model = { latin5_CharToOrderMap, RussianLangModel, (float)0.976601, PR_FALSE, - CHARDET_ENCODING_ISO_8859_5 + "ISO-8859-5" }; -SequenceModel MacCyrillicModel = +const SequenceModel MacCyrillicModel = { macCyrillic_CharToOrderMap, RussianLangModel, (float)0.976601, PR_FALSE, - CHARDET_ENCODING_MACCYRILLIC + "x-mac-cyrillic" }; -SequenceModel Ibm866Model = +const SequenceModel Ibm866Model = { IBM866_CharToOrderMap, RussianLangModel, (float)0.976601, PR_FALSE, - CHARDET_ENCODING_IBM866 + "IBM866" }; -SequenceModel Ibm855Model = +const SequenceModel Ibm855Model = { IBM855_CharToOrderMap, RussianLangModel, (float)0.976601, PR_FALSE, - CHARDET_ENCODING_IBM855 + "IBM855" }; diff --git a/src/LangGreekModel.cpp b/src/LangGreekModel.cpp index 60cea7e..30c65dc 100644 --- a/src/LangGreekModel.cpp +++ b/src/LangGreekModel.cpp @@ -35,7 +35,6 @@ * * ***** END LICENSE BLOCK ***** */ -#include "uchardetDefine.h" #include "nsSBCharSetProber.h" /**************************************************************** 255: Control characters that usually does not exist in any text @@ -46,7 +45,7 @@ *****************************************************************/ //Character Mapping Table: -unsigned char Latin7_CharToOrderMap[] = +static const unsigned char Latin7_CharToOrderMap[] = { 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10 @@ -68,7 +67,7 @@ unsigned char Latin7_CharToOrderMap[] = -unsigned char win1253_CharToOrderMap[] = +static const unsigned char win1253_CharToOrderMap[] = { 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10 @@ -94,7 +93,7 @@ unsigned char win1253_CharToOrderMap[] = //first 1024 sequences:1.7001% //rest sequences: 0.0359% //negative sequences: 0.0148% -char GreekLangModel[] = +static const PRUint8 GreekLangModel[] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, @@ -226,20 +225,20 @@ char GreekLangModel[] = 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, }; -SequenceModel Latin7Model = +const SequenceModel Latin7Model = { Latin7_CharToOrderMap, GreekLangModel, (float)0.982851, PR_FALSE, - CHARDET_ENCODING_ISO_8859_7 + "ISO-8859-7" }; -SequenceModel Win1253Model = +const SequenceModel Win1253Model = { win1253_CharToOrderMap, GreekLangModel, (float)0.982851, PR_FALSE, - CHARDET_ENCODING_WINDOWS_1253 + "windows-1253" }; diff --git a/src/LangHebrewModel.cpp b/src/LangHebrewModel.cpp index 8c18641..a4e10ad 100644 --- a/src/LangHebrewModel.cpp +++ b/src/LangHebrewModel.cpp @@ -37,7 +37,6 @@ * * ***** END LICENSE BLOCK ***** */ -#include "uchardetDefine.h" #include "nsSBCharSetProber.h" @@ -51,7 +50,7 @@ //Windows-1255 language model //Character Mapping Table: -unsigned char win1255_CharToOrderMap[] = +static const unsigned char win1255_CharToOrderMap[] = { 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10 @@ -77,7 +76,7 @@ unsigned char win1255_CharToOrderMap[] = //first 1024 sequences: 1.5981% //rest sequences: 0.087% //negative sequences: 0.0015% -char HebrewLangModel[] = +static const PRUint8 HebrewLangModel[] = { 0,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,2,3,2,1,2,0,1,0,0, 3,0,3,1,0,0,1,3,2,0,1,1,2,0,2,2,2,1,1,1,1,2,1,1,1,2,0,0,2,2,0,1, @@ -209,12 +208,12 @@ char HebrewLangModel[] = 0,0,0,0,0,0,0,0,0,0,1,2,1,0,0,0,0,0,1,1,1,1,1,0,1,0,0,0,1,1,0,0, }; -SequenceModel Win1255Model = +const SequenceModel Win1255Model = { win1255_CharToOrderMap, HebrewLangModel, (float)0.984004, PR_FALSE, - CHARDET_ENCODING_WINDOWS_1255 + "windows-1255" }; diff --git a/src/LangHungarianModel.cpp b/src/LangHungarianModel.cpp index 66e2d5d..3af2f58 100644 --- a/src/LangHungarianModel.cpp +++ b/src/LangHungarianModel.cpp @@ -35,7 +35,6 @@ * * ***** END LICENSE BLOCK ***** */ -#include "uchardetDefine.h" #include "nsSBCharSetProber.h" /**************************************************************** 255: Control characters that usually does not exist in any text @@ -46,7 +45,7 @@ *****************************************************************/ //Character Mapping Table: -unsigned char Latin2_HungarianCharToOrderMap[] = +static const unsigned char Latin2_HungarianCharToOrderMap[] = { 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10 @@ -66,7 +65,7 @@ unsigned char Latin2_HungarianCharToOrderMap[] = 245,246,247, 25, 73, 42, 24,248,249,250, 31, 56, 29,251,252,253, }; -unsigned char win1250HungarianCharToOrderMap[] = +static const unsigned char win1250HungarianCharToOrderMap[] = { 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10 @@ -92,7 +91,7 @@ unsigned char win1250HungarianCharToOrderMap[] = //first 1024 sequences:5.2623% //rest sequences: 0.8894% //negative sequences: 0.0009% -char HungarianLangModel[] = +static const PRUint8 HungarianLangModel[] = { 0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,2,2,3,3,1,1,2,2,2,2,2,1,2, @@ -224,20 +223,20 @@ char HungarianLangModel[] = 0,1,1,1,1,1,1,0,1,1,0,1,0,1,0,0,1,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0, }; -SequenceModel Latin2HungarianModel = +const SequenceModel Latin2HungarianModel = { Latin2_HungarianCharToOrderMap, HungarianLangModel, (float)0.947368, PR_TRUE, - CHARDET_ENCODING_ISO_8859_2 + "ISO-8859-2" }; -SequenceModel Win1250HungarianModel = +const SequenceModel Win1250HungarianModel = { win1250HungarianCharToOrderMap, HungarianLangModel, (float)0.947368, PR_TRUE, - CHARDET_ENCODING_WINDOWS_1250 + "windows-1250" }; diff --git a/src/LangThaiModel.cpp b/src/LangThaiModel.cpp index 7d376cc..8145ffa 100644 --- a/src/LangThaiModel.cpp +++ b/src/LangThaiModel.cpp @@ -35,7 +35,6 @@ * * ***** END LICENSE BLOCK ***** */ -#include "uchardetDefine.h" #include "nsSBCharSetProber.h" @@ -50,7 +49,7 @@ //The following result for thai was collected from a limited sample (1M). //Character Mapping Table: -unsigned char TIS620CharToOrderMap[] = +static const unsigned char TIS620CharToOrderMap[] = { 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10 @@ -79,7 +78,7 @@ unsigned char TIS620CharToOrderMap[] = //first 1024 sequences:7.3177% //rest sequences: 1.0230% //negative sequences: 0.0436% -char ThaiLangModel[] = +static const PRUint8 ThaiLangModel[] = { 0,1,3,3,3,3,0,0,3,3,0,3,3,0,3,3,3,3,3,3,3,3,0,0,3,3,3,0,3,3,3,3, 0,3,3,0,0,0,1,3,0,3,3,2,3,3,0,1,2,3,3,3,3,0,2,0,2,0,0,3,2,1,2,2, @@ -212,11 +211,11 @@ char ThaiLangModel[] = }; -SequenceModel TIS620ThaiModel = +const SequenceModel TIS620ThaiModel = { TIS620CharToOrderMap, ThaiLangModel, (float)0.926386, PR_FALSE, - CHARDET_ENCODING_TIS_620 + "TIS-620" }; diff --git a/src/nsBig5Prober.cpp b/src/nsBig5Prober.cpp index 55d63c6..7a85abb 100644 --- a/src/nsBig5Prober.cpp +++ b/src/nsBig5Prober.cpp @@ -41,7 +41,7 @@ void nsBig5Prober::Reset(void) { mCodingSM->Reset(); mState = eDetecting; - mDistributionAnalyser.Reset(); + mDistributionAnalyser.Reset(mIsPreferredLanguage); } nsProbingState nsBig5Prober::HandleData(const char* aBuf, PRUint32 aLen) @@ -51,11 +51,6 @@ nsProbingState nsBig5Prober::HandleData(const char* aBuf, PRUint32 aLen) for (PRUint32 i = 0; i < aLen; i++) { codingState = mCodingSM->NextState(aBuf[i]); - if (codingState == eError) - { - mState = eNotMe; - break; - } if (codingState == eItsMe) { mState = eFoundIt; diff --git a/src/nsBig5Prober.h b/src/nsBig5Prober.h index 5a4ce37..5ae3576 100644 --- a/src/nsBig5Prober.h +++ b/src/nsBig5Prober.h @@ -38,18 +38,19 @@ #ifndef nsBig5Prober_h__ #define nsBig5Prober_h__ -#include "uchardetDefine.h" #include "nsCharSetProber.h" #include "nsCodingStateMachine.h" #include "CharDistribution.h" class nsBig5Prober: public nsCharSetProber { public: - nsBig5Prober(void){mCodingSM = new nsCodingStateMachine(&Big5SMModel); - Reset();} + nsBig5Prober(PRBool aIsPreferredLanguage) + :mIsPreferredLanguage(aIsPreferredLanguage) + {mCodingSM = new nsCodingStateMachine(&Big5SMModel); + Reset();} virtual ~nsBig5Prober(void){delete mCodingSM;} nsProbingState HandleData(const char* aBuf, PRUint32 aLen); - const char* GetCharSetName() {return CHARDET_ENCODING_BIG5;} + const char* GetCharSetName() {return "Big5";} nsProbingState GetState(void) {return mState;} void Reset(void); float GetConfidence(void); @@ -64,6 +65,7 @@ protected: //Big5ContextAnalysis mContextAnalyser; Big5DistributionAnalysis mDistributionAnalyser; char mLastChar[2]; + PRBool mIsPreferredLanguage; }; diff --git a/src/nsCharSetProber.h b/src/nsCharSetProber.h index 0c3ae95..c078ccf 100644 --- a/src/nsCharSetProber.h +++ b/src/nsCharSetProber.h @@ -61,7 +61,7 @@ public: virtual void SetOpion() = 0; #ifdef DEBUG_chardet - virtual void DumpStatus() {} + virtual void DumpStatus() {}; #endif // Helper functions used in the Latin1 and Group probers. diff --git a/src/nsCodingStateMachine.h b/src/nsCodingStateMachine.h index b9c2f64..819f9ab 100644 --- a/src/nsCodingStateMachine.h +++ b/src/nsCodingStateMachine.h @@ -59,10 +59,7 @@ typedef struct class nsCodingStateMachine { public: - nsCodingStateMachine(SMModel* sm){ - mCurrentState = eStart; - mModel = sm; - } + nsCodingStateMachine(const SMModel* sm) : mModel(sm) { mCurrentState = eStart; } nsSMState NextState(char c){ //for each byte we get its class , if it is first byte, we also get byte length PRUint32 byteCls = GETCLASS(c); @@ -86,23 +83,22 @@ protected: PRUint32 mCurrentCharLen; PRUint32 mCurrentBytePos; - SMModel *mModel; + const SMModel *mModel; }; -extern SMModel UTF8SMModel; -extern SMModel Big5SMModel; -extern SMModel EUCJPSMModel; -extern SMModel EUCKRSMModel; -extern SMModel EUCTWSMModel; -extern SMModel GB18030SMModel; -extern SMModel SJISSMModel; -extern SMModel UCS2BESMModel; +extern const SMModel UTF8SMModel; +extern const SMModel Big5SMModel; +extern const SMModel EUCJPSMModel; +extern const SMModel EUCKRSMModel; +extern const SMModel EUCTWSMModel; +extern const SMModel GB18030SMModel; +extern const SMModel SJISSMModel; -extern SMModel HZSMModel; -extern SMModel ISO2022CNSMModel; -extern SMModel ISO2022JPSMModel; -extern SMModel ISO2022KRSMModel; +extern const SMModel HZSMModel; +extern const SMModel ISO2022CNSMModel; +extern const SMModel ISO2022JPSMModel; +extern const SMModel ISO2022KRSMModel; #endif /* nsCodingStateMachine_h__ */ diff --git a/src/nsEUCJPProber.cpp b/src/nsEUCJPProber.cpp index fb0d296..54861b3 100644 --- a/src/nsEUCJPProber.cpp +++ b/src/nsEUCJPProber.cpp @@ -46,8 +46,8 @@ void nsEUCJPProber::Reset(void) { mCodingSM->Reset(); mState = eDetecting; - mContextAnalyser.Reset(); - mDistributionAnalyser.Reset(); + mContextAnalyser.Reset(mIsPreferredLanguage); + mDistributionAnalyser.Reset(mIsPreferredLanguage); } nsProbingState nsEUCJPProber::HandleData(const char* aBuf, PRUint32 aLen) @@ -57,11 +57,6 @@ nsProbingState nsEUCJPProber::HandleData(const char* aBuf, PRUint32 aLen) for (PRUint32 i = 0; i < aLen; i++) { codingState = mCodingSM->NextState(aBuf[i]); - if (codingState == eError) - { - mState = eNotMe; - break; - } if (codingState == eItsMe) { mState = eFoundIt; diff --git a/src/nsEUCJPProber.h b/src/nsEUCJPProber.h index e4efa5a..a7a2f51 100644 --- a/src/nsEUCJPProber.h +++ b/src/nsEUCJPProber.h @@ -43,7 +43,6 @@ #ifndef nsEUCJPProber_h__ #define nsEUCJPProber_h__ -#include "uchardetDefine.h" #include "nsCharSetProber.h" #include "nsCodingStateMachine.h" #include "JpCntx.h" @@ -51,11 +50,13 @@ class nsEUCJPProber: public nsCharSetProber { public: - nsEUCJPProber(void){mCodingSM = new nsCodingStateMachine(&EUCJPSMModel); - Reset();} + nsEUCJPProber(PRBool aIsPreferredLanguage) + :mIsPreferredLanguage(aIsPreferredLanguage) + {mCodingSM = new nsCodingStateMachine(&EUCJPSMModel); + Reset();} virtual ~nsEUCJPProber(void){delete mCodingSM;} nsProbingState HandleData(const char* aBuf, PRUint32 aLen); - const char* GetCharSetName() {return CHARDET_ENCODING_EUC_JP;} + const char* GetCharSetName() {return "EUC-JP";} nsProbingState GetState(void) {return mState;} void Reset(void); float GetConfidence(void); @@ -69,6 +70,7 @@ protected: EUCJPDistributionAnalysis mDistributionAnalyser; char mLastChar[2]; + PRBool mIsPreferredLanguage; }; diff --git a/src/nsEUCKRProber.cpp b/src/nsEUCKRProber.cpp index c91a97e..3632f1f 100644 --- a/src/nsEUCKRProber.cpp +++ b/src/nsEUCKRProber.cpp @@ -41,7 +41,7 @@ void nsEUCKRProber::Reset(void) { mCodingSM->Reset(); mState = eDetecting; - mDistributionAnalyser.Reset(); + mDistributionAnalyser.Reset(mIsPreferredLanguage); //mContextAnalyser.Reset(); } @@ -52,11 +52,6 @@ nsProbingState nsEUCKRProber::HandleData(const char* aBuf, PRUint32 aLen) for (PRUint32 i = 0; i < aLen; i++) { codingState = mCodingSM->NextState(aBuf[i]); - if (codingState == eError) - { - mState = eNotMe; - break; - } if (codingState == eItsMe) { mState = eFoundIt; diff --git a/src/nsEUCKRProber.h b/src/nsEUCKRProber.h index 53e9f30..8e09984 100644 --- a/src/nsEUCKRProber.h +++ b/src/nsEUCKRProber.h @@ -38,18 +38,20 @@ #ifndef nsEUCKRProber_h__ #define nsEUCKRProber_h__ -#include "uchardetDefine.h" #include "nsCharSetProber.h" #include "nsCodingStateMachine.h" #include "CharDistribution.h" class nsEUCKRProber: public nsCharSetProber { public: - nsEUCKRProber(void){mCodingSM = new nsCodingStateMachine(&EUCKRSMModel); - Reset();} + nsEUCKRProber(PRBool aIsPreferredLanguage) + :mIsPreferredLanguage(aIsPreferredLanguage) + {mCodingSM = new nsCodingStateMachine(&EUCKRSMModel); + Reset(); + } virtual ~nsEUCKRProber(void){delete mCodingSM;} nsProbingState HandleData(const char* aBuf, PRUint32 aLen); - const char* GetCharSetName() {return CHARDET_ENCODING_EUC_KR;} + const char* GetCharSetName() {return "EUC-KR";} nsProbingState GetState(void) {return mState;} void Reset(void); float GetConfidence(void); @@ -64,6 +66,7 @@ protected: //EUCKRContextAnalysis mContextAnalyser; EUCKRDistributionAnalysis mDistributionAnalyser; char mLastChar[2]; + PRBool mIsPreferredLanguage; }; diff --git a/src/nsEUCTWProber.cpp b/src/nsEUCTWProber.cpp index 8552941..a06e074 100644 --- a/src/nsEUCTWProber.cpp +++ b/src/nsEUCTWProber.cpp @@ -41,7 +41,7 @@ void nsEUCTWProber::Reset(void) { mCodingSM->Reset(); mState = eDetecting; - mDistributionAnalyser.Reset(); + mDistributionAnalyser.Reset(mIsPreferredLanguage); //mContextAnalyser.Reset(); } @@ -52,11 +52,6 @@ nsProbingState nsEUCTWProber::HandleData(const char* aBuf, PRUint32 aLen) for (PRUint32 i = 0; i < aLen; i++) { codingState = mCodingSM->NextState(aBuf[i]); - if (codingState == eError) - { - mState = eNotMe; - break; - } if (codingState == eItsMe) { mState = eFoundIt; diff --git a/src/nsEUCTWProber.h b/src/nsEUCTWProber.h index 7df1120..911d50b 100644 --- a/src/nsEUCTWProber.h +++ b/src/nsEUCTWProber.h @@ -38,18 +38,19 @@ #ifndef nsEUCTWProber_h__ #define nsEUCTWProber_h__ -#include "uchardetDefine.h" #include "nsCharSetProber.h" #include "nsCodingStateMachine.h" #include "CharDistribution.h" class nsEUCTWProber: public nsCharSetProber { public: - nsEUCTWProber(void){mCodingSM = new nsCodingStateMachine(&EUCTWSMModel); - Reset();} + nsEUCTWProber(PRBool aIsPreferredLanguage) + :mIsPreferredLanguage(aIsPreferredLanguage) + {mCodingSM = new nsCodingStateMachine(&EUCTWSMModel); + Reset();} virtual ~nsEUCTWProber(void){delete mCodingSM;} nsProbingState HandleData(const char* aBuf, PRUint32 aLen); - const char* GetCharSetName() {return CHARDET_ENCODING_EUC_TW;} + const char* GetCharSetName() {return "x-euc-tw";} nsProbingState GetState(void) {return mState;} void Reset(void); float GetConfidence(void); @@ -64,6 +65,7 @@ protected: //EUCTWContextAnalysis mContextAnalyser; EUCTWDistributionAnalysis mDistributionAnalyser; char mLastChar[2]; + PRBool mIsPreferredLanguage; }; diff --git a/src/nsEscCharsetProber.cpp b/src/nsEscCharsetProber.cpp index a816bab..464c753 100644 --- a/src/nsEscCharsetProber.cpp +++ b/src/nsEscCharsetProber.cpp @@ -37,13 +37,21 @@ #include "nsEscCharsetProber.h" +#include "nsUniversalDetector.h" -nsEscCharSetProber::nsEscCharSetProber(void) +nsEscCharSetProber::nsEscCharSetProber(PRUint32 aLanguageFilter) { - mCodingSM[0] = new nsCodingStateMachine(&HZSMModel); - mCodingSM[1] = new nsCodingStateMachine(&ISO2022CNSMModel); - mCodingSM[2] = new nsCodingStateMachine(&ISO2022JPSMModel); - mCodingSM[3] = new nsCodingStateMachine(&ISO2022KRSMModel); + for (PRUint32 i = 0; i < NUM_OF_ESC_CHARSETS; i++) + mCodingSM[i] = nsnull; + if (aLanguageFilter & NS_FILTER_CHINESE_SIMPLIFIED) + { + mCodingSM[0] = new nsCodingStateMachine(&HZSMModel); + mCodingSM[1] = new nsCodingStateMachine(&ISO2022CNSMModel); + } + if (aLanguageFilter & NS_FILTER_JAPANESE) + mCodingSM[2] = new nsCodingStateMachine(&ISO2022JPSMModel); + if (aLanguageFilter & NS_FILTER_KOREAN) + mCodingSM[3] = new nsCodingStateMachine(&ISO2022KRSMModel); mActiveSM = NUM_OF_ESC_CHARSETS; mState = eDetecting; mDetectedCharset = nsnull; @@ -59,7 +67,8 @@ void nsEscCharSetProber::Reset(void) { mState = eDetecting; for (PRUint32 i = 0; i < NUM_OF_ESC_CHARSETS; i++) - mCodingSM[i]->Reset(); + if (mCodingSM[i]) + mCodingSM[i]->Reset(); mActiveSM = NUM_OF_ESC_CHARSETS; mDetectedCharset = nsnull; } @@ -74,30 +83,15 @@ nsProbingState nsEscCharSetProber::HandleData(const char* aBuf, PRUint32 aLen) { for (j = mActiveSM-1; j>= 0; j--) { - //byte is feed to all active state machine - codingState = mCodingSM[j]->NextState(aBuf[i]); - if (codingState == eError) + if (mCodingSM[j]) { - //got negative answer for this state machine, make it inactive - mActiveSM--; - if (mActiveSM == 0) + codingState = mCodingSM[j]->NextState(aBuf[i]); + if (codingState == eItsMe) { - mState = eNotMe; + mState = eFoundIt; + mDetectedCharset = mCodingSM[j]->GetCodingStateMachine(); return mState; } - else if (j != (PRInt32)mActiveSM) - { - nsCodingStateMachine* t; - t = mCodingSM[mActiveSM]; - mCodingSM[mActiveSM] = mCodingSM[j]; - mCodingSM[j] = t; - } - } - else if (codingState == eItsMe) - { - mState = eFoundIt; - mDetectedCharset = mCodingSM[j]->GetCodingStateMachine(); - return mState; } } } diff --git a/src/nsEscCharsetProber.h b/src/nsEscCharsetProber.h index c08f442..4b648e0 100644 --- a/src/nsEscCharsetProber.h +++ b/src/nsEscCharsetProber.h @@ -45,7 +45,7 @@ class nsEscCharSetProber: public nsCharSetProber { public: - nsEscCharSetProber(void); + nsEscCharSetProber(PRUint32 aLanguageFilter); virtual ~nsEscCharSetProber(void); nsProbingState HandleData(const char* aBuf, PRUint32 aLen); const char* GetCharSetName() {return mDetectedCharset;} diff --git a/src/nsEscSM.cpp b/src/nsEscSM.cpp index f3b4a8d..eed1b7c 100644 --- a/src/nsEscSM.cpp +++ b/src/nsEscSM.cpp @@ -20,7 +20,6 @@ * the Initial Developer. All Rights Reserved. * * Contributor(s): - * Kazutoshi Satoda * * Alternatively, the contents of this file may be used under the terms of * either the GNU General Public License Version 2 or later (the "GPL"), or @@ -35,10 +34,9 @@ * the terms of any one of the MPL, the GPL or the LGPL. * * ***** END LICENSE BLOCK ***** */ -#include "uchardetDefine.h" #include "nsCodingStateMachine.h" -static PRUint32 HZ_cls[ 256 / 8 ] = { +static const PRUint32 HZ_cls[ 256 / 8 ] = { PCK4BITS(1,0,0,0,0,0,0,0), // 00 - 07 PCK4BITS(0,0,0,0,0,0,0,0), // 08 - 0f PCK4BITS(0,0,0,0,0,0,0,0), // 10 - 17 @@ -74,7 +72,7 @@ PCK4BITS(1,1,1,1,1,1,1,1) // f8 - ff }; -static PRUint32 HZ_st [ 6] = { +static const PRUint32 HZ_st [ 6] = { PCK4BITS(eStart,eError, 3,eStart,eStart,eStart,eError,eError),//00-07 PCK4BITS(eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe),//08-0f PCK4BITS(eItsMe,eItsMe,eError,eError,eStart,eStart, 4,eError),//10-17 @@ -85,16 +83,16 @@ PCK4BITS( 4,eItsMe,eStart,eStart,eStart,eStart,eStart,eStart) //28-2f static const PRUint32 HZCharLenTable[] = {0, 0, 0, 0, 0, 0}; -SMModel HZSMModel = { +const SMModel HZSMModel = { {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, HZ_cls }, 6, {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, HZ_st }, HZCharLenTable, - CHARDET_ENCODING_HZ_GB_2312, + "HZ-GB-2312", }; -static PRUint32 ISO2022CN_cls [ 256 / 8 ] = { +static const PRUint32 ISO2022CN_cls [ 256 / 8 ] = { PCK4BITS(2,0,0,0,0,0,0,0), // 00 - 07 PCK4BITS(0,0,0,0,0,0,0,0), // 08 - 0f PCK4BITS(0,0,0,0,0,0,0,0), // 10 - 17 @@ -130,7 +128,7 @@ PCK4BITS(2,2,2,2,2,2,2,2) // f8 - ff }; -static PRUint32 ISO2022CN_st [ 8] = { +static const PRUint32 ISO2022CN_st [ 8] = { PCK4BITS(eStart, 3,eError,eStart,eStart,eStart,eStart,eStart),//00-07 PCK4BITS(eStart,eError,eError,eError,eError,eError,eError,eError),//08-0f PCK4BITS(eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe),//10-17 @@ -143,15 +141,15 @@ PCK4BITS(eError,eError,eError,eError,eError,eItsMe,eError,eStart) //38-3f static const PRUint32 ISO2022CNCharLenTable[] = {0, 0, 0, 0, 0, 0, 0, 0, 0}; -SMModel ISO2022CNSMModel = { +const SMModel ISO2022CNSMModel = { {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, ISO2022CN_cls }, 9, {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, ISO2022CN_st }, ISO2022CNCharLenTable, - CHARDET_ENCODING_ISO_2022_CN, + "ISO-2022-CN", }; -static PRUint32 ISO2022JP_cls [ 256 / 8 ] = { +static const PRUint32 ISO2022JP_cls [ 256 / 8 ] = { PCK4BITS(2,0,0,0,0,0,0,0), // 00 - 07 PCK4BITS(0,0,0,0,0,0,2,2), // 08 - 0f PCK4BITS(0,0,0,0,0,0,0,0), // 10 - 17 @@ -187,7 +185,7 @@ PCK4BITS(2,2,2,2,2,2,2,2) // f8 - ff }; -static PRUint32 ISO2022JP_st [ 9] = { +static const PRUint32 ISO2022JP_st [ 9] = { PCK4BITS(eStart, 3,eError,eStart,eStart,eStart,eStart,eStart),//00-07 PCK4BITS(eStart,eStart,eError,eError,eError,eError,eError,eError),//08-0f PCK4BITS(eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe),//10-17 @@ -199,17 +197,17 @@ PCK4BITS(eError,eError,eError,eItsMe,eError,eError,eError,eError),//38-3f PCK4BITS(eError,eError,eError,eError,eItsMe,eError,eStart,eStart) //40-47 }; -static const PRUint32 ISO2022JPCharLenTable[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; +static const PRUint32 ISO2022JPCharLenTable[] = {0, 0, 0, 0, 0, 0, 0, 0}; -SMModel ISO2022JPSMModel = { +const SMModel ISO2022JPSMModel = { {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, ISO2022JP_cls }, 10, {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, ISO2022JP_st }, ISO2022JPCharLenTable, - CHARDET_ENCODING_ISO_2022_JP, + "ISO-2022-JP", }; -static PRUint32 ISO2022KR_cls [ 256 / 8 ] = { +static const PRUint32 ISO2022KR_cls [ 256 / 8 ] = { PCK4BITS(2,0,0,0,0,0,0,0), // 00 - 07 PCK4BITS(0,0,0,0,0,0,0,0), // 08 - 0f PCK4BITS(0,0,0,0,0,0,0,0), // 10 - 17 @@ -245,7 +243,7 @@ PCK4BITS(2,2,2,2,2,2,2,2) // f8 - ff }; -static PRUint32 ISO2022KR_st [ 5] = { +static const PRUint32 ISO2022KR_st [ 5] = { PCK4BITS(eStart, 3,eError,eStart,eStart,eStart,eError,eError),//00-07 PCK4BITS(eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe),//08-0f PCK4BITS(eItsMe,eItsMe,eError,eError,eError, 4,eError,eError),//10-17 @@ -255,11 +253,11 @@ PCK4BITS(eError,eError,eError,eItsMe,eStart,eStart,eStart,eStart) //20-27 static const PRUint32 ISO2022KRCharLenTable[] = {0, 0, 0, 0, 0, 0}; -SMModel ISO2022KRSMModel = { +const SMModel ISO2022KRSMModel = { {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, ISO2022KR_cls }, 6, {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, ISO2022KR_st }, ISO2022KRCharLenTable, - CHARDET_ENCODING_ISO_2022_KR, + "ISO-2022-KR", }; diff --git a/src/nsGB2312Prober.cpp b/src/nsGB2312Prober.cpp index 576dcd6..b6d469c 100644 --- a/src/nsGB2312Prober.cpp +++ b/src/nsGB2312Prober.cpp @@ -46,7 +46,7 @@ void nsGB18030Prober::Reset(void) { mCodingSM->Reset(); mState = eDetecting; - mDistributionAnalyser.Reset(); + mDistributionAnalyser.Reset(mIsPreferredLanguage); //mContextAnalyser.Reset(); } @@ -57,11 +57,6 @@ nsProbingState nsGB18030Prober::HandleData(const char* aBuf, PRUint32 aLen) for (PRUint32 i = 0; i < aLen; i++) { codingState = mCodingSM->NextState(aBuf[i]); - if (codingState == eError) - { - mState = eNotMe; - break; - } if (codingState == eItsMe) { mState = eFoundIt; diff --git a/src/nsGB2312Prober.h b/src/nsGB2312Prober.h index 18edae5..4bdac3b 100644 --- a/src/nsGB2312Prober.h +++ b/src/nsGB2312Prober.h @@ -38,7 +38,6 @@ #ifndef nsGB2312Prober_h__ #define nsGB2312Prober_h__ -#include "uchardetDefine.h" #include "nsCharSetProber.h" #include "nsCodingStateMachine.h" #include "CharDistribution.h" @@ -47,11 +46,13 @@ class nsGB18030Prober: public nsCharSetProber { public: - nsGB18030Prober(void){mCodingSM = new nsCodingStateMachine(&GB18030SMModel); - Reset();} + nsGB18030Prober(PRBool aIsPreferredLanguage) + :mIsPreferredLanguage(aIsPreferredLanguage) + {mCodingSM = new nsCodingStateMachine(&GB18030SMModel); + Reset();} virtual ~nsGB18030Prober(void){delete mCodingSM;} nsProbingState HandleData(const char* aBuf, PRUint32 aLen); - const char* GetCharSetName() {return CHARDET_ENCODING_GB18030;} + const char* GetCharSetName() {return "gb18030";} nsProbingState GetState(void) {return mState;} void Reset(void); float GetConfidence(void); @@ -66,6 +67,7 @@ protected: //GB2312ContextAnalysis mContextAnalyser; GB2312DistributionAnalysis mDistributionAnalyser; char mLastChar[2]; + PRBool mIsPreferredLanguage; }; diff --git a/src/nsHebrewProber.cpp b/src/nsHebrewProber.cpp index 2168de3..b148ce3 100644 --- a/src/nsHebrewProber.cpp +++ b/src/nsHebrewProber.cpp @@ -35,7 +35,6 @@ * * ***** END LICENSE BLOCK ***** */ -#include "uchardetDefine.h" #include "nsHebrewProber.h" #include @@ -59,8 +58,8 @@ // If the difference is below this, don't rely at all on the model score distance. #define MIN_MODEL_DISTANCE (0.01) -#define VISUAL_HEBREW_NAME (CHARDET_ENCODING_ISO_8859_8) -#define LOGICAL_HEBREW_NAME (CHARDET_ENCODING_WINDOWS_1255) +#define VISUAL_HEBREW_NAME ("ISO-8859-8") +#define LOGICAL_HEBREW_NAME ("windows-1255") PRBool nsHebrewProber::isFinal(char c) { diff --git a/src/nsLatin1Prober.cpp b/src/nsLatin1Prober.cpp index 05d6823..7694ef7 100644 --- a/src/nsLatin1Prober.cpp +++ b/src/nsLatin1Prober.cpp @@ -50,7 +50,7 @@ #define ASO 7 // accent small other #define CLASS_NUM 8 // total classes -static unsigned char Latin1_CharToClass[] = +static const unsigned char Latin1_CharToClass[] = { OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 00 - 07 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 08 - 0F @@ -92,7 +92,7 @@ static unsigned char Latin1_CharToClass[] = 2 : normal 3 : very likely */ -static unsigned char Latin1ClassModel[] = +static const unsigned char Latin1ClassModel[] = { /* UDF OTH ASC ASS ACV ACO ASV ASO */ /*UDF*/ 0, 0, 0, 0, 0, 0, 0, 0, diff --git a/src/nsLatin1Prober.h b/src/nsLatin1Prober.h index 3953d8e..5145e96 100644 --- a/src/nsLatin1Prober.h +++ b/src/nsLatin1Prober.h @@ -39,7 +39,6 @@ #ifndef nsLatin1Prober_h__ #define nsLatin1Prober_h__ -#include "uchardetDefine.h" #include "nsCharSetProber.h" #define FREQ_CAT_NUM 4 @@ -49,7 +48,7 @@ public: nsLatin1Prober(void){Reset();} virtual ~nsLatin1Prober(void){} nsProbingState HandleData(const char* aBuf, PRUint32 aLen); - const char* GetCharSetName() {return CHARDET_ENCODING_WINDOWS_1252;} + const char* GetCharSetName() {return "windows-1252";} nsProbingState GetState(void) {return mState;} void Reset(void); float GetConfidence(void); diff --git a/src/nsMBCSGroupProber.cpp b/src/nsMBCSGroupProber.cpp index 35a97be..f161165 100644 --- a/src/nsMBCSGroupProber.cpp +++ b/src/nsMBCSGroupProber.cpp @@ -21,6 +21,7 @@ * * Contributor(s): * Shy Shalom + * Proofpoint, Inc. * * Alternatively, the contents of this file may be used under the terms of * either the GNU General Public License Version 2 or later (the "GPL"), or @@ -36,12 +37,12 @@ * * ***** END LICENSE BLOCK ***** */ #include -#include "prmem.h" #include "nsMBCSGroupProber.h" +#include "nsUniversalDetector.h" -#ifdef DEBUG_chardet -char *ProberName[] = +#if defined(DEBUG_chardet) || defined(DEBUG_jgmyers) +const char *ProberName[] = { "UTF8", "SJIS", @@ -54,15 +55,26 @@ char *ProberName[] = #endif -nsMBCSGroupProber::nsMBCSGroupProber() +nsMBCSGroupProber::nsMBCSGroupProber(PRUint32 aLanguageFilter) { + for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++) + mProbers[i] = nsnull; + mProbers[0] = new nsUTF8Prober(); - mProbers[1] = new nsSJISProber(); - mProbers[2] = new nsEUCJPProber(); - mProbers[3] = new nsGB18030Prober(); - mProbers[4] = new nsEUCKRProber(); - mProbers[5] = new nsBig5Prober(); - mProbers[6] = new nsEUCTWProber(); + if (aLanguageFilter & NS_FILTER_JAPANESE) + { + mProbers[1] = new nsSJISProber(aLanguageFilter == NS_FILTER_JAPANESE); + mProbers[2] = new nsEUCJPProber(aLanguageFilter == NS_FILTER_JAPANESE); + } + if (aLanguageFilter & NS_FILTER_CHINESE_SIMPLIFIED) + mProbers[3] = new nsGB18030Prober(aLanguageFilter == NS_FILTER_CHINESE_SIMPLIFIED); + if (aLanguageFilter & NS_FILTER_KOREAN) + mProbers[4] = new nsEUCKRProber(aLanguageFilter == NS_FILTER_KOREAN); + if (aLanguageFilter & NS_FILTER_CHINESE_TRADITIONAL) + { + mProbers[5] = new nsBig5Prober(aLanguageFilter == NS_FILTER_CHINESE_TRADITIONAL); + mProbers[6] = new nsEUCTWProber(aLanguageFilter == NS_FILTER_CHINESE_TRADITIONAL); + } Reset(); } @@ -101,62 +113,59 @@ void nsMBCSGroupProber::Reset(void) } mBestGuess = -1; mState = eDetecting; + mKeepNext = 0; } nsProbingState nsMBCSGroupProber::HandleData(const char* aBuf, PRUint32 aLen) { nsProbingState st; - PRUint32 i; + PRUint32 start = 0; + PRUint32 keepNext = mKeepNext; //do filtering to reduce load to probers - char *highbyteBuf; - char *hptr; - PRBool keepNext = PR_TRUE; //assume previous is not ascii, it will do no harm except add some noise - hptr = highbyteBuf = (char*)PR_Malloc(aLen); - if (!hptr) - return mState; - for (i = 0; i < aLen; i++) + for (PRUint32 pos = 0; pos < aLen; ++pos) { - if (aBuf[i] & 0x80) + if (aBuf[pos] & 0x80) { - *hptr++ = aBuf[i]; - keepNext = PR_TRUE; + if (!keepNext) + start = pos; + keepNext = 2; } - else + else if (keepNext) { - //if previous is highbyte, keep this even it is a ASCII - if (keepNext) + if (--keepNext == 0) { - *hptr++ = aBuf[i]; - keepNext = PR_FALSE; + for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++) + { + if (!mIsActive[i]) + continue; + st = mProbers[i]->HandleData(aBuf + start, pos + 1 - start); + if (st == eFoundIt) + { + mBestGuess = i; + mState = eFoundIt; + return mState; + } + } } } } - for (i = 0; i < NUM_OF_PROBERS; i++) - { - if (!mIsActive[i]) - continue; - st = mProbers[i]->HandleData(highbyteBuf, hptr - highbyteBuf); - if (st == eFoundIt) - { - mBestGuess = i; - mState = eFoundIt; - break; - } - else if (st == eNotMe) - { - mIsActive[i] = PR_FALSE; - mActiveNum--; - if (mActiveNum <= 0) - { - mState = eNotMe; - break; - } - } + if (keepNext) { + for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++) + { + if (!mIsActive[i]) + continue; + st = mProbers[i]->HandleData(aBuf + start, aLen - start); + if (st == eFoundIt) + { + mBestGuess = i; + mState = eFoundIt; + return mState; + } + } } - - PR_FREEIF(highbyteBuf); + mKeepNext = keepNext; return mState; } @@ -207,3 +216,15 @@ void nsMBCSGroupProber::DumpStatus() } } #endif + +#ifdef DEBUG_jgmyers +void nsMBCSGroupProber::GetDetectorState(nsUniversalDetector::DetectorState (&states)[nsUniversalDetector::NumDetectors], PRUint32 &offset) +{ + for (PRUint32 i = 0; i < NUM_OF_PROBERS; ++i) { + states[offset].name = ProberName[i]; + states[offset].isActive = mIsActive[i]; + states[offset].confidence = mIsActive[i] ? mProbers[i]->GetConfidence() : 0.0; + ++offset; + } +} +#endif /* DEBUG_jgmyers */ diff --git a/src/nsMBCSGroupProber.h b/src/nsMBCSGroupProber.h index 6d2730a..c4e9964 100644 --- a/src/nsMBCSGroupProber.h +++ b/src/nsMBCSGroupProber.h @@ -20,6 +20,7 @@ * the Initial Developer. All Rights Reserved. * * Contributor(s): + * Proofpoint, Inc. * * Alternatively, the contents of this file may be used under the terms of * either the GNU General Public License Version 2 or later (the "GPL"), or @@ -50,7 +51,7 @@ class nsMBCSGroupProber: public nsCharSetProber { public: - nsMBCSGroupProber(); + nsMBCSGroupProber(PRUint32 aLanguageFilter); virtual ~nsMBCSGroupProber(); nsProbingState HandleData(const char* aBuf, PRUint32 aLen); const char* GetCharSetName(); @@ -62,6 +63,9 @@ public: #ifdef DEBUG_chardet void DumpStatus(); #endif +#ifdef DEBUG_jgmyers + void GetDetectorState(nsUniversalDetector::DetectorState (&states)[nsUniversalDetector::NumDetectors], PRUint32 &offset); +#endif protected: nsProbingState mState; @@ -69,6 +73,7 @@ protected: PRBool mIsActive[NUM_OF_PROBERS]; PRInt32 mBestGuess; PRUint32 mActiveNum; + PRUint32 mKeepNext; }; #endif /* nsMBCSGroupProber_h__ */ diff --git a/src/nsMBCSSM.cpp b/src/nsMBCSSM.cpp index 0231ad6..584e931 100644 --- a/src/nsMBCSSM.cpp +++ b/src/nsMBCSSM.cpp @@ -34,7 +34,6 @@ * the terms of any one of the MPL, the GPL or the LGPL. * * ***** END LICENSE BLOCK ***** */ -#include "uchardetDefine.h" #include "nsCodingStateMachine.h" /* @@ -45,7 +44,7 @@ Modification from frank tang's original work: // BIG5 -static PRUint32 BIG5_cls [ 256 / 8 ] = { +static const PRUint32 BIG5_cls [ 256 / 8 ] = { //PCK4BITS(0,1,1,1,1,1,1,1), // 00 - 07 PCK4BITS(1,1,1,1,1,1,1,1), // 00 - 07 //allow 0x00 as legal value PCK4BITS(1,1,1,1,1,1,0,0), // 08 - 0f @@ -82,7 +81,7 @@ PCK4BITS(3,3,3,3,3,3,3,0) // f8 - ff }; -static PRUint32 BIG5_st [ 3] = { +static const PRUint32 BIG5_st [ 3] = { PCK4BITS(eError,eStart,eStart, 3,eError,eError,eError,eError),//00-07 PCK4BITS(eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError),//08-0f PCK4BITS(eError,eStart,eStart,eStart,eStart,eStart,eStart,eStart) //10-17 @@ -90,15 +89,15 @@ PCK4BITS(eError,eStart,eStart,eStart,eStart,eStart,eStart,eStart) //10-17 static const PRUint32 Big5CharLenTable[] = {0, 1, 1, 2, 0}; -SMModel Big5SMModel = { +SMModel const Big5SMModel = { {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, BIG5_cls }, 5, {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, BIG5_st }, Big5CharLenTable, - CHARDET_ENCODING_BIG5, + "Big5", }; -static PRUint32 EUCJP_cls [ 256 / 8 ] = { +static const PRUint32 EUCJP_cls [ 256 / 8 ] = { //PCK4BITS(5,4,4,4,4,4,4,4), // 00 - 07 PCK4BITS(4,4,4,4,4,4,4,4), // 00 - 07 PCK4BITS(4,4,4,4,4,4,5,5), // 08 - 0f @@ -135,7 +134,7 @@ PCK4BITS(0,0,0,0,0,0,0,5) // f8 - ff }; -static PRUint32 EUCJP_st [ 5] = { +static const PRUint32 EUCJP_st [ 5] = { PCK4BITS( 3, 4, 3, 5,eStart,eError,eError,eError),//00-07 PCK4BITS(eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe),//08-0f PCK4BITS(eItsMe,eItsMe,eStart,eError,eStart,eError,eError,eError),//10-17 @@ -145,15 +144,15 @@ PCK4BITS( 3,eError,eError,eError,eStart,eStart,eStart,eStart) //20-27 static const PRUint32 EUCJPCharLenTable[] = {2, 2, 2, 3, 1, 0}; -SMModel EUCJPSMModel = { +const SMModel EUCJPSMModel = { {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCJP_cls }, 6, {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCJP_st }, EUCJPCharLenTable, - CHARDET_ENCODING_EUC_JP, + "EUC-JP", }; -static PRUint32 EUCKR_cls [ 256 / 8 ] = { +static const PRUint32 EUCKR_cls [ 256 / 8 ] = { //PCK4BITS(0,1,1,1,1,1,1,1), // 00 - 07 PCK4BITS(1,1,1,1,1,1,1,1), // 00 - 07 PCK4BITS(1,1,1,1,1,1,0,0), // 08 - 0f @@ -190,22 +189,22 @@ PCK4BITS(2,2,2,2,2,2,2,0) // f8 - ff }; -static PRUint32 EUCKR_st [ 2] = { +static const PRUint32 EUCKR_st [ 2] = { PCK4BITS(eError,eStart, 3,eError,eError,eError,eError,eError),//00-07 PCK4BITS(eItsMe,eItsMe,eItsMe,eItsMe,eError,eError,eStart,eStart) //08-0f }; static const PRUint32 EUCKRCharLenTable[] = {0, 1, 2, 0}; -SMModel EUCKRSMModel = { +const SMModel EUCKRSMModel = { {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCKR_cls }, 4, {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCKR_st }, EUCKRCharLenTable, - CHARDET_ENCODING_EUC_KR, + "EUC-KR", }; -static PRUint32 EUCTW_cls [ 256 / 8 ] = { +static const PRUint32 EUCTW_cls [ 256 / 8 ] = { //PCK4BITS(0,2,2,2,2,2,2,2), // 00 - 07 PCK4BITS(2,2,2,2,2,2,2,2), // 00 - 07 PCK4BITS(2,2,2,2,2,2,0,0), // 08 - 0f @@ -242,7 +241,7 @@ PCK4BITS(3,3,3,3,3,3,3,0) // f8 - ff }; -static PRUint32 EUCTW_st [ 6] = { +static const PRUint32 EUCTW_st [ 6] = { PCK4BITS(eError,eError,eStart, 3, 3, 3, 4,eError),//00-07 PCK4BITS(eError,eError,eError,eError,eError,eError,eItsMe,eItsMe),//08-0f PCK4BITS(eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError,eStart,eError),//10-17 @@ -253,12 +252,12 @@ PCK4BITS(eStart,eError,eStart,eStart,eStart,eStart,eStart,eStart) //28-2f static const PRUint32 EUCTWCharLenTable[] = {0, 0, 1, 2, 2, 2, 3}; -SMModel EUCTWSMModel = { +const SMModel EUCTWSMModel = { {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCTW_cls }, 7, {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCTW_st }, EUCTWCharLenTable, - CHARDET_ENCODING_EUC_TW, + "x-euc-tw", }; /* obsolete GB2312 by gb18030 @@ -317,7 +316,7 @@ SMModel GB2312SMModel = { // the following state machine data was created by perl script in // intl/chardet/tools. It should be the same as in PSM detector. -static PRUint32 GB18030_cls [ 256 / 8 ] = { +static const PRUint32 GB18030_cls [ 256 / 8 ] = { PCK4BITS(1,1,1,1,1,1,1,1), // 00 - 07 PCK4BITS(1,1,1,1,1,1,0,0), // 08 - 0f PCK4BITS(1,1,1,1,1,1,1,1), // 10 - 17 @@ -353,7 +352,7 @@ PCK4BITS(6,6,6,6,6,6,6,0) // f8 - ff }; -static PRUint32 GB18030_st [ 6] = { +static const PRUint32 GB18030_st [ 6] = { PCK4BITS(eError,eStart,eStart,eStart,eStart,eStart, 3,eError),//00-07 PCK4BITS(eError,eError,eError,eError,eError,eError,eItsMe,eItsMe),//08-0f PCK4BITS(eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError,eError,eStart),//10-17 @@ -369,17 +368,17 @@ PCK4BITS(eError,eError,eStart,eStart,eStart,eStart,eStart,eStart) //28-2f // 2 here. static const PRUint32 GB18030CharLenTable[] = {0, 1, 1, 1, 1, 1, 2}; -SMModel GB18030SMModel = { +const SMModel GB18030SMModel = { {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, GB18030_cls }, 7, {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, GB18030_st }, GB18030CharLenTable, - CHARDET_ENCODING_GB18030, + "GB18030", }; // sjis -static PRUint32 SJIS_cls [ 256 / 8 ] = { +static const PRUint32 SJIS_cls [ 256 / 8 ] = { //PCK4BITS(0,1,1,1,1,1,1,1), // 00 - 07 PCK4BITS(1,1,1,1,1,1,1,1), // 00 - 07 PCK4BITS(1,1,1,1,1,1,0,0), // 08 - 0f @@ -418,7 +417,7 @@ PCK4BITS(4,4,4,4,4,0,0,0) // f8 - ff }; -static PRUint32 SJIS_st [ 3] = { +static const PRUint32 SJIS_st [ 3] = { PCK4BITS(eError,eStart,eStart, 3,eError,eError,eError,eError),//00-07 PCK4BITS(eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe),//08-0f PCK4BITS(eItsMe,eItsMe,eError,eError,eStart,eStart,eStart,eStart) //10-17 @@ -426,129 +425,16 @@ PCK4BITS(eItsMe,eItsMe,eError,eError,eStart,eStart,eStart,eStart) //10-17 static const PRUint32 SJISCharLenTable[] = {0, 1, 1, 2, 0, 0}; -SMModel SJISSMModel = { +const SMModel SJISSMModel = { {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, SJIS_cls }, 6, {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, SJIS_st }, SJISCharLenTable, - CHARDET_ENCODING_SHIFT_JIS, + "Shift_JIS", }; -static PRUint32 UCS2BE_cls [ 256 / 8 ] = { -PCK4BITS(0,0,0,0,0,0,0,0), // 00 - 07 -PCK4BITS(0,0,1,0,0,2,0,0), // 08 - 0f -PCK4BITS(0,0,0,0,0,0,0,0), // 10 - 17 -PCK4BITS(0,0,0,3,0,0,0,0), // 18 - 1f -PCK4BITS(0,0,0,0,0,0,0,0), // 20 - 27 -PCK4BITS(0,3,3,3,3,3,0,0), // 28 - 2f -PCK4BITS(0,0,0,0,0,0,0,0), // 30 - 37 -PCK4BITS(0,0,0,0,0,0,0,0), // 38 - 3f -PCK4BITS(0,0,0,0,0,0,0,0), // 40 - 47 -PCK4BITS(0,0,0,0,0,0,0,0), // 48 - 4f -PCK4BITS(0,0,0,0,0,0,0,0), // 50 - 57 -PCK4BITS(0,0,0,0,0,0,0,0), // 58 - 5f -PCK4BITS(0,0,0,0,0,0,0,0), // 60 - 67 -PCK4BITS(0,0,0,0,0,0,0,0), // 68 - 6f -PCK4BITS(0,0,0,0,0,0,0,0), // 70 - 77 -PCK4BITS(0,0,0,0,0,0,0,0), // 78 - 7f -PCK4BITS(0,0,0,0,0,0,0,0), // 80 - 87 -PCK4BITS(0,0,0,0,0,0,0,0), // 88 - 8f -PCK4BITS(0,0,0,0,0,0,0,0), // 90 - 97 -PCK4BITS(0,0,0,0,0,0,0,0), // 98 - 9f -PCK4BITS(0,0,0,0,0,0,0,0), // a0 - a7 -PCK4BITS(0,0,0,0,0,0,0,0), // a8 - af -PCK4BITS(0,0,0,0,0,0,0,0), // b0 - b7 -PCK4BITS(0,0,0,0,0,0,0,0), // b8 - bf -PCK4BITS(0,0,0,0,0,0,0,0), // c0 - c7 -PCK4BITS(0,0,0,0,0,0,0,0), // c8 - cf -PCK4BITS(0,0,0,0,0,0,0,0), // d0 - d7 -PCK4BITS(0,0,0,0,0,0,0,0), // d8 - df -PCK4BITS(0,0,0,0,0,0,0,0), // e0 - e7 -PCK4BITS(0,0,0,0,0,0,0,0), // e8 - ef -PCK4BITS(0,0,0,0,0,0,0,0), // f0 - f7 -PCK4BITS(0,0,0,0,0,0,4,5) // f8 - ff -}; - - -static PRUint32 UCS2BE_st [ 7] = { -PCK4BITS( 5, 7, 7,eError, 4, 3,eError,eError),//00-07 -PCK4BITS(eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe),//08-0f -PCK4BITS(eItsMe,eItsMe, 6, 6, 6, 6,eError,eError),//10-17 -PCK4BITS( 6, 6, 6, 6, 6,eItsMe, 6, 6),//18-1f -PCK4BITS( 6, 6, 6, 6, 5, 7, 7,eError),//20-27 -PCK4BITS( 5, 8, 6, 6,eError, 6, 6, 6),//28-2f -PCK4BITS( 6, 6, 6, 6,eError,eError,eStart,eStart) //30-37 -}; - -static const PRUint32 UCS2BECharLenTable[] = {2, 2, 2, 0, 2, 2}; - -SMModel UCS2BESMModel = { - {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, UCS2BE_cls }, - 6, - {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, UCS2BE_st }, - UCS2BECharLenTable, - CHARDET_ENCODING_UTF_16BE, -}; - -static PRUint32 UCS2LE_cls [ 256 / 8 ] = { -PCK4BITS(0,0,0,0,0,0,0,0), // 00 - 07 -PCK4BITS(0,0,1,0,0,2,0,0), // 08 - 0f -PCK4BITS(0,0,0,0,0,0,0,0), // 10 - 17 -PCK4BITS(0,0,0,3,0,0,0,0), // 18 - 1f -PCK4BITS(0,0,0,0,0,0,0,0), // 20 - 27 -PCK4BITS(0,3,3,3,3,3,0,0), // 28 - 2f -PCK4BITS(0,0,0,0,0,0,0,0), // 30 - 37 -PCK4BITS(0,0,0,0,0,0,0,0), // 38 - 3f -PCK4BITS(0,0,0,0,0,0,0,0), // 40 - 47 -PCK4BITS(0,0,0,0,0,0,0,0), // 48 - 4f -PCK4BITS(0,0,0,0,0,0,0,0), // 50 - 57 -PCK4BITS(0,0,0,0,0,0,0,0), // 58 - 5f -PCK4BITS(0,0,0,0,0,0,0,0), // 60 - 67 -PCK4BITS(0,0,0,0,0,0,0,0), // 68 - 6f -PCK4BITS(0,0,0,0,0,0,0,0), // 70 - 77 -PCK4BITS(0,0,0,0,0,0,0,0), // 78 - 7f -PCK4BITS(0,0,0,0,0,0,0,0), // 80 - 87 -PCK4BITS(0,0,0,0,0,0,0,0), // 88 - 8f -PCK4BITS(0,0,0,0,0,0,0,0), // 90 - 97 -PCK4BITS(0,0,0,0,0,0,0,0), // 98 - 9f -PCK4BITS(0,0,0,0,0,0,0,0), // a0 - a7 -PCK4BITS(0,0,0,0,0,0,0,0), // a8 - af -PCK4BITS(0,0,0,0,0,0,0,0), // b0 - b7 -PCK4BITS(0,0,0,0,0,0,0,0), // b8 - bf -PCK4BITS(0,0,0,0,0,0,0,0), // c0 - c7 -PCK4BITS(0,0,0,0,0,0,0,0), // c8 - cf -PCK4BITS(0,0,0,0,0,0,0,0), // d0 - d7 -PCK4BITS(0,0,0,0,0,0,0,0), // d8 - df -PCK4BITS(0,0,0,0,0,0,0,0), // e0 - e7 -PCK4BITS(0,0,0,0,0,0,0,0), // e8 - ef -PCK4BITS(0,0,0,0,0,0,0,0), // f0 - f7 -PCK4BITS(0,0,0,0,0,0,4,5) // f8 - ff -}; - - -static PRUint32 UCS2LE_st [ 7] = { -PCK4BITS( 6, 6, 7, 6, 4, 3,eError,eError),//00-07 -PCK4BITS(eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe),//08-0f -PCK4BITS(eItsMe,eItsMe, 5, 5, 5,eError,eItsMe,eError),//10-17 -PCK4BITS( 5, 5, 5,eError, 5,eError, 6, 6),//18-1f -PCK4BITS( 7, 6, 8, 8, 5, 5, 5,eError),//20-27 -PCK4BITS( 5, 5, 5,eError,eError,eError, 5, 5),//28-2f -PCK4BITS( 5, 5, 5,eError, 5,eError,eStart,eStart) //30-37 -}; - -static const PRUint32 UCS2LECharLenTable[] = {2, 2, 2, 2, 2, 2}; - -SMModel UCS2LESMModel = { - {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, UCS2LE_cls }, - 6, - {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, UCS2LE_st }, - UCS2LECharLenTable, - CHARDET_ENCODING_UTF_16LE, -}; - - -static PRUint32 UTF8_cls [ 256 / 8 ] = { +static const PRUint32 UTF8_cls [ 256 / 8 ] = { //PCK4BITS(0,1,1,1,1,1,1,1), // 00 - 07 PCK4BITS(1,1,1,1,1,1,1,1), // 00 - 07 //allow 0x00 as a legal value PCK4BITS(1,1,1,1,1,1,0,0), // 08 - 0f @@ -585,7 +471,7 @@ PCK4BITS(12,13,13,13,14,15,0,0) // f8 - ff }; -static PRUint32 UTF8_st [ 26] = { +static const PRUint32 UTF8_st [ 26] = { PCK4BITS(eError,eStart,eError,eError,eError,eError, 12, 10),//00-07 PCK4BITS( 9, 11, 8, 7, 6, 5, 4, 3),//08-0f PCK4BITS(eError,eError,eError,eError,eError,eError,eError,eError),//10-17 @@ -617,11 +503,11 @@ PCK4BITS(eError,eError,eError,eError,eError,eError,eError,eError) //c8-cf static const PRUint32 UTF8CharLenTable[] = {0, 1, 0, 0, 0, 0, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6 }; -SMModel UTF8SMModel = { +const SMModel UTF8SMModel = { {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, UTF8_cls }, 16, {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, UTF8_st }, UTF8CharLenTable, - CHARDET_ENCODING_UTF_8, + "UTF-8", }; diff --git a/src/nsPkgInt.h b/src/nsPkgInt.h index 7617d6c..3caa912 100644 --- a/src/nsPkgInt.h +++ b/src/nsPkgInt.h @@ -68,7 +68,7 @@ typedef struct nsPkgInt { nsSftMsk sftmsk; nsBitSft bitsft; nsUnitMsk unitmsk; - PRUint32 *data; + const PRUint32* const data; } nsPkgInt; diff --git a/src/nsSBCSGroupProber.cpp b/src/nsSBCSGroupProber.cpp index 65afdfe..d8fef87 100644 --- a/src/nsSBCSGroupProber.cpp +++ b/src/nsSBCSGroupProber.cpp @@ -56,21 +56,22 @@ nsSBCSGroupProber::nsSBCSGroupProber() mProbers[7] = new nsSingleByteCharSetProber(&Win1253Model); mProbers[8] = new nsSingleByteCharSetProber(&Latin5BulgarianModel); mProbers[9] = new nsSingleByteCharSetProber(&Win1251BulgarianModel); + mProbers[10] = new nsSingleByteCharSetProber(&TIS620ThaiModel); nsHebrewProber *hebprober = new nsHebrewProber(); // Notice: Any change in these indexes - 10,11,12 must be reflected // in the code below as well. - mProbers[10] = hebprober; - mProbers[11] = new nsSingleByteCharSetProber(&Win1255Model, PR_FALSE, hebprober); // Logical Hebrew - mProbers[12] = new nsSingleByteCharSetProber(&Win1255Model, PR_TRUE, hebprober); // Visual Hebrew + mProbers[11] = hebprober; + mProbers[12] = new nsSingleByteCharSetProber(&Win1255Model, PR_FALSE, hebprober); // Logical Hebrew + mProbers[13] = new nsSingleByteCharSetProber(&Win1255Model, PR_TRUE, hebprober); // Visual Hebrew // Tell the Hebrew prober about the logical and visual probers - if (mProbers[10] && mProbers[11] && mProbers[12]) // all are not null + if (mProbers[11] && mProbers[12] && mProbers[13]) // all are not null { - hebprober->SetModelProbers(mProbers[11], mProbers[12]); + hebprober->SetModelProbers(mProbers[12], mProbers[13]); } else // One or more is null. avoid any Hebrew probing, null them all { - for (PRUint32 i = 10; i <= 12; ++i) + for (PRUint32 i = 11; i <= 13; ++i) { delete mProbers[i]; mProbers[i] = 0; diff --git a/src/nsSBCSGroupProber.h b/src/nsSBCSGroupProber.h index faa57ed..cfbf7e1 100644 --- a/src/nsSBCSGroupProber.h +++ b/src/nsSBCSGroupProber.h @@ -40,7 +40,7 @@ #define nsSBCSGroupProber_h__ -#define NUM_OF_SBCS_PROBERS 13 +#define NUM_OF_SBCS_PROBERS 14 class nsCharSetProber; class nsSBCSGroupProber: public nsCharSetProber { diff --git a/src/nsSBCharSetProber.h b/src/nsSBCharSetProber.h index 277ab07..d7180dc 100644 --- a/src/nsSBCharSetProber.h +++ b/src/nsSBCharSetProber.h @@ -51,19 +51,19 @@ typedef struct { - unsigned char *charToOrderMap; // [256] table use to find a char's order - char *precedenceMatrix; // [SAMPLE_SIZE][SAMPLE_SIZE]; table to find a 2-char sequence's frequency + const unsigned char* const charToOrderMap; // [256] table use to find a char's order + const PRUint8* const precedenceMatrix; // [SAMPLE_SIZE][SAMPLE_SIZE]; table to find a 2-char sequence's frequency float mTypicalPositiveRatio; // = freqSeqs / totalSeqs PRBool keepEnglishLetter; // says if this script contains English characters (not implemented) - const char* charsetName; + const char* const charsetName; } SequenceModel; class nsSingleByteCharSetProber : public nsCharSetProber{ public: - nsSingleByteCharSetProber(SequenceModel *model) + nsSingleByteCharSetProber(const SequenceModel *model) :mModel(model), mReversed(PR_FALSE), mNameProber(0) { Reset(); } - nsSingleByteCharSetProber(SequenceModel *model, PRBool reversed, nsCharSetProber* nameProber) + nsSingleByteCharSetProber(const SequenceModel *model, PRBool reversed, nsCharSetProber* nameProber) :mModel(model), mReversed(reversed), mNameProber(nameProber) { Reset(); } virtual const char* GetCharSetName(); @@ -87,7 +87,7 @@ public: protected: nsProbingState mState; - const SequenceModel *mModel; + const SequenceModel* const mModel; const PRBool mReversed; // PR_TRUE if we need to reverse every pair in the model lookup //char order of last character @@ -106,19 +106,20 @@ protected: }; -extern SequenceModel Koi8rModel; -extern SequenceModel Win1251Model; -extern SequenceModel Latin5Model; -extern SequenceModel MacCyrillicModel; -extern SequenceModel Ibm866Model; -extern SequenceModel Ibm855Model; -extern SequenceModel Latin7Model; -extern SequenceModel Win1253Model; -extern SequenceModel Latin5BulgarianModel; -extern SequenceModel Win1251BulgarianModel; -extern SequenceModel Latin2HungarianModel; -extern SequenceModel Win1250HungarianModel; -extern SequenceModel Win1255Model; +extern const SequenceModel Koi8rModel; +extern const SequenceModel Win1251Model; +extern const SequenceModel Latin5Model; +extern const SequenceModel MacCyrillicModel; +extern const SequenceModel Ibm866Model; +extern const SequenceModel Ibm855Model; +extern const SequenceModel Latin7Model; +extern const SequenceModel Win1253Model; +extern const SequenceModel Latin5BulgarianModel; +extern const SequenceModel Win1251BulgarianModel; +extern const SequenceModel Latin2HungarianModel; +extern const SequenceModel Win1250HungarianModel; +extern const SequenceModel Win1255Model; +extern const SequenceModel TIS620ThaiModel; #endif /* nsSingleByteCharSetProber_h__ */ diff --git a/src/nsSJISProber.cpp b/src/nsSJISProber.cpp index 9bab506..c7842f6 100644 --- a/src/nsSJISProber.cpp +++ b/src/nsSJISProber.cpp @@ -46,8 +46,8 @@ void nsSJISProber::Reset(void) { mCodingSM->Reset(); mState = eDetecting; - mContextAnalyser.Reset(); - mDistributionAnalyser.Reset(); + mContextAnalyser.Reset(mIsPreferredLanguage); + mDistributionAnalyser.Reset(mIsPreferredLanguage); } nsProbingState nsSJISProber::HandleData(const char* aBuf, PRUint32 aLen) @@ -57,11 +57,6 @@ nsProbingState nsSJISProber::HandleData(const char* aBuf, PRUint32 aLen) for (PRUint32 i = 0; i < aLen; i++) { codingState = mCodingSM->NextState(aBuf[i]); - if (codingState == eError) - { - mState = eNotMe; - break; - } if (codingState == eItsMe) { mState = eFoundIt; diff --git a/src/nsSJISProber.h b/src/nsSJISProber.h index 77a7085..1efb6e3 100644 --- a/src/nsSJISProber.h +++ b/src/nsSJISProber.h @@ -43,7 +43,6 @@ #ifndef nsSJISProber_h__ #define nsSJISProber_h__ -#include "uchardetDefine.h" #include "nsCharSetProber.h" #include "nsCodingStateMachine.h" #include "JpCntx.h" @@ -52,11 +51,13 @@ class nsSJISProber: public nsCharSetProber { public: - nsSJISProber(void){mCodingSM = new nsCodingStateMachine(&SJISSMModel); - Reset();} + nsSJISProber(PRBool aIsPreferredLanguage) + :mIsPreferredLanguage(aIsPreferredLanguage) + {mCodingSM = new nsCodingStateMachine(&SJISSMModel); + Reset();} virtual ~nsSJISProber(void){delete mCodingSM;} nsProbingState HandleData(const char* aBuf, PRUint32 aLen); - const char* GetCharSetName() {return CHARDET_ENCODING_SHIFT_JIS;} + const char* GetCharSetName() {return "Shift_JIS";} nsProbingState GetState(void) {return mState;} void Reset(void); float GetConfidence(void); @@ -70,6 +71,7 @@ protected: SJISDistributionAnalysis mDistributionAnalyser; char mLastChar[2]; + PRBool mIsPreferredLanguage; }; diff --git a/src/nsUTF8Prober.cpp b/src/nsUTF8Prober.cpp index 6d590b4..ab8d9f7 100644 --- a/src/nsUTF8Prober.cpp +++ b/src/nsUTF8Prober.cpp @@ -51,11 +51,6 @@ nsProbingState nsUTF8Prober::HandleData(const char* aBuf, PRUint32 aLen) for (PRUint32 i = 0; i < aLen; i++) { codingState = mCodingSM->NextState(aBuf[i]); - if (codingState == eError) - { - mState = eNotMe; - break; - } if (codingState == eItsMe) { mState = eFoundIt; diff --git a/src/nsUTF8Prober.h b/src/nsUTF8Prober.h index 3bc8874..21c91c4 100644 --- a/src/nsUTF8Prober.h +++ b/src/nsUTF8Prober.h @@ -38,7 +38,6 @@ #ifndef nsUTF8Prober_h__ #define nsUTF8Prober_h__ -#include "uchardetDefine.h" #include "nsCharSetProber.h" #include "nsCodingStateMachine.h" @@ -49,7 +48,7 @@ public: Reset(); } virtual ~nsUTF8Prober(){delete mCodingSM;} nsProbingState HandleData(const char* aBuf, PRUint32 aLen); - const char* GetCharSetName() {return CHARDET_ENCODING_UTF_8;} + const char* GetCharSetName() {return "UTF-8";} nsProbingState GetState(void) {return mState;} void Reset(void); float GetConfidence(void); diff --git a/src/nsUniversalDetector.cpp b/src/nsUniversalDetector.cpp index 1e16002..7af8f95 100644 --- a/src/nsUniversalDetector.cpp +++ b/src/nsUniversalDetector.cpp @@ -38,7 +38,6 @@ #include "nscore.h" -#include "uchardetDefine.h" #include "nsUniversalDetector.h" #include "nsMBCSGroupProber.h" @@ -46,7 +45,7 @@ #include "nsEscCharsetProber.h" #include "nsLatin1Prober.h" -nsUniversalDetector::nsUniversalDetector() +nsUniversalDetector::nsUniversalDetector(PRUint32 aLanguageFilter) { mDone = PR_FALSE; mBestGuess = -1; //illegal value as signal @@ -58,6 +57,7 @@ nsUniversalDetector::nsUniversalDetector() mGotData = PR_FALSE; mInputState = ePureAscii; mLastChar = '\0'; + mLanguageFilter = aLanguageFilter; PRUint32 i; for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++) @@ -67,10 +67,9 @@ nsUniversalDetector::nsUniversalDetector() nsUniversalDetector::~nsUniversalDetector() { for (PRInt32 i = 0; i < NUM_OF_CHARSET_PROBERS; i++) - if (mCharSetProbers[i]) - delete mCharSetProbers[i]; - if (mEscCharSetProber) - delete mEscCharSetProber; + delete mCharSetProbers[i]; + + delete mEscCharSetProber; } void @@ -111,37 +110,23 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen) if (mStart) { mStart = PR_FALSE; - if (aLen > 3) + if (aLen > 2) switch (aBuf[0]) { case '\xEF': if (('\xBB' == aBuf[1]) && ('\xBF' == aBuf[2])) // EF BB BF UTF-8 encoded BOM - mDetectedCharset = CHARDET_ENCODING_UTF_8; + mDetectedCharset = "UTF-8"; break; case '\xFE': - if (('\xFF' == aBuf[1]) && ('\x00' == aBuf[2]) && ('\x00' == aBuf[3])) - // FE FF 00 00 UCS-4, unusual octet order BOM (3412) - mDetectedCharset = CHARDET_ENCODING_X_ISO_10646_UCS_4_3412; - else if ('\xFF' == aBuf[1]) + if ('\xFF' == aBuf[1]) // FE FF UTF-16, big endian BOM - mDetectedCharset = CHARDET_ENCODING_UTF_16BE; - break; - case '\x00': - if (('\x00' == aBuf[1]) && ('\xFE' == aBuf[2]) && ('\xFF' == aBuf[3])) - // 00 00 FE FF UTF-32, big-endian BOM - mDetectedCharset = CHARDET_ENCODING_UTF_32BE; - else if (('\x00' == aBuf[1]) && ('\xFF' == aBuf[2]) && ('\xFE' == aBuf[3])) - // 00 00 FF FE UCS-4, unusual octet order BOM (2143) - mDetectedCharset = CHARDET_ENCODING_X_ISO_10646_UCS_4_2143; + mDetectedCharset = "UTF-16"; break; case '\xFF': - if (('\xFE' == aBuf[1]) && ('\x00' == aBuf[2]) && ('\x00' == aBuf[3])) - // FF FE 00 00 UTF-32, little-endian BOM - mDetectedCharset = CHARDET_ENCODING_UTF_32LE; - else if ('\xFE' == aBuf[1]) + if ('\xFE' == aBuf[1]) // FF FE UTF-16, little endian BOM - mDetectedCharset = CHARDET_ENCODING_UTF_16LE; + mDetectedCharset = "UTF-16"; break; } // switch @@ -172,16 +157,24 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen) //start multibyte and singlebyte charset prober if (nsnull == mCharSetProbers[0]) - mCharSetProbers[0] = new nsMBCSGroupProber; - if (nsnull == mCharSetProbers[1]) - mCharSetProbers[1] = new nsSBCSGroupProber; - if (nsnull == mCharSetProbers[2]) - mCharSetProbers[2] = new nsLatin1Prober; - - if ((nsnull == mCharSetProbers[0]) || - (nsnull == mCharSetProbers[1]) || - (nsnull == mCharSetProbers[2])) + { + mCharSetProbers[0] = new nsMBCSGroupProber(mLanguageFilter); + if (nsnull == mCharSetProbers[0]) return NS_ERROR_OUT_OF_MEMORY; + } + if (nsnull == mCharSetProbers[1] && + (mLanguageFilter & NS_FILTER_NON_CJK)) + { + mCharSetProbers[1] = new nsSBCSGroupProber; + if (nsnull == mCharSetProbers[1]) + return NS_ERROR_OUT_OF_MEMORY; + } + if (nsnull == mCharSetProbers[2]) + { + mCharSetProbers[2] = new nsLatin1Prober; + if (nsnull == mCharSetProbers[2]) + return NS_ERROR_OUT_OF_MEMORY; + } } } else @@ -202,7 +195,7 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen) { case eEscAscii: if (nsnull == mEscCharSetProber) { - mEscCharSetProber = new nsEscCharSetProber; + mEscCharSetProber = new nsEscCharSetProber(mLanguageFilter); if (nsnull == mEscCharSetProber) return NS_ERROR_OUT_OF_MEMORY; } @@ -216,12 +209,15 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen) case eHighbyte: for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++) { - st = mCharSetProbers[i]->HandleData(aBuf, aLen); - if (st == eFoundIt) + if (mCharSetProbers[i]) { - mDone = PR_TRUE; - mDetectedCharset = mCharSetProbers[i]->GetCharSetName(); - return NS_OK; + st = mCharSetProbers[i]->HandleData(aBuf, aLen); + if (st == eFoundIt) + { + mDone = PR_TRUE; + mDetectedCharset = mCharSetProbers[i]->GetCharSetName(); + return NS_OK; + } } } break; @@ -260,11 +256,14 @@ void nsUniversalDetector::DataEnd() for (PRInt32 i = 0; i < NUM_OF_CHARSET_PROBERS; i++) { - proberConfidence = mCharSetProbers[i]->GetConfidence(); - if (proberConfidence > maxProberConfidence) + if (mCharSetProbers[i]) { - maxProberConfidence = proberConfidence; - maxProber = i; + proberConfidence = mCharSetProbers[i]->GetConfidence(); + if (proberConfidence > maxProberConfidence) + { + maxProberConfidence = proberConfidence; + maxProber = i; + } } } //do not report anything because we are not confident of it, that's in fact a negative answer diff --git a/src/nsUniversalDetector.h b/src/nsUniversalDetector.h index 36f3fa0..525f722 100644 --- a/src/nsUniversalDetector.h +++ b/src/nsUniversalDetector.h @@ -48,9 +48,22 @@ typedef enum { eHighbyte = 2 } nsInputState; +#define NS_FILTER_CHINESE_SIMPLIFIED 0x01 +#define NS_FILTER_CHINESE_TRADITIONAL 0x02 +#define NS_FILTER_JAPANESE 0x04 +#define NS_FILTER_KOREAN 0x08 +#define NS_FILTER_NON_CJK 0x10 +#define NS_FILTER_ALL 0x1F +#define NS_FILTER_CHINESE (NS_FILTER_CHINESE_SIMPLIFIED | \ + NS_FILTER_CHINESE_TRADITIONAL) +#define NS_FILTER_CJK (NS_FILTER_CHINESE_SIMPLIFIED | \ + NS_FILTER_CHINESE_TRADITIONAL | \ + NS_FILTER_JAPANESE | \ + NS_FILTER_KOREAN) + class nsUniversalDetector { public: - nsUniversalDetector(); + nsUniversalDetector(PRUint32 aLanguageFilter); virtual ~nsUniversalDetector(); virtual nsresult HandleData(const char* aBuf, PRUint32 aLen); virtual void DataEnd(void); @@ -66,6 +79,7 @@ protected: char mLastChar; const char * mDetectedCharset; PRInt32 mBestGuess; + PRUint32 mLanguageFilter; nsCharSetProber *mCharSetProbers[NUM_OF_CHARSET_PROBERS]; nsCharSetProber *mEscCharSetProber; diff --git a/src/nscore.h b/src/nscore.h index 83d7485..e0b5a72 100644 --- a/src/nscore.h +++ b/src/nscore.h @@ -42,6 +42,8 @@ typedef int PRInt32; typedef unsigned int PRUint32; typedef short PRInt16; typedef unsigned short PRUint16; +typedef signed char PRInt8; +typedef unsigned char PRUint8; #define PR_FALSE false #define PR_TRUE true diff --git a/src/tools/uchardet.cpp b/src/tools/uchardet.cpp index 285248c..56765bc 100644 --- a/src/tools/uchardet.cpp +++ b/src/tools/uchardet.cpp @@ -44,38 +44,18 @@ #ifndef VERSION #define VERSION "Unknown" #endif -#define BUFFER_SIZE 32768 +#define BUFFER_SIZE 65536 + +char buffer[BUFFER_SIZE]; void detect(FILE * fp) { uchardet_t handle = uchardet_new(); - size_t size = BUFFER_SIZE; - char * buffer_in = (char *) malloc(size * sizeof(char)); - - while (fgets(buffer_in, size, fp) != NULL) + while (!feof(fp)) { - size_t freesize = size; - - char * buffer_in_p = buffer_in; - size_t line_length = strlen(buffer_in_p); - while (line_length + 1 == freesize && buffer_in_p[line_length - 2] != '\n') - { - buffer_in_p += size - 1; - freesize = size + 1; - size += size; - size_t offset = buffer_in_p - buffer_in; - buffer_in = (char *) realloc(buffer_in, size * sizeof(char)); - buffer_in_p = buffer_in + offset; - - if (fgets(buffer_in_p, freesize, fp) == NULL) - break; - - line_length = strlen(buffer_in_p); - } - - int retval = uchardet_handle_data(handle, buffer_in, strlen(buffer_in)); - + size_t len = fread(buffer, 1, BUFFER_SIZE, fp); + int retval = uchardet_handle_data(handle, buffer, len); if (retval != 0) { fprintf(stderr, "Handle data error.\n"); @@ -84,10 +64,10 @@ void detect(FILE * fp) } uchardet_data_end(handle); - printf("%s\n", uchardet_get_charset(handle)); + const char * charset = uchardet_get_charset(handle); + printf("%s\n", charset); uchardet_delete(handle); - free(buffer_in); } void show_version() diff --git a/src/uchardet.cpp b/src/uchardet.cpp index bc83e0f..74ab63c 100644 --- a/src/uchardet.cpp +++ b/src/uchardet.cpp @@ -37,67 +37,69 @@ #include "uchardet.h" #include "nscore.h" #include "nsUniversalDetector.h" -#include +#include -class DllDetector : public nsUniversalDetector +using std::string; + +class HandleUniversalDetector : public nsUniversalDetector { protected: - char charset_[256]; + string m_charset; public: - DllDetector() - : nsUniversalDetector() + HandleUniversalDetector() + : nsUniversalDetector(NS_FILTER_ALL) { - *charset_=0; + m_charset = ""; } - virtual ~DllDetector() + virtual ~HandleUniversalDetector() {} virtual void Report(const char* charset) { - strncpy( charset_ , charset , sizeof(charset_) ); + m_charset = charset; } virtual void Reset() { nsUniversalDetector::Reset(); - *charset_=0; + m_charset = ""; } const char* GetCharset() const { - return charset_; + return m_charset.c_str(); } }; uchardet_t uchardet_new() { - return reinterpret_cast (new DllDetector()); + return reinterpret_cast (new HandleUniversalDetector()); } void uchardet_delete(uchardet_t ud) { - delete reinterpret_cast(ud); + delete reinterpret_cast(ud); } int uchardet_handle_data(uchardet_t ud, const char * data, size_t len) { - nsresult ret = reinterpret_cast(ud)->HandleData(data, (PRUint32)len); + nsresult ret = reinterpret_cast(ud)->HandleData(data, (PRUint32)len); return (ret != NS_OK); } void uchardet_data_end(uchardet_t ud) { - reinterpret_cast(ud)->DataEnd(); + reinterpret_cast(ud)->DataEnd(); } void uchardet_reset(uchardet_t ud) { - reinterpret_cast(ud)->Reset(); + reinterpret_cast(ud)->Reset(); } const char* uchardet_get_charset(uchardet_t ud) { - return reinterpret_cast(ud)->GetCharset(); + return reinterpret_cast(ud)->GetCharset(); } diff --git a/win32.sh b/win32.sh new file mode 100644 index 0000000..b757380 --- /dev/null +++ b/win32.sh @@ -0,0 +1,7 @@ +mkdir --parents win32 \ +&& cd win32 \ +&& cmake .. \ + -G "MSYS Makefiles" \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_INSTALL_PREFIX="" \ +&& make \ No newline at end of file