Update code from upstream.

2025-12-06 16:56:40 +08:00 · 2011-07-11 14:42:50 +08:00 · 2011-07-11 14:42:50 +08:00 · 84284eccf4
commit 84284eccf4
parent 76a1be36f8
48 changed files with 411 additions and 532 deletions
--- a/debug.sh
+++ b/debug.sh
--- a/release.sh
+++ b/release.sh
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -11,26 +11,26 @@ set(
 	LangBulgarianModel.cpp
 	LangCyrillicModel.cpp
 	LangGreekModel.cpp
-	LangHebrewModel.cpp
 	LangHungarianModel.cpp
+	LangHebrewModel.cpp
 	LangThaiModel.cpp
-	nsBig5Prober.cpp
+	nsHebrewProber.cpp
 	nsCharSetProber.cpp
-	nsEscCharsetProber.cpp
-	nsEscSM.cpp
+	nsBig5Prober.cpp
 	nsEUCJPProber.cpp
 	nsEUCKRProber.cpp
 	nsEUCTWProber.cpp
+	nsEscCharsetProber.cpp
+	nsEscSM.cpp
 	nsGB2312Prober.cpp
-	nsHebrewProber.cpp
-	nsLatin1Prober.cpp
 	nsMBCSGroupProber.cpp
 	nsMBCSSM.cpp
-	nsSBCharSetProber.cpp
 	nsSBCSGroupProber.cpp
+	nsSBCharSetProber.cpp
 	nsSJISProber.cpp
-	nsUniversalDetector.cpp
 	nsUTF8Prober.cpp
+	nsLatin1Prober.cpp
+	nsUniversalDetector.cpp
 	uchardet.cpp
 )

--- a/src/CharDistribution.cpp
+++ b/src/CharDistribution.cpp
@ -46,15 +46,13 @@
 #define SURE_YES 0.99f
 #define SURE_NO  0.01f

-#define MINIMUM_DATA_THRESHOLD  4
-
 //return confidence base on received data
-float CharDistributionAnalysis::GetConfidence()
+float CharDistributionAnalysis::GetConfidence(void)
 { 
  //if we didn't receive any character in our consideration range, or the
-  //number of frequent characters is below the minimum threshold, return
+  // number of frequent characters is below the minimum threshold, return
  // negative answer
-  if (mTotalChars <= 0 || mFreqChars <= MINIMUM_DATA_THRESHOLD)
+  if (mTotalChars <= 0 || mFreqChars <= mDataThreshold)
    return SURE_NO;

  if (mTotalChars != mFreqChars) {
--- a/src/CharDistribution.h
+++ b/src/CharDistribution.h
@ -42,11 +42,12 @@

 #define ENOUGH_DATA_THRESHOLD 1024
 
+#define MINIMUM_DATA_THRESHOLD  4
+
 class CharDistributionAnalysis
 {
 public:
-  CharDistributionAnalysis() {Reset();}
-  virtual ~CharDistributionAnalysis(){};
+  CharDistributionAnalysis() {Reset(PR_FALSE);}

  //feed a block of data and do distribution analysis
  void HandleData(const char* aBuf, PRUint32 aLen) {}
@ -72,14 +73,15 @@ public:
  }

  //return confidence base on existing data
-  float GetConfidence();
+  float GetConfidence(void);

  //Reset analyser, clear any state 
-  void      Reset(void) 
+  void      Reset(PRBool aIsPreferredLanguage) 
  {
    mDone = PR_FALSE;
    mTotalChars = 0;
    mFreqChars = 0;
+    mDataThreshold = aIsPreferredLanguage ? 0 : MINIMUM_DATA_THRESHOLD;
  }

  //This function is for future extension. Caller can use this function to control
@ -105,6 +107,9 @@ protected:
  //Total character encounted.
  PRUint32 mTotalChars;

+  //Number of hi-byte characters needed to trigger detection
+  PRUint32 mDataThreshold;
+
  //Mapping table to get frequency order from char order (get from GetOrder())
  const PRInt16  *mCharToFreqOrder;

--- a/src/JpCntx.cpp
+++ b/src/JpCntx.cpp
@ -39,7 +39,7 @@
 #include "JpCntx.h"

 //This is hiragana 2-char sequence table, the number in each cell represents its frequency category
-char jp2CharContext[83][83] = 
+const PRUint8 jp2CharContext[83][83] = 
 { 
 { 0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,},
 { 2,4,0,4,0,3,0,4,0,3,4,4,4,2,4,3,3,4,3,2,3,3,4,2,3,3,3,2,4,1,4,3,3,1,5,4,3,4,3,4,3,5,3,0,3,5,4,2,0,3,1,0,3,3,0,3,3,0,1,1,0,4,3,0,3,3,0,4,0,2,0,3,5,5,5,5,4,0,4,1,0,3,4,},
@ -170,7 +170,7 @@ void JapaneseContextAnalysis::HandleData(const char* aBuf, PRUint32 aLen)
  return;
 }

-void JapaneseContextAnalysis::Reset(void)
+void JapaneseContextAnalysis::Reset(PRBool aIsPreferredLanguage)
 {
  mTotalRel = 0;
  for (PRUint32 i = 0; i < NUM_OF_CATEGORY; i++)
@ -178,13 +178,14 @@ void JapaneseContextAnalysis::Reset(void)
  mNeedToSkipCharNum = 0;
  mLastCharOrder = -1;
  mDone = PR_FALSE;
+  mDataThreshold = aIsPreferredLanguage ? 0 : MINIMUM_DATA_THRESHOLD;
 }
 #define DONT_KNOW (float)-1

-float  JapaneseContextAnalysis::GetConfidence()
+float  JapaneseContextAnalysis::GetConfidence(void)
 {
  //This is just one way to calculate confidence. It works well for me.
-  if (mTotalRel > MINIMUM_DATA_THRESHOLD)
+  if (mTotalRel > mDataThreshold)
    return ((float)(mTotalRel - mRelSample[0]))/mTotalRel;
  else 
    return (float)DONT_KNOW;
@ -227,5 +228,3 @@ PRInt32 EUCJPContextAnalysis::GetOrder(const char* str, PRUint32 *charLen)
     return (unsigned char)*(str+1) - (unsigned char)0xa1;
  return -1;
 }
-
-
--- a/src/JpCntx.h
+++ b/src/JpCntx.h
@ -46,13 +46,12 @@
 #define MAX_REL_THRESHOLD     1000

 //hiragana frequency category table
-extern char jp2CharContext[83][83];
+extern const PRUint8 jp2CharContext[83][83];

 class JapaneseContextAnalysis
 {
 public:
-  JapaneseContextAnalysis() {Reset();}
-  virtual ~JapaneseContextAnalysis(){};
+  JapaneseContextAnalysis() {Reset(PR_FALSE);}

  void HandleData(const char* aBuf, PRUint32 aLen);

@ -75,8 +74,8 @@ public:
    mLastCharOrder = order;
  }

-  float GetConfidence();
-  void      Reset(void);
+  float GetConfidence(void);
+  void      Reset(PRBool aIsPreferredLanguage);
  void      SetOpion(){}
  PRBool GotEnoughData() {return mTotalRel > ENOUGH_REL_THRESHOLD;}

@ -84,11 +83,14 @@ protected:
  virtual PRInt32 GetOrder(const char* str, PRUint32 *charLen) = 0;
  virtual PRInt32 GetOrder(const char* str) = 0;

-  //category counters, each interger counts sequence in its category
+  //category counters, each integer counts sequences in its category
  PRUint32 mRelSample[NUM_OF_CATEGORY];

  //total sequence received
  PRUint32 mTotalRel;
+
+  //Number of sequences needed to trigger detection
+  PRUint32 mDataThreshold;
  
  //The order of previous char
  PRInt32  mLastCharOrder;
--- a/src/LangBulgarianModel.cpp
+++ b/src/LangBulgarianModel.cpp
@ -35,7 +35,6 @@
 *
 * ***** END LICENSE BLOCK ***** */

-#include "uchardetDefine.h"
 #include "nsSBCharSetProber.h"
 /****************************************************************
 255: Control characters that usually does not exist in any text
@ -49,7 +48,7 @@
 //this talbe is modified base on win1251BulgarianCharToOrderMap, so 
 //only number <64 is sure valid

-unsigned char Latin5_BulgarianCharToOrderMap[] =
+static const unsigned char Latin5_BulgarianCharToOrderMap[] =
 {
 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255,  //00
 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,  //10
@ -69,7 +68,7 @@ unsigned char Latin5_BulgarianCharToOrderMap[] =
 62,242,243,244, 58,245, 98,246,247,248,249,250,251, 91,252,253,  //f0
 };

-unsigned char win1251BulgarianCharToOrderMap[] =
+static const unsigned char win1251BulgarianCharToOrderMap[] =
 {
 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255,  //00
 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,  //10
@ -95,7 +94,7 @@ unsigned char win1251BulgarianCharToOrderMap[] =
 //first 1024 sequences:3.0618%
 //rest  sequences:     0.2992%
 //negative sequences:  0.0020% 
-char BulgarianLangModel[] = 
+static const PRUint8 BulgarianLangModel[] = 
 {
 0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,2,3,3,3,3,3,
 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,2,2,3,2,2,1,2,2,
@ -227,20 +226,20 @@ char BulgarianLangModel[] =
 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,
 };

-SequenceModel Latin5BulgarianModel = 
+const SequenceModel Latin5BulgarianModel = 
 {
  Latin5_BulgarianCharToOrderMap,
  BulgarianLangModel,
  (float)0.969392,
  PR_FALSE,
-  CHARDET_ENCODING_ISO_8859_5
+  "ISO-8859-5"
 };

-SequenceModel Win1251BulgarianModel = 
+const SequenceModel Win1251BulgarianModel = 
 {
  win1251BulgarianCharToOrderMap,
  BulgarianLangModel,
  (float)0.969392,
  PR_FALSE,
-  CHARDET_ENCODING_WINDOWS_1251
+  "windows-1251"
 };
--- a/src/LangCyrillicModel.cpp
+++ b/src/LangCyrillicModel.cpp
@ -35,14 +35,13 @@
 *
 * ***** END LICENSE BLOCK ***** */

-#include "uchardetDefine.h"
 #include "nsSBCharSetProber.h"



 //KOI8-R language model
 //Character Mapping Table:
-unsigned char KOI8R_CharToOrderMap[] =
+static const unsigned char KOI8R_CharToOrderMap[] =
 {
 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255,  //00
 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,  //10
@ -62,7 +61,7 @@ unsigned char KOI8R_CharToOrderMap[] =
 35, 43, 45, 32, 40, 52, 56, 33, 61, 62, 51, 57, 47, 63, 50, 70,  //f0
 };

-unsigned char win1251_CharToOrderMap[] =
+static const unsigned char win1251_CharToOrderMap[] =
 {
 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255,  //00
 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,  //10
@ -82,7 +81,7 @@ unsigned char win1251_CharToOrderMap[] =
  9,  7,  6, 14, 39, 26, 28, 22, 25, 29, 54, 18, 17, 30, 27, 16,
 };

-unsigned char latin5_CharToOrderMap[] =
+static const unsigned char latin5_CharToOrderMap[] =
 {
 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255,  //00
 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,  //10
@ -102,7 +101,7 @@ unsigned char latin5_CharToOrderMap[] =
 239, 68,240,241,242,243,244,245,246,247,248,249,250,251,252,255,
 };

-unsigned char macCyrillic_CharToOrderMap[] =
+static const unsigned char macCyrillic_CharToOrderMap[] =
 {
 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255,  //00
 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,  //10
@ -122,7 +121,7 @@ unsigned char macCyrillic_CharToOrderMap[] =
  9,  7,  6, 14, 39, 26, 28, 22, 25, 29, 54, 18, 17, 30, 27,255,
 };

-unsigned char IBM855_CharToOrderMap[] =
+static const unsigned char IBM855_CharToOrderMap[] =
 {
 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255,  //00
 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,  //10
@ -142,7 +141,7 @@ unsigned char IBM855_CharToOrderMap[] =
 250, 18, 62, 20, 51, 25, 57, 30, 47, 29, 63, 22, 50,251,252,255,
 };

-unsigned char IBM866_CharToOrderMap[] =
+static const unsigned char IBM866_CharToOrderMap[] =
 {
 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255,  //00
 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,  //10
@ -168,7 +167,7 @@ unsigned char IBM866_CharToOrderMap[] =
 //first 1024 sequences: 2.3389%
 //rest  sequences:      0.1237%
 //negative sequences:   0.0009% 
-char RussianLangModel[] = 
+static const PRUint8 RussianLangModel[] = 
 {
 0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,1,3,3,3,3,1,3,3,3,2,3,2,3,3,
 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,2,2,2,2,2,0,0,2,
@ -301,56 +300,56 @@ char RussianLangModel[] =
 };


-SequenceModel Koi8rModel = 
+const SequenceModel Koi8rModel = 
 {
  KOI8R_CharToOrderMap,
  RussianLangModel,
  (float)0.976601,
  PR_FALSE,
-  CHARDET_ENCODING_KOI8_R
+  "KOI8-R"
 };

-SequenceModel Win1251Model = 
+const SequenceModel Win1251Model = 
 {
  win1251_CharToOrderMap,
  RussianLangModel,
  (float)0.976601,
  PR_FALSE,
-  CHARDET_ENCODING_WINDOWS_1251
+  "windows-1251"
 };

-SequenceModel Latin5Model = 
+const SequenceModel Latin5Model = 
 {
  latin5_CharToOrderMap,
  RussianLangModel,
  (float)0.976601,
  PR_FALSE,
-  CHARDET_ENCODING_ISO_8859_5
+  "ISO-8859-5"
 };

-SequenceModel MacCyrillicModel = 
+const SequenceModel MacCyrillicModel = 
 {
  macCyrillic_CharToOrderMap,
  RussianLangModel,
  (float)0.976601,
  PR_FALSE,
-  CHARDET_ENCODING_MACCYRILLIC
+  "x-mac-cyrillic"
 };

-SequenceModel Ibm866Model = 
+const SequenceModel Ibm866Model = 
 {
  IBM866_CharToOrderMap,
  RussianLangModel,
  (float)0.976601,
  PR_FALSE,
-  CHARDET_ENCODING_IBM866
+  "IBM866"
 };

-SequenceModel Ibm855Model = 
+const SequenceModel Ibm855Model = 
 {
  IBM855_CharToOrderMap,
  RussianLangModel,
  (float)0.976601,
  PR_FALSE,
-  CHARDET_ENCODING_IBM855
+  "IBM855"
 };
--- a/src/LangGreekModel.cpp
+++ b/src/LangGreekModel.cpp
@ -35,7 +35,6 @@
 *
 * ***** END LICENSE BLOCK ***** */

-#include "uchardetDefine.h"
 #include "nsSBCharSetProber.h"
 /****************************************************************
 255: Control characters that usually does not exist in any text
@ -46,7 +45,7 @@
 *****************************************************************/

 //Character Mapping Table:
-unsigned char Latin7_CharToOrderMap[] =
+static const unsigned char Latin7_CharToOrderMap[] =
 {
 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255,  //00
 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,  //10
@ -68,7 +67,7 @@ unsigned char Latin7_CharToOrderMap[] =



-unsigned char win1253_CharToOrderMap[] =
+static const unsigned char win1253_CharToOrderMap[] =
 {
 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255,  //00
 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,  //10
@ -94,7 +93,7 @@ unsigned char win1253_CharToOrderMap[] =
 //first 1024 sequences:1.7001%
 //rest  sequences:     0.0359%
 //negative sequences:  0.0148% 
-char GreekLangModel[] = 
+static const PRUint8 GreekLangModel[] = 
 {
 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
@ -226,20 +225,20 @@ char GreekLangModel[] =
 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 };

-SequenceModel Latin7Model = 
+const SequenceModel Latin7Model = 
 {
  Latin7_CharToOrderMap,
  GreekLangModel,
  (float)0.982851,
  PR_FALSE,
-  CHARDET_ENCODING_ISO_8859_7
+  "ISO-8859-7"
 };

-SequenceModel Win1253Model = 
+const SequenceModel Win1253Model = 
 {
  win1253_CharToOrderMap,
  GreekLangModel,
  (float)0.982851,
  PR_FALSE,
-  CHARDET_ENCODING_WINDOWS_1253
+  "windows-1253"
 };
--- a/src/LangHebrewModel.cpp
+++ b/src/LangHebrewModel.cpp
@ -37,7 +37,6 @@
 *
 * ***** END LICENSE BLOCK ***** */

-#include "uchardetDefine.h"
 #include "nsSBCharSetProber.h"


@ -51,7 +50,7 @@

 //Windows-1255 language model
 //Character Mapping Table:
-unsigned char win1255_CharToOrderMap[] =
+static const unsigned char win1255_CharToOrderMap[] =
 {
 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255,  //00
 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,  //10
@ -77,7 +76,7 @@ unsigned char win1255_CharToOrderMap[] =
 //first 1024 sequences: 1.5981%
 //rest  sequences:      0.087%
 //negative sequences:   0.0015% 
-char HebrewLangModel[] = 
+static const PRUint8 HebrewLangModel[] = 
 {
 0,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,2,3,2,1,2,0,1,0,0,
 3,0,3,1,0,0,1,3,2,0,1,1,2,0,2,2,2,1,1,1,1,2,1,1,1,2,0,0,2,2,0,1,
@ -209,12 +208,12 @@ char HebrewLangModel[] =
 0,0,0,0,0,0,0,0,0,0,1,2,1,0,0,0,0,0,1,1,1,1,1,0,1,0,0,0,1,1,0,0,
 };

-SequenceModel Win1255Model = 
+const SequenceModel Win1255Model = 
 {
  win1255_CharToOrderMap,
  HebrewLangModel,
  (float)0.984004,
  PR_FALSE,
-  CHARDET_ENCODING_WINDOWS_1255
+  "windows-1255"
 };

--- a/src/LangHungarianModel.cpp
+++ b/src/LangHungarianModel.cpp
@ -35,7 +35,6 @@
 *
 * ***** END LICENSE BLOCK ***** */

-#include "uchardetDefine.h"
 #include "nsSBCharSetProber.h"
 /****************************************************************
 255: Control characters that usually does not exist in any text
@ -46,7 +45,7 @@
 *****************************************************************/

 //Character Mapping Table:
-unsigned char Latin2_HungarianCharToOrderMap[] =
+static const unsigned char Latin2_HungarianCharToOrderMap[] =
 {
 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255,  //00
 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,  //10
@ -66,7 +65,7 @@ unsigned char Latin2_HungarianCharToOrderMap[] =
 245,246,247, 25, 73, 42, 24,248,249,250, 31, 56, 29,251,252,253,
 };

-unsigned char win1250HungarianCharToOrderMap[] =
+static const unsigned char win1250HungarianCharToOrderMap[] =
 {
 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255,  //00
 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,  //10
@ -92,7 +91,7 @@ unsigned char win1250HungarianCharToOrderMap[] =
 //first 1024 sequences:5.2623%
 //rest  sequences:     0.8894%
 //negative sequences:  0.0009% 
-char HungarianLangModel[] = 
+static const PRUint8 HungarianLangModel[] = 
 {
 0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
 3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,2,2,3,3,1,1,2,2,2,2,2,1,2,
@ -224,20 +223,20 @@ char HungarianLangModel[] =
 0,1,1,1,1,1,1,0,1,1,0,1,0,1,0,0,1,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,
 };

-SequenceModel Latin2HungarianModel = 
+const SequenceModel Latin2HungarianModel = 
 {
  Latin2_HungarianCharToOrderMap,
  HungarianLangModel,
  (float)0.947368,
  PR_TRUE,
-  CHARDET_ENCODING_ISO_8859_2
+  "ISO-8859-2"
 };

-SequenceModel Win1250HungarianModel = 
+const SequenceModel Win1250HungarianModel = 
 {
  win1250HungarianCharToOrderMap,
  HungarianLangModel,
  (float)0.947368,
  PR_TRUE,
-  CHARDET_ENCODING_WINDOWS_1250
+  "windows-1250"
 };
--- a/src/LangThaiModel.cpp
+++ b/src/LangThaiModel.cpp
@ -35,7 +35,6 @@
 *
 * ***** END LICENSE BLOCK ***** */

-#include "uchardetDefine.h"
 #include "nsSBCharSetProber.h"


@ -50,7 +49,7 @@
 //The following result for thai was collected from a limited sample (1M). 

 //Character Mapping Table:
-unsigned char TIS620CharToOrderMap[] =
+static const unsigned char TIS620CharToOrderMap[] =
 {
 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255,  //00
 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,  //10
@ -79,7 +78,7 @@ unsigned char TIS620CharToOrderMap[] =
 //first 1024 sequences:7.3177%
 //rest  sequences:     1.0230%
 //negative sequences:  0.0436% 
-char ThaiLangModel[] = 
+static const PRUint8 ThaiLangModel[] = 
 {
 0,1,3,3,3,3,0,0,3,3,0,3,3,0,3,3,3,3,3,3,3,3,0,0,3,3,3,0,3,3,3,3,
 0,3,3,0,0,0,1,3,0,3,3,2,3,3,0,1,2,3,3,3,3,0,2,0,2,0,0,3,2,1,2,2,
@ -212,11 +211,11 @@ char ThaiLangModel[] =
 };


-SequenceModel TIS620ThaiModel = 
+const SequenceModel TIS620ThaiModel = 
 {
  TIS620CharToOrderMap,
  ThaiLangModel,
  (float)0.926386,
  PR_FALSE,
-  CHARDET_ENCODING_TIS_620
+  "TIS-620"
 };
--- a/src/nsBig5Prober.cpp
+++ b/src/nsBig5Prober.cpp
@ -41,7 +41,7 @@ void  nsBig5Prober::Reset(void)
 {
  mCodingSM->Reset(); 
  mState = eDetecting;
-  mDistributionAnalyser.Reset();
+  mDistributionAnalyser.Reset(mIsPreferredLanguage);
 }

 nsProbingState nsBig5Prober::HandleData(const char* aBuf, PRUint32 aLen)
@ -51,11 +51,6 @@ nsProbingState nsBig5Prober::HandleData(const char* aBuf, PRUint32 aLen)
  for (PRUint32 i = 0; i < aLen; i++)
  {
    codingState = mCodingSM->NextState(aBuf[i]);
-    if (codingState == eError)
-    {
-      mState = eNotMe;
-      break;
-    }
    if (codingState == eItsMe)
    {
      mState = eFoundIt;
--- a/src/nsBig5Prober.h
+++ b/src/nsBig5Prober.h
@ -38,18 +38,19 @@
 #ifndef nsBig5Prober_h__
 #define nsBig5Prober_h__

-#include "uchardetDefine.h"
 #include "nsCharSetProber.h"
 #include "nsCodingStateMachine.h"
 #include "CharDistribution.h"

 class nsBig5Prober: public nsCharSetProber {
 public:
-  nsBig5Prober(void){mCodingSM = new nsCodingStateMachine(&Big5SMModel);
-                      Reset();}
+  nsBig5Prober(PRBool aIsPreferredLanguage)
+    :mIsPreferredLanguage(aIsPreferredLanguage) 
+  {mCodingSM = new nsCodingStateMachine(&Big5SMModel); 
+    Reset();}
  virtual ~nsBig5Prober(void){delete mCodingSM;}
  nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
-  const char* GetCharSetName() {return CHARDET_ENCODING_BIG5;}
+  const char* GetCharSetName() {return "Big5";}
  nsProbingState GetState(void) {return mState;}
  void      Reset(void);
  float     GetConfidence(void);
@ -64,6 +65,7 @@ protected:
  //Big5ContextAnalysis mContextAnalyser;
  Big5DistributionAnalysis mDistributionAnalyser;
  char mLastChar[2];
+  PRBool mIsPreferredLanguage;

 };

--- a/src/nsCharSetProber.h
+++ b/src/nsCharSetProber.h
@ -61,7 +61,7 @@ public:
  virtual void      SetOpion() = 0;

 #ifdef DEBUG_chardet
-  virtual void  DumpStatus() {}
+  virtual void  DumpStatus() {};
 #endif

  // Helper functions used in the Latin1 and Group probers.
--- a/src/nsCodingStateMachine.h
+++ b/src/nsCodingStateMachine.h
@ -59,10 +59,7 @@ typedef struct

 class nsCodingStateMachine {
 public:
-  nsCodingStateMachine(SMModel* sm){
-          mCurrentState = eStart;
-          mModel = sm;
-        }
+  nsCodingStateMachine(const SMModel* sm) : mModel(sm) { mCurrentState = eStart; }
  nsSMState NextState(char c){
    //for each byte we get its class , if it is first byte, we also get byte length
    PRUint32 byteCls = GETCLASS(c);
@ -86,23 +83,22 @@ protected:
  PRUint32 mCurrentCharLen;
  PRUint32 mCurrentBytePos;

-  SMModel *mModel;
+  const SMModel *mModel;
 };

-extern SMModel UTF8SMModel;
-extern SMModel Big5SMModel;
-extern SMModel EUCJPSMModel;
-extern SMModel EUCKRSMModel;
-extern SMModel EUCTWSMModel;
-extern SMModel GB18030SMModel;
-extern SMModel SJISSMModel;
-extern SMModel UCS2BESMModel;
+extern const SMModel UTF8SMModel;
+extern const SMModel Big5SMModel;
+extern const SMModel EUCJPSMModel;
+extern const SMModel EUCKRSMModel;
+extern const SMModel EUCTWSMModel;
+extern const SMModel GB18030SMModel;
+extern const SMModel SJISSMModel;


-extern SMModel HZSMModel;
-extern SMModel ISO2022CNSMModel;
-extern SMModel ISO2022JPSMModel;
-extern SMModel ISO2022KRSMModel;
+extern const SMModel HZSMModel;
+extern const SMModel ISO2022CNSMModel;
+extern const SMModel ISO2022JPSMModel;
+extern const SMModel ISO2022KRSMModel;

 #endif /* nsCodingStateMachine_h__ */

--- a/src/nsEUCJPProber.cpp
+++ b/src/nsEUCJPProber.cpp
@ -46,8 +46,8 @@ void  nsEUCJPProber::Reset(void)
 {
  mCodingSM->Reset(); 
  mState = eDetecting;
-  mContextAnalyser.Reset();
-  mDistributionAnalyser.Reset();
+  mContextAnalyser.Reset(mIsPreferredLanguage);
+  mDistributionAnalyser.Reset(mIsPreferredLanguage);
 }

 nsProbingState nsEUCJPProber::HandleData(const char* aBuf, PRUint32 aLen)
@ -57,11 +57,6 @@ nsProbingState nsEUCJPProber::HandleData(const char* aBuf, PRUint32 aLen)
  for (PRUint32 i = 0; i < aLen; i++)
  {
    codingState = mCodingSM->NextState(aBuf[i]);
-    if (codingState == eError)
-    {
-      mState = eNotMe;
-      break;
-    }
    if (codingState == eItsMe)
    {
      mState = eFoundIt;
--- a/src/nsEUCJPProber.h
+++ b/src/nsEUCJPProber.h
@ -43,7 +43,6 @@
 #ifndef nsEUCJPProber_h__
 #define nsEUCJPProber_h__

-#include "uchardetDefine.h"
 #include "nsCharSetProber.h"
 #include "nsCodingStateMachine.h"
 #include "JpCntx.h"
@ -51,11 +50,13 @@

 class nsEUCJPProber: public nsCharSetProber {
 public:
-  nsEUCJPProber(void){mCodingSM = new nsCodingStateMachine(&EUCJPSMModel);
-                      Reset();}
+  nsEUCJPProber(PRBool aIsPreferredLanguage)
+    :mIsPreferredLanguage(aIsPreferredLanguage)
+  {mCodingSM = new nsCodingStateMachine(&EUCJPSMModel);
+    Reset();}
  virtual ~nsEUCJPProber(void){delete mCodingSM;}
  nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
-  const char* GetCharSetName() {return CHARDET_ENCODING_EUC_JP;}
+  const char* GetCharSetName() {return "EUC-JP";}
  nsProbingState GetState(void) {return mState;}
  void      Reset(void);
  float     GetConfidence(void);
@ -69,6 +70,7 @@ protected:
  EUCJPDistributionAnalysis mDistributionAnalyser;

  char mLastChar[2];
+  PRBool mIsPreferredLanguage;
 };


--- a/src/nsEUCKRProber.cpp
+++ b/src/nsEUCKRProber.cpp
@ -41,7 +41,7 @@ void  nsEUCKRProber::Reset(void)
 {
  mCodingSM->Reset(); 
  mState = eDetecting;
-  mDistributionAnalyser.Reset();
+  mDistributionAnalyser.Reset(mIsPreferredLanguage);
  //mContextAnalyser.Reset();
 }

@ -52,11 +52,6 @@ nsProbingState nsEUCKRProber::HandleData(const char* aBuf, PRUint32 aLen)
  for (PRUint32 i = 0; i < aLen; i++)
  {
    codingState = mCodingSM->NextState(aBuf[i]);
-    if (codingState == eError)
-    {
-      mState = eNotMe;
-      break;
-    }
    if (codingState == eItsMe)
    {
      mState = eFoundIt;
--- a/src/nsEUCKRProber.h
+++ b/src/nsEUCKRProber.h
@ -38,18 +38,20 @@
 #ifndef nsEUCKRProber_h__
 #define nsEUCKRProber_h__

-#include "uchardetDefine.h"
 #include "nsCharSetProber.h"
 #include "nsCodingStateMachine.h"
 #include "CharDistribution.h"

 class nsEUCKRProber: public nsCharSetProber {
 public:
-  nsEUCKRProber(void){mCodingSM = new nsCodingStateMachine(&EUCKRSMModel);
-                      Reset();}
+  nsEUCKRProber(PRBool aIsPreferredLanguage)
+    :mIsPreferredLanguage(aIsPreferredLanguage)
+  {mCodingSM = new nsCodingStateMachine(&EUCKRSMModel);
+    Reset();
+  }
  virtual ~nsEUCKRProber(void){delete mCodingSM;}
  nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
-  const char* GetCharSetName() {return CHARDET_ENCODING_EUC_KR;}
+  const char* GetCharSetName() {return "EUC-KR";}
  nsProbingState GetState(void) {return mState;}
  void      Reset(void);
  float     GetConfidence(void);
@ -64,6 +66,7 @@ protected:
  //EUCKRContextAnalysis mContextAnalyser;
  EUCKRDistributionAnalysis mDistributionAnalyser;
  char mLastChar[2];
+  PRBool mIsPreferredLanguage;

 };

--- a/src/nsEUCTWProber.cpp
+++ b/src/nsEUCTWProber.cpp
@ -41,7 +41,7 @@ void  nsEUCTWProber::Reset(void)
 {
  mCodingSM->Reset(); 
  mState = eDetecting;
-  mDistributionAnalyser.Reset();
+  mDistributionAnalyser.Reset(mIsPreferredLanguage);
  //mContextAnalyser.Reset();
 }

@ -52,11 +52,6 @@ nsProbingState nsEUCTWProber::HandleData(const char* aBuf, PRUint32 aLen)
  for (PRUint32 i = 0; i < aLen; i++)
  {
    codingState = mCodingSM->NextState(aBuf[i]);
-    if (codingState == eError)
-    {
-      mState = eNotMe;
-      break;
-    }
    if (codingState == eItsMe)
    {
      mState = eFoundIt;
--- a/src/nsEUCTWProber.h
+++ b/src/nsEUCTWProber.h
@ -38,18 +38,19 @@
 #ifndef nsEUCTWProber_h__
 #define nsEUCTWProber_h__

-#include "uchardetDefine.h"
 #include "nsCharSetProber.h"
 #include "nsCodingStateMachine.h"
 #include "CharDistribution.h"

 class nsEUCTWProber: public nsCharSetProber {
 public:
-  nsEUCTWProber(void){mCodingSM = new nsCodingStateMachine(&EUCTWSMModel);
-                      Reset();}
+  nsEUCTWProber(PRBool aIsPreferredLanguage)
+    :mIsPreferredLanguage(aIsPreferredLanguage)
+  {mCodingSM = new nsCodingStateMachine(&EUCTWSMModel);
+    Reset();}
  virtual ~nsEUCTWProber(void){delete mCodingSM;}
  nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
-  const char* GetCharSetName() {return CHARDET_ENCODING_EUC_TW;}
+  const char* GetCharSetName() {return "x-euc-tw";}
  nsProbingState GetState(void) {return mState;}
  void      Reset(void);
  float     GetConfidence(void);
@ -64,6 +65,7 @@ protected:
  //EUCTWContextAnalysis mContextAnalyser;
  EUCTWDistributionAnalysis mDistributionAnalyser;
  char mLastChar[2];
+  PRBool mIsPreferredLanguage;

 };

--- a/src/nsEscCharsetProber.cpp
+++ b/src/nsEscCharsetProber.cpp
@ -37,13 +37,21 @@


 #include "nsEscCharsetProber.h"
+#include "nsUniversalDetector.h"

-nsEscCharSetProber::nsEscCharSetProber(void)
+nsEscCharSetProber::nsEscCharSetProber(PRUint32 aLanguageFilter)
 {
-  mCodingSM[0] = new nsCodingStateMachine(&HZSMModel);
-  mCodingSM[1] = new nsCodingStateMachine(&ISO2022CNSMModel);
-  mCodingSM[2] = new nsCodingStateMachine(&ISO2022JPSMModel);
-  mCodingSM[3] = new nsCodingStateMachine(&ISO2022KRSMModel);
+  for (PRUint32 i = 0; i < NUM_OF_ESC_CHARSETS; i++)
+    mCodingSM[i] = nsnull;
+  if (aLanguageFilter & NS_FILTER_CHINESE_SIMPLIFIED) 
+  {
+    mCodingSM[0] = new nsCodingStateMachine(&HZSMModel);
+    mCodingSM[1] = new nsCodingStateMachine(&ISO2022CNSMModel);
+  }
+  if (aLanguageFilter & NS_FILTER_JAPANESE)
+    mCodingSM[2] = new nsCodingStateMachine(&ISO2022JPSMModel);
+  if (aLanguageFilter & NS_FILTER_KOREAN)
+    mCodingSM[3] = new nsCodingStateMachine(&ISO2022KRSMModel);
  mActiveSM = NUM_OF_ESC_CHARSETS;
  mState = eDetecting;
  mDetectedCharset = nsnull;
@ -59,7 +67,8 @@ void nsEscCharSetProber::Reset(void)
 {
  mState = eDetecting;
  for (PRUint32 i = 0; i < NUM_OF_ESC_CHARSETS; i++)
-    mCodingSM[i]->Reset();
+    if (mCodingSM[i])
+      mCodingSM[i]->Reset();
  mActiveSM = NUM_OF_ESC_CHARSETS;
  mDetectedCharset = nsnull;
 }
@ -74,30 +83,15 @@ nsProbingState nsEscCharSetProber::HandleData(const char* aBuf, PRUint32 aLen)
  {
    for (j = mActiveSM-1; j>= 0; j--)
    {
-      //byte is feed to all active state machine 
-      codingState = mCodingSM[j]->NextState(aBuf[i]);
-      if (codingState == eError)
+      if (mCodingSM[j])
      {
-        //got negative answer for this state machine, make it inactive
-        mActiveSM--;
-        if (mActiveSM == 0)
+        codingState = mCodingSM[j]->NextState(aBuf[i]);
+        if (codingState == eItsMe)
        {
-          mState = eNotMe;
+          mState = eFoundIt;
+          mDetectedCharset = mCodingSM[j]->GetCodingStateMachine();
          return mState;
        }
-        else if (j != (PRInt32)mActiveSM)
-        {
-          nsCodingStateMachine* t;
-          t = mCodingSM[mActiveSM];
-          mCodingSM[mActiveSM] = mCodingSM[j];
-          mCodingSM[j] = t;
-        }
-      }
-      else if (codingState == eItsMe)
-      {
-        mState = eFoundIt;
-        mDetectedCharset = mCodingSM[j]->GetCodingStateMachine();
-        return mState;
      }
    }
  }
--- a/src/nsEscCharsetProber.h
+++ b/src/nsEscCharsetProber.h
@ -45,7 +45,7 @@

 class nsEscCharSetProber: public nsCharSetProber {
 public:
-  nsEscCharSetProber(void);
+  nsEscCharSetProber(PRUint32 aLanguageFilter);
  virtual ~nsEscCharSetProber(void);
  nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
  const char* GetCharSetName() {return mDetectedCharset;}
--- a/src/nsEscSM.cpp
+++ b/src/nsEscSM.cpp
@ -20,7 +20,6 @@
 * the Initial Developer. All Rights Reserved.
 *
 * Contributor(s):
- *   Kazutoshi Satoda
 *
 * Alternatively, the contents of this file may be used under the terms of
 * either the GNU General Public License Version 2 or later (the "GPL"), or
@ -35,10 +34,9 @@
 * the terms of any one of the MPL, the GPL or the LGPL.
 *
 * ***** END LICENSE BLOCK ***** */
-#include "uchardetDefine.h"
 #include "nsCodingStateMachine.h"

-static PRUint32 HZ_cls[ 256 / 8 ] = {
+static const PRUint32 HZ_cls[ 256 / 8 ] = {
 PCK4BITS(1,0,0,0,0,0,0,0),  // 00 - 07 
 PCK4BITS(0,0,0,0,0,0,0,0),  // 08 - 0f 
 PCK4BITS(0,0,0,0,0,0,0,0),  // 10 - 17 
@ -74,7 +72,7 @@ PCK4BITS(1,1,1,1,1,1,1,1)   // f8 - ff
 };


-static PRUint32 HZ_st [ 6] = {
+static const PRUint32 HZ_st [ 6] = {
 PCK4BITS(eStart,eError,     3,eStart,eStart,eStart,eError,eError),//00-07 
 PCK4BITS(eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe),//08-0f 
 PCK4BITS(eItsMe,eItsMe,eError,eError,eStart,eStart,     4,eError),//10-17 
@ -85,16 +83,16 @@ PCK4BITS(     4,eItsMe,eStart,eStart,eStart,eStart,eStart,eStart) //28-2f

 static const PRUint32 HZCharLenTable[] = {0, 0, 0, 0, 0, 0};

-SMModel HZSMModel = {
+const SMModel HZSMModel = {
  {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, HZ_cls },
   6,
  {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, HZ_st },
  HZCharLenTable,
-  CHARDET_ENCODING_HZ_GB_2312,
+  "HZ-GB-2312",
 };


-static PRUint32 ISO2022CN_cls [ 256 / 8 ] = {
+static const PRUint32 ISO2022CN_cls [ 256 / 8 ] = {
 PCK4BITS(2,0,0,0,0,0,0,0),  // 00 - 07 
 PCK4BITS(0,0,0,0,0,0,0,0),  // 08 - 0f 
 PCK4BITS(0,0,0,0,0,0,0,0),  // 10 - 17 
@ -130,7 +128,7 @@ PCK4BITS(2,2,2,2,2,2,2,2)   // f8 - ff
 };


-static PRUint32 ISO2022CN_st [ 8] = {
+static const PRUint32 ISO2022CN_st [ 8] = {
 PCK4BITS(eStart,     3,eError,eStart,eStart,eStart,eStart,eStart),//00-07 
 PCK4BITS(eStart,eError,eError,eError,eError,eError,eError,eError),//08-0f 
 PCK4BITS(eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe),//10-17 
@ -143,15 +141,15 @@ PCK4BITS(eError,eError,eError,eError,eError,eItsMe,eError,eStart) //38-3f

 static const PRUint32 ISO2022CNCharLenTable[] = {0, 0, 0, 0, 0, 0, 0, 0, 0};

-SMModel ISO2022CNSMModel = {
+const SMModel ISO2022CNSMModel = {
  {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, ISO2022CN_cls },
  9,
  {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, ISO2022CN_st },
  ISO2022CNCharLenTable,
-  CHARDET_ENCODING_ISO_2022_CN,
+  "ISO-2022-CN",
 };

-static PRUint32 ISO2022JP_cls [ 256 / 8 ] = {
+static const PRUint32 ISO2022JP_cls [ 256 / 8 ] = {
 PCK4BITS(2,0,0,0,0,0,0,0),  // 00 - 07 
 PCK4BITS(0,0,0,0,0,0,2,2),  // 08 - 0f 
 PCK4BITS(0,0,0,0,0,0,0,0),  // 10 - 17 
@ -187,7 +185,7 @@ PCK4BITS(2,2,2,2,2,2,2,2)   // f8 - ff
 };


-static PRUint32 ISO2022JP_st [ 9] = {
+static const PRUint32 ISO2022JP_st [ 9] = {
 PCK4BITS(eStart,     3,eError,eStart,eStart,eStart,eStart,eStart),//00-07 
 PCK4BITS(eStart,eStart,eError,eError,eError,eError,eError,eError),//08-0f 
 PCK4BITS(eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe),//10-17 
@ -199,17 +197,17 @@ PCK4BITS(eError,eError,eError,eItsMe,eError,eError,eError,eError),//38-3f
 PCK4BITS(eError,eError,eError,eError,eItsMe,eError,eStart,eStart) //40-47 
 };

-static const PRUint32 ISO2022JPCharLenTable[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+static const PRUint32 ISO2022JPCharLenTable[] = {0, 0, 0, 0, 0, 0, 0, 0};

-SMModel ISO2022JPSMModel = {
+const SMModel ISO2022JPSMModel = {
  {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, ISO2022JP_cls },
  10,
  {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, ISO2022JP_st },
  ISO2022JPCharLenTable,
-  CHARDET_ENCODING_ISO_2022_JP,
+  "ISO-2022-JP",
 };

-static PRUint32 ISO2022KR_cls [ 256 / 8 ] = {
+static const PRUint32 ISO2022KR_cls [ 256 / 8 ] = {
 PCK4BITS(2,0,0,0,0,0,0,0),  // 00 - 07 
 PCK4BITS(0,0,0,0,0,0,0,0),  // 08 - 0f 
 PCK4BITS(0,0,0,0,0,0,0,0),  // 10 - 17 
@ -245,7 +243,7 @@ PCK4BITS(2,2,2,2,2,2,2,2)   // f8 - ff
 };


-static PRUint32 ISO2022KR_st [ 5] = {
+static const PRUint32 ISO2022KR_st [ 5] = {
 PCK4BITS(eStart,     3,eError,eStart,eStart,eStart,eError,eError),//00-07 
 PCK4BITS(eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe),//08-0f 
 PCK4BITS(eItsMe,eItsMe,eError,eError,eError,     4,eError,eError),//10-17 
@ -255,11 +253,11 @@ PCK4BITS(eError,eError,eError,eItsMe,eStart,eStart,eStart,eStart) //20-27

 static const PRUint32 ISO2022KRCharLenTable[] = {0, 0, 0, 0, 0, 0};

-SMModel ISO2022KRSMModel = {
+const SMModel ISO2022KRSMModel = {
  {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, ISO2022KR_cls },
   6,
  {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, ISO2022KR_st },
  ISO2022KRCharLenTable,
-  CHARDET_ENCODING_ISO_2022_KR,
+  "ISO-2022-KR",
 };

--- a/src/nsGB2312Prober.cpp
+++ b/src/nsGB2312Prober.cpp
@ -46,7 +46,7 @@ void  nsGB18030Prober::Reset(void)
 {
  mCodingSM->Reset(); 
  mState = eDetecting;
-  mDistributionAnalyser.Reset();
+  mDistributionAnalyser.Reset(mIsPreferredLanguage);
  //mContextAnalyser.Reset();
 }

@ -57,11 +57,6 @@ nsProbingState nsGB18030Prober::HandleData(const char* aBuf, PRUint32 aLen)
  for (PRUint32 i = 0; i < aLen; i++)
  {
    codingState = mCodingSM->NextState(aBuf[i]);
-    if (codingState == eError)
-    {
-      mState = eNotMe;
-      break;
-    }
    if (codingState == eItsMe)
    {
      mState = eFoundIt;
--- a/src/nsGB2312Prober.h
+++ b/src/nsGB2312Prober.h
@ -38,7 +38,6 @@
 #ifndef nsGB2312Prober_h__
 #define nsGB2312Prober_h__

-#include "uchardetDefine.h"
 #include "nsCharSetProber.h"
 #include "nsCodingStateMachine.h"
 #include "CharDistribution.h"
@ -47,11 +46,13 @@

 class nsGB18030Prober: public nsCharSetProber {
 public:
-  nsGB18030Prober(void){mCodingSM = new nsCodingStateMachine(&GB18030SMModel);
-                      Reset();}
+  nsGB18030Prober(PRBool aIsPreferredLanguage)
+    :mIsPreferredLanguage(aIsPreferredLanguage)
+  {mCodingSM = new nsCodingStateMachine(&GB18030SMModel);
+    Reset();}
  virtual ~nsGB18030Prober(void){delete mCodingSM;}
  nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
-  const char* GetCharSetName() {return CHARDET_ENCODING_GB18030;}
+  const char* GetCharSetName() {return "gb18030";}
  nsProbingState GetState(void) {return mState;}
  void      Reset(void);
  float     GetConfidence(void);
@ -66,6 +67,7 @@ protected:
  //GB2312ContextAnalysis mContextAnalyser;
  GB2312DistributionAnalysis mDistributionAnalyser;
  char mLastChar[2];
+  PRBool mIsPreferredLanguage;

 };

--- a/src/nsHebrewProber.cpp
+++ b/src/nsHebrewProber.cpp
@ -35,7 +35,6 @@
 *
 * ***** END LICENSE BLOCK ***** */

-#include "uchardetDefine.h"
 #include "nsHebrewProber.h"
 #include <stdio.h>

@ -59,8 +58,8 @@
 // If the difference is below this, don't rely at all on the model score distance.
 #define MIN_MODEL_DISTANCE (0.01)

-#define VISUAL_HEBREW_NAME (CHARDET_ENCODING_ISO_8859_8)
-#define LOGICAL_HEBREW_NAME (CHARDET_ENCODING_WINDOWS_1255)
+#define VISUAL_HEBREW_NAME ("ISO-8859-8")
+#define LOGICAL_HEBREW_NAME ("windows-1255")

 PRBool nsHebrewProber::isFinal(char c)
 {
--- a/src/nsLatin1Prober.cpp
+++ b/src/nsLatin1Prober.cpp
@ -50,7 +50,7 @@
 #define ASO    7        // accent small other
 #define CLASS_NUM   8    // total classes

-static unsigned char Latin1_CharToClass[] = 
+static const unsigned char Latin1_CharToClass[] = 
 {
  OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   // 00 - 07
  OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   // 08 - 0F
@ -92,7 +92,7 @@ static unsigned char Latin1_CharToClass[] =
   2 : normal 
   3 : very likely
 */
-static unsigned char Latin1ClassModel[] = 
+static const unsigned char Latin1ClassModel[] = 
 {
 /*      UDF OTH ASC ASS ACV ACO ASV ASO  */
 /*UDF*/  0,  0,  0,  0,  0,  0,  0,  0,
--- a/src/nsLatin1Prober.h
+++ b/src/nsLatin1Prober.h
@ -39,7 +39,6 @@
 #ifndef nsLatin1Prober_h__
 #define nsLatin1Prober_h__

-#include "uchardetDefine.h"
 #include "nsCharSetProber.h"

 #define FREQ_CAT_NUM    4
@ -49,7 +48,7 @@ public:
  nsLatin1Prober(void){Reset();}
  virtual ~nsLatin1Prober(void){}
  nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
-  const char* GetCharSetName() {return CHARDET_ENCODING_WINDOWS_1252;}
+  const char* GetCharSetName() {return "windows-1252";}
  nsProbingState GetState(void) {return mState;}
  void      Reset(void);
  float     GetConfidence(void);
--- a/src/nsMBCSGroupProber.cpp
+++ b/src/nsMBCSGroupProber.cpp
@ -21,6 +21,7 @@
 *
 * Contributor(s):
 *          Shy Shalom <shooshX@gmail.com>
+ *			Proofpoint, Inc.
 *
 * Alternatively, the contents of this file may be used under the terms of
 * either the GNU General Public License Version 2 or later (the "GPL"), or
@ -36,12 +37,12 @@
 *
 * ***** END LICENSE BLOCK ***** */
 #include <stdio.h>
-#include "prmem.h"

 #include "nsMBCSGroupProber.h"
+#include "nsUniversalDetector.h"

-#ifdef DEBUG_chardet
-char *ProberName[] = 
+#if defined(DEBUG_chardet) || defined(DEBUG_jgmyers)
+const char *ProberName[] = 
 {
  "UTF8",
  "SJIS",
@ -54,15 +55,26 @@ char *ProberName[] =

 #endif

-nsMBCSGroupProber::nsMBCSGroupProber()
+nsMBCSGroupProber::nsMBCSGroupProber(PRUint32 aLanguageFilter)
 {
+  for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++)
+    mProbers[i] = nsnull;
+
  mProbers[0] = new nsUTF8Prober();
-  mProbers[1] = new nsSJISProber();
-  mProbers[2] = new nsEUCJPProber();
-  mProbers[3] = new nsGB18030Prober();
-  mProbers[4] = new nsEUCKRProber();
-  mProbers[5] = new nsBig5Prober();
-  mProbers[6] = new nsEUCTWProber();
+  if (aLanguageFilter & NS_FILTER_JAPANESE) 
+  {
+    mProbers[1] = new nsSJISProber(aLanguageFilter == NS_FILTER_JAPANESE);
+    mProbers[2] = new nsEUCJPProber(aLanguageFilter == NS_FILTER_JAPANESE);
+  }
+  if (aLanguageFilter & NS_FILTER_CHINESE_SIMPLIFIED)
+    mProbers[3] = new nsGB18030Prober(aLanguageFilter == NS_FILTER_CHINESE_SIMPLIFIED);
+  if (aLanguageFilter & NS_FILTER_KOREAN)
+    mProbers[4] = new nsEUCKRProber(aLanguageFilter == NS_FILTER_KOREAN);
+  if (aLanguageFilter & NS_FILTER_CHINESE_TRADITIONAL) 
+  {
+    mProbers[5] = new nsBig5Prober(aLanguageFilter == NS_FILTER_CHINESE_TRADITIONAL);
+    mProbers[6] = new nsEUCTWProber(aLanguageFilter == NS_FILTER_CHINESE_TRADITIONAL);
+  }
  Reset();
 }

@ -101,62 +113,59 @@ void  nsMBCSGroupProber::Reset(void)
  }
  mBestGuess = -1;
  mState = eDetecting;
+  mKeepNext = 0;
 }

 nsProbingState nsMBCSGroupProber::HandleData(const char* aBuf, PRUint32 aLen)
 {
  nsProbingState st;
-  PRUint32 i;
+  PRUint32 start = 0;
+  PRUint32 keepNext = mKeepNext;

  //do filtering to reduce load to probers
-  char *highbyteBuf;
-  char *hptr;
-  PRBool keepNext = PR_TRUE;   //assume previous is not ascii, it will do no harm except add some noise
-  hptr = highbyteBuf = (char*)PR_Malloc(aLen);
-  if (!hptr)
-      return mState;
-  for (i = 0; i < aLen; i++)
+  for (PRUint32 pos = 0; pos < aLen; ++pos)
  {
-    if (aBuf[i] & 0x80)
+    if (aBuf[pos] & 0x80)
    {
-      *hptr++ = aBuf[i];
-      keepNext = PR_TRUE;
+      if (!keepNext)
+        start = pos;
+      keepNext = 2;
    }
-    else
+    else if (keepNext)
    {
-      //if previous is highbyte, keep this even it is a ASCII
-      if (keepNext)
+      if (--keepNext == 0)
      {
-          *hptr++ = aBuf[i];
-          keepNext = PR_FALSE;
+        for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++)
+        {
+          if (!mIsActive[i])
+            continue;
+          st = mProbers[i]->HandleData(aBuf + start, pos + 1 - start);
+          if (st == eFoundIt)
+          {
+            mBestGuess = i;
+            mState = eFoundIt;
+            return mState;
+          }
+        }
      }
    }
  }

-  for (i = 0; i < NUM_OF_PROBERS; i++)
-  {
-     if (!mIsActive[i])
-       continue;
-     st = mProbers[i]->HandleData(highbyteBuf, hptr - highbyteBuf);
-     if (st == eFoundIt)
-     {
-       mBestGuess = i;
-       mState = eFoundIt;
-       break;
-     }
-     else if (st == eNotMe)
-     {
-       mIsActive[i] = PR_FALSE;
-       mActiveNum--;
-       if (mActiveNum <= 0)
-       {
-         mState = eNotMe;
-         break;
-       }
-     }
+  if (keepNext) {
+    for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++)
+    {
+      if (!mIsActive[i])
+        continue;
+      st = mProbers[i]->HandleData(aBuf + start, aLen - start);
+      if (st == eFoundIt)
+      {
+        mBestGuess = i;
+        mState = eFoundIt;
+        return mState;
+      }
+    }
  }
-
-  PR_FREEIF(highbyteBuf);
+  mKeepNext = keepNext;

  return mState;
 }
@ -207,3 +216,15 @@ void nsMBCSGroupProber::DumpStatus()
  }
 }
 #endif
+
+#ifdef DEBUG_jgmyers
+void nsMBCSGroupProber::GetDetectorState(nsUniversalDetector::DetectorState (&states)[nsUniversalDetector::NumDetectors], PRUint32 &offset)
+{
+  for (PRUint32 i = 0; i < NUM_OF_PROBERS; ++i) {
+    states[offset].name = ProberName[i];
+    states[offset].isActive = mIsActive[i];
+    states[offset].confidence = mIsActive[i] ? mProbers[i]->GetConfidence() : 0.0;
+    ++offset;
+  }
+}
+#endif /* DEBUG_jgmyers */
--- a/src/nsMBCSGroupProber.h
+++ b/src/nsMBCSGroupProber.h
@ -20,6 +20,7 @@
 * the Initial Developer. All Rights Reserved.
 *
 * Contributor(s):
+ *			Proofpoint, Inc.
 *
 * Alternatively, the contents of this file may be used under the terms of
 * either the GNU General Public License Version 2 or later (the "GPL"), or
@ -50,7 +51,7 @@

 class nsMBCSGroupProber: public nsCharSetProber {
 public:
-  nsMBCSGroupProber();
+  nsMBCSGroupProber(PRUint32 aLanguageFilter);
  virtual ~nsMBCSGroupProber();
  nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
  const char* GetCharSetName();
@ -62,6 +63,9 @@ public:
 #ifdef DEBUG_chardet
  void  DumpStatus();
 #endif
+#ifdef DEBUG_jgmyers
+  void GetDetectorState(nsUniversalDetector::DetectorState (&states)[nsUniversalDetector::NumDetectors], PRUint32 &offset);
+#endif

 protected:
  nsProbingState mState;
@ -69,6 +73,7 @@ protected:
  PRBool          mIsActive[NUM_OF_PROBERS];
  PRInt32 mBestGuess;
  PRUint32 mActiveNum;
+  PRUint32 mKeepNext;
 };

 #endif /* nsMBCSGroupProber_h__ */
--- a/src/nsMBCSSM.cpp
+++ b/src/nsMBCSSM.cpp
@ -34,7 +34,6 @@
 * the terms of any one of the MPL, the GPL or the LGPL.
 *
 * ***** END LICENSE BLOCK ***** */
-#include "uchardetDefine.h"
 #include "nsCodingStateMachine.h"

 /*
@ -45,7 +44,7 @@ Modification from frank tang's original work:

 // BIG5 

-static PRUint32 BIG5_cls [ 256 / 8 ] = {
+static const PRUint32 BIG5_cls [ 256 / 8 ] = {
 //PCK4BITS(0,1,1,1,1,1,1,1),  // 00 - 07 
 PCK4BITS(1,1,1,1,1,1,1,1),  // 00 - 07    //allow 0x00 as legal value
 PCK4BITS(1,1,1,1,1,1,0,0),  // 08 - 0f 
@ -82,7 +81,7 @@ PCK4BITS(3,3,3,3,3,3,3,0)   // f8 - ff
 };


-static PRUint32 BIG5_st [ 3] = {
+static const PRUint32 BIG5_st [ 3] = {
 PCK4BITS(eError,eStart,eStart,     3,eError,eError,eError,eError),//00-07 
 PCK4BITS(eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError),//08-0f 
 PCK4BITS(eError,eStart,eStart,eStart,eStart,eStart,eStart,eStart) //10-17 
@ -90,15 +89,15 @@ PCK4BITS(eError,eStart,eStart,eStart,eStart,eStart,eStart,eStart) //10-17

 static const PRUint32 Big5CharLenTable[] = {0, 1, 1, 2, 0};

-SMModel Big5SMModel = {
+SMModel const Big5SMModel = {
  {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, BIG5_cls },
    5,
  {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, BIG5_st },
  Big5CharLenTable,
-  CHARDET_ENCODING_BIG5,
+  "Big5",
 };

-static PRUint32 EUCJP_cls [ 256 / 8 ] = {
+static const PRUint32 EUCJP_cls [ 256 / 8 ] = {
 //PCK4BITS(5,4,4,4,4,4,4,4),  // 00 - 07 
 PCK4BITS(4,4,4,4,4,4,4,4),  // 00 - 07 
 PCK4BITS(4,4,4,4,4,4,5,5),  // 08 - 0f 
@ -135,7 +134,7 @@ PCK4BITS(0,0,0,0,0,0,0,5)   // f8 - ff
 };


-static PRUint32 EUCJP_st [ 5] = {
+static const PRUint32 EUCJP_st [ 5] = {
 PCK4BITS(     3,     4,     3,     5,eStart,eError,eError,eError),//00-07 
 PCK4BITS(eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe),//08-0f 
 PCK4BITS(eItsMe,eItsMe,eStart,eError,eStart,eError,eError,eError),//10-17 
@ -145,15 +144,15 @@ PCK4BITS(     3,eError,eError,eError,eStart,eStart,eStart,eStart) //20-27

 static const PRUint32 EUCJPCharLenTable[] = {2, 2, 2, 3, 1, 0};

-SMModel EUCJPSMModel = {
+const SMModel EUCJPSMModel = {
  {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCJP_cls },
   6,
  {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCJP_st },
  EUCJPCharLenTable,
-  CHARDET_ENCODING_EUC_JP,
+  "EUC-JP",
 };

-static PRUint32 EUCKR_cls [ 256 / 8 ] = {
+static const PRUint32 EUCKR_cls [ 256 / 8 ] = {
 //PCK4BITS(0,1,1,1,1,1,1,1),  // 00 - 07 
 PCK4BITS(1,1,1,1,1,1,1,1),  // 00 - 07 
 PCK4BITS(1,1,1,1,1,1,0,0),  // 08 - 0f 
@ -190,22 +189,22 @@ PCK4BITS(2,2,2,2,2,2,2,0)   // f8 - ff
 };


-static PRUint32 EUCKR_st [ 2] = {
+static const PRUint32 EUCKR_st [ 2] = {
 PCK4BITS(eError,eStart,     3,eError,eError,eError,eError,eError),//00-07 
 PCK4BITS(eItsMe,eItsMe,eItsMe,eItsMe,eError,eError,eStart,eStart) //08-0f 
 };

 static const PRUint32 EUCKRCharLenTable[] = {0, 1, 2, 0};

-SMModel EUCKRSMModel = {
+const SMModel EUCKRSMModel = {
  {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCKR_cls },
  4,
  {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCKR_st },
  EUCKRCharLenTable,
-  CHARDET_ENCODING_EUC_KR,
+  "EUC-KR",
 };

-static PRUint32 EUCTW_cls [ 256 / 8 ] = {
+static const PRUint32 EUCTW_cls [ 256 / 8 ] = {
 //PCK4BITS(0,2,2,2,2,2,2,2),  // 00 - 07 
 PCK4BITS(2,2,2,2,2,2,2,2),  // 00 - 07 
 PCK4BITS(2,2,2,2,2,2,0,0),  // 08 - 0f 
@ -242,7 +241,7 @@ PCK4BITS(3,3,3,3,3,3,3,0)   // f8 - ff
 };


-static PRUint32 EUCTW_st [ 6] = {
+static const PRUint32 EUCTW_st [ 6] = {
 PCK4BITS(eError,eError,eStart,     3,     3,     3,     4,eError),//00-07 
 PCK4BITS(eError,eError,eError,eError,eError,eError,eItsMe,eItsMe),//08-0f 
 PCK4BITS(eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError,eStart,eError),//10-17 
@ -253,12 +252,12 @@ PCK4BITS(eStart,eError,eStart,eStart,eStart,eStart,eStart,eStart) //28-2f

 static const PRUint32 EUCTWCharLenTable[] = {0, 0, 1, 2, 2, 2, 3};

-SMModel EUCTWSMModel = {
+const SMModel EUCTWSMModel = {
  {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCTW_cls },
   7,
  {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCTW_st },
  EUCTWCharLenTable,
-  CHARDET_ENCODING_EUC_TW,
+  "x-euc-tw",
 };

 /* obsolete GB2312 by gb18030
@ -317,7 +316,7 @@ SMModel GB2312SMModel = {

 // the following state machine data was created by perl script in 
 // intl/chardet/tools. It should be the same as in PSM detector.
-static PRUint32 GB18030_cls [ 256 / 8 ] = {
+static const PRUint32 GB18030_cls [ 256 / 8 ] = {
 PCK4BITS(1,1,1,1,1,1,1,1),  // 00 - 07 
 PCK4BITS(1,1,1,1,1,1,0,0),  // 08 - 0f 
 PCK4BITS(1,1,1,1,1,1,1,1),  // 10 - 17 
@ -353,7 +352,7 @@ PCK4BITS(6,6,6,6,6,6,6,0)   // f8 - ff
 };


-static PRUint32 GB18030_st [ 6] = {
+static const PRUint32 GB18030_st [ 6] = {
 PCK4BITS(eError,eStart,eStart,eStart,eStart,eStart,     3,eError),//00-07 
 PCK4BITS(eError,eError,eError,eError,eError,eError,eItsMe,eItsMe),//08-0f 
 PCK4BITS(eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError,eError,eStart),//10-17 
@ -369,17 +368,17 @@ PCK4BITS(eError,eError,eStart,eStart,eStart,eStart,eStart,eStart) //28-2f
 // 2 here. 
 static const PRUint32 GB18030CharLenTable[] = {0, 1, 1, 1, 1, 1, 2};

-SMModel GB18030SMModel = {
+const SMModel GB18030SMModel = {
  {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, GB18030_cls },
   7,
  {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, GB18030_st },
  GB18030CharLenTable,
-  CHARDET_ENCODING_GB18030,
+  "GB18030",
 };

 // sjis

-static PRUint32 SJIS_cls [ 256 / 8 ] = {
+static const PRUint32 SJIS_cls [ 256 / 8 ] = {
 //PCK4BITS(0,1,1,1,1,1,1,1),  // 00 - 07 
 PCK4BITS(1,1,1,1,1,1,1,1),  // 00 - 07 
 PCK4BITS(1,1,1,1,1,1,0,0),  // 08 - 0f 
@ -418,7 +417,7 @@ PCK4BITS(4,4,4,4,4,0,0,0)   // f8 - ff
 };


-static PRUint32 SJIS_st [ 3] = {
+static const PRUint32 SJIS_st [ 3] = {
 PCK4BITS(eError,eStart,eStart,     3,eError,eError,eError,eError),//00-07 
 PCK4BITS(eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe),//08-0f 
 PCK4BITS(eItsMe,eItsMe,eError,eError,eStart,eStart,eStart,eStart) //10-17 
@ -426,129 +425,16 @@ PCK4BITS(eItsMe,eItsMe,eError,eError,eStart,eStart,eStart,eStart) //10-17

 static const PRUint32 SJISCharLenTable[] = {0, 1, 1, 2, 0, 0};

-SMModel SJISSMModel = {
+const SMModel SJISSMModel = {
  {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, SJIS_cls },
   6,
  {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, SJIS_st },
  SJISCharLenTable,
-  CHARDET_ENCODING_SHIFT_JIS,
+  "Shift_JIS",
 };


-static PRUint32 UCS2BE_cls [ 256 / 8 ] = {
-PCK4BITS(0,0,0,0,0,0,0,0),  // 00 - 07 
-PCK4BITS(0,0,1,0,0,2,0,0),  // 08 - 0f 
-PCK4BITS(0,0,0,0,0,0,0,0),  // 10 - 17 
-PCK4BITS(0,0,0,3,0,0,0,0),  // 18 - 1f 
-PCK4BITS(0,0,0,0,0,0,0,0),  // 20 - 27 
-PCK4BITS(0,3,3,3,3,3,0,0),  // 28 - 2f 
-PCK4BITS(0,0,0,0,0,0,0,0),  // 30 - 37 
-PCK4BITS(0,0,0,0,0,0,0,0),  // 38 - 3f 
-PCK4BITS(0,0,0,0,0,0,0,0),  // 40 - 47 
-PCK4BITS(0,0,0,0,0,0,0,0),  // 48 - 4f 
-PCK4BITS(0,0,0,0,0,0,0,0),  // 50 - 57 
-PCK4BITS(0,0,0,0,0,0,0,0),  // 58 - 5f 
-PCK4BITS(0,0,0,0,0,0,0,0),  // 60 - 67 
-PCK4BITS(0,0,0,0,0,0,0,0),  // 68 - 6f 
-PCK4BITS(0,0,0,0,0,0,0,0),  // 70 - 77 
-PCK4BITS(0,0,0,0,0,0,0,0),  // 78 - 7f 
-PCK4BITS(0,0,0,0,0,0,0,0),  // 80 - 87 
-PCK4BITS(0,0,0,0,0,0,0,0),  // 88 - 8f 
-PCK4BITS(0,0,0,0,0,0,0,0),  // 90 - 97 
-PCK4BITS(0,0,0,0,0,0,0,0),  // 98 - 9f 
-PCK4BITS(0,0,0,0,0,0,0,0),  // a0 - a7 
-PCK4BITS(0,0,0,0,0,0,0,0),  // a8 - af 
-PCK4BITS(0,0,0,0,0,0,0,0),  // b0 - b7 
-PCK4BITS(0,0,0,0,0,0,0,0),  // b8 - bf 
-PCK4BITS(0,0,0,0,0,0,0,0),  // c0 - c7 
-PCK4BITS(0,0,0,0,0,0,0,0),  // c8 - cf 
-PCK4BITS(0,0,0,0,0,0,0,0),  // d0 - d7 
-PCK4BITS(0,0,0,0,0,0,0,0),  // d8 - df 
-PCK4BITS(0,0,0,0,0,0,0,0),  // e0 - e7 
-PCK4BITS(0,0,0,0,0,0,0,0),  // e8 - ef 
-PCK4BITS(0,0,0,0,0,0,0,0),  // f0 - f7 
-PCK4BITS(0,0,0,0,0,0,4,5)   // f8 - ff 
-};
-
-
-static PRUint32 UCS2BE_st [ 7] = {
-PCK4BITS(     5,     7,     7,eError,     4,     3,eError,eError),//00-07 
-PCK4BITS(eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe),//08-0f 
-PCK4BITS(eItsMe,eItsMe,     6,     6,     6,     6,eError,eError),//10-17 
-PCK4BITS(     6,     6,     6,     6,     6,eItsMe,     6,     6),//18-1f 
-PCK4BITS(     6,     6,     6,     6,     5,     7,     7,eError),//20-27 
-PCK4BITS(     5,     8,     6,     6,eError,     6,     6,     6),//28-2f 
-PCK4BITS(     6,     6,     6,     6,eError,eError,eStart,eStart) //30-37 
-};
-
-static const PRUint32 UCS2BECharLenTable[] = {2, 2, 2, 0, 2, 2};
-
-SMModel UCS2BESMModel = {
-  {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, UCS2BE_cls },
-   6,
-  {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, UCS2BE_st },
-  UCS2BECharLenTable,
-  CHARDET_ENCODING_UTF_16BE,
-};
-
-static PRUint32 UCS2LE_cls [ 256 / 8 ] = {
-PCK4BITS(0,0,0,0,0,0,0,0),  // 00 - 07 
-PCK4BITS(0,0,1,0,0,2,0,0),  // 08 - 0f 
-PCK4BITS(0,0,0,0,0,0,0,0),  // 10 - 17 
-PCK4BITS(0,0,0,3,0,0,0,0),  // 18 - 1f 
-PCK4BITS(0,0,0,0,0,0,0,0),  // 20 - 27 
-PCK4BITS(0,3,3,3,3,3,0,0),  // 28 - 2f 
-PCK4BITS(0,0,0,0,0,0,0,0),  // 30 - 37 
-PCK4BITS(0,0,0,0,0,0,0,0),  // 38 - 3f 
-PCK4BITS(0,0,0,0,0,0,0,0),  // 40 - 47 
-PCK4BITS(0,0,0,0,0,0,0,0),  // 48 - 4f 
-PCK4BITS(0,0,0,0,0,0,0,0),  // 50 - 57 
-PCK4BITS(0,0,0,0,0,0,0,0),  // 58 - 5f 
-PCK4BITS(0,0,0,0,0,0,0,0),  // 60 - 67 
-PCK4BITS(0,0,0,0,0,0,0,0),  // 68 - 6f 
-PCK4BITS(0,0,0,0,0,0,0,0),  // 70 - 77 
-PCK4BITS(0,0,0,0,0,0,0,0),  // 78 - 7f 
-PCK4BITS(0,0,0,0,0,0,0,0),  // 80 - 87 
-PCK4BITS(0,0,0,0,0,0,0,0),  // 88 - 8f 
-PCK4BITS(0,0,0,0,0,0,0,0),  // 90 - 97 
-PCK4BITS(0,0,0,0,0,0,0,0),  // 98 - 9f 
-PCK4BITS(0,0,0,0,0,0,0,0),  // a0 - a7 
-PCK4BITS(0,0,0,0,0,0,0,0),  // a8 - af 
-PCK4BITS(0,0,0,0,0,0,0,0),  // b0 - b7 
-PCK4BITS(0,0,0,0,0,0,0,0),  // b8 - bf 
-PCK4BITS(0,0,0,0,0,0,0,0),  // c0 - c7 
-PCK4BITS(0,0,0,0,0,0,0,0),  // c8 - cf 
-PCK4BITS(0,0,0,0,0,0,0,0),  // d0 - d7 
-PCK4BITS(0,0,0,0,0,0,0,0),  // d8 - df 
-PCK4BITS(0,0,0,0,0,0,0,0),  // e0 - e7 
-PCK4BITS(0,0,0,0,0,0,0,0),  // e8 - ef 
-PCK4BITS(0,0,0,0,0,0,0,0),  // f0 - f7 
-PCK4BITS(0,0,0,0,0,0,4,5)   // f8 - ff 
-};
-
-
-static PRUint32 UCS2LE_st [ 7] = {
-PCK4BITS(     6,     6,     7,     6,     4,     3,eError,eError),//00-07 
-PCK4BITS(eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe),//08-0f 
-PCK4BITS(eItsMe,eItsMe,     5,     5,     5,eError,eItsMe,eError),//10-17 
-PCK4BITS(     5,     5,     5,eError,     5,eError,     6,     6),//18-1f 
-PCK4BITS(     7,     6,     8,     8,     5,     5,     5,eError),//20-27 
-PCK4BITS(     5,     5,     5,eError,eError,eError,     5,     5),//28-2f 
-PCK4BITS(     5,     5,     5,eError,     5,eError,eStart,eStart) //30-37 
-};
-
-static const PRUint32 UCS2LECharLenTable[] = {2, 2, 2, 2, 2, 2};
-
-SMModel UCS2LESMModel = {
-  {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, UCS2LE_cls },
-   6,
-  {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, UCS2LE_st },
-  UCS2LECharLenTable,
-  CHARDET_ENCODING_UTF_16LE,
-};
-
-
-static PRUint32 UTF8_cls [ 256 / 8 ] = {
+static const PRUint32 UTF8_cls [ 256 / 8 ] = {
 //PCK4BITS(0,1,1,1,1,1,1,1),  // 00 - 07 
 PCK4BITS(1,1,1,1,1,1,1,1),  // 00 - 07  //allow 0x00 as a legal value
 PCK4BITS(1,1,1,1,1,1,0,0),  // 08 - 0f 
@ -585,7 +471,7 @@ PCK4BITS(12,13,13,13,14,15,0,0)   // f8 - ff
 };


-static PRUint32 UTF8_st [ 26] = {
+static const PRUint32 UTF8_st [ 26] = {
 PCK4BITS(eError,eStart,eError,eError,eError,eError,     12,     10),//00-07 
 PCK4BITS(     9,     11,     8,     7,     6,     5,     4,     3),//08-0f 
 PCK4BITS(eError,eError,eError,eError,eError,eError,eError,eError),//10-17 
@ -617,11 +503,11 @@ PCK4BITS(eError,eError,eError,eError,eError,eError,eError,eError) //c8-cf
 static const PRUint32 UTF8CharLenTable[] = {0, 1, 0, 0, 0, 0, 2, 3, 
                            3, 3, 4, 4, 5, 5, 6, 6 };

-SMModel UTF8SMModel = {
+const SMModel UTF8SMModel = {
  {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, UTF8_cls },
   16,
  {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, UTF8_st },
  UTF8CharLenTable,
-  CHARDET_ENCODING_UTF_8,
+  "UTF-8",
 };

--- a/src/nsPkgInt.h
+++ b/src/nsPkgInt.h
@ -68,7 +68,7 @@ typedef struct nsPkgInt {
  nsSftMsk  sftmsk;
  nsBitSft  bitsft;
  nsUnitMsk unitmsk;
-  PRUint32  *data;
+  const PRUint32* const data;
 } nsPkgInt;


--- a/src/nsSBCSGroupProber.cpp
+++ b/src/nsSBCSGroupProber.cpp
@ -56,21 +56,22 @@ nsSBCSGroupProber::nsSBCSGroupProber()
  mProbers[7] = new nsSingleByteCharSetProber(&Win1253Model);
  mProbers[8] = new nsSingleByteCharSetProber(&Latin5BulgarianModel);
  mProbers[9] = new nsSingleByteCharSetProber(&Win1251BulgarianModel);
+  mProbers[10] = new nsSingleByteCharSetProber(&TIS620ThaiModel);

  nsHebrewProber *hebprober = new nsHebrewProber();
  // Notice: Any change in these indexes - 10,11,12 must be reflected
  // in the code below as well.
-  mProbers[10] = hebprober;
-  mProbers[11] = new nsSingleByteCharSetProber(&Win1255Model, PR_FALSE, hebprober); // Logical Hebrew
-  mProbers[12] = new nsSingleByteCharSetProber(&Win1255Model, PR_TRUE, hebprober); // Visual Hebrew
+  mProbers[11] = hebprober;
+  mProbers[12] = new nsSingleByteCharSetProber(&Win1255Model, PR_FALSE, hebprober); // Logical Hebrew
+  mProbers[13] = new nsSingleByteCharSetProber(&Win1255Model, PR_TRUE, hebprober); // Visual Hebrew
  // Tell the Hebrew prober about the logical and visual probers
-  if (mProbers[10] && mProbers[11] && mProbers[12]) // all are not null
+  if (mProbers[11] && mProbers[12] && mProbers[13]) // all are not null
  {
-    hebprober->SetModelProbers(mProbers[11], mProbers[12]);
+    hebprober->SetModelProbers(mProbers[12], mProbers[13]);
  }
  else // One or more is null. avoid any Hebrew probing, null them all
  {
-    for (PRUint32 i = 10; i <= 12; ++i)
+    for (PRUint32 i = 11; i <= 13; ++i)
    { 
      delete mProbers[i]; 
      mProbers[i] = 0; 
--- a/src/nsSBCSGroupProber.h
+++ b/src/nsSBCSGroupProber.h
@ -40,7 +40,7 @@
 #define nsSBCSGroupProber_h__


-#define NUM_OF_SBCS_PROBERS    13
+#define NUM_OF_SBCS_PROBERS    14

 class nsCharSetProber;
 class nsSBCSGroupProber: public nsCharSetProber {
--- a/src/nsSBCharSetProber.h
+++ b/src/nsSBCharSetProber.h
@ -51,19 +51,19 @@

 typedef struct
 {
-  unsigned char *charToOrderMap;    // [256] table use to find a char's order
-  char *precedenceMatrix;           // [SAMPLE_SIZE][SAMPLE_SIZE]; table to find a 2-char sequence's frequency
+  const unsigned char* const charToOrderMap;    // [256] table use to find a char's order
+  const PRUint8* const precedenceMatrix;  // [SAMPLE_SIZE][SAMPLE_SIZE]; table to find a 2-char sequence's frequency
  float  mTypicalPositiveRatio;     // = freqSeqs / totalSeqs 
  PRBool keepEnglishLetter;         // says if this script contains English characters (not implemented)
-  const char* charsetName;
+  const char* const charsetName;
 } SequenceModel;


 class nsSingleByteCharSetProber : public nsCharSetProber{
 public:
-  nsSingleByteCharSetProber(SequenceModel *model) 
+  nsSingleByteCharSetProber(const SequenceModel *model) 
    :mModel(model), mReversed(PR_FALSE), mNameProber(0) { Reset(); }
-  nsSingleByteCharSetProber(SequenceModel *model, PRBool reversed, nsCharSetProber* nameProber)
+  nsSingleByteCharSetProber(const SequenceModel *model, PRBool reversed, nsCharSetProber* nameProber)
    :mModel(model), mReversed(reversed), mNameProber(nameProber) { Reset(); }

  virtual const char* GetCharSetName();
@ -87,7 +87,7 @@ public:

 protected:
  nsProbingState mState;
-  const SequenceModel *mModel;
+  const SequenceModel* const mModel;
  const PRBool mReversed; // PR_TRUE if we need to reverse every pair in the model lookup

  //char order of last character
@ -106,19 +106,20 @@ protected:
 };


-extern SequenceModel Koi8rModel;
-extern SequenceModel Win1251Model;
-extern SequenceModel Latin5Model;
-extern SequenceModel MacCyrillicModel;
-extern SequenceModel Ibm866Model;
-extern SequenceModel Ibm855Model;
-extern SequenceModel Latin7Model;
-extern SequenceModel Win1253Model;
-extern SequenceModel Latin5BulgarianModel;
-extern SequenceModel Win1251BulgarianModel;
-extern SequenceModel Latin2HungarianModel;
-extern SequenceModel Win1250HungarianModel;
-extern SequenceModel Win1255Model;
+extern const SequenceModel Koi8rModel;
+extern const SequenceModel Win1251Model;
+extern const SequenceModel Latin5Model;
+extern const SequenceModel MacCyrillicModel;
+extern const SequenceModel Ibm866Model;
+extern const SequenceModel Ibm855Model;
+extern const SequenceModel Latin7Model;
+extern const SequenceModel Win1253Model;
+extern const SequenceModel Latin5BulgarianModel;
+extern const SequenceModel Win1251BulgarianModel;
+extern const SequenceModel Latin2HungarianModel;
+extern const SequenceModel Win1250HungarianModel;
+extern const SequenceModel Win1255Model;
+extern const SequenceModel TIS620ThaiModel;

 #endif /* nsSingleByteCharSetProber_h__ */

--- a/src/nsSJISProber.cpp
+++ b/src/nsSJISProber.cpp
@ -46,8 +46,8 @@ void  nsSJISProber::Reset(void)
 {
  mCodingSM->Reset(); 
  mState = eDetecting;
-  mContextAnalyser.Reset();
-  mDistributionAnalyser.Reset();
+  mContextAnalyser.Reset(mIsPreferredLanguage);
+  mDistributionAnalyser.Reset(mIsPreferredLanguage);
 }

 nsProbingState nsSJISProber::HandleData(const char* aBuf, PRUint32 aLen)
@ -57,11 +57,6 @@ nsProbingState nsSJISProber::HandleData(const char* aBuf, PRUint32 aLen)
  for (PRUint32 i = 0; i < aLen; i++)
  {
    codingState = mCodingSM->NextState(aBuf[i]);
-    if (codingState == eError)
-    {
-      mState = eNotMe;
-      break;
-    }
    if (codingState == eItsMe)
    {
      mState = eFoundIt;
--- a/src/nsSJISProber.h
+++ b/src/nsSJISProber.h
@ -43,7 +43,6 @@
 #ifndef nsSJISProber_h__
 #define nsSJISProber_h__

-#include "uchardetDefine.h"
 #include "nsCharSetProber.h"
 #include "nsCodingStateMachine.h"
 #include "JpCntx.h"
@ -52,11 +51,13 @@

 class nsSJISProber: public nsCharSetProber {
 public:
-  nsSJISProber(void){mCodingSM = new nsCodingStateMachine(&SJISSMModel);
-                      Reset();}
+  nsSJISProber(PRBool aIsPreferredLanguage)
+    :mIsPreferredLanguage(aIsPreferredLanguage)
+  {mCodingSM = new nsCodingStateMachine(&SJISSMModel);
+    Reset();}
  virtual ~nsSJISProber(void){delete mCodingSM;}
  nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
-  const char* GetCharSetName() {return CHARDET_ENCODING_SHIFT_JIS;}
+  const char* GetCharSetName() {return "Shift_JIS";}
  nsProbingState GetState(void) {return mState;}
  void      Reset(void);
  float     GetConfidence(void);
@ -70,6 +71,7 @@ protected:
  SJISDistributionAnalysis mDistributionAnalyser;

  char mLastChar[2];
+  PRBool mIsPreferredLanguage;

 };

--- a/src/nsUTF8Prober.cpp
+++ b/src/nsUTF8Prober.cpp
@ -51,11 +51,6 @@ nsProbingState nsUTF8Prober::HandleData(const char* aBuf, PRUint32 aLen)
  for (PRUint32 i = 0; i < aLen; i++)
  {
    codingState = mCodingSM->NextState(aBuf[i]);
-    if (codingState == eError)
-    {
-      mState = eNotMe;
-      break;
-    }
    if (codingState == eItsMe)
    {
      mState = eFoundIt;
--- a/src/nsUTF8Prober.h
+++ b/src/nsUTF8Prober.h
@ -38,7 +38,6 @@
 #ifndef nsUTF8Prober_h__
 #define nsUTF8Prober_h__

-#include "uchardetDefine.h"
 #include "nsCharSetProber.h"
 #include "nsCodingStateMachine.h"

@ -49,7 +48,7 @@ public:
                Reset(); }
  virtual ~nsUTF8Prober(){delete mCodingSM;}
  nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
-  const char* GetCharSetName() {return CHARDET_ENCODING_UTF_8;}
+  const char* GetCharSetName() {return "UTF-8";}
  nsProbingState GetState(void) {return mState;}
  void      Reset(void);
  float     GetConfidence(void);
--- a/src/nsUniversalDetector.cpp
+++ b/src/nsUniversalDetector.cpp
@ -38,7 +38,6 @@

 #include "nscore.h"

-#include "uchardetDefine.h"
 #include "nsUniversalDetector.h"

 #include "nsMBCSGroupProber.h"
@ -46,7 +45,7 @@
 #include "nsEscCharsetProber.h"
 #include "nsLatin1Prober.h"

-nsUniversalDetector::nsUniversalDetector()
+nsUniversalDetector::nsUniversalDetector(PRUint32 aLanguageFilter)
 {
  mDone = PR_FALSE;
  mBestGuess = -1;   //illegal value as signal
@ -58,6 +57,7 @@ nsUniversalDetector::nsUniversalDetector()
  mGotData = PR_FALSE;
  mInputState = ePureAscii;
  mLastChar = '\0';
+  mLanguageFilter = aLanguageFilter;

  PRUint32 i;
  for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
@ -67,10 +67,9 @@ nsUniversalDetector::nsUniversalDetector()
 nsUniversalDetector::~nsUniversalDetector() 
 {
  for (PRInt32 i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
-    if (mCharSetProbers[i])      
-      delete mCharSetProbers[i];
-  if (mEscCharSetProber)
-    delete mEscCharSetProber;
+    delete mCharSetProbers[i];
+
+  delete mEscCharSetProber;
 }

 void 
@ -111,37 +110,23 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
  if (mStart)
  {
    mStart = PR_FALSE;
-    if (aLen > 3)
+    if (aLen > 2)
      switch (aBuf[0])
        {
        case '\xEF':
          if (('\xBB' == aBuf[1]) && ('\xBF' == aBuf[2]))
            // EF BB BF  UTF-8 encoded BOM
-            mDetectedCharset = CHARDET_ENCODING_UTF_8;
+            mDetectedCharset = "UTF-8";
        break;
        case '\xFE':
-          if (('\xFF' == aBuf[1]) && ('\x00' == aBuf[2]) && ('\x00' == aBuf[3]))
-            // FE FF 00 00  UCS-4, unusual octet order BOM (3412)
-            mDetectedCharset = CHARDET_ENCODING_X_ISO_10646_UCS_4_3412;
-          else if ('\xFF' == aBuf[1])
+          if ('\xFF' == aBuf[1])
            // FE FF  UTF-16, big endian BOM
-            mDetectedCharset = CHARDET_ENCODING_UTF_16BE;
-        break;
-        case '\x00':
-          if (('\x00' == aBuf[1]) && ('\xFE' == aBuf[2]) && ('\xFF' == aBuf[3]))
-            // 00 00 FE FF  UTF-32, big-endian BOM
-            mDetectedCharset = CHARDET_ENCODING_UTF_32BE;
-          else if (('\x00' == aBuf[1]) && ('\xFF' == aBuf[2]) && ('\xFE' == aBuf[3]))
-            // 00 00 FF FE  UCS-4, unusual octet order BOM (2143)
-            mDetectedCharset = CHARDET_ENCODING_X_ISO_10646_UCS_4_2143;
+            mDetectedCharset = "UTF-16";
        break;
        case '\xFF':
-          if (('\xFE' == aBuf[1]) && ('\x00' == aBuf[2]) && ('\x00' == aBuf[3]))
-            // FF FE 00 00  UTF-32, little-endian BOM
-            mDetectedCharset = CHARDET_ENCODING_UTF_32LE;
-          else if ('\xFE' == aBuf[1])
+          if ('\xFE' == aBuf[1])
            // FF FE  UTF-16, little endian BOM
-            mDetectedCharset = CHARDET_ENCODING_UTF_16LE;
+            mDetectedCharset = "UTF-16";
        break;
      }  // switch

@ -172,16 +157,24 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)

        //start multibyte and singlebyte charset prober
        if (nsnull == mCharSetProbers[0])
-          mCharSetProbers[0] = new nsMBCSGroupProber;
-        if (nsnull == mCharSetProbers[1])
-          mCharSetProbers[1] = new nsSBCSGroupProber;
-        if (nsnull == mCharSetProbers[2])
-          mCharSetProbers[2] = new nsLatin1Prober; 
-
-        if ((nsnull == mCharSetProbers[0]) ||
-            (nsnull == mCharSetProbers[1]) ||
-            (nsnull == mCharSetProbers[2]))
+        {
+          mCharSetProbers[0] = new nsMBCSGroupProber(mLanguageFilter);
+          if (nsnull == mCharSetProbers[0])
            return NS_ERROR_OUT_OF_MEMORY;
+        }
+        if (nsnull == mCharSetProbers[1] &&
+            (mLanguageFilter & NS_FILTER_NON_CJK))
+        {
+          mCharSetProbers[1] = new nsSBCSGroupProber;
+          if (nsnull == mCharSetProbers[1])
+            return NS_ERROR_OUT_OF_MEMORY;
+        }
+        if (nsnull == mCharSetProbers[2])
+        {
+          mCharSetProbers[2] = new nsLatin1Prober; 
+          if (nsnull == mCharSetProbers[2])
+            return NS_ERROR_OUT_OF_MEMORY;
+        }
      }
    }
    else
@ -202,7 +195,7 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
  {
  case eEscAscii:
    if (nsnull == mEscCharSetProber) {
-      mEscCharSetProber = new nsEscCharSetProber;
+      mEscCharSetProber = new nsEscCharSetProber(mLanguageFilter);
      if (nsnull == mEscCharSetProber)
        return NS_ERROR_OUT_OF_MEMORY;
    }
@ -216,12 +209,15 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
  case eHighbyte:
    for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
    {
-      st = mCharSetProbers[i]->HandleData(aBuf, aLen);
-      if (st == eFoundIt) 
+      if (mCharSetProbers[i])
      {
-        mDone = PR_TRUE;
-        mDetectedCharset = mCharSetProbers[i]->GetCharSetName();
-        return NS_OK;
+        st = mCharSetProbers[i]->HandleData(aBuf, aLen);
+        if (st == eFoundIt) 
+        {
+          mDone = PR_TRUE;
+          mDetectedCharset = mCharSetProbers[i]->GetCharSetName();
+          return NS_OK;
+        }
      } 
    }
    break;
@ -260,11 +256,14 @@ void nsUniversalDetector::DataEnd()

      for (PRInt32 i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
      {
-        proberConfidence = mCharSetProbers[i]->GetConfidence();
-        if (proberConfidence > maxProberConfidence)
+        if (mCharSetProbers[i])
        {
-          maxProberConfidence = proberConfidence;
-          maxProber = i;
+          proberConfidence = mCharSetProbers[i]->GetConfidence();
+          if (proberConfidence > maxProberConfidence)
+          {
+            maxProberConfidence = proberConfidence;
+            maxProber = i;
+          }
        }
      }
      //do not report anything because we are not confident of it, that's in fact a negative answer
--- a/src/nsUniversalDetector.h
+++ b/src/nsUniversalDetector.h
@ -48,9 +48,22 @@ typedef enum {
  eHighbyte  = 2
 } nsInputState;

+#define NS_FILTER_CHINESE_SIMPLIFIED  0x01
+#define NS_FILTER_CHINESE_TRADITIONAL 0x02
+#define NS_FILTER_JAPANESE            0x04
+#define NS_FILTER_KOREAN              0x08
+#define NS_FILTER_NON_CJK             0x10
+#define NS_FILTER_ALL                 0x1F
+#define NS_FILTER_CHINESE (NS_FILTER_CHINESE_SIMPLIFIED | \
+                           NS_FILTER_CHINESE_TRADITIONAL)
+#define NS_FILTER_CJK (NS_FILTER_CHINESE_SIMPLIFIED | \
+                       NS_FILTER_CHINESE_TRADITIONAL | \
+                       NS_FILTER_JAPANESE | \
+                       NS_FILTER_KOREAN)
+
 class nsUniversalDetector {
 public:
-   nsUniversalDetector();
+   nsUniversalDetector(PRUint32 aLanguageFilter);
   virtual ~nsUniversalDetector();
   virtual nsresult HandleData(const char* aBuf, PRUint32 aLen);
   virtual void DataEnd(void);
@ -66,6 +79,7 @@ protected:
   char    mLastChar;
   const char *  mDetectedCharset;
   PRInt32 mBestGuess;
+   PRUint32 mLanguageFilter;

   nsCharSetProber  *mCharSetProbers[NUM_OF_CHARSET_PROBERS];
   nsCharSetProber  *mEscCharSetProber;
--- a/src/nscore.h
+++ b/src/nscore.h
@ -42,6 +42,8 @@ typedef int PRInt32;
 typedef unsigned int PRUint32;
 typedef short PRInt16;
 typedef unsigned short PRUint16;
+typedef signed char PRInt8;
+typedef unsigned char PRUint8;

 #define PR_FALSE false
 #define PR_TRUE true
--- a/src/tools/uchardet.cpp
+++ b/src/tools/uchardet.cpp
@ -44,38 +44,18 @@
 #ifndef VERSION
 #define VERSION "Unknown"
 #endif
-#define BUFFER_SIZE 32768
+#define BUFFER_SIZE 65536
+
+char buffer[BUFFER_SIZE];

 void detect(FILE * fp)
 {
    uchardet_t handle = uchardet_new();

-    size_t size = BUFFER_SIZE;
-    char * buffer_in = (char *) malloc(size * sizeof(char));
-
-    while (fgets(buffer_in, size, fp) != NULL)
+    while (!feof(fp))
    {
-        size_t freesize = size;
-
-        char * buffer_in_p = buffer_in;
-        size_t line_length = strlen(buffer_in_p);
-        while (line_length + 1 == freesize && buffer_in_p[line_length - 2] != '\n')
-        {
-            buffer_in_p += size - 1;
-            freesize = size + 1;
-            size += size;
-            size_t offset = buffer_in_p - buffer_in;
-            buffer_in = (char *) realloc(buffer_in, size * sizeof(char));
-            buffer_in_p = buffer_in + offset;
-
-            if (fgets(buffer_in_p, freesize, fp) == NULL)
-                break;
-
-            line_length = strlen(buffer_in_p);
-        }
-
-        int retval = uchardet_handle_data(handle, buffer_in, strlen(buffer_in));
-
+        size_t len = fread(buffer, 1, BUFFER_SIZE, fp);
+        int retval = uchardet_handle_data(handle, buffer, len);
        if (retval != 0)
        {
            fprintf(stderr, "Handle data error.\n");
@ -84,10 +64,10 @@ void detect(FILE * fp)
    }
    uchardet_data_end(handle);

-    printf("%s\n", uchardet_get_charset(handle));
+    const char * charset = uchardet_get_charset(handle);
+    printf("%s\n", charset);

    uchardet_delete(handle);
-    free(buffer_in);
 }

 void show_version()
--- a/src/uchardet.cpp
+++ b/src/uchardet.cpp
@ -37,67 +37,69 @@
 #include "uchardet.h"
 #include "nscore.h"
 #include "nsUniversalDetector.h"
-#include <string.h>
+#include <string>

-class DllDetector : public nsUniversalDetector
+using std::string;
+
+class HandleUniversalDetector : public nsUniversalDetector
 {
 protected:
-    char charset_[256];
+	string m_charset;

 public:
-    DllDetector()
-    : nsUniversalDetector()
+    HandleUniversalDetector()
+    : nsUniversalDetector(NS_FILTER_ALL)
    {
-        *charset_=0;
+        m_charset = "";
    }

-    virtual ~DllDetector()
+    virtual ~HandleUniversalDetector()
    {}

    virtual void Report(const char* charset)
    {
-        strncpy( charset_ , charset , sizeof(charset_) );
+        m_charset = charset;
    }

    virtual void Reset()
    {
        nsUniversalDetector::Reset();
-        *charset_=0;
+        m_charset = "";
    }

    const char* GetCharset() const
    {
-        return charset_;
+        return m_charset.c_str();
    }
 };

 uchardet_t uchardet_new()
 {
-    return reinterpret_cast<uchardet_t> (new DllDetector());
+    return reinterpret_cast<uchardet_t> (new HandleUniversalDetector());
 }

 void uchardet_delete(uchardet_t ud)
 {
-    delete reinterpret_cast<DllDetector*>(ud);
+    delete reinterpret_cast<HandleUniversalDetector*>(ud);
 }

 int uchardet_handle_data(uchardet_t ud, const char * data, size_t len)
 {
-    nsresult ret = reinterpret_cast<DllDetector*>(ud)->HandleData(data, (PRUint32)len);
+    nsresult ret = reinterpret_cast<HandleUniversalDetector*>(ud)->HandleData(data, (PRUint32)len);
    return (ret != NS_OK);
 }

 void uchardet_data_end(uchardet_t ud)
 {
-    reinterpret_cast<DllDetector*>(ud)->DataEnd();
+    reinterpret_cast<HandleUniversalDetector*>(ud)->DataEnd();
 }

 void uchardet_reset(uchardet_t ud)
 {
-    reinterpret_cast<DllDetector*>(ud)->Reset();
+    reinterpret_cast<HandleUniversalDetector*>(ud)->Reset();
 }

 const char* uchardet_get_charset(uchardet_t ud)
 {
-    return reinterpret_cast<DllDetector*>(ud)->GetCharset();
+    return reinterpret_cast<HandleUniversalDetector*>(ud)->GetCharset();
 }
--- a/win32.sh
+++ b/win32.sh
@ -0,0 +1,7 @@
+mkdir --parents win32 \
+&& cd win32 \
+&& cmake .. \
+	-G "MSYS Makefiles" \
+	-DCMAKE_BUILD_TYPE=Release \
+	-DCMAKE_INSTALL_PREFIX="" \
+&& make