Using the generic language detector in UTF-8 detection.

Now the UTF-8 prober would not only detect valid UTF-8, but would also detect the most probable language. Using the data generated 2 commits away, this works very well. This is still basic and will require even more improvements. In particular, now the nsUTF8Prober should return an array of ("UTF-8", language) couple candidate. And nsMBCSGroupProber should itself forward these candidates as well as other candidates from other multi-byte detectors. This way, the public-facing API would get more probable candidates, in case the algorithm is slightly wrong. Also the UTF-8 confidence is currently stupidly high as soon as we consider it to be right. We should likely weigh it with language detection (in particular, if no language is detected, this should severely weigh down UTF-8 detection; not to 0, but high enough to be a fallback in case no other encoding+lang is valid and low enough to give chances to other good candidate couples.
2026-01-01 03:12:24 +08:00 · 2021-03-15 12:01:35 +01:00 · 2021-03-15 12:01:35 +01:00 · 5257fc1abf
commit 5257fc1abf
parent dac7cbd30f
29 changed files with 235 additions and 43 deletions
--- a/src/nsBig5Prober.cpp
+++ b/src/nsBig5Prober.cpp
@ -44,7 +44,9 @@ void  nsBig5Prober::Reset(void)
  mDistributionAnalyser.Reset(mIsPreferredLanguage);
 }

-nsProbingState nsBig5Prober::HandleData(const char* aBuf, PRUint32 aLen)
+nsProbingState nsBig5Prober::HandleData(const char* aBuf, PRUint32 aLen,
+                                        int** codePointBuffer,
+                                        int*  codePointBufferIdx)
 {
  PRUint32 codingState;

--- a/src/nsBig5Prober.h
+++ b/src/nsBig5Prober.h
@ -49,7 +49,9 @@ public:
  {mCodingSM = new nsCodingStateMachine(&Big5SMModel); 
    Reset();}
  virtual ~nsBig5Prober(void){delete mCodingSM;}
-  nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
+  nsProbingState HandleData(const char* aBuf, PRUint32 aLen,
+                            int** codePointBuffer,
+                            int*  codePointBufferIdx);
  const char* GetCharSetName() {return "BIG5";}
  const char* GetLanguage() {return "zh";}
  nsProbingState GetState(void) {return mState;}
--- a/src/nsCharSetProber.h
+++ b/src/nsCharSetProber.h
@ -55,7 +55,10 @@ public:
  virtual ~nsCharSetProber() {}
  virtual const char* GetCharSetName() = 0;
  virtual const char* GetLanguage() = 0;
-  virtual nsProbingState HandleData(const char* aBuf, PRUint32 aLen) = 0;
+  virtual nsProbingState HandleData(const char* aBuf, PRUint32 aLen,
+                                    int** codePointBuffer,
+                                    int*  codePointBufferIdx) = 0;
+  virtual bool DecodeToUnicode() {return false;}
  virtual nsProbingState GetState(void) = 0;
  virtual void      Reset(void)  = 0;
  virtual float     GetConfidence(void) = 0;
--- a/src/nsEUCJPProber.cpp
+++ b/src/nsEUCJPProber.cpp
@ -50,7 +50,9 @@ void  nsEUCJPProber::Reset(void)
  mDistributionAnalyser.Reset(mIsPreferredLanguage);
 }

-nsProbingState nsEUCJPProber::HandleData(const char* aBuf, PRUint32 aLen)
+nsProbingState nsEUCJPProber::HandleData(const char* aBuf, PRUint32 aLen,
+                                         int** codePointBuffer,
+                                         int*  codePointBufferIdx)
 {
  PRUint32 codingState;

--- a/src/nsEUCJPProber.h
+++ b/src/nsEUCJPProber.h
@ -55,7 +55,9 @@ public:
  {mCodingSM = new nsCodingStateMachine(&EUCJPSMModel);
    Reset();}
  virtual ~nsEUCJPProber(void){delete mCodingSM;}
-  nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
+  nsProbingState HandleData(const char* aBuf, PRUint32 aLen,
+                            int** codePointBuffer,
+                            int*  codePointBufferIdx);
  const char* GetCharSetName() {return "EUC-JP";}
  const char* GetLanguage() {return "ja";}
  nsProbingState GetState(void) {return mState;}
--- a/src/nsEUCKRProber.cpp
+++ b/src/nsEUCKRProber.cpp
@ -45,7 +45,9 @@ void  nsEUCKRProber::Reset(void)
  //mContextAnalyser.Reset();
 }

-nsProbingState nsEUCKRProber::HandleData(const char* aBuf, PRUint32 aLen)
+nsProbingState nsEUCKRProber::HandleData(const char* aBuf, PRUint32 aLen,
+                                         int** codePointBuffer,
+                                         int*  codePointBufferIdx)
 {
  PRUint32 codingState;

--- a/src/nsEUCKRProber.h
+++ b/src/nsEUCKRProber.h
@ -50,7 +50,9 @@ public:
    Reset();
  }
  virtual ~nsEUCKRProber(void){delete mCodingSM;}
-  nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
+  nsProbingState HandleData(const char* aBuf, PRUint32 aLen,
+                            int** codePointBuffer,
+                            int*  codePointBufferIdx);
  /* "Unified Hangul Code", also called "CP949" or "Windows-949" is a
   * superset of EUC-KR. Though not fully ok to return UHC here (a
   * separate prober would be better), it is acceptable, since many
--- a/src/nsEUCTWProber.cpp
+++ b/src/nsEUCTWProber.cpp
@ -45,7 +45,9 @@ void  nsEUCTWProber::Reset(void)
  //mContextAnalyser.Reset();
 }

-nsProbingState nsEUCTWProber::HandleData(const char* aBuf, PRUint32 aLen)
+nsProbingState nsEUCTWProber::HandleData(const char* aBuf, PRUint32 aLen,
+                                         int** codePointBuffer,
+                                         int*  codePointBufferIdx)
 {
  PRUint32 codingState;

--- a/src/nsEUCTWProber.h
+++ b/src/nsEUCTWProber.h
@ -49,7 +49,9 @@ public:
  {mCodingSM = new nsCodingStateMachine(&EUCTWSMModel);
    Reset();}
  virtual ~nsEUCTWProber(void){delete mCodingSM;}
-  nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
+  nsProbingState HandleData(const char* aBuf, PRUint32 aLen,
+                            int** codePointBuffer,
+                            int*  codePointBufferIdx);
  const char* GetCharSetName() {return "EUC-TW";}
  const char* GetLanguage() {return "zh";}
  nsProbingState GetState(void) {return mState;}
--- a/src/nsEscCharsetProber.cpp
+++ b/src/nsEscCharsetProber.cpp
@ -73,7 +73,9 @@ void nsEscCharSetProber::Reset(void)
  mDetectedCharset = nsnull;
 }

-nsProbingState nsEscCharSetProber::HandleData(const char* aBuf, PRUint32 aLen)
+nsProbingState nsEscCharSetProber::HandleData(const char* aBuf, PRUint32 aLen,
+                                              int** codePointBuffer,
+                                              int*  codePointBufferIdx)
 {
  PRUint32 codingState;
  PRInt32 j;
--- a/src/nsEscCharsetProber.h
+++ b/src/nsEscCharsetProber.h
@ -49,7 +49,9 @@ class nsEscCharSetProber: public nsCharSetProber {
 public:
  nsEscCharSetProber(PRUint32 aLanguageFilter);
  virtual ~nsEscCharSetProber(void);
-  nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
+  nsProbingState HandleData(const char* aBuf, PRUint32 aLen,
+                            int** codePointBuffer,
+                            int*  codePointBufferIdx);
  const char* GetCharSetName() {return mDetectedCharset;}
  const char* GetLanguage() {return NULL;}
  nsProbingState GetState(void) {return mState;}
--- a/src/nsGB2312Prober.cpp
+++ b/src/nsGB2312Prober.cpp
@ -50,7 +50,9 @@ void  nsGB18030Prober::Reset(void)
  //mContextAnalyser.Reset();
 }

-nsProbingState nsGB18030Prober::HandleData(const char* aBuf, PRUint32 aLen)
+nsProbingState nsGB18030Prober::HandleData(const char* aBuf, PRUint32 aLen,
+                                           int** codePointBuffer,
+                                           int*  codePointBufferIdx)
 {
  PRUint32 codingState;

--- a/src/nsGB2312Prober.h
+++ b/src/nsGB2312Prober.h
@ -51,7 +51,9 @@ public:
  {mCodingSM = new nsCodingStateMachine(&GB18030SMModel);
    Reset();}
  virtual ~nsGB18030Prober(void){delete mCodingSM;}
-  nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
+  nsProbingState HandleData(const char* aBuf, PRUint32 aLen,
+                            int** codePointBuffer,
+                            int*  codePointBufferIdx);
  const char* GetCharSetName() {return "GB18030";}
  const char* GetLanguage() {return "zh";}
  nsProbingState GetState(void) {return mState;}
--- a/src/nsHebrewProber.cpp
+++ b/src/nsHebrewProber.cpp
@ -106,7 +106,9 @@ PRBool nsHebrewProber::isNonFinal(char c)
 * The input buffer should not contain any white spaces that are not (' ')
 * or any low-ascii punctuation marks. 
 */
-nsProbingState nsHebrewProber::HandleData(const char* aBuf, PRUint32 aLen)
+nsProbingState nsHebrewProber::HandleData(const char* aBuf, PRUint32 aLen,
+                                          int** codePointBuffer,
+                                          int*  codePointBufferIdx)
 {
  // Both model probers say it's not them. No reason to continue.
  if (GetState() == eNotMe)
--- a/src/nsHebrewProber.h
+++ b/src/nsHebrewProber.h
@ -48,7 +48,9 @@ public:
  nsHebrewProber(void) :mLogicalProb(0), mVisualProb(0) { Reset(); }

  virtual ~nsHebrewProber(void) {}
-  virtual nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
+  virtual nsProbingState HandleData(const char* aBuf, PRUint32 aLen,
+                                    int** codePointBuffer,
+                                    int*  codePointBufferIdx);
  virtual const char *GetCharSetName();
  virtual const char *GetLanguage(void) { return "he"; }
  virtual void Reset(void);
--- a/src/nsLatin1Prober.cpp
+++ b/src/nsLatin1Prober.cpp
@ -114,7 +114,9 @@ void  nsLatin1Prober::Reset(void)
 }


-nsProbingState nsLatin1Prober::HandleData(const char* aBuf, PRUint32 aLen)
+nsProbingState nsLatin1Prober::HandleData(const char* aBuf, PRUint32 aLen,
+                                          int** codePointBuffer,
+                                          int*  codePointBufferIdx)
 {
  char *newBuf1 = 0;
  PRUint32 newLen1 = 0;
--- a/src/nsLatin1Prober.h
+++ b/src/nsLatin1Prober.h
@ -49,7 +49,9 @@ class nsLatin1Prober: public nsCharSetProber {
 public:
  nsLatin1Prober(void){Reset();}
  virtual ~nsLatin1Prober(void){}
-  nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
+  nsProbingState HandleData(const char* aBuf, PRUint32 aLen,
+                            int** codePointBuffer,
+                            int*  codePointBufferIdx);
  const char* GetCharSetName() {return "WINDOWS-1252";}
  const char* GetLanguage() {return NULL;}
  nsProbingState GetState(void) {return mState;}
--- a/src/nsMBCSGroupProber.cpp
+++ b/src/nsMBCSGroupProber.cpp
@ -58,7 +58,12 @@ const char *ProberName[] =
 nsMBCSGroupProber::nsMBCSGroupProber(PRUint32 aLanguageFilter)
 {
  for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++)
-    mProbers[i] = nsnull;
+  {
+    mProbers[i]            = nsnull;
+    codePointBuffer[i]     = nsnull;
+    codePointBufferSize[i] = 0;
+    codePointBufferIdx[i]  = 0;
+  }

  mProbers[0] = new nsUTF8Prober();
  if (aLanguageFilter & NS_FILTER_JAPANESE) 
@ -75,6 +80,24 @@ nsMBCSGroupProber::nsMBCSGroupProber(PRUint32 aLanguageFilter)
    mProbers[5] = new nsBig5Prober(aLanguageFilter == NS_FILTER_CHINESE_TRADITIONAL);
    mProbers[6] = new nsEUCTWProber(aLanguageFilter == NS_FILTER_CHINESE_TRADITIONAL);
  }
+
+  for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++)
+  {
+    if (mProbers[i]->DecodeToUnicode())
+    {
+      langDetectors[i][0] = new nsLanguageDetector(&FrenchModel);
+      langDetectors[i][1] = new nsLanguageDetector(&ItalianModel);
+      langDetectors[i][2] = new nsLanguageDetector(&DanishModel);
+      langDetectors[i][3] = new nsLanguageDetector(&GermanModel);
+      langDetectors[i][4] = new nsLanguageDetector(&ArabicModel);
+      langDetectors[i][5] = new nsLanguageDetector(&SpanishModel);
+    }
+    else
+    {
+      for (PRUint32 j = 0; j < NUM_OF_LANGUAGES; j++)
+        langDetectors[i][j] = nsnull;
+    }
+  }
  Reset();
 }

@ -83,6 +106,13 @@ nsMBCSGroupProber::~nsMBCSGroupProber()
  for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++)
  {
    delete mProbers[i];
+
+    if (codePointBufferSize[i] != 0)
+      delete [] codePointBuffer[i];
+
+    for (PRUint32 j = 0; j < NUM_OF_LANGUAGES; j++)
+      if (langDetectors[i][j])
+        delete langDetectors[i][j];
  }
 }

@ -99,17 +129,35 @@ const char* nsMBCSGroupProber::GetCharSetName()

 const char* nsMBCSGroupProber::GetLanguage(void)
 {
+  const char* maxLang       = NULL;
+  int         maxLangIdx    = -1;
+  float       maxConfidence = 0.0;
+
  if (mBestGuess == -1)
-  {
-    GetConfidence();
-  }
-  if (mBestGuess == -1)
-      return NULL;
+    return NULL;
  else
-      return mProbers[mBestGuess]->GetLanguage();
+    maxLang = mProbers[mBestGuess]->GetLanguage();
+
+  if (maxLang == NULL && mProbers[mBestGuess]->DecodeToUnicode())
+  {
+    for (PRUint32 j = 0; j < NUM_OF_LANGUAGES; j++)
+    {
+      float conf = langDetectors[mBestGuess][j]->GetConfidence();
+
+      if (conf > maxConfidence)
+      {
+        maxLangIdx = j;
+        maxConfidence = conf;
+      }
+    }
+    if (maxLangIdx != -1)
+      maxLang = langDetectors[mBestGuess][maxLangIdx]->GetLanguage();
+  }
+
+  return maxLang;
 }

-void  nsMBCSGroupProber::Reset(void)
+void nsMBCSGroupProber::Reset(void)
 {
  mActiveNum = 0;
  for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++)
@ -119,6 +167,13 @@ void  nsMBCSGroupProber::Reset(void)
      mProbers[i]->Reset();
      mIsActive[i] = PR_TRUE;
      ++mActiveNum;
+
+      if (codePointBufferSize[i] == 0 && mProbers[i]->DecodeToUnicode())
+      {
+        codePointBufferSize[i] = 1024;
+        codePointBuffer[i] = new int[codePointBufferSize[i]];
+      }
+      codePointBufferIdx[i] = 0;
    }
    else
      mIsActive[i] = PR_FALSE;
@ -128,7 +183,9 @@ void  nsMBCSGroupProber::Reset(void)
  mKeepNext = 0;
 }

-nsProbingState nsMBCSGroupProber::HandleData(const char* aBuf, PRUint32 aLen)
+nsProbingState nsMBCSGroupProber::HandleData(const char* aBuf, PRUint32 aLen,
+                                             int** cpBuffer,
+                                             int*  cpBufferIdx)
 {
  nsProbingState st;
  PRUint32 start = 0;
@ -151,7 +208,20 @@ nsProbingState nsMBCSGroupProber::HandleData(const char* aBuf, PRUint32 aLen)
        {
          if (!mIsActive[i])
            continue;
-          st = mProbers[i]->HandleData(aBuf + start, pos + 1 - start);
+
+          if (codePointBuffer[i])
+            st = mProbers[i]->HandleData(aBuf + start, pos + 1 - start,
+                                         &(codePointBuffer[i]), &(codePointBufferIdx[i]));
+          else
+            st = mProbers[i]->HandleData(aBuf + start, pos + 1 - start, NULL, NULL);
+
+          if (codePointBufferIdx[i] > 0 && codePointBuffer[i])
+          {
+            for (PRUint32 j = 0; j < NUM_OF_LANGUAGES; j++)
+              langDetectors[i][j]->HandleData(codePointBuffer[i], codePointBufferIdx[i]);
+            codePointBufferIdx[i] = 0;
+          }
+
          if (st == eFoundIt)
          {
            mBestGuess = i;
@ -161,6 +231,12 @@ nsProbingState nsMBCSGroupProber::HandleData(const char* aBuf, PRUint32 aLen)
        }
      }
    }
+    else
+    {
+      for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++)
+        if (codePointBuffer[i])
+          codePointBuffer[i][(codePointBufferIdx[i])++] = aBuf[pos];
+    }
  }

  if (keepNext) {
@ -168,7 +244,20 @@ nsProbingState nsMBCSGroupProber::HandleData(const char* aBuf, PRUint32 aLen)
    {
      if (!mIsActive[i])
        continue;
-      st = mProbers[i]->HandleData(aBuf + start, aLen - start);
+
+      if (codePointBuffer[i])
+        st = mProbers[i]->HandleData(aBuf + start, aLen - start,
+                                     &(codePointBuffer[i]), &(codePointBufferIdx[i]));
+      else
+        st = mProbers[i]->HandleData(aBuf + start, aLen - start, NULL, NULL);
+
+      if (codePointBufferIdx[i] > 0 && codePointBuffer[i])
+      {
+        for (PRUint32 j = 0; j < NUM_OF_LANGUAGES; j++)
+          langDetectors[i][j]->HandleData(codePointBuffer[i], codePointBufferIdx[i]);
+        codePointBufferIdx[i] = 0;
+      }
+
      if (st == eFoundIt)
      {
        mBestGuess = i;
--- a/src/nsMBCSGroupProber.h
+++ b/src/nsMBCSGroupProber.h
@ -48,12 +48,15 @@
 #include "nsEUCTWProber.h"

 #define NUM_OF_PROBERS    7
+#define NUM_OF_LANGUAGES  6

 class nsMBCSGroupProber: public nsCharSetProber {
 public:
  nsMBCSGroupProber(PRUint32 aLanguageFilter);
  virtual ~nsMBCSGroupProber();
-  nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
+  nsProbingState HandleData(const char* aBuf, PRUint32 aLen,
+                            int** codePointBuffer,
+                            int*  codePointBufferIdx);
  const char* GetCharSetName();
  const char* GetLanguage();
  nsProbingState GetState(void) {return mState;}
@ -75,6 +78,12 @@ protected:
  PRInt32 mBestGuess;
  PRUint32 mActiveNum;
  PRUint32 mKeepNext;
+
+  int *codePointBuffer[NUM_OF_PROBERS];
+  int  codePointBufferSize[NUM_OF_PROBERS];
+  int  codePointBufferIdx[NUM_OF_PROBERS];
+
+  nsLanguageDetector *langDetectors[NUM_OF_PROBERS][NUM_OF_LANGUAGES];
 };

 #endif /* nsMBCSGroupProber_h__ */
--- a/src/nsSBCSGroupProber.cpp
+++ b/src/nsSBCSGroupProber.cpp
@ -253,7 +253,9 @@ void  nsSBCSGroupProber::Reset(void)
 }


-nsProbingState nsSBCSGroupProber::HandleData(const char* aBuf, PRUint32 aLen)
+nsProbingState nsSBCSGroupProber::HandleData(const char* aBuf, PRUint32 aLen,
+                                             int** codePointBuffer,
+                                             int*  codePointBufferIdx)
 {
  nsProbingState st;
  PRUint32 i;
@ -276,7 +278,7 @@ nsProbingState nsSBCSGroupProber::HandleData(const char* aBuf, PRUint32 aLen)
  {
     if (!mIsActive[i])
       continue;
-     st = mProbers[i]->HandleData(newBuf1, newLen1);
+     st = mProbers[i]->HandleData(newBuf1, newLen1, codePointBuffer, codePointBufferIdx);
     if (st == eFoundIt)
     {
       mBestGuess = i;
--- a/src/nsSBCSGroupProber.h
+++ b/src/nsSBCSGroupProber.h
@ -47,7 +47,9 @@ class nsSBCSGroupProber: public nsCharSetProber {
 public:
  nsSBCSGroupProber();
  virtual ~nsSBCSGroupProber();
-  nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
+  nsProbingState HandleData(const char* aBuf, PRUint32 aLen,
+                            int** codePointBuffer,
+                            int*  codePointBufferIdx);
  const char* GetCharSetName();
  const char* GetLanguage();
  nsProbingState GetState(void) {return mState;}
--- a/src/nsSBCharSetProber.cpp
+++ b/src/nsSBCharSetProber.cpp
@ -38,7 +38,9 @@
 #include <stdio.h>
 #include "nsSBCharSetProber.h"

-nsProbingState nsSingleByteCharSetProber::HandleData(const char* aBuf, PRUint32 aLen)
+nsProbingState nsSingleByteCharSetProber::HandleData(const char* aBuf, PRUint32 aLen,
+                                                     int** codePointBuffer,
+                                                     int*  codePointBufferIdx)
 {
  unsigned char order;

--- a/src/nsSBCharSetProber.h
+++ b/src/nsSBCharSetProber.h
@ -88,7 +88,9 @@ public:

  virtual const char* GetCharSetName();
  virtual const char* GetLanguage();
-  virtual nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
+  virtual nsProbingState HandleData(const char* aBuf, PRUint32 aLen,
+                                    int** codePointBuffer,
+                                    int*  codePointBufferIdx);
  virtual nsProbingState GetState(void) {return mState;}
  virtual void      Reset(void);
  virtual float     GetConfidence(void);
--- a/src/nsSJISProber.cpp
+++ b/src/nsSJISProber.cpp
@ -50,7 +50,9 @@ void  nsSJISProber::Reset(void)
  mDistributionAnalyser.Reset(mIsPreferredLanguage);
 }

-nsProbingState nsSJISProber::HandleData(const char* aBuf, PRUint32 aLen)
+nsProbingState nsSJISProber::HandleData(const char* aBuf, PRUint32 aLen,
+                                        int** codePointBuffer,
+                                        int*  codePointBufferIdx)
 {
  PRUint32 codingState;

--- a/src/nsSJISProber.h
+++ b/src/nsSJISProber.h
@ -56,7 +56,9 @@ public:
  {mCodingSM = new nsCodingStateMachine(&SJISSMModel);
    Reset();}
  virtual ~nsSJISProber(void){delete mCodingSM;}
-  nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
+  nsProbingState HandleData(const char* aBuf, PRUint32 aLen,
+                            int** codePointBuffer,
+                            int*  codePointBufferIdx);
  const char* GetCharSetName() {return "SHIFT_JIS";}
  const char* GetLanguage() {return "ja";}
  nsProbingState GetState(void) {return mState;}
--- a/src/nsUTF8Prober.cpp
+++ b/src/nsUTF8Prober.cpp
@ -42,9 +42,12 @@ void  nsUTF8Prober::Reset(void)
  mCodingSM->Reset(); 
  mNumOfMBChar = 0;
  mState = eDetecting;
+  currentCodePoint = 0;
 }

-nsProbingState nsUTF8Prober::HandleData(const char* aBuf, PRUint32 aLen)
+nsProbingState nsUTF8Prober::HandleData(const char* aBuf, PRUint32 aLen,
+                                        int** codePointBuffer,
+                                        int*  codePointBufferIdx)
 {
  PRUint32 codingState;

@ -59,7 +62,28 @@ nsProbingState nsUTF8Prober::HandleData(const char* aBuf, PRUint32 aLen)
    if (codingState == eStart)
    {
      if (mCodingSM->GetCurrentCharLen() >= 2)
+      {
        mNumOfMBChar++;
+
+        currentCodePoint = ((0xff & aBuf[i]) & 0x3fu) | (currentCodePoint << 6);
+        if (mCodingSM->GetCurrentCharLen() == 2)
+            currentCodePoint &= 0x7ff;
+        else if (mCodingSM->GetCurrentCharLen() == 3)
+            currentCodePoint &= 0xffff;
+        else
+            currentCodePoint &= 0x1fffff;
+      }
+      else
+      {
+        currentCodePoint = 0xff & (char) aBuf[i];
+      }
+
+      (*codePointBuffer)[(*codePointBufferIdx)++] = currentCodePoint;
+      currentCodePoint = 0;
+    }
+    else
+    {
+        currentCodePoint = ((0xff & aBuf[i]) & 0x3fu) | (currentCodePoint << 6);
    }
  }

@ -84,4 +108,3 @@ float nsUTF8Prober::GetConfidence(void)
  else
    return (float)0.99;
 }
-
--- a/src/nsUTF8Prober.h
+++ b/src/nsUTF8Prober.h
@ -41,6 +41,7 @@
 #include <cstddef>
 #include "nsCharSetProber.h"
 #include "nsCodingStateMachine.h"
+#include "nsLanguageDetector.h"

 class nsUTF8Prober: public nsCharSetProber {
 public:
@ -48,7 +49,9 @@ public:
                mCodingSM = new nsCodingStateMachine(&UTF8SMModel);
                Reset(); }
  virtual ~nsUTF8Prober(){delete mCodingSM;}
-  nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
+  nsProbingState HandleData(const char* aBuf, PRUint32 aLen,
+                            int** codePointBuffer,
+                            int*  codePointBufferIdx);
  const char* GetCharSetName() {return "UTF-8";}
  const char* GetLanguage() {return NULL;}
  nsProbingState GetState(void) {return mState;}
@ -56,11 +59,14 @@ public:
  float     GetConfidence(void);
  void      SetOpion() {}

+  virtual bool DecodeToUnicode() {return true;}
+
 protected:
  nsCodingStateMachine* mCodingSM;
  nsProbingState mState;
  PRUint32 mNumOfMBChar;
+
+  int currentCodePoint;
 };

 #endif /* nsUTF8Prober_h__ */
-
--- a/src/nsUniversalDetector.cpp
+++ b/src/nsUniversalDetector.cpp
@ -55,6 +55,8 @@ nsUniversalDetector::nsUniversalDetector(PRUint32 aLanguageFilter)

  mStart = PR_TRUE;
  mDetectedCharset = nsnull;
+  mDetectedLanguage = nsnull;
+  mDetectedConfidence = 0.0;
  mGotData = PR_FALSE;
  mInputState = ePureAscii;
  mLastChar = '\0';
@ -83,6 +85,8 @@ nsUniversalDetector::Reset()

  mStart = PR_TRUE;
  mDetectedCharset = nsnull;
+  mDetectedLanguage = nsnull;
+  mDetectedConfidence = 0.0;
  mGotData = PR_FALSE;
  mInputState = ePureAscii;
  mLastChar = '\0';
@ -118,13 +122,19 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
        {
        case '\xEF':
          if (('\xBB' == aBuf[1]) && ('\xBF' == aBuf[2]))
+          {
            /* EF BB BF: UTF-8 encoded BOM. */
            mDetectedCharset = "UTF-8";
+            mDetectedConfidence = 0.99;
+          }
        break;
        case '\xFE':
          if ('\xFF' == aBuf[1])
+          {
            /* FE FF: UTF-16, big endian BOM. */
            mDetectedCharset = "UTF-16";
+            mDetectedConfidence = 0.99;
+          }
        break;
        case '\xFF':
          if ('\xFE' == aBuf[1])
@ -135,11 +145,13 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
            {
                /* FF FE 00 00: UTF-32 (LE). */
                mDetectedCharset = "UTF-32";
+                mDetectedConfidence = 0.99;
            }
            else
            {
                /* FF FE: UTF-16, little endian BOM. */
                mDetectedCharset = "UTF-16";
+                mDetectedConfidence = 0.99;
            }
          }
          break;
@ -151,6 +163,7 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
          {
              /* 00 00 FE FF: UTF-32 (BE). */
              mDetectedCharset = "UTF-32";
+              mDetectedConfidence = 0.99;
          }
          break;
        }
@ -236,11 +249,12 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
      if (nsnull == mEscCharSetProber)
        return NS_ERROR_OUT_OF_MEMORY;
    }
-    st = mEscCharSetProber->HandleData(aBuf, aLen);
+    st = mEscCharSetProber->HandleData(aBuf, aLen, NULL, NULL);
    if (st == eFoundIt)
    {
      mDone = PR_TRUE;
      mDetectedCharset = mEscCharSetProber->GetCharSetName();
+      mDetectedConfidence = mEscCharSetProber->GetConfidence();
    }
    break;
  case eHighbyte:
@ -248,11 +262,13 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
    {
      if (mCharSetProbers[i])
      {
-        st = mCharSetProbers[i]->HandleData(aBuf, aLen);
+        st = mCharSetProbers[i]->HandleData(aBuf, aLen, NULL, NULL);
        if (st == eFoundIt)
        {
          mDone = PR_TRUE;
          mDetectedCharset = mCharSetProbers[i]->GetCharSetName();
+          mDetectedLanguage = mCharSetProbers[i]->GetLanguage();
+          mDetectedConfidence = mCharSetProbers[i]->GetConfidence();
          return NS_OK;
        }
      }
@ -305,7 +321,7 @@ void nsUniversalDetector::DataEnd()
       * when finding them.
       */
      mDone = PR_TRUE;
-      Report(mDetectedCharset, NULL, 1.0);
+      Report(mDetectedCharset, mDetectedLanguage, mDetectedConfidence);
      return;
  }

--- a/src/nsUniversalDetector.h
+++ b/src/nsUniversalDetector.h
@ -81,6 +81,8 @@ protected:
   PRBool  mGotData;
   char    mLastChar;
   const char *  mDetectedCharset;
+   const char *  mDetectedLanguage;
+   float         mDetectedConfidence;
   PRInt32 mBestGuess;
   PRUint32 mLanguageFilter;