Update code from upstream.

This commit is contained in:
BYVoid 2011-07-11 14:42:50 +08:00
parent 76a1be36f8
commit 84284eccf4
48 changed files with 411 additions and 532 deletions

0
debug.sh Executable file → Normal file
View File

0
release.sh Executable file → Normal file
View File

View File

@ -11,26 +11,26 @@ set(
LangBulgarianModel.cpp
LangCyrillicModel.cpp
LangGreekModel.cpp
LangHebrewModel.cpp
LangHungarianModel.cpp
LangHebrewModel.cpp
LangThaiModel.cpp
nsBig5Prober.cpp
nsHebrewProber.cpp
nsCharSetProber.cpp
nsEscCharsetProber.cpp
nsEscSM.cpp
nsBig5Prober.cpp
nsEUCJPProber.cpp
nsEUCKRProber.cpp
nsEUCTWProber.cpp
nsEscCharsetProber.cpp
nsEscSM.cpp
nsGB2312Prober.cpp
nsHebrewProber.cpp
nsLatin1Prober.cpp
nsMBCSGroupProber.cpp
nsMBCSSM.cpp
nsSBCharSetProber.cpp
nsSBCSGroupProber.cpp
nsSBCharSetProber.cpp
nsSJISProber.cpp
nsUniversalDetector.cpp
nsUTF8Prober.cpp
nsLatin1Prober.cpp
nsUniversalDetector.cpp
uchardet.cpp
)

View File

@ -46,15 +46,13 @@
#define SURE_YES 0.99f
#define SURE_NO 0.01f
#define MINIMUM_DATA_THRESHOLD 4
//return confidence base on received data
float CharDistributionAnalysis::GetConfidence()
float CharDistributionAnalysis::GetConfidence(void)
{
//if we didn't receive any character in our consideration range, or the
//number of frequent characters is below the minimum threshold, return
// number of frequent characters is below the minimum threshold, return
// negative answer
if (mTotalChars <= 0 || mFreqChars <= MINIMUM_DATA_THRESHOLD)
if (mTotalChars <= 0 || mFreqChars <= mDataThreshold)
return SURE_NO;
if (mTotalChars != mFreqChars) {

View File

@ -42,11 +42,12 @@
#define ENOUGH_DATA_THRESHOLD 1024
#define MINIMUM_DATA_THRESHOLD 4
class CharDistributionAnalysis
{
public:
CharDistributionAnalysis() {Reset();}
virtual ~CharDistributionAnalysis(){};
CharDistributionAnalysis() {Reset(PR_FALSE);}
//feed a block of data and do distribution analysis
void HandleData(const char* aBuf, PRUint32 aLen) {}
@ -72,14 +73,15 @@ public:
}
//return confidence base on existing data
float GetConfidence();
float GetConfidence(void);
//Reset analyser, clear any state
void Reset(void)
void Reset(PRBool aIsPreferredLanguage)
{
mDone = PR_FALSE;
mTotalChars = 0;
mFreqChars = 0;
mDataThreshold = aIsPreferredLanguage ? 0 : MINIMUM_DATA_THRESHOLD;
}
//This function is for future extension. Caller can use this function to control
@ -105,6 +107,9 @@ protected:
//Total character encounted.
PRUint32 mTotalChars;
//Number of hi-byte characters needed to trigger detection
PRUint32 mDataThreshold;
//Mapping table to get frequency order from char order (get from GetOrder())
const PRInt16 *mCharToFreqOrder;

View File

@ -39,7 +39,7 @@
#include "JpCntx.h"
//This is hiragana 2-char sequence table, the number in each cell represents its frequency category
char jp2CharContext[83][83] =
const PRUint8 jp2CharContext[83][83] =
{
{ 0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,},
{ 2,4,0,4,0,3,0,4,0,3,4,4,4,2,4,3,3,4,3,2,3,3,4,2,3,3,3,2,4,1,4,3,3,1,5,4,3,4,3,4,3,5,3,0,3,5,4,2,0,3,1,0,3,3,0,3,3,0,1,1,0,4,3,0,3,3,0,4,0,2,0,3,5,5,5,5,4,0,4,1,0,3,4,},
@ -170,7 +170,7 @@ void JapaneseContextAnalysis::HandleData(const char* aBuf, PRUint32 aLen)
return;
}
void JapaneseContextAnalysis::Reset(void)
void JapaneseContextAnalysis::Reset(PRBool aIsPreferredLanguage)
{
mTotalRel = 0;
for (PRUint32 i = 0; i < NUM_OF_CATEGORY; i++)
@ -178,13 +178,14 @@ void JapaneseContextAnalysis::Reset(void)
mNeedToSkipCharNum = 0;
mLastCharOrder = -1;
mDone = PR_FALSE;
mDataThreshold = aIsPreferredLanguage ? 0 : MINIMUM_DATA_THRESHOLD;
}
#define DONT_KNOW (float)-1
float JapaneseContextAnalysis::GetConfidence()
float JapaneseContextAnalysis::GetConfidence(void)
{
//This is just one way to calculate confidence. It works well for me.
if (mTotalRel > MINIMUM_DATA_THRESHOLD)
if (mTotalRel > mDataThreshold)
return ((float)(mTotalRel - mRelSample[0]))/mTotalRel;
else
return (float)DONT_KNOW;
@ -227,5 +228,3 @@ PRInt32 EUCJPContextAnalysis::GetOrder(const char* str, PRUint32 *charLen)
return (unsigned char)*(str+1) - (unsigned char)0xa1;
return -1;
}

View File

@ -46,13 +46,12 @@
#define MAX_REL_THRESHOLD 1000
//hiragana frequency category table
extern char jp2CharContext[83][83];
extern const PRUint8 jp2CharContext[83][83];
class JapaneseContextAnalysis
{
public:
JapaneseContextAnalysis() {Reset();}
virtual ~JapaneseContextAnalysis(){};
JapaneseContextAnalysis() {Reset(PR_FALSE);}
void HandleData(const char* aBuf, PRUint32 aLen);
@ -75,8 +74,8 @@ public:
mLastCharOrder = order;
}
float GetConfidence();
void Reset(void);
float GetConfidence(void);
void Reset(PRBool aIsPreferredLanguage);
void SetOpion(){}
PRBool GotEnoughData() {return mTotalRel > ENOUGH_REL_THRESHOLD;}
@ -84,11 +83,14 @@ protected:
virtual PRInt32 GetOrder(const char* str, PRUint32 *charLen) = 0;
virtual PRInt32 GetOrder(const char* str) = 0;
//category counters, each interger counts sequence in its category
//category counters, each integer counts sequences in its category
PRUint32 mRelSample[NUM_OF_CATEGORY];
//total sequence received
PRUint32 mTotalRel;
//Number of sequences needed to trigger detection
PRUint32 mDataThreshold;
//The order of previous char
PRInt32 mLastCharOrder;

View File

@ -35,7 +35,6 @@
*
* ***** END LICENSE BLOCK ***** */
#include "uchardetDefine.h"
#include "nsSBCharSetProber.h"
/****************************************************************
255: Control characters that usually does not exist in any text
@ -49,7 +48,7 @@
//this talbe is modified base on win1251BulgarianCharToOrderMap, so
//only number <64 is sure valid
unsigned char Latin5_BulgarianCharToOrderMap[] =
static const unsigned char Latin5_BulgarianCharToOrderMap[] =
{
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
@ -69,7 +68,7 @@ unsigned char Latin5_BulgarianCharToOrderMap[] =
62,242,243,244, 58,245, 98,246,247,248,249,250,251, 91,252,253, //f0
};
unsigned char win1251BulgarianCharToOrderMap[] =
static const unsigned char win1251BulgarianCharToOrderMap[] =
{
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
@ -95,7 +94,7 @@ unsigned char win1251BulgarianCharToOrderMap[] =
//first 1024 sequences:3.0618%
//rest sequences: 0.2992%
//negative sequences: 0.0020%
char BulgarianLangModel[] =
static const PRUint8 BulgarianLangModel[] =
{
0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,2,3,3,3,3,3,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,2,2,3,2,2,1,2,2,
@ -227,20 +226,20 @@ char BulgarianLangModel[] =
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,
};
SequenceModel Latin5BulgarianModel =
const SequenceModel Latin5BulgarianModel =
{
Latin5_BulgarianCharToOrderMap,
BulgarianLangModel,
(float)0.969392,
PR_FALSE,
CHARDET_ENCODING_ISO_8859_5
"ISO-8859-5"
};
SequenceModel Win1251BulgarianModel =
const SequenceModel Win1251BulgarianModel =
{
win1251BulgarianCharToOrderMap,
BulgarianLangModel,
(float)0.969392,
PR_FALSE,
CHARDET_ENCODING_WINDOWS_1251
"windows-1251"
};

View File

@ -35,14 +35,13 @@
*
* ***** END LICENSE BLOCK ***** */
#include "uchardetDefine.h"
#include "nsSBCharSetProber.h"
//KOI8-R language model
//Character Mapping Table:
unsigned char KOI8R_CharToOrderMap[] =
static const unsigned char KOI8R_CharToOrderMap[] =
{
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
@ -62,7 +61,7 @@ unsigned char KOI8R_CharToOrderMap[] =
35, 43, 45, 32, 40, 52, 56, 33, 61, 62, 51, 57, 47, 63, 50, 70, //f0
};
unsigned char win1251_CharToOrderMap[] =
static const unsigned char win1251_CharToOrderMap[] =
{
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
@ -82,7 +81,7 @@ unsigned char win1251_CharToOrderMap[] =
9, 7, 6, 14, 39, 26, 28, 22, 25, 29, 54, 18, 17, 30, 27, 16,
};
unsigned char latin5_CharToOrderMap[] =
static const unsigned char latin5_CharToOrderMap[] =
{
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
@ -102,7 +101,7 @@ unsigned char latin5_CharToOrderMap[] =
239, 68,240,241,242,243,244,245,246,247,248,249,250,251,252,255,
};
unsigned char macCyrillic_CharToOrderMap[] =
static const unsigned char macCyrillic_CharToOrderMap[] =
{
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
@ -122,7 +121,7 @@ unsigned char macCyrillic_CharToOrderMap[] =
9, 7, 6, 14, 39, 26, 28, 22, 25, 29, 54, 18, 17, 30, 27,255,
};
unsigned char IBM855_CharToOrderMap[] =
static const unsigned char IBM855_CharToOrderMap[] =
{
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
@ -142,7 +141,7 @@ unsigned char IBM855_CharToOrderMap[] =
250, 18, 62, 20, 51, 25, 57, 30, 47, 29, 63, 22, 50,251,252,255,
};
unsigned char IBM866_CharToOrderMap[] =
static const unsigned char IBM866_CharToOrderMap[] =
{
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
@ -168,7 +167,7 @@ unsigned char IBM866_CharToOrderMap[] =
//first 1024 sequences: 2.3389%
//rest sequences: 0.1237%
//negative sequences: 0.0009%
char RussianLangModel[] =
static const PRUint8 RussianLangModel[] =
{
0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,1,3,3,3,3,1,3,3,3,2,3,2,3,3,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,2,2,2,2,2,0,0,2,
@ -301,56 +300,56 @@ char RussianLangModel[] =
};
SequenceModel Koi8rModel =
const SequenceModel Koi8rModel =
{
KOI8R_CharToOrderMap,
RussianLangModel,
(float)0.976601,
PR_FALSE,
CHARDET_ENCODING_KOI8_R
"KOI8-R"
};
SequenceModel Win1251Model =
const SequenceModel Win1251Model =
{
win1251_CharToOrderMap,
RussianLangModel,
(float)0.976601,
PR_FALSE,
CHARDET_ENCODING_WINDOWS_1251
"windows-1251"
};
SequenceModel Latin5Model =
const SequenceModel Latin5Model =
{
latin5_CharToOrderMap,
RussianLangModel,
(float)0.976601,
PR_FALSE,
CHARDET_ENCODING_ISO_8859_5
"ISO-8859-5"
};
SequenceModel MacCyrillicModel =
const SequenceModel MacCyrillicModel =
{
macCyrillic_CharToOrderMap,
RussianLangModel,
(float)0.976601,
PR_FALSE,
CHARDET_ENCODING_MACCYRILLIC
"x-mac-cyrillic"
};
SequenceModel Ibm866Model =
const SequenceModel Ibm866Model =
{
IBM866_CharToOrderMap,
RussianLangModel,
(float)0.976601,
PR_FALSE,
CHARDET_ENCODING_IBM866
"IBM866"
};
SequenceModel Ibm855Model =
const SequenceModel Ibm855Model =
{
IBM855_CharToOrderMap,
RussianLangModel,
(float)0.976601,
PR_FALSE,
CHARDET_ENCODING_IBM855
"IBM855"
};

View File

@ -35,7 +35,6 @@
*
* ***** END LICENSE BLOCK ***** */
#include "uchardetDefine.h"
#include "nsSBCharSetProber.h"
/****************************************************************
255: Control characters that usually does not exist in any text
@ -46,7 +45,7 @@
*****************************************************************/
//Character Mapping Table:
unsigned char Latin7_CharToOrderMap[] =
static const unsigned char Latin7_CharToOrderMap[] =
{
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
@ -68,7 +67,7 @@ unsigned char Latin7_CharToOrderMap[] =
unsigned char win1253_CharToOrderMap[] =
static const unsigned char win1253_CharToOrderMap[] =
{
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
@ -94,7 +93,7 @@ unsigned char win1253_CharToOrderMap[] =
//first 1024 sequences:1.7001%
//rest sequences: 0.0359%
//negative sequences: 0.0148%
char GreekLangModel[] =
static const PRUint8 GreekLangModel[] =
{
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
@ -226,20 +225,20 @@ char GreekLangModel[] =
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
};
SequenceModel Latin7Model =
const SequenceModel Latin7Model =
{
Latin7_CharToOrderMap,
GreekLangModel,
(float)0.982851,
PR_FALSE,
CHARDET_ENCODING_ISO_8859_7
"ISO-8859-7"
};
SequenceModel Win1253Model =
const SequenceModel Win1253Model =
{
win1253_CharToOrderMap,
GreekLangModel,
(float)0.982851,
PR_FALSE,
CHARDET_ENCODING_WINDOWS_1253
"windows-1253"
};

View File

@ -37,7 +37,6 @@
*
* ***** END LICENSE BLOCK ***** */
#include "uchardetDefine.h"
#include "nsSBCharSetProber.h"
@ -51,7 +50,7 @@
//Windows-1255 language model
//Character Mapping Table:
unsigned char win1255_CharToOrderMap[] =
static const unsigned char win1255_CharToOrderMap[] =
{
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
@ -77,7 +76,7 @@ unsigned char win1255_CharToOrderMap[] =
//first 1024 sequences: 1.5981%
//rest sequences: 0.087%
//negative sequences: 0.0015%
char HebrewLangModel[] =
static const PRUint8 HebrewLangModel[] =
{
0,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,2,3,2,1,2,0,1,0,0,
3,0,3,1,0,0,1,3,2,0,1,1,2,0,2,2,2,1,1,1,1,2,1,1,1,2,0,0,2,2,0,1,
@ -209,12 +208,12 @@ char HebrewLangModel[] =
0,0,0,0,0,0,0,0,0,0,1,2,1,0,0,0,0,0,1,1,1,1,1,0,1,0,0,0,1,1,0,0,
};
SequenceModel Win1255Model =
const SequenceModel Win1255Model =
{
win1255_CharToOrderMap,
HebrewLangModel,
(float)0.984004,
PR_FALSE,
CHARDET_ENCODING_WINDOWS_1255
"windows-1255"
};

View File

@ -35,7 +35,6 @@
*
* ***** END LICENSE BLOCK ***** */
#include "uchardetDefine.h"
#include "nsSBCharSetProber.h"
/****************************************************************
255: Control characters that usually does not exist in any text
@ -46,7 +45,7 @@
*****************************************************************/
//Character Mapping Table:
unsigned char Latin2_HungarianCharToOrderMap[] =
static const unsigned char Latin2_HungarianCharToOrderMap[] =
{
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
@ -66,7 +65,7 @@ unsigned char Latin2_HungarianCharToOrderMap[] =
245,246,247, 25, 73, 42, 24,248,249,250, 31, 56, 29,251,252,253,
};
unsigned char win1250HungarianCharToOrderMap[] =
static const unsigned char win1250HungarianCharToOrderMap[] =
{
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
@ -92,7 +91,7 @@ unsigned char win1250HungarianCharToOrderMap[] =
//first 1024 sequences:5.2623%
//rest sequences: 0.8894%
//negative sequences: 0.0009%
char HungarianLangModel[] =
static const PRUint8 HungarianLangModel[] =
{
0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,2,2,3,3,1,1,2,2,2,2,2,1,2,
@ -224,20 +223,20 @@ char HungarianLangModel[] =
0,1,1,1,1,1,1,0,1,1,0,1,0,1,0,0,1,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,
};
SequenceModel Latin2HungarianModel =
const SequenceModel Latin2HungarianModel =
{
Latin2_HungarianCharToOrderMap,
HungarianLangModel,
(float)0.947368,
PR_TRUE,
CHARDET_ENCODING_ISO_8859_2
"ISO-8859-2"
};
SequenceModel Win1250HungarianModel =
const SequenceModel Win1250HungarianModel =
{
win1250HungarianCharToOrderMap,
HungarianLangModel,
(float)0.947368,
PR_TRUE,
CHARDET_ENCODING_WINDOWS_1250
"windows-1250"
};

View File

@ -35,7 +35,6 @@
*
* ***** END LICENSE BLOCK ***** */
#include "uchardetDefine.h"
#include "nsSBCharSetProber.h"
@ -50,7 +49,7 @@
//The following result for thai was collected from a limited sample (1M).
//Character Mapping Table:
unsigned char TIS620CharToOrderMap[] =
static const unsigned char TIS620CharToOrderMap[] =
{
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
@ -79,7 +78,7 @@ unsigned char TIS620CharToOrderMap[] =
//first 1024 sequences:7.3177%
//rest sequences: 1.0230%
//negative sequences: 0.0436%
char ThaiLangModel[] =
static const PRUint8 ThaiLangModel[] =
{
0,1,3,3,3,3,0,0,3,3,0,3,3,0,3,3,3,3,3,3,3,3,0,0,3,3,3,0,3,3,3,3,
0,3,3,0,0,0,1,3,0,3,3,2,3,3,0,1,2,3,3,3,3,0,2,0,2,0,0,3,2,1,2,2,
@ -212,11 +211,11 @@ char ThaiLangModel[] =
};
SequenceModel TIS620ThaiModel =
const SequenceModel TIS620ThaiModel =
{
TIS620CharToOrderMap,
ThaiLangModel,
(float)0.926386,
PR_FALSE,
CHARDET_ENCODING_TIS_620
"TIS-620"
};

View File

@ -41,7 +41,7 @@ void nsBig5Prober::Reset(void)
{
mCodingSM->Reset();
mState = eDetecting;
mDistributionAnalyser.Reset();
mDistributionAnalyser.Reset(mIsPreferredLanguage);
}
nsProbingState nsBig5Prober::HandleData(const char* aBuf, PRUint32 aLen)
@ -51,11 +51,6 @@ nsProbingState nsBig5Prober::HandleData(const char* aBuf, PRUint32 aLen)
for (PRUint32 i = 0; i < aLen; i++)
{
codingState = mCodingSM->NextState(aBuf[i]);
if (codingState == eError)
{
mState = eNotMe;
break;
}
if (codingState == eItsMe)
{
mState = eFoundIt;

View File

@ -38,18 +38,19 @@
#ifndef nsBig5Prober_h__
#define nsBig5Prober_h__
#include "uchardetDefine.h"
#include "nsCharSetProber.h"
#include "nsCodingStateMachine.h"
#include "CharDistribution.h"
class nsBig5Prober: public nsCharSetProber {
public:
nsBig5Prober(void){mCodingSM = new nsCodingStateMachine(&Big5SMModel);
Reset();}
nsBig5Prober(PRBool aIsPreferredLanguage)
:mIsPreferredLanguage(aIsPreferredLanguage)
{mCodingSM = new nsCodingStateMachine(&Big5SMModel);
Reset();}
virtual ~nsBig5Prober(void){delete mCodingSM;}
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
const char* GetCharSetName() {return CHARDET_ENCODING_BIG5;}
const char* GetCharSetName() {return "Big5";}
nsProbingState GetState(void) {return mState;}
void Reset(void);
float GetConfidence(void);
@ -64,6 +65,7 @@ protected:
//Big5ContextAnalysis mContextAnalyser;
Big5DistributionAnalysis mDistributionAnalyser;
char mLastChar[2];
PRBool mIsPreferredLanguage;
};

View File

@ -61,7 +61,7 @@ public:
virtual void SetOpion() = 0;
#ifdef DEBUG_chardet
virtual void DumpStatus() {}
virtual void DumpStatus() {};
#endif
// Helper functions used in the Latin1 and Group probers.

View File

@ -59,10 +59,7 @@ typedef struct
class nsCodingStateMachine {
public:
nsCodingStateMachine(SMModel* sm){
mCurrentState = eStart;
mModel = sm;
}
nsCodingStateMachine(const SMModel* sm) : mModel(sm) { mCurrentState = eStart; }
nsSMState NextState(char c){
//for each byte we get its class , if it is first byte, we also get byte length
PRUint32 byteCls = GETCLASS(c);
@ -86,23 +83,22 @@ protected:
PRUint32 mCurrentCharLen;
PRUint32 mCurrentBytePos;
SMModel *mModel;
const SMModel *mModel;
};
extern SMModel UTF8SMModel;
extern SMModel Big5SMModel;
extern SMModel EUCJPSMModel;
extern SMModel EUCKRSMModel;
extern SMModel EUCTWSMModel;
extern SMModel GB18030SMModel;
extern SMModel SJISSMModel;
extern SMModel UCS2BESMModel;
extern const SMModel UTF8SMModel;
extern const SMModel Big5SMModel;
extern const SMModel EUCJPSMModel;
extern const SMModel EUCKRSMModel;
extern const SMModel EUCTWSMModel;
extern const SMModel GB18030SMModel;
extern const SMModel SJISSMModel;
extern SMModel HZSMModel;
extern SMModel ISO2022CNSMModel;
extern SMModel ISO2022JPSMModel;
extern SMModel ISO2022KRSMModel;
extern const SMModel HZSMModel;
extern const SMModel ISO2022CNSMModel;
extern const SMModel ISO2022JPSMModel;
extern const SMModel ISO2022KRSMModel;
#endif /* nsCodingStateMachine_h__ */

View File

@ -46,8 +46,8 @@ void nsEUCJPProber::Reset(void)
{
mCodingSM->Reset();
mState = eDetecting;
mContextAnalyser.Reset();
mDistributionAnalyser.Reset();
mContextAnalyser.Reset(mIsPreferredLanguage);
mDistributionAnalyser.Reset(mIsPreferredLanguage);
}
nsProbingState nsEUCJPProber::HandleData(const char* aBuf, PRUint32 aLen)
@ -57,11 +57,6 @@ nsProbingState nsEUCJPProber::HandleData(const char* aBuf, PRUint32 aLen)
for (PRUint32 i = 0; i < aLen; i++)
{
codingState = mCodingSM->NextState(aBuf[i]);
if (codingState == eError)
{
mState = eNotMe;
break;
}
if (codingState == eItsMe)
{
mState = eFoundIt;

View File

@ -43,7 +43,6 @@
#ifndef nsEUCJPProber_h__
#define nsEUCJPProber_h__
#include "uchardetDefine.h"
#include "nsCharSetProber.h"
#include "nsCodingStateMachine.h"
#include "JpCntx.h"
@ -51,11 +50,13 @@
class nsEUCJPProber: public nsCharSetProber {
public:
nsEUCJPProber(void){mCodingSM = new nsCodingStateMachine(&EUCJPSMModel);
Reset();}
nsEUCJPProber(PRBool aIsPreferredLanguage)
:mIsPreferredLanguage(aIsPreferredLanguage)
{mCodingSM = new nsCodingStateMachine(&EUCJPSMModel);
Reset();}
virtual ~nsEUCJPProber(void){delete mCodingSM;}
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
const char* GetCharSetName() {return CHARDET_ENCODING_EUC_JP;}
const char* GetCharSetName() {return "EUC-JP";}
nsProbingState GetState(void) {return mState;}
void Reset(void);
float GetConfidence(void);
@ -69,6 +70,7 @@ protected:
EUCJPDistributionAnalysis mDistributionAnalyser;
char mLastChar[2];
PRBool mIsPreferredLanguage;
};

View File

@ -41,7 +41,7 @@ void nsEUCKRProber::Reset(void)
{
mCodingSM->Reset();
mState = eDetecting;
mDistributionAnalyser.Reset();
mDistributionAnalyser.Reset(mIsPreferredLanguage);
//mContextAnalyser.Reset();
}
@ -52,11 +52,6 @@ nsProbingState nsEUCKRProber::HandleData(const char* aBuf, PRUint32 aLen)
for (PRUint32 i = 0; i < aLen; i++)
{
codingState = mCodingSM->NextState(aBuf[i]);
if (codingState == eError)
{
mState = eNotMe;
break;
}
if (codingState == eItsMe)
{
mState = eFoundIt;

View File

@ -38,18 +38,20 @@
#ifndef nsEUCKRProber_h__
#define nsEUCKRProber_h__
#include "uchardetDefine.h"
#include "nsCharSetProber.h"
#include "nsCodingStateMachine.h"
#include "CharDistribution.h"
class nsEUCKRProber: public nsCharSetProber {
public:
nsEUCKRProber(void){mCodingSM = new nsCodingStateMachine(&EUCKRSMModel);
Reset();}
nsEUCKRProber(PRBool aIsPreferredLanguage)
:mIsPreferredLanguage(aIsPreferredLanguage)
{mCodingSM = new nsCodingStateMachine(&EUCKRSMModel);
Reset();
}
virtual ~nsEUCKRProber(void){delete mCodingSM;}
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
const char* GetCharSetName() {return CHARDET_ENCODING_EUC_KR;}
const char* GetCharSetName() {return "EUC-KR";}
nsProbingState GetState(void) {return mState;}
void Reset(void);
float GetConfidence(void);
@ -64,6 +66,7 @@ protected:
//EUCKRContextAnalysis mContextAnalyser;
EUCKRDistributionAnalysis mDistributionAnalyser;
char mLastChar[2];
PRBool mIsPreferredLanguage;
};

View File

@ -41,7 +41,7 @@ void nsEUCTWProber::Reset(void)
{
mCodingSM->Reset();
mState = eDetecting;
mDistributionAnalyser.Reset();
mDistributionAnalyser.Reset(mIsPreferredLanguage);
//mContextAnalyser.Reset();
}
@ -52,11 +52,6 @@ nsProbingState nsEUCTWProber::HandleData(const char* aBuf, PRUint32 aLen)
for (PRUint32 i = 0; i < aLen; i++)
{
codingState = mCodingSM->NextState(aBuf[i]);
if (codingState == eError)
{
mState = eNotMe;
break;
}
if (codingState == eItsMe)
{
mState = eFoundIt;

View File

@ -38,18 +38,19 @@
#ifndef nsEUCTWProber_h__
#define nsEUCTWProber_h__
#include "uchardetDefine.h"
#include "nsCharSetProber.h"
#include "nsCodingStateMachine.h"
#include "CharDistribution.h"
class nsEUCTWProber: public nsCharSetProber {
public:
nsEUCTWProber(void){mCodingSM = new nsCodingStateMachine(&EUCTWSMModel);
Reset();}
nsEUCTWProber(PRBool aIsPreferredLanguage)
:mIsPreferredLanguage(aIsPreferredLanguage)
{mCodingSM = new nsCodingStateMachine(&EUCTWSMModel);
Reset();}
virtual ~nsEUCTWProber(void){delete mCodingSM;}
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
const char* GetCharSetName() {return CHARDET_ENCODING_EUC_TW;}
const char* GetCharSetName() {return "x-euc-tw";}
nsProbingState GetState(void) {return mState;}
void Reset(void);
float GetConfidence(void);
@ -64,6 +65,7 @@ protected:
//EUCTWContextAnalysis mContextAnalyser;
EUCTWDistributionAnalysis mDistributionAnalyser;
char mLastChar[2];
PRBool mIsPreferredLanguage;
};

View File

@ -37,13 +37,21 @@
#include "nsEscCharsetProber.h"
#include "nsUniversalDetector.h"
nsEscCharSetProber::nsEscCharSetProber(void)
nsEscCharSetProber::nsEscCharSetProber(PRUint32 aLanguageFilter)
{
mCodingSM[0] = new nsCodingStateMachine(&HZSMModel);
mCodingSM[1] = new nsCodingStateMachine(&ISO2022CNSMModel);
mCodingSM[2] = new nsCodingStateMachine(&ISO2022JPSMModel);
mCodingSM[3] = new nsCodingStateMachine(&ISO2022KRSMModel);
for (PRUint32 i = 0; i < NUM_OF_ESC_CHARSETS; i++)
mCodingSM[i] = nsnull;
if (aLanguageFilter & NS_FILTER_CHINESE_SIMPLIFIED)
{
mCodingSM[0] = new nsCodingStateMachine(&HZSMModel);
mCodingSM[1] = new nsCodingStateMachine(&ISO2022CNSMModel);
}
if (aLanguageFilter & NS_FILTER_JAPANESE)
mCodingSM[2] = new nsCodingStateMachine(&ISO2022JPSMModel);
if (aLanguageFilter & NS_FILTER_KOREAN)
mCodingSM[3] = new nsCodingStateMachine(&ISO2022KRSMModel);
mActiveSM = NUM_OF_ESC_CHARSETS;
mState = eDetecting;
mDetectedCharset = nsnull;
@ -59,7 +67,8 @@ void nsEscCharSetProber::Reset(void)
{
mState = eDetecting;
for (PRUint32 i = 0; i < NUM_OF_ESC_CHARSETS; i++)
mCodingSM[i]->Reset();
if (mCodingSM[i])
mCodingSM[i]->Reset();
mActiveSM = NUM_OF_ESC_CHARSETS;
mDetectedCharset = nsnull;
}
@ -74,30 +83,15 @@ nsProbingState nsEscCharSetProber::HandleData(const char* aBuf, PRUint32 aLen)
{
for (j = mActiveSM-1; j>= 0; j--)
{
//byte is feed to all active state machine
codingState = mCodingSM[j]->NextState(aBuf[i]);
if (codingState == eError)
if (mCodingSM[j])
{
//got negative answer for this state machine, make it inactive
mActiveSM--;
if (mActiveSM == 0)
codingState = mCodingSM[j]->NextState(aBuf[i]);
if (codingState == eItsMe)
{
mState = eNotMe;
mState = eFoundIt;
mDetectedCharset = mCodingSM[j]->GetCodingStateMachine();
return mState;
}
else if (j != (PRInt32)mActiveSM)
{
nsCodingStateMachine* t;
t = mCodingSM[mActiveSM];
mCodingSM[mActiveSM] = mCodingSM[j];
mCodingSM[j] = t;
}
}
else if (codingState == eItsMe)
{
mState = eFoundIt;
mDetectedCharset = mCodingSM[j]->GetCodingStateMachine();
return mState;
}
}
}

View File

@ -45,7 +45,7 @@
class nsEscCharSetProber: public nsCharSetProber {
public:
nsEscCharSetProber(void);
nsEscCharSetProber(PRUint32 aLanguageFilter);
virtual ~nsEscCharSetProber(void);
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
const char* GetCharSetName() {return mDetectedCharset;}

View File

@ -20,7 +20,6 @@
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
* Kazutoshi Satoda
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
@ -35,10 +34,9 @@
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
#include "uchardetDefine.h"
#include "nsCodingStateMachine.h"
static PRUint32 HZ_cls[ 256 / 8 ] = {
static const PRUint32 HZ_cls[ 256 / 8 ] = {
PCK4BITS(1,0,0,0,0,0,0,0), // 00 - 07
PCK4BITS(0,0,0,0,0,0,0,0), // 08 - 0f
PCK4BITS(0,0,0,0,0,0,0,0), // 10 - 17
@ -74,7 +72,7 @@ PCK4BITS(1,1,1,1,1,1,1,1) // f8 - ff
};
static PRUint32 HZ_st [ 6] = {
static const PRUint32 HZ_st [ 6] = {
PCK4BITS(eStart,eError, 3,eStart,eStart,eStart,eError,eError),//00-07
PCK4BITS(eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe),//08-0f
PCK4BITS(eItsMe,eItsMe,eError,eError,eStart,eStart, 4,eError),//10-17
@ -85,16 +83,16 @@ PCK4BITS( 4,eItsMe,eStart,eStart,eStart,eStart,eStart,eStart) //28-2f
static const PRUint32 HZCharLenTable[] = {0, 0, 0, 0, 0, 0};
SMModel HZSMModel = {
const SMModel HZSMModel = {
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, HZ_cls },
6,
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, HZ_st },
HZCharLenTable,
CHARDET_ENCODING_HZ_GB_2312,
"HZ-GB-2312",
};
static PRUint32 ISO2022CN_cls [ 256 / 8 ] = {
static const PRUint32 ISO2022CN_cls [ 256 / 8 ] = {
PCK4BITS(2,0,0,0,0,0,0,0), // 00 - 07
PCK4BITS(0,0,0,0,0,0,0,0), // 08 - 0f
PCK4BITS(0,0,0,0,0,0,0,0), // 10 - 17
@ -130,7 +128,7 @@ PCK4BITS(2,2,2,2,2,2,2,2) // f8 - ff
};
static PRUint32 ISO2022CN_st [ 8] = {
static const PRUint32 ISO2022CN_st [ 8] = {
PCK4BITS(eStart, 3,eError,eStart,eStart,eStart,eStart,eStart),//00-07
PCK4BITS(eStart,eError,eError,eError,eError,eError,eError,eError),//08-0f
PCK4BITS(eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe),//10-17
@ -143,15 +141,15 @@ PCK4BITS(eError,eError,eError,eError,eError,eItsMe,eError,eStart) //38-3f
static const PRUint32 ISO2022CNCharLenTable[] = {0, 0, 0, 0, 0, 0, 0, 0, 0};
SMModel ISO2022CNSMModel = {
const SMModel ISO2022CNSMModel = {
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, ISO2022CN_cls },
9,
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, ISO2022CN_st },
ISO2022CNCharLenTable,
CHARDET_ENCODING_ISO_2022_CN,
"ISO-2022-CN",
};
static PRUint32 ISO2022JP_cls [ 256 / 8 ] = {
static const PRUint32 ISO2022JP_cls [ 256 / 8 ] = {
PCK4BITS(2,0,0,0,0,0,0,0), // 00 - 07
PCK4BITS(0,0,0,0,0,0,2,2), // 08 - 0f
PCK4BITS(0,0,0,0,0,0,0,0), // 10 - 17
@ -187,7 +185,7 @@ PCK4BITS(2,2,2,2,2,2,2,2) // f8 - ff
};
static PRUint32 ISO2022JP_st [ 9] = {
static const PRUint32 ISO2022JP_st [ 9] = {
PCK4BITS(eStart, 3,eError,eStart,eStart,eStart,eStart,eStart),//00-07
PCK4BITS(eStart,eStart,eError,eError,eError,eError,eError,eError),//08-0f
PCK4BITS(eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe),//10-17
@ -199,17 +197,17 @@ PCK4BITS(eError,eError,eError,eItsMe,eError,eError,eError,eError),//38-3f
PCK4BITS(eError,eError,eError,eError,eItsMe,eError,eStart,eStart) //40-47
};
static const PRUint32 ISO2022JPCharLenTable[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
static const PRUint32 ISO2022JPCharLenTable[] = {0, 0, 0, 0, 0, 0, 0, 0};
SMModel ISO2022JPSMModel = {
const SMModel ISO2022JPSMModel = {
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, ISO2022JP_cls },
10,
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, ISO2022JP_st },
ISO2022JPCharLenTable,
CHARDET_ENCODING_ISO_2022_JP,
"ISO-2022-JP",
};
static PRUint32 ISO2022KR_cls [ 256 / 8 ] = {
static const PRUint32 ISO2022KR_cls [ 256 / 8 ] = {
PCK4BITS(2,0,0,0,0,0,0,0), // 00 - 07
PCK4BITS(0,0,0,0,0,0,0,0), // 08 - 0f
PCK4BITS(0,0,0,0,0,0,0,0), // 10 - 17
@ -245,7 +243,7 @@ PCK4BITS(2,2,2,2,2,2,2,2) // f8 - ff
};
static PRUint32 ISO2022KR_st [ 5] = {
static const PRUint32 ISO2022KR_st [ 5] = {
PCK4BITS(eStart, 3,eError,eStart,eStart,eStart,eError,eError),//00-07
PCK4BITS(eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe),//08-0f
PCK4BITS(eItsMe,eItsMe,eError,eError,eError, 4,eError,eError),//10-17
@ -255,11 +253,11 @@ PCK4BITS(eError,eError,eError,eItsMe,eStart,eStart,eStart,eStart) //20-27
static const PRUint32 ISO2022KRCharLenTable[] = {0, 0, 0, 0, 0, 0};
SMModel ISO2022KRSMModel = {
const SMModel ISO2022KRSMModel = {
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, ISO2022KR_cls },
6,
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, ISO2022KR_st },
ISO2022KRCharLenTable,
CHARDET_ENCODING_ISO_2022_KR,
"ISO-2022-KR",
};

View File

@ -46,7 +46,7 @@ void nsGB18030Prober::Reset(void)
{
mCodingSM->Reset();
mState = eDetecting;
mDistributionAnalyser.Reset();
mDistributionAnalyser.Reset(mIsPreferredLanguage);
//mContextAnalyser.Reset();
}
@ -57,11 +57,6 @@ nsProbingState nsGB18030Prober::HandleData(const char* aBuf, PRUint32 aLen)
for (PRUint32 i = 0; i < aLen; i++)
{
codingState = mCodingSM->NextState(aBuf[i]);
if (codingState == eError)
{
mState = eNotMe;
break;
}
if (codingState == eItsMe)
{
mState = eFoundIt;

View File

@ -38,7 +38,6 @@
#ifndef nsGB2312Prober_h__
#define nsGB2312Prober_h__
#include "uchardetDefine.h"
#include "nsCharSetProber.h"
#include "nsCodingStateMachine.h"
#include "CharDistribution.h"
@ -47,11 +46,13 @@
class nsGB18030Prober: public nsCharSetProber {
public:
nsGB18030Prober(void){mCodingSM = new nsCodingStateMachine(&GB18030SMModel);
Reset();}
nsGB18030Prober(PRBool aIsPreferredLanguage)
:mIsPreferredLanguage(aIsPreferredLanguage)
{mCodingSM = new nsCodingStateMachine(&GB18030SMModel);
Reset();}
virtual ~nsGB18030Prober(void){delete mCodingSM;}
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
const char* GetCharSetName() {return CHARDET_ENCODING_GB18030;}
const char* GetCharSetName() {return "gb18030";}
nsProbingState GetState(void) {return mState;}
void Reset(void);
float GetConfidence(void);
@ -66,6 +67,7 @@ protected:
//GB2312ContextAnalysis mContextAnalyser;
GB2312DistributionAnalysis mDistributionAnalyser;
char mLastChar[2];
PRBool mIsPreferredLanguage;
};

View File

@ -35,7 +35,6 @@
*
* ***** END LICENSE BLOCK ***** */
#include "uchardetDefine.h"
#include "nsHebrewProber.h"
#include <stdio.h>
@ -59,8 +58,8 @@
// If the difference is below this, don't rely at all on the model score distance.
#define MIN_MODEL_DISTANCE (0.01)
#define VISUAL_HEBREW_NAME (CHARDET_ENCODING_ISO_8859_8)
#define LOGICAL_HEBREW_NAME (CHARDET_ENCODING_WINDOWS_1255)
#define VISUAL_HEBREW_NAME ("ISO-8859-8")
#define LOGICAL_HEBREW_NAME ("windows-1255")
PRBool nsHebrewProber::isFinal(char c)
{

View File

@ -50,7 +50,7 @@
#define ASO 7 // accent small other
#define CLASS_NUM 8 // total classes
static unsigned char Latin1_CharToClass[] =
static const unsigned char Latin1_CharToClass[] =
{
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 00 - 07
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 08 - 0F
@ -92,7 +92,7 @@ static unsigned char Latin1_CharToClass[] =
2 : normal
3 : very likely
*/
static unsigned char Latin1ClassModel[] =
static const unsigned char Latin1ClassModel[] =
{
/* UDF OTH ASC ASS ACV ACO ASV ASO */
/*UDF*/ 0, 0, 0, 0, 0, 0, 0, 0,

View File

@ -39,7 +39,6 @@
#ifndef nsLatin1Prober_h__
#define nsLatin1Prober_h__
#include "uchardetDefine.h"
#include "nsCharSetProber.h"
#define FREQ_CAT_NUM 4
@ -49,7 +48,7 @@ public:
nsLatin1Prober(void){Reset();}
virtual ~nsLatin1Prober(void){}
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
const char* GetCharSetName() {return CHARDET_ENCODING_WINDOWS_1252;}
const char* GetCharSetName() {return "windows-1252";}
nsProbingState GetState(void) {return mState;}
void Reset(void);
float GetConfidence(void);

View File

@ -21,6 +21,7 @@
*
* Contributor(s):
* Shy Shalom <shooshX@gmail.com>
* Proofpoint, Inc.
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
@ -36,12 +37,12 @@
*
* ***** END LICENSE BLOCK ***** */
#include <stdio.h>
#include "prmem.h"
#include "nsMBCSGroupProber.h"
#include "nsUniversalDetector.h"
#ifdef DEBUG_chardet
char *ProberName[] =
#if defined(DEBUG_chardet) || defined(DEBUG_jgmyers)
const char *ProberName[] =
{
"UTF8",
"SJIS",
@ -54,15 +55,26 @@ char *ProberName[] =
#endif
nsMBCSGroupProber::nsMBCSGroupProber()
nsMBCSGroupProber::nsMBCSGroupProber(PRUint32 aLanguageFilter)
{
for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++)
mProbers[i] = nsnull;
mProbers[0] = new nsUTF8Prober();
mProbers[1] = new nsSJISProber();
mProbers[2] = new nsEUCJPProber();
mProbers[3] = new nsGB18030Prober();
mProbers[4] = new nsEUCKRProber();
mProbers[5] = new nsBig5Prober();
mProbers[6] = new nsEUCTWProber();
if (aLanguageFilter & NS_FILTER_JAPANESE)
{
mProbers[1] = new nsSJISProber(aLanguageFilter == NS_FILTER_JAPANESE);
mProbers[2] = new nsEUCJPProber(aLanguageFilter == NS_FILTER_JAPANESE);
}
if (aLanguageFilter & NS_FILTER_CHINESE_SIMPLIFIED)
mProbers[3] = new nsGB18030Prober(aLanguageFilter == NS_FILTER_CHINESE_SIMPLIFIED);
if (aLanguageFilter & NS_FILTER_KOREAN)
mProbers[4] = new nsEUCKRProber(aLanguageFilter == NS_FILTER_KOREAN);
if (aLanguageFilter & NS_FILTER_CHINESE_TRADITIONAL)
{
mProbers[5] = new nsBig5Prober(aLanguageFilter == NS_FILTER_CHINESE_TRADITIONAL);
mProbers[6] = new nsEUCTWProber(aLanguageFilter == NS_FILTER_CHINESE_TRADITIONAL);
}
Reset();
}
@ -101,62 +113,59 @@ void nsMBCSGroupProber::Reset(void)
}
mBestGuess = -1;
mState = eDetecting;
mKeepNext = 0;
}
nsProbingState nsMBCSGroupProber::HandleData(const char* aBuf, PRUint32 aLen)
{
nsProbingState st;
PRUint32 i;
PRUint32 start = 0;
PRUint32 keepNext = mKeepNext;
//do filtering to reduce load to probers
char *highbyteBuf;
char *hptr;
PRBool keepNext = PR_TRUE; //assume previous is not ascii, it will do no harm except add some noise
hptr = highbyteBuf = (char*)PR_Malloc(aLen);
if (!hptr)
return mState;
for (i = 0; i < aLen; i++)
for (PRUint32 pos = 0; pos < aLen; ++pos)
{
if (aBuf[i] & 0x80)
if (aBuf[pos] & 0x80)
{
*hptr++ = aBuf[i];
keepNext = PR_TRUE;
if (!keepNext)
start = pos;
keepNext = 2;
}
else
else if (keepNext)
{
//if previous is highbyte, keep this even it is a ASCII
if (keepNext)
if (--keepNext == 0)
{
*hptr++ = aBuf[i];
keepNext = PR_FALSE;
for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++)
{
if (!mIsActive[i])
continue;
st = mProbers[i]->HandleData(aBuf + start, pos + 1 - start);
if (st == eFoundIt)
{
mBestGuess = i;
mState = eFoundIt;
return mState;
}
}
}
}
}
for (i = 0; i < NUM_OF_PROBERS; i++)
{
if (!mIsActive[i])
continue;
st = mProbers[i]->HandleData(highbyteBuf, hptr - highbyteBuf);
if (st == eFoundIt)
{
mBestGuess = i;
mState = eFoundIt;
break;
}
else if (st == eNotMe)
{
mIsActive[i] = PR_FALSE;
mActiveNum--;
if (mActiveNum <= 0)
{
mState = eNotMe;
break;
}
}
if (keepNext) {
for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++)
{
if (!mIsActive[i])
continue;
st = mProbers[i]->HandleData(aBuf + start, aLen - start);
if (st == eFoundIt)
{
mBestGuess = i;
mState = eFoundIt;
return mState;
}
}
}
PR_FREEIF(highbyteBuf);
mKeepNext = keepNext;
return mState;
}
@ -207,3 +216,15 @@ void nsMBCSGroupProber::DumpStatus()
}
}
#endif
#ifdef DEBUG_jgmyers
void nsMBCSGroupProber::GetDetectorState(nsUniversalDetector::DetectorState (&states)[nsUniversalDetector::NumDetectors], PRUint32 &offset)
{
for (PRUint32 i = 0; i < NUM_OF_PROBERS; ++i) {
states[offset].name = ProberName[i];
states[offset].isActive = mIsActive[i];
states[offset].confidence = mIsActive[i] ? mProbers[i]->GetConfidence() : 0.0;
++offset;
}
}
#endif /* DEBUG_jgmyers */

View File

@ -20,6 +20,7 @@
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
* Proofpoint, Inc.
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
@ -50,7 +51,7 @@
class nsMBCSGroupProber: public nsCharSetProber {
public:
nsMBCSGroupProber();
nsMBCSGroupProber(PRUint32 aLanguageFilter);
virtual ~nsMBCSGroupProber();
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
const char* GetCharSetName();
@ -62,6 +63,9 @@ public:
#ifdef DEBUG_chardet
void DumpStatus();
#endif
#ifdef DEBUG_jgmyers
void GetDetectorState(nsUniversalDetector::DetectorState (&states)[nsUniversalDetector::NumDetectors], PRUint32 &offset);
#endif
protected:
nsProbingState mState;
@ -69,6 +73,7 @@ protected:
PRBool mIsActive[NUM_OF_PROBERS];
PRInt32 mBestGuess;
PRUint32 mActiveNum;
PRUint32 mKeepNext;
};
#endif /* nsMBCSGroupProber_h__ */

View File

@ -34,7 +34,6 @@
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
#include "uchardetDefine.h"
#include "nsCodingStateMachine.h"
/*
@ -45,7 +44,7 @@ Modification from frank tang's original work:
// BIG5
static PRUint32 BIG5_cls [ 256 / 8 ] = {
static const PRUint32 BIG5_cls [ 256 / 8 ] = {
//PCK4BITS(0,1,1,1,1,1,1,1), // 00 - 07
PCK4BITS(1,1,1,1,1,1,1,1), // 00 - 07 //allow 0x00 as legal value
PCK4BITS(1,1,1,1,1,1,0,0), // 08 - 0f
@ -82,7 +81,7 @@ PCK4BITS(3,3,3,3,3,3,3,0) // f8 - ff
};
static PRUint32 BIG5_st [ 3] = {
static const PRUint32 BIG5_st [ 3] = {
PCK4BITS(eError,eStart,eStart, 3,eError,eError,eError,eError),//00-07
PCK4BITS(eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError),//08-0f
PCK4BITS(eError,eStart,eStart,eStart,eStart,eStart,eStart,eStart) //10-17
@ -90,15 +89,15 @@ PCK4BITS(eError,eStart,eStart,eStart,eStart,eStart,eStart,eStart) //10-17
static const PRUint32 Big5CharLenTable[] = {0, 1, 1, 2, 0};
SMModel Big5SMModel = {
SMModel const Big5SMModel = {
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, BIG5_cls },
5,
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, BIG5_st },
Big5CharLenTable,
CHARDET_ENCODING_BIG5,
"Big5",
};
static PRUint32 EUCJP_cls [ 256 / 8 ] = {
static const PRUint32 EUCJP_cls [ 256 / 8 ] = {
//PCK4BITS(5,4,4,4,4,4,4,4), // 00 - 07
PCK4BITS(4,4,4,4,4,4,4,4), // 00 - 07
PCK4BITS(4,4,4,4,4,4,5,5), // 08 - 0f
@ -135,7 +134,7 @@ PCK4BITS(0,0,0,0,0,0,0,5) // f8 - ff
};
static PRUint32 EUCJP_st [ 5] = {
static const PRUint32 EUCJP_st [ 5] = {
PCK4BITS( 3, 4, 3, 5,eStart,eError,eError,eError),//00-07
PCK4BITS(eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe),//08-0f
PCK4BITS(eItsMe,eItsMe,eStart,eError,eStart,eError,eError,eError),//10-17
@ -145,15 +144,15 @@ PCK4BITS( 3,eError,eError,eError,eStart,eStart,eStart,eStart) //20-27
static const PRUint32 EUCJPCharLenTable[] = {2, 2, 2, 3, 1, 0};
SMModel EUCJPSMModel = {
const SMModel EUCJPSMModel = {
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCJP_cls },
6,
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCJP_st },
EUCJPCharLenTable,
CHARDET_ENCODING_EUC_JP,
"EUC-JP",
};
static PRUint32 EUCKR_cls [ 256 / 8 ] = {
static const PRUint32 EUCKR_cls [ 256 / 8 ] = {
//PCK4BITS(0,1,1,1,1,1,1,1), // 00 - 07
PCK4BITS(1,1,1,1,1,1,1,1), // 00 - 07
PCK4BITS(1,1,1,1,1,1,0,0), // 08 - 0f
@ -190,22 +189,22 @@ PCK4BITS(2,2,2,2,2,2,2,0) // f8 - ff
};
static PRUint32 EUCKR_st [ 2] = {
static const PRUint32 EUCKR_st [ 2] = {
PCK4BITS(eError,eStart, 3,eError,eError,eError,eError,eError),//00-07
PCK4BITS(eItsMe,eItsMe,eItsMe,eItsMe,eError,eError,eStart,eStart) //08-0f
};
static const PRUint32 EUCKRCharLenTable[] = {0, 1, 2, 0};
SMModel EUCKRSMModel = {
const SMModel EUCKRSMModel = {
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCKR_cls },
4,
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCKR_st },
EUCKRCharLenTable,
CHARDET_ENCODING_EUC_KR,
"EUC-KR",
};
static PRUint32 EUCTW_cls [ 256 / 8 ] = {
static const PRUint32 EUCTW_cls [ 256 / 8 ] = {
//PCK4BITS(0,2,2,2,2,2,2,2), // 00 - 07
PCK4BITS(2,2,2,2,2,2,2,2), // 00 - 07
PCK4BITS(2,2,2,2,2,2,0,0), // 08 - 0f
@ -242,7 +241,7 @@ PCK4BITS(3,3,3,3,3,3,3,0) // f8 - ff
};
static PRUint32 EUCTW_st [ 6] = {
static const PRUint32 EUCTW_st [ 6] = {
PCK4BITS(eError,eError,eStart, 3, 3, 3, 4,eError),//00-07
PCK4BITS(eError,eError,eError,eError,eError,eError,eItsMe,eItsMe),//08-0f
PCK4BITS(eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError,eStart,eError),//10-17
@ -253,12 +252,12 @@ PCK4BITS(eStart,eError,eStart,eStart,eStart,eStart,eStart,eStart) //28-2f
static const PRUint32 EUCTWCharLenTable[] = {0, 0, 1, 2, 2, 2, 3};
SMModel EUCTWSMModel = {
const SMModel EUCTWSMModel = {
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCTW_cls },
7,
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCTW_st },
EUCTWCharLenTable,
CHARDET_ENCODING_EUC_TW,
"x-euc-tw",
};
/* obsolete GB2312 by gb18030
@ -317,7 +316,7 @@ SMModel GB2312SMModel = {
// the following state machine data was created by perl script in
// intl/chardet/tools. It should be the same as in PSM detector.
static PRUint32 GB18030_cls [ 256 / 8 ] = {
static const PRUint32 GB18030_cls [ 256 / 8 ] = {
PCK4BITS(1,1,1,1,1,1,1,1), // 00 - 07
PCK4BITS(1,1,1,1,1,1,0,0), // 08 - 0f
PCK4BITS(1,1,1,1,1,1,1,1), // 10 - 17
@ -353,7 +352,7 @@ PCK4BITS(6,6,6,6,6,6,6,0) // f8 - ff
};
static PRUint32 GB18030_st [ 6] = {
static const PRUint32 GB18030_st [ 6] = {
PCK4BITS(eError,eStart,eStart,eStart,eStart,eStart, 3,eError),//00-07
PCK4BITS(eError,eError,eError,eError,eError,eError,eItsMe,eItsMe),//08-0f
PCK4BITS(eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError,eError,eStart),//10-17
@ -369,17 +368,17 @@ PCK4BITS(eError,eError,eStart,eStart,eStart,eStart,eStart,eStart) //28-2f
// 2 here.
static const PRUint32 GB18030CharLenTable[] = {0, 1, 1, 1, 1, 1, 2};
SMModel GB18030SMModel = {
const SMModel GB18030SMModel = {
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, GB18030_cls },
7,
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, GB18030_st },
GB18030CharLenTable,
CHARDET_ENCODING_GB18030,
"GB18030",
};
// sjis
static PRUint32 SJIS_cls [ 256 / 8 ] = {
static const PRUint32 SJIS_cls [ 256 / 8 ] = {
//PCK4BITS(0,1,1,1,1,1,1,1), // 00 - 07
PCK4BITS(1,1,1,1,1,1,1,1), // 00 - 07
PCK4BITS(1,1,1,1,1,1,0,0), // 08 - 0f
@ -418,7 +417,7 @@ PCK4BITS(4,4,4,4,4,0,0,0) // f8 - ff
};
static PRUint32 SJIS_st [ 3] = {
static const PRUint32 SJIS_st [ 3] = {
PCK4BITS(eError,eStart,eStart, 3,eError,eError,eError,eError),//00-07
PCK4BITS(eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe),//08-0f
PCK4BITS(eItsMe,eItsMe,eError,eError,eStart,eStart,eStart,eStart) //10-17
@ -426,129 +425,16 @@ PCK4BITS(eItsMe,eItsMe,eError,eError,eStart,eStart,eStart,eStart) //10-17
static const PRUint32 SJISCharLenTable[] = {0, 1, 1, 2, 0, 0};
SMModel SJISSMModel = {
const SMModel SJISSMModel = {
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, SJIS_cls },
6,
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, SJIS_st },
SJISCharLenTable,
CHARDET_ENCODING_SHIFT_JIS,
"Shift_JIS",
};
static PRUint32 UCS2BE_cls [ 256 / 8 ] = {
PCK4BITS(0,0,0,0,0,0,0,0), // 00 - 07
PCK4BITS(0,0,1,0,0,2,0,0), // 08 - 0f
PCK4BITS(0,0,0,0,0,0,0,0), // 10 - 17
PCK4BITS(0,0,0,3,0,0,0,0), // 18 - 1f
PCK4BITS(0,0,0,0,0,0,0,0), // 20 - 27
PCK4BITS(0,3,3,3,3,3,0,0), // 28 - 2f
PCK4BITS(0,0,0,0,0,0,0,0), // 30 - 37
PCK4BITS(0,0,0,0,0,0,0,0), // 38 - 3f
PCK4BITS(0,0,0,0,0,0,0,0), // 40 - 47
PCK4BITS(0,0,0,0,0,0,0,0), // 48 - 4f
PCK4BITS(0,0,0,0,0,0,0,0), // 50 - 57
PCK4BITS(0,0,0,0,0,0,0,0), // 58 - 5f
PCK4BITS(0,0,0,0,0,0,0,0), // 60 - 67
PCK4BITS(0,0,0,0,0,0,0,0), // 68 - 6f
PCK4BITS(0,0,0,0,0,0,0,0), // 70 - 77
PCK4BITS(0,0,0,0,0,0,0,0), // 78 - 7f
PCK4BITS(0,0,0,0,0,0,0,0), // 80 - 87
PCK4BITS(0,0,0,0,0,0,0,0), // 88 - 8f
PCK4BITS(0,0,0,0,0,0,0,0), // 90 - 97
PCK4BITS(0,0,0,0,0,0,0,0), // 98 - 9f
PCK4BITS(0,0,0,0,0,0,0,0), // a0 - a7
PCK4BITS(0,0,0,0,0,0,0,0), // a8 - af
PCK4BITS(0,0,0,0,0,0,0,0), // b0 - b7
PCK4BITS(0,0,0,0,0,0,0,0), // b8 - bf
PCK4BITS(0,0,0,0,0,0,0,0), // c0 - c7
PCK4BITS(0,0,0,0,0,0,0,0), // c8 - cf
PCK4BITS(0,0,0,0,0,0,0,0), // d0 - d7
PCK4BITS(0,0,0,0,0,0,0,0), // d8 - df
PCK4BITS(0,0,0,0,0,0,0,0), // e0 - e7
PCK4BITS(0,0,0,0,0,0,0,0), // e8 - ef
PCK4BITS(0,0,0,0,0,0,0,0), // f0 - f7
PCK4BITS(0,0,0,0,0,0,4,5) // f8 - ff
};
static PRUint32 UCS2BE_st [ 7] = {
PCK4BITS( 5, 7, 7,eError, 4, 3,eError,eError),//00-07
PCK4BITS(eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe),//08-0f
PCK4BITS(eItsMe,eItsMe, 6, 6, 6, 6,eError,eError),//10-17
PCK4BITS( 6, 6, 6, 6, 6,eItsMe, 6, 6),//18-1f
PCK4BITS( 6, 6, 6, 6, 5, 7, 7,eError),//20-27
PCK4BITS( 5, 8, 6, 6,eError, 6, 6, 6),//28-2f
PCK4BITS( 6, 6, 6, 6,eError,eError,eStart,eStart) //30-37
};
static const PRUint32 UCS2BECharLenTable[] = {2, 2, 2, 0, 2, 2};
SMModel UCS2BESMModel = {
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, UCS2BE_cls },
6,
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, UCS2BE_st },
UCS2BECharLenTable,
CHARDET_ENCODING_UTF_16BE,
};
static PRUint32 UCS2LE_cls [ 256 / 8 ] = {
PCK4BITS(0,0,0,0,0,0,0,0), // 00 - 07
PCK4BITS(0,0,1,0,0,2,0,0), // 08 - 0f
PCK4BITS(0,0,0,0,0,0,0,0), // 10 - 17
PCK4BITS(0,0,0,3,0,0,0,0), // 18 - 1f
PCK4BITS(0,0,0,0,0,0,0,0), // 20 - 27
PCK4BITS(0,3,3,3,3,3,0,0), // 28 - 2f
PCK4BITS(0,0,0,0,0,0,0,0), // 30 - 37
PCK4BITS(0,0,0,0,0,0,0,0), // 38 - 3f
PCK4BITS(0,0,0,0,0,0,0,0), // 40 - 47
PCK4BITS(0,0,0,0,0,0,0,0), // 48 - 4f
PCK4BITS(0,0,0,0,0,0,0,0), // 50 - 57
PCK4BITS(0,0,0,0,0,0,0,0), // 58 - 5f
PCK4BITS(0,0,0,0,0,0,0,0), // 60 - 67
PCK4BITS(0,0,0,0,0,0,0,0), // 68 - 6f
PCK4BITS(0,0,0,0,0,0,0,0), // 70 - 77
PCK4BITS(0,0,0,0,0,0,0,0), // 78 - 7f
PCK4BITS(0,0,0,0,0,0,0,0), // 80 - 87
PCK4BITS(0,0,0,0,0,0,0,0), // 88 - 8f
PCK4BITS(0,0,0,0,0,0,0,0), // 90 - 97
PCK4BITS(0,0,0,0,0,0,0,0), // 98 - 9f
PCK4BITS(0,0,0,0,0,0,0,0), // a0 - a7
PCK4BITS(0,0,0,0,0,0,0,0), // a8 - af
PCK4BITS(0,0,0,0,0,0,0,0), // b0 - b7
PCK4BITS(0,0,0,0,0,0,0,0), // b8 - bf
PCK4BITS(0,0,0,0,0,0,0,0), // c0 - c7
PCK4BITS(0,0,0,0,0,0,0,0), // c8 - cf
PCK4BITS(0,0,0,0,0,0,0,0), // d0 - d7
PCK4BITS(0,0,0,0,0,0,0,0), // d8 - df
PCK4BITS(0,0,0,0,0,0,0,0), // e0 - e7
PCK4BITS(0,0,0,0,0,0,0,0), // e8 - ef
PCK4BITS(0,0,0,0,0,0,0,0), // f0 - f7
PCK4BITS(0,0,0,0,0,0,4,5) // f8 - ff
};
static PRUint32 UCS2LE_st [ 7] = {
PCK4BITS( 6, 6, 7, 6, 4, 3,eError,eError),//00-07
PCK4BITS(eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe),//08-0f
PCK4BITS(eItsMe,eItsMe, 5, 5, 5,eError,eItsMe,eError),//10-17
PCK4BITS( 5, 5, 5,eError, 5,eError, 6, 6),//18-1f
PCK4BITS( 7, 6, 8, 8, 5, 5, 5,eError),//20-27
PCK4BITS( 5, 5, 5,eError,eError,eError, 5, 5),//28-2f
PCK4BITS( 5, 5, 5,eError, 5,eError,eStart,eStart) //30-37
};
static const PRUint32 UCS2LECharLenTable[] = {2, 2, 2, 2, 2, 2};
SMModel UCS2LESMModel = {
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, UCS2LE_cls },
6,
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, UCS2LE_st },
UCS2LECharLenTable,
CHARDET_ENCODING_UTF_16LE,
};
static PRUint32 UTF8_cls [ 256 / 8 ] = {
static const PRUint32 UTF8_cls [ 256 / 8 ] = {
//PCK4BITS(0,1,1,1,1,1,1,1), // 00 - 07
PCK4BITS(1,1,1,1,1,1,1,1), // 00 - 07 //allow 0x00 as a legal value
PCK4BITS(1,1,1,1,1,1,0,0), // 08 - 0f
@ -585,7 +471,7 @@ PCK4BITS(12,13,13,13,14,15,0,0) // f8 - ff
};
static PRUint32 UTF8_st [ 26] = {
static const PRUint32 UTF8_st [ 26] = {
PCK4BITS(eError,eStart,eError,eError,eError,eError, 12, 10),//00-07
PCK4BITS( 9, 11, 8, 7, 6, 5, 4, 3),//08-0f
PCK4BITS(eError,eError,eError,eError,eError,eError,eError,eError),//10-17
@ -617,11 +503,11 @@ PCK4BITS(eError,eError,eError,eError,eError,eError,eError,eError) //c8-cf
static const PRUint32 UTF8CharLenTable[] = {0, 1, 0, 0, 0, 0, 2, 3,
3, 3, 4, 4, 5, 5, 6, 6 };
SMModel UTF8SMModel = {
const SMModel UTF8SMModel = {
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, UTF8_cls },
16,
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, UTF8_st },
UTF8CharLenTable,
CHARDET_ENCODING_UTF_8,
"UTF-8",
};

View File

@ -68,7 +68,7 @@ typedef struct nsPkgInt {
nsSftMsk sftmsk;
nsBitSft bitsft;
nsUnitMsk unitmsk;
PRUint32 *data;
const PRUint32* const data;
} nsPkgInt;

View File

@ -56,21 +56,22 @@ nsSBCSGroupProber::nsSBCSGroupProber()
mProbers[7] = new nsSingleByteCharSetProber(&Win1253Model);
mProbers[8] = new nsSingleByteCharSetProber(&Latin5BulgarianModel);
mProbers[9] = new nsSingleByteCharSetProber(&Win1251BulgarianModel);
mProbers[10] = new nsSingleByteCharSetProber(&TIS620ThaiModel);
nsHebrewProber *hebprober = new nsHebrewProber();
// Notice: Any change in these indexes - 10,11,12 must be reflected
// in the code below as well.
mProbers[10] = hebprober;
mProbers[11] = new nsSingleByteCharSetProber(&Win1255Model, PR_FALSE, hebprober); // Logical Hebrew
mProbers[12] = new nsSingleByteCharSetProber(&Win1255Model, PR_TRUE, hebprober); // Visual Hebrew
mProbers[11] = hebprober;
mProbers[12] = new nsSingleByteCharSetProber(&Win1255Model, PR_FALSE, hebprober); // Logical Hebrew
mProbers[13] = new nsSingleByteCharSetProber(&Win1255Model, PR_TRUE, hebprober); // Visual Hebrew
// Tell the Hebrew prober about the logical and visual probers
if (mProbers[10] && mProbers[11] && mProbers[12]) // all are not null
if (mProbers[11] && mProbers[12] && mProbers[13]) // all are not null
{
hebprober->SetModelProbers(mProbers[11], mProbers[12]);
hebprober->SetModelProbers(mProbers[12], mProbers[13]);
}
else // One or more is null. avoid any Hebrew probing, null them all
{
for (PRUint32 i = 10; i <= 12; ++i)
for (PRUint32 i = 11; i <= 13; ++i)
{
delete mProbers[i];
mProbers[i] = 0;

View File

@ -40,7 +40,7 @@
#define nsSBCSGroupProber_h__
#define NUM_OF_SBCS_PROBERS 13
#define NUM_OF_SBCS_PROBERS 14
class nsCharSetProber;
class nsSBCSGroupProber: public nsCharSetProber {

View File

@ -51,19 +51,19 @@
typedef struct
{
unsigned char *charToOrderMap; // [256] table use to find a char's order
char *precedenceMatrix; // [SAMPLE_SIZE][SAMPLE_SIZE]; table to find a 2-char sequence's frequency
const unsigned char* const charToOrderMap; // [256] table use to find a char's order
const PRUint8* const precedenceMatrix; // [SAMPLE_SIZE][SAMPLE_SIZE]; table to find a 2-char sequence's frequency
float mTypicalPositiveRatio; // = freqSeqs / totalSeqs
PRBool keepEnglishLetter; // says if this script contains English characters (not implemented)
const char* charsetName;
const char* const charsetName;
} SequenceModel;
class nsSingleByteCharSetProber : public nsCharSetProber{
public:
nsSingleByteCharSetProber(SequenceModel *model)
nsSingleByteCharSetProber(const SequenceModel *model)
:mModel(model), mReversed(PR_FALSE), mNameProber(0) { Reset(); }
nsSingleByteCharSetProber(SequenceModel *model, PRBool reversed, nsCharSetProber* nameProber)
nsSingleByteCharSetProber(const SequenceModel *model, PRBool reversed, nsCharSetProber* nameProber)
:mModel(model), mReversed(reversed), mNameProber(nameProber) { Reset(); }
virtual const char* GetCharSetName();
@ -87,7 +87,7 @@ public:
protected:
nsProbingState mState;
const SequenceModel *mModel;
const SequenceModel* const mModel;
const PRBool mReversed; // PR_TRUE if we need to reverse every pair in the model lookup
//char order of last character
@ -106,19 +106,20 @@ protected:
};
extern SequenceModel Koi8rModel;
extern SequenceModel Win1251Model;
extern SequenceModel Latin5Model;
extern SequenceModel MacCyrillicModel;
extern SequenceModel Ibm866Model;
extern SequenceModel Ibm855Model;
extern SequenceModel Latin7Model;
extern SequenceModel Win1253Model;
extern SequenceModel Latin5BulgarianModel;
extern SequenceModel Win1251BulgarianModel;
extern SequenceModel Latin2HungarianModel;
extern SequenceModel Win1250HungarianModel;
extern SequenceModel Win1255Model;
extern const SequenceModel Koi8rModel;
extern const SequenceModel Win1251Model;
extern const SequenceModel Latin5Model;
extern const SequenceModel MacCyrillicModel;
extern const SequenceModel Ibm866Model;
extern const SequenceModel Ibm855Model;
extern const SequenceModel Latin7Model;
extern const SequenceModel Win1253Model;
extern const SequenceModel Latin5BulgarianModel;
extern const SequenceModel Win1251BulgarianModel;
extern const SequenceModel Latin2HungarianModel;
extern const SequenceModel Win1250HungarianModel;
extern const SequenceModel Win1255Model;
extern const SequenceModel TIS620ThaiModel;
#endif /* nsSingleByteCharSetProber_h__ */

View File

@ -46,8 +46,8 @@ void nsSJISProber::Reset(void)
{
mCodingSM->Reset();
mState = eDetecting;
mContextAnalyser.Reset();
mDistributionAnalyser.Reset();
mContextAnalyser.Reset(mIsPreferredLanguage);
mDistributionAnalyser.Reset(mIsPreferredLanguage);
}
nsProbingState nsSJISProber::HandleData(const char* aBuf, PRUint32 aLen)
@ -57,11 +57,6 @@ nsProbingState nsSJISProber::HandleData(const char* aBuf, PRUint32 aLen)
for (PRUint32 i = 0; i < aLen; i++)
{
codingState = mCodingSM->NextState(aBuf[i]);
if (codingState == eError)
{
mState = eNotMe;
break;
}
if (codingState == eItsMe)
{
mState = eFoundIt;

View File

@ -43,7 +43,6 @@
#ifndef nsSJISProber_h__
#define nsSJISProber_h__
#include "uchardetDefine.h"
#include "nsCharSetProber.h"
#include "nsCodingStateMachine.h"
#include "JpCntx.h"
@ -52,11 +51,13 @@
class nsSJISProber: public nsCharSetProber {
public:
nsSJISProber(void){mCodingSM = new nsCodingStateMachine(&SJISSMModel);
Reset();}
nsSJISProber(PRBool aIsPreferredLanguage)
:mIsPreferredLanguage(aIsPreferredLanguage)
{mCodingSM = new nsCodingStateMachine(&SJISSMModel);
Reset();}
virtual ~nsSJISProber(void){delete mCodingSM;}
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
const char* GetCharSetName() {return CHARDET_ENCODING_SHIFT_JIS;}
const char* GetCharSetName() {return "Shift_JIS";}
nsProbingState GetState(void) {return mState;}
void Reset(void);
float GetConfidence(void);
@ -70,6 +71,7 @@ protected:
SJISDistributionAnalysis mDistributionAnalyser;
char mLastChar[2];
PRBool mIsPreferredLanguage;
};

View File

@ -51,11 +51,6 @@ nsProbingState nsUTF8Prober::HandleData(const char* aBuf, PRUint32 aLen)
for (PRUint32 i = 0; i < aLen; i++)
{
codingState = mCodingSM->NextState(aBuf[i]);
if (codingState == eError)
{
mState = eNotMe;
break;
}
if (codingState == eItsMe)
{
mState = eFoundIt;

View File

@ -38,7 +38,6 @@
#ifndef nsUTF8Prober_h__
#define nsUTF8Prober_h__
#include "uchardetDefine.h"
#include "nsCharSetProber.h"
#include "nsCodingStateMachine.h"
@ -49,7 +48,7 @@ public:
Reset(); }
virtual ~nsUTF8Prober(){delete mCodingSM;}
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
const char* GetCharSetName() {return CHARDET_ENCODING_UTF_8;}
const char* GetCharSetName() {return "UTF-8";}
nsProbingState GetState(void) {return mState;}
void Reset(void);
float GetConfidence(void);

View File

@ -38,7 +38,6 @@
#include "nscore.h"
#include "uchardetDefine.h"
#include "nsUniversalDetector.h"
#include "nsMBCSGroupProber.h"
@ -46,7 +45,7 @@
#include "nsEscCharsetProber.h"
#include "nsLatin1Prober.h"
nsUniversalDetector::nsUniversalDetector()
nsUniversalDetector::nsUniversalDetector(PRUint32 aLanguageFilter)
{
mDone = PR_FALSE;
mBestGuess = -1; //illegal value as signal
@ -58,6 +57,7 @@ nsUniversalDetector::nsUniversalDetector()
mGotData = PR_FALSE;
mInputState = ePureAscii;
mLastChar = '\0';
mLanguageFilter = aLanguageFilter;
PRUint32 i;
for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
@ -67,10 +67,9 @@ nsUniversalDetector::nsUniversalDetector()
nsUniversalDetector::~nsUniversalDetector()
{
for (PRInt32 i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
if (mCharSetProbers[i])
delete mCharSetProbers[i];
if (mEscCharSetProber)
delete mEscCharSetProber;
delete mCharSetProbers[i];
delete mEscCharSetProber;
}
void
@ -111,37 +110,23 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
if (mStart)
{
mStart = PR_FALSE;
if (aLen > 3)
if (aLen > 2)
switch (aBuf[0])
{
case '\xEF':
if (('\xBB' == aBuf[1]) && ('\xBF' == aBuf[2]))
// EF BB BF UTF-8 encoded BOM
mDetectedCharset = CHARDET_ENCODING_UTF_8;
mDetectedCharset = "UTF-8";
break;
case '\xFE':
if (('\xFF' == aBuf[1]) && ('\x00' == aBuf[2]) && ('\x00' == aBuf[3]))
// FE FF 00 00 UCS-4, unusual octet order BOM (3412)
mDetectedCharset = CHARDET_ENCODING_X_ISO_10646_UCS_4_3412;
else if ('\xFF' == aBuf[1])
if ('\xFF' == aBuf[1])
// FE FF UTF-16, big endian BOM
mDetectedCharset = CHARDET_ENCODING_UTF_16BE;
break;
case '\x00':
if (('\x00' == aBuf[1]) && ('\xFE' == aBuf[2]) && ('\xFF' == aBuf[3]))
// 00 00 FE FF UTF-32, big-endian BOM
mDetectedCharset = CHARDET_ENCODING_UTF_32BE;
else if (('\x00' == aBuf[1]) && ('\xFF' == aBuf[2]) && ('\xFE' == aBuf[3]))
// 00 00 FF FE UCS-4, unusual octet order BOM (2143)
mDetectedCharset = CHARDET_ENCODING_X_ISO_10646_UCS_4_2143;
mDetectedCharset = "UTF-16";
break;
case '\xFF':
if (('\xFE' == aBuf[1]) && ('\x00' == aBuf[2]) && ('\x00' == aBuf[3]))
// FF FE 00 00 UTF-32, little-endian BOM
mDetectedCharset = CHARDET_ENCODING_UTF_32LE;
else if ('\xFE' == aBuf[1])
if ('\xFE' == aBuf[1])
// FF FE UTF-16, little endian BOM
mDetectedCharset = CHARDET_ENCODING_UTF_16LE;
mDetectedCharset = "UTF-16";
break;
} // switch
@ -172,16 +157,24 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
//start multibyte and singlebyte charset prober
if (nsnull == mCharSetProbers[0])
mCharSetProbers[0] = new nsMBCSGroupProber;
if (nsnull == mCharSetProbers[1])
mCharSetProbers[1] = new nsSBCSGroupProber;
if (nsnull == mCharSetProbers[2])
mCharSetProbers[2] = new nsLatin1Prober;
if ((nsnull == mCharSetProbers[0]) ||
(nsnull == mCharSetProbers[1]) ||
(nsnull == mCharSetProbers[2]))
{
mCharSetProbers[0] = new nsMBCSGroupProber(mLanguageFilter);
if (nsnull == mCharSetProbers[0])
return NS_ERROR_OUT_OF_MEMORY;
}
if (nsnull == mCharSetProbers[1] &&
(mLanguageFilter & NS_FILTER_NON_CJK))
{
mCharSetProbers[1] = new nsSBCSGroupProber;
if (nsnull == mCharSetProbers[1])
return NS_ERROR_OUT_OF_MEMORY;
}
if (nsnull == mCharSetProbers[2])
{
mCharSetProbers[2] = new nsLatin1Prober;
if (nsnull == mCharSetProbers[2])
return NS_ERROR_OUT_OF_MEMORY;
}
}
}
else
@ -202,7 +195,7 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
{
case eEscAscii:
if (nsnull == mEscCharSetProber) {
mEscCharSetProber = new nsEscCharSetProber;
mEscCharSetProber = new nsEscCharSetProber(mLanguageFilter);
if (nsnull == mEscCharSetProber)
return NS_ERROR_OUT_OF_MEMORY;
}
@ -216,12 +209,15 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
case eHighbyte:
for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
{
st = mCharSetProbers[i]->HandleData(aBuf, aLen);
if (st == eFoundIt)
if (mCharSetProbers[i])
{
mDone = PR_TRUE;
mDetectedCharset = mCharSetProbers[i]->GetCharSetName();
return NS_OK;
st = mCharSetProbers[i]->HandleData(aBuf, aLen);
if (st == eFoundIt)
{
mDone = PR_TRUE;
mDetectedCharset = mCharSetProbers[i]->GetCharSetName();
return NS_OK;
}
}
}
break;
@ -260,11 +256,14 @@ void nsUniversalDetector::DataEnd()
for (PRInt32 i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
{
proberConfidence = mCharSetProbers[i]->GetConfidence();
if (proberConfidence > maxProberConfidence)
if (mCharSetProbers[i])
{
maxProberConfidence = proberConfidence;
maxProber = i;
proberConfidence = mCharSetProbers[i]->GetConfidence();
if (proberConfidence > maxProberConfidence)
{
maxProberConfidence = proberConfidence;
maxProber = i;
}
}
}
//do not report anything because we are not confident of it, that's in fact a negative answer

View File

@ -48,9 +48,22 @@ typedef enum {
eHighbyte = 2
} nsInputState;
#define NS_FILTER_CHINESE_SIMPLIFIED 0x01
#define NS_FILTER_CHINESE_TRADITIONAL 0x02
#define NS_FILTER_JAPANESE 0x04
#define NS_FILTER_KOREAN 0x08
#define NS_FILTER_NON_CJK 0x10
#define NS_FILTER_ALL 0x1F
#define NS_FILTER_CHINESE (NS_FILTER_CHINESE_SIMPLIFIED | \
NS_FILTER_CHINESE_TRADITIONAL)
#define NS_FILTER_CJK (NS_FILTER_CHINESE_SIMPLIFIED | \
NS_FILTER_CHINESE_TRADITIONAL | \
NS_FILTER_JAPANESE | \
NS_FILTER_KOREAN)
class nsUniversalDetector {
public:
nsUniversalDetector();
nsUniversalDetector(PRUint32 aLanguageFilter);
virtual ~nsUniversalDetector();
virtual nsresult HandleData(const char* aBuf, PRUint32 aLen);
virtual void DataEnd(void);
@ -66,6 +79,7 @@ protected:
char mLastChar;
const char * mDetectedCharset;
PRInt32 mBestGuess;
PRUint32 mLanguageFilter;
nsCharSetProber *mCharSetProbers[NUM_OF_CHARSET_PROBERS];
nsCharSetProber *mEscCharSetProber;

View File

@ -42,6 +42,8 @@ typedef int PRInt32;
typedef unsigned int PRUint32;
typedef short PRInt16;
typedef unsigned short PRUint16;
typedef signed char PRInt8;
typedef unsigned char PRUint8;
#define PR_FALSE false
#define PR_TRUE true

View File

@ -44,38 +44,18 @@
#ifndef VERSION
#define VERSION "Unknown"
#endif
#define BUFFER_SIZE 32768
#define BUFFER_SIZE 65536
char buffer[BUFFER_SIZE];
void detect(FILE * fp)
{
uchardet_t handle = uchardet_new();
size_t size = BUFFER_SIZE;
char * buffer_in = (char *) malloc(size * sizeof(char));
while (fgets(buffer_in, size, fp) != NULL)
while (!feof(fp))
{
size_t freesize = size;
char * buffer_in_p = buffer_in;
size_t line_length = strlen(buffer_in_p);
while (line_length + 1 == freesize && buffer_in_p[line_length - 2] != '\n')
{
buffer_in_p += size - 1;
freesize = size + 1;
size += size;
size_t offset = buffer_in_p - buffer_in;
buffer_in = (char *) realloc(buffer_in, size * sizeof(char));
buffer_in_p = buffer_in + offset;
if (fgets(buffer_in_p, freesize, fp) == NULL)
break;
line_length = strlen(buffer_in_p);
}
int retval = uchardet_handle_data(handle, buffer_in, strlen(buffer_in));
size_t len = fread(buffer, 1, BUFFER_SIZE, fp);
int retval = uchardet_handle_data(handle, buffer, len);
if (retval != 0)
{
fprintf(stderr, "Handle data error.\n");
@ -84,10 +64,10 @@ void detect(FILE * fp)
}
uchardet_data_end(handle);
printf("%s\n", uchardet_get_charset(handle));
const char * charset = uchardet_get_charset(handle);
printf("%s\n", charset);
uchardet_delete(handle);
free(buffer_in);
}
void show_version()

View File

@ -37,67 +37,69 @@
#include "uchardet.h"
#include "nscore.h"
#include "nsUniversalDetector.h"
#include <string.h>
#include <string>
class DllDetector : public nsUniversalDetector
using std::string;
class HandleUniversalDetector : public nsUniversalDetector
{
protected:
char charset_[256];
string m_charset;
public:
DllDetector()
: nsUniversalDetector()
HandleUniversalDetector()
: nsUniversalDetector(NS_FILTER_ALL)
{
*charset_=0;
m_charset = "";
}
virtual ~DllDetector()
virtual ~HandleUniversalDetector()
{}
virtual void Report(const char* charset)
{
strncpy( charset_ , charset , sizeof(charset_) );
m_charset = charset;
}
virtual void Reset()
{
nsUniversalDetector::Reset();
*charset_=0;
m_charset = "";
}
const char* GetCharset() const
{
return charset_;
return m_charset.c_str();
}
};
uchardet_t uchardet_new()
{
return reinterpret_cast<uchardet_t> (new DllDetector());
return reinterpret_cast<uchardet_t> (new HandleUniversalDetector());
}
void uchardet_delete(uchardet_t ud)
{
delete reinterpret_cast<DllDetector*>(ud);
delete reinterpret_cast<HandleUniversalDetector*>(ud);
}
int uchardet_handle_data(uchardet_t ud, const char * data, size_t len)
{
nsresult ret = reinterpret_cast<DllDetector*>(ud)->HandleData(data, (PRUint32)len);
nsresult ret = reinterpret_cast<HandleUniversalDetector*>(ud)->HandleData(data, (PRUint32)len);
return (ret != NS_OK);
}
void uchardet_data_end(uchardet_t ud)
{
reinterpret_cast<DllDetector*>(ud)->DataEnd();
reinterpret_cast<HandleUniversalDetector*>(ud)->DataEnd();
}
void uchardet_reset(uchardet_t ud)
{
reinterpret_cast<DllDetector*>(ud)->Reset();
reinterpret_cast<HandleUniversalDetector*>(ud)->Reset();
}
const char* uchardet_get_charset(uchardet_t ud)
{
return reinterpret_cast<DllDetector*>(ud)->GetCharset();
return reinterpret_cast<HandleUniversalDetector*>(ud)->GetCharset();
}

7
win32.sh Normal file
View File

@ -0,0 +1,7 @@
mkdir --parents win32 \
&& cd win32 \
&& cmake .. \
-G "MSYS Makefiles" \
-DCMAKE_BUILD_TYPE=Release \
-DCMAKE_INSTALL_PREFIX="" \
&& make