mirror of
https://gitlab.freedesktop.org/uchardet/uchardet.git
synced 2025-12-06 16:56:40 +08:00
Update code from upstream.
This commit is contained in:
parent
76a1be36f8
commit
84284eccf4
0
release.sh
Executable file → Normal file
0
release.sh
Executable file → Normal file
@ -11,26 +11,26 @@ set(
|
||||
LangBulgarianModel.cpp
|
||||
LangCyrillicModel.cpp
|
||||
LangGreekModel.cpp
|
||||
LangHebrewModel.cpp
|
||||
LangHungarianModel.cpp
|
||||
LangHebrewModel.cpp
|
||||
LangThaiModel.cpp
|
||||
nsBig5Prober.cpp
|
||||
nsHebrewProber.cpp
|
||||
nsCharSetProber.cpp
|
||||
nsEscCharsetProber.cpp
|
||||
nsEscSM.cpp
|
||||
nsBig5Prober.cpp
|
||||
nsEUCJPProber.cpp
|
||||
nsEUCKRProber.cpp
|
||||
nsEUCTWProber.cpp
|
||||
nsEscCharsetProber.cpp
|
||||
nsEscSM.cpp
|
||||
nsGB2312Prober.cpp
|
||||
nsHebrewProber.cpp
|
||||
nsLatin1Prober.cpp
|
||||
nsMBCSGroupProber.cpp
|
||||
nsMBCSSM.cpp
|
||||
nsSBCharSetProber.cpp
|
||||
nsSBCSGroupProber.cpp
|
||||
nsSBCharSetProber.cpp
|
||||
nsSJISProber.cpp
|
||||
nsUniversalDetector.cpp
|
||||
nsUTF8Prober.cpp
|
||||
nsLatin1Prober.cpp
|
||||
nsUniversalDetector.cpp
|
||||
uchardet.cpp
|
||||
)
|
||||
|
||||
|
||||
@ -46,15 +46,13 @@
|
||||
#define SURE_YES 0.99f
|
||||
#define SURE_NO 0.01f
|
||||
|
||||
#define MINIMUM_DATA_THRESHOLD 4
|
||||
|
||||
//return confidence base on received data
|
||||
float CharDistributionAnalysis::GetConfidence()
|
||||
float CharDistributionAnalysis::GetConfidence(void)
|
||||
{
|
||||
//if we didn't receive any character in our consideration range, or the
|
||||
//number of frequent characters is below the minimum threshold, return
|
||||
// number of frequent characters is below the minimum threshold, return
|
||||
// negative answer
|
||||
if (mTotalChars <= 0 || mFreqChars <= MINIMUM_DATA_THRESHOLD)
|
||||
if (mTotalChars <= 0 || mFreqChars <= mDataThreshold)
|
||||
return SURE_NO;
|
||||
|
||||
if (mTotalChars != mFreqChars) {
|
||||
|
||||
@ -42,11 +42,12 @@
|
||||
|
||||
#define ENOUGH_DATA_THRESHOLD 1024
|
||||
|
||||
#define MINIMUM_DATA_THRESHOLD 4
|
||||
|
||||
class CharDistributionAnalysis
|
||||
{
|
||||
public:
|
||||
CharDistributionAnalysis() {Reset();}
|
||||
virtual ~CharDistributionAnalysis(){};
|
||||
CharDistributionAnalysis() {Reset(PR_FALSE);}
|
||||
|
||||
//feed a block of data and do distribution analysis
|
||||
void HandleData(const char* aBuf, PRUint32 aLen) {}
|
||||
@ -72,14 +73,15 @@ public:
|
||||
}
|
||||
|
||||
//return confidence base on existing data
|
||||
float GetConfidence();
|
||||
float GetConfidence(void);
|
||||
|
||||
//Reset analyser, clear any state
|
||||
void Reset(void)
|
||||
void Reset(PRBool aIsPreferredLanguage)
|
||||
{
|
||||
mDone = PR_FALSE;
|
||||
mTotalChars = 0;
|
||||
mFreqChars = 0;
|
||||
mDataThreshold = aIsPreferredLanguage ? 0 : MINIMUM_DATA_THRESHOLD;
|
||||
}
|
||||
|
||||
//This function is for future extension. Caller can use this function to control
|
||||
@ -105,6 +107,9 @@ protected:
|
||||
//Total character encounted.
|
||||
PRUint32 mTotalChars;
|
||||
|
||||
//Number of hi-byte characters needed to trigger detection
|
||||
PRUint32 mDataThreshold;
|
||||
|
||||
//Mapping table to get frequency order from char order (get from GetOrder())
|
||||
const PRInt16 *mCharToFreqOrder;
|
||||
|
||||
|
||||
@ -39,7 +39,7 @@
|
||||
#include "JpCntx.h"
|
||||
|
||||
//This is hiragana 2-char sequence table, the number in each cell represents its frequency category
|
||||
char jp2CharContext[83][83] =
|
||||
const PRUint8 jp2CharContext[83][83] =
|
||||
{
|
||||
{ 0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,},
|
||||
{ 2,4,0,4,0,3,0,4,0,3,4,4,4,2,4,3,3,4,3,2,3,3,4,2,3,3,3,2,4,1,4,3,3,1,5,4,3,4,3,4,3,5,3,0,3,5,4,2,0,3,1,0,3,3,0,3,3,0,1,1,0,4,3,0,3,3,0,4,0,2,0,3,5,5,5,5,4,0,4,1,0,3,4,},
|
||||
@ -170,7 +170,7 @@ void JapaneseContextAnalysis::HandleData(const char* aBuf, PRUint32 aLen)
|
||||
return;
|
||||
}
|
||||
|
||||
void JapaneseContextAnalysis::Reset(void)
|
||||
void JapaneseContextAnalysis::Reset(PRBool aIsPreferredLanguage)
|
||||
{
|
||||
mTotalRel = 0;
|
||||
for (PRUint32 i = 0; i < NUM_OF_CATEGORY; i++)
|
||||
@ -178,13 +178,14 @@ void JapaneseContextAnalysis::Reset(void)
|
||||
mNeedToSkipCharNum = 0;
|
||||
mLastCharOrder = -1;
|
||||
mDone = PR_FALSE;
|
||||
mDataThreshold = aIsPreferredLanguage ? 0 : MINIMUM_DATA_THRESHOLD;
|
||||
}
|
||||
#define DONT_KNOW (float)-1
|
||||
|
||||
float JapaneseContextAnalysis::GetConfidence()
|
||||
float JapaneseContextAnalysis::GetConfidence(void)
|
||||
{
|
||||
//This is just one way to calculate confidence. It works well for me.
|
||||
if (mTotalRel > MINIMUM_DATA_THRESHOLD)
|
||||
if (mTotalRel > mDataThreshold)
|
||||
return ((float)(mTotalRel - mRelSample[0]))/mTotalRel;
|
||||
else
|
||||
return (float)DONT_KNOW;
|
||||
@ -227,5 +228,3 @@ PRInt32 EUCJPContextAnalysis::GetOrder(const char* str, PRUint32 *charLen)
|
||||
return (unsigned char)*(str+1) - (unsigned char)0xa1;
|
||||
return -1;
|
||||
}
|
||||
|
||||
|
||||
|
||||
14
src/JpCntx.h
14
src/JpCntx.h
@ -46,13 +46,12 @@
|
||||
#define MAX_REL_THRESHOLD 1000
|
||||
|
||||
//hiragana frequency category table
|
||||
extern char jp2CharContext[83][83];
|
||||
extern const PRUint8 jp2CharContext[83][83];
|
||||
|
||||
class JapaneseContextAnalysis
|
||||
{
|
||||
public:
|
||||
JapaneseContextAnalysis() {Reset();}
|
||||
virtual ~JapaneseContextAnalysis(){};
|
||||
JapaneseContextAnalysis() {Reset(PR_FALSE);}
|
||||
|
||||
void HandleData(const char* aBuf, PRUint32 aLen);
|
||||
|
||||
@ -75,8 +74,8 @@ public:
|
||||
mLastCharOrder = order;
|
||||
}
|
||||
|
||||
float GetConfidence();
|
||||
void Reset(void);
|
||||
float GetConfidence(void);
|
||||
void Reset(PRBool aIsPreferredLanguage);
|
||||
void SetOpion(){}
|
||||
PRBool GotEnoughData() {return mTotalRel > ENOUGH_REL_THRESHOLD;}
|
||||
|
||||
@ -84,11 +83,14 @@ protected:
|
||||
virtual PRInt32 GetOrder(const char* str, PRUint32 *charLen) = 0;
|
||||
virtual PRInt32 GetOrder(const char* str) = 0;
|
||||
|
||||
//category counters, each interger counts sequence in its category
|
||||
//category counters, each integer counts sequences in its category
|
||||
PRUint32 mRelSample[NUM_OF_CATEGORY];
|
||||
|
||||
//total sequence received
|
||||
PRUint32 mTotalRel;
|
||||
|
||||
//Number of sequences needed to trigger detection
|
||||
PRUint32 mDataThreshold;
|
||||
|
||||
//The order of previous char
|
||||
PRInt32 mLastCharOrder;
|
||||
|
||||
@ -35,7 +35,6 @@
|
||||
*
|
||||
* ***** END LICENSE BLOCK ***** */
|
||||
|
||||
#include "uchardetDefine.h"
|
||||
#include "nsSBCharSetProber.h"
|
||||
/****************************************************************
|
||||
255: Control characters that usually does not exist in any text
|
||||
@ -49,7 +48,7 @@
|
||||
//this talbe is modified base on win1251BulgarianCharToOrderMap, so
|
||||
//only number <64 is sure valid
|
||||
|
||||
unsigned char Latin5_BulgarianCharToOrderMap[] =
|
||||
static const unsigned char Latin5_BulgarianCharToOrderMap[] =
|
||||
{
|
||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
|
||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
|
||||
@ -69,7 +68,7 @@ unsigned char Latin5_BulgarianCharToOrderMap[] =
|
||||
62,242,243,244, 58,245, 98,246,247,248,249,250,251, 91,252,253, //f0
|
||||
};
|
||||
|
||||
unsigned char win1251BulgarianCharToOrderMap[] =
|
||||
static const unsigned char win1251BulgarianCharToOrderMap[] =
|
||||
{
|
||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
|
||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
|
||||
@ -95,7 +94,7 @@ unsigned char win1251BulgarianCharToOrderMap[] =
|
||||
//first 1024 sequences:3.0618%
|
||||
//rest sequences: 0.2992%
|
||||
//negative sequences: 0.0020%
|
||||
char BulgarianLangModel[] =
|
||||
static const PRUint8 BulgarianLangModel[] =
|
||||
{
|
||||
0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,2,3,3,3,3,3,
|
||||
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,2,2,3,2,2,1,2,2,
|
||||
@ -227,20 +226,20 @@ char BulgarianLangModel[] =
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,
|
||||
};
|
||||
|
||||
SequenceModel Latin5BulgarianModel =
|
||||
const SequenceModel Latin5BulgarianModel =
|
||||
{
|
||||
Latin5_BulgarianCharToOrderMap,
|
||||
BulgarianLangModel,
|
||||
(float)0.969392,
|
||||
PR_FALSE,
|
||||
CHARDET_ENCODING_ISO_8859_5
|
||||
"ISO-8859-5"
|
||||
};
|
||||
|
||||
SequenceModel Win1251BulgarianModel =
|
||||
const SequenceModel Win1251BulgarianModel =
|
||||
{
|
||||
win1251BulgarianCharToOrderMap,
|
||||
BulgarianLangModel,
|
||||
(float)0.969392,
|
||||
PR_FALSE,
|
||||
CHARDET_ENCODING_WINDOWS_1251
|
||||
"windows-1251"
|
||||
};
|
||||
|
||||
@ -35,14 +35,13 @@
|
||||
*
|
||||
* ***** END LICENSE BLOCK ***** */
|
||||
|
||||
#include "uchardetDefine.h"
|
||||
#include "nsSBCharSetProber.h"
|
||||
|
||||
|
||||
|
||||
//KOI8-R language model
|
||||
//Character Mapping Table:
|
||||
unsigned char KOI8R_CharToOrderMap[] =
|
||||
static const unsigned char KOI8R_CharToOrderMap[] =
|
||||
{
|
||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
|
||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
|
||||
@ -62,7 +61,7 @@ unsigned char KOI8R_CharToOrderMap[] =
|
||||
35, 43, 45, 32, 40, 52, 56, 33, 61, 62, 51, 57, 47, 63, 50, 70, //f0
|
||||
};
|
||||
|
||||
unsigned char win1251_CharToOrderMap[] =
|
||||
static const unsigned char win1251_CharToOrderMap[] =
|
||||
{
|
||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
|
||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
|
||||
@ -82,7 +81,7 @@ unsigned char win1251_CharToOrderMap[] =
|
||||
9, 7, 6, 14, 39, 26, 28, 22, 25, 29, 54, 18, 17, 30, 27, 16,
|
||||
};
|
||||
|
||||
unsigned char latin5_CharToOrderMap[] =
|
||||
static const unsigned char latin5_CharToOrderMap[] =
|
||||
{
|
||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
|
||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
|
||||
@ -102,7 +101,7 @@ unsigned char latin5_CharToOrderMap[] =
|
||||
239, 68,240,241,242,243,244,245,246,247,248,249,250,251,252,255,
|
||||
};
|
||||
|
||||
unsigned char macCyrillic_CharToOrderMap[] =
|
||||
static const unsigned char macCyrillic_CharToOrderMap[] =
|
||||
{
|
||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
|
||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
|
||||
@ -122,7 +121,7 @@ unsigned char macCyrillic_CharToOrderMap[] =
|
||||
9, 7, 6, 14, 39, 26, 28, 22, 25, 29, 54, 18, 17, 30, 27,255,
|
||||
};
|
||||
|
||||
unsigned char IBM855_CharToOrderMap[] =
|
||||
static const unsigned char IBM855_CharToOrderMap[] =
|
||||
{
|
||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
|
||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
|
||||
@ -142,7 +141,7 @@ unsigned char IBM855_CharToOrderMap[] =
|
||||
250, 18, 62, 20, 51, 25, 57, 30, 47, 29, 63, 22, 50,251,252,255,
|
||||
};
|
||||
|
||||
unsigned char IBM866_CharToOrderMap[] =
|
||||
static const unsigned char IBM866_CharToOrderMap[] =
|
||||
{
|
||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
|
||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
|
||||
@ -168,7 +167,7 @@ unsigned char IBM866_CharToOrderMap[] =
|
||||
//first 1024 sequences: 2.3389%
|
||||
//rest sequences: 0.1237%
|
||||
//negative sequences: 0.0009%
|
||||
char RussianLangModel[] =
|
||||
static const PRUint8 RussianLangModel[] =
|
||||
{
|
||||
0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,1,3,3,3,3,1,3,3,3,2,3,2,3,3,
|
||||
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,2,2,2,2,2,0,0,2,
|
||||
@ -301,56 +300,56 @@ char RussianLangModel[] =
|
||||
};
|
||||
|
||||
|
||||
SequenceModel Koi8rModel =
|
||||
const SequenceModel Koi8rModel =
|
||||
{
|
||||
KOI8R_CharToOrderMap,
|
||||
RussianLangModel,
|
||||
(float)0.976601,
|
||||
PR_FALSE,
|
||||
CHARDET_ENCODING_KOI8_R
|
||||
"KOI8-R"
|
||||
};
|
||||
|
||||
SequenceModel Win1251Model =
|
||||
const SequenceModel Win1251Model =
|
||||
{
|
||||
win1251_CharToOrderMap,
|
||||
RussianLangModel,
|
||||
(float)0.976601,
|
||||
PR_FALSE,
|
||||
CHARDET_ENCODING_WINDOWS_1251
|
||||
"windows-1251"
|
||||
};
|
||||
|
||||
SequenceModel Latin5Model =
|
||||
const SequenceModel Latin5Model =
|
||||
{
|
||||
latin5_CharToOrderMap,
|
||||
RussianLangModel,
|
||||
(float)0.976601,
|
||||
PR_FALSE,
|
||||
CHARDET_ENCODING_ISO_8859_5
|
||||
"ISO-8859-5"
|
||||
};
|
||||
|
||||
SequenceModel MacCyrillicModel =
|
||||
const SequenceModel MacCyrillicModel =
|
||||
{
|
||||
macCyrillic_CharToOrderMap,
|
||||
RussianLangModel,
|
||||
(float)0.976601,
|
||||
PR_FALSE,
|
||||
CHARDET_ENCODING_MACCYRILLIC
|
||||
"x-mac-cyrillic"
|
||||
};
|
||||
|
||||
SequenceModel Ibm866Model =
|
||||
const SequenceModel Ibm866Model =
|
||||
{
|
||||
IBM866_CharToOrderMap,
|
||||
RussianLangModel,
|
||||
(float)0.976601,
|
||||
PR_FALSE,
|
||||
CHARDET_ENCODING_IBM866
|
||||
"IBM866"
|
||||
};
|
||||
|
||||
SequenceModel Ibm855Model =
|
||||
const SequenceModel Ibm855Model =
|
||||
{
|
||||
IBM855_CharToOrderMap,
|
||||
RussianLangModel,
|
||||
(float)0.976601,
|
||||
PR_FALSE,
|
||||
CHARDET_ENCODING_IBM855
|
||||
"IBM855"
|
||||
};
|
||||
|
||||
@ -35,7 +35,6 @@
|
||||
*
|
||||
* ***** END LICENSE BLOCK ***** */
|
||||
|
||||
#include "uchardetDefine.h"
|
||||
#include "nsSBCharSetProber.h"
|
||||
/****************************************************************
|
||||
255: Control characters that usually does not exist in any text
|
||||
@ -46,7 +45,7 @@
|
||||
*****************************************************************/
|
||||
|
||||
//Character Mapping Table:
|
||||
unsigned char Latin7_CharToOrderMap[] =
|
||||
static const unsigned char Latin7_CharToOrderMap[] =
|
||||
{
|
||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
|
||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
|
||||
@ -68,7 +67,7 @@ unsigned char Latin7_CharToOrderMap[] =
|
||||
|
||||
|
||||
|
||||
unsigned char win1253_CharToOrderMap[] =
|
||||
static const unsigned char win1253_CharToOrderMap[] =
|
||||
{
|
||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
|
||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
|
||||
@ -94,7 +93,7 @@ unsigned char win1253_CharToOrderMap[] =
|
||||
//first 1024 sequences:1.7001%
|
||||
//rest sequences: 0.0359%
|
||||
//negative sequences: 0.0148%
|
||||
char GreekLangModel[] =
|
||||
static const PRUint8 GreekLangModel[] =
|
||||
{
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
@ -226,20 +225,20 @@ char GreekLangModel[] =
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
};
|
||||
|
||||
SequenceModel Latin7Model =
|
||||
const SequenceModel Latin7Model =
|
||||
{
|
||||
Latin7_CharToOrderMap,
|
||||
GreekLangModel,
|
||||
(float)0.982851,
|
||||
PR_FALSE,
|
||||
CHARDET_ENCODING_ISO_8859_7
|
||||
"ISO-8859-7"
|
||||
};
|
||||
|
||||
SequenceModel Win1253Model =
|
||||
const SequenceModel Win1253Model =
|
||||
{
|
||||
win1253_CharToOrderMap,
|
||||
GreekLangModel,
|
||||
(float)0.982851,
|
||||
PR_FALSE,
|
||||
CHARDET_ENCODING_WINDOWS_1253
|
||||
"windows-1253"
|
||||
};
|
||||
|
||||
@ -37,7 +37,6 @@
|
||||
*
|
||||
* ***** END LICENSE BLOCK ***** */
|
||||
|
||||
#include "uchardetDefine.h"
|
||||
#include "nsSBCharSetProber.h"
|
||||
|
||||
|
||||
@ -51,7 +50,7 @@
|
||||
|
||||
//Windows-1255 language model
|
||||
//Character Mapping Table:
|
||||
unsigned char win1255_CharToOrderMap[] =
|
||||
static const unsigned char win1255_CharToOrderMap[] =
|
||||
{
|
||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
|
||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
|
||||
@ -77,7 +76,7 @@ unsigned char win1255_CharToOrderMap[] =
|
||||
//first 1024 sequences: 1.5981%
|
||||
//rest sequences: 0.087%
|
||||
//negative sequences: 0.0015%
|
||||
char HebrewLangModel[] =
|
||||
static const PRUint8 HebrewLangModel[] =
|
||||
{
|
||||
0,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,2,3,2,1,2,0,1,0,0,
|
||||
3,0,3,1,0,0,1,3,2,0,1,1,2,0,2,2,2,1,1,1,1,2,1,1,1,2,0,0,2,2,0,1,
|
||||
@ -209,12 +208,12 @@ char HebrewLangModel[] =
|
||||
0,0,0,0,0,0,0,0,0,0,1,2,1,0,0,0,0,0,1,1,1,1,1,0,1,0,0,0,1,1,0,0,
|
||||
};
|
||||
|
||||
SequenceModel Win1255Model =
|
||||
const SequenceModel Win1255Model =
|
||||
{
|
||||
win1255_CharToOrderMap,
|
||||
HebrewLangModel,
|
||||
(float)0.984004,
|
||||
PR_FALSE,
|
||||
CHARDET_ENCODING_WINDOWS_1255
|
||||
"windows-1255"
|
||||
};
|
||||
|
||||
|
||||
@ -35,7 +35,6 @@
|
||||
*
|
||||
* ***** END LICENSE BLOCK ***** */
|
||||
|
||||
#include "uchardetDefine.h"
|
||||
#include "nsSBCharSetProber.h"
|
||||
/****************************************************************
|
||||
255: Control characters that usually does not exist in any text
|
||||
@ -46,7 +45,7 @@
|
||||
*****************************************************************/
|
||||
|
||||
//Character Mapping Table:
|
||||
unsigned char Latin2_HungarianCharToOrderMap[] =
|
||||
static const unsigned char Latin2_HungarianCharToOrderMap[] =
|
||||
{
|
||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
|
||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
|
||||
@ -66,7 +65,7 @@ unsigned char Latin2_HungarianCharToOrderMap[] =
|
||||
245,246,247, 25, 73, 42, 24,248,249,250, 31, 56, 29,251,252,253,
|
||||
};
|
||||
|
||||
unsigned char win1250HungarianCharToOrderMap[] =
|
||||
static const unsigned char win1250HungarianCharToOrderMap[] =
|
||||
{
|
||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
|
||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
|
||||
@ -92,7 +91,7 @@ unsigned char win1250HungarianCharToOrderMap[] =
|
||||
//first 1024 sequences:5.2623%
|
||||
//rest sequences: 0.8894%
|
||||
//negative sequences: 0.0009%
|
||||
char HungarianLangModel[] =
|
||||
static const PRUint8 HungarianLangModel[] =
|
||||
{
|
||||
0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
|
||||
3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,2,2,3,3,1,1,2,2,2,2,2,1,2,
|
||||
@ -224,20 +223,20 @@ char HungarianLangModel[] =
|
||||
0,1,1,1,1,1,1,0,1,1,0,1,0,1,0,0,1,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,
|
||||
};
|
||||
|
||||
SequenceModel Latin2HungarianModel =
|
||||
const SequenceModel Latin2HungarianModel =
|
||||
{
|
||||
Latin2_HungarianCharToOrderMap,
|
||||
HungarianLangModel,
|
||||
(float)0.947368,
|
||||
PR_TRUE,
|
||||
CHARDET_ENCODING_ISO_8859_2
|
||||
"ISO-8859-2"
|
||||
};
|
||||
|
||||
SequenceModel Win1250HungarianModel =
|
||||
const SequenceModel Win1250HungarianModel =
|
||||
{
|
||||
win1250HungarianCharToOrderMap,
|
||||
HungarianLangModel,
|
||||
(float)0.947368,
|
||||
PR_TRUE,
|
||||
CHARDET_ENCODING_WINDOWS_1250
|
||||
"windows-1250"
|
||||
};
|
||||
|
||||
@ -35,7 +35,6 @@
|
||||
*
|
||||
* ***** END LICENSE BLOCK ***** */
|
||||
|
||||
#include "uchardetDefine.h"
|
||||
#include "nsSBCharSetProber.h"
|
||||
|
||||
|
||||
@ -50,7 +49,7 @@
|
||||
//The following result for thai was collected from a limited sample (1M).
|
||||
|
||||
//Character Mapping Table:
|
||||
unsigned char TIS620CharToOrderMap[] =
|
||||
static const unsigned char TIS620CharToOrderMap[] =
|
||||
{
|
||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
|
||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
|
||||
@ -79,7 +78,7 @@ unsigned char TIS620CharToOrderMap[] =
|
||||
//first 1024 sequences:7.3177%
|
||||
//rest sequences: 1.0230%
|
||||
//negative sequences: 0.0436%
|
||||
char ThaiLangModel[] =
|
||||
static const PRUint8 ThaiLangModel[] =
|
||||
{
|
||||
0,1,3,3,3,3,0,0,3,3,0,3,3,0,3,3,3,3,3,3,3,3,0,0,3,3,3,0,3,3,3,3,
|
||||
0,3,3,0,0,0,1,3,0,3,3,2,3,3,0,1,2,3,3,3,3,0,2,0,2,0,0,3,2,1,2,2,
|
||||
@ -212,11 +211,11 @@ char ThaiLangModel[] =
|
||||
};
|
||||
|
||||
|
||||
SequenceModel TIS620ThaiModel =
|
||||
const SequenceModel TIS620ThaiModel =
|
||||
{
|
||||
TIS620CharToOrderMap,
|
||||
ThaiLangModel,
|
||||
(float)0.926386,
|
||||
PR_FALSE,
|
||||
CHARDET_ENCODING_TIS_620
|
||||
"TIS-620"
|
||||
};
|
||||
|
||||
@ -41,7 +41,7 @@ void nsBig5Prober::Reset(void)
|
||||
{
|
||||
mCodingSM->Reset();
|
||||
mState = eDetecting;
|
||||
mDistributionAnalyser.Reset();
|
||||
mDistributionAnalyser.Reset(mIsPreferredLanguage);
|
||||
}
|
||||
|
||||
nsProbingState nsBig5Prober::HandleData(const char* aBuf, PRUint32 aLen)
|
||||
@ -51,11 +51,6 @@ nsProbingState nsBig5Prober::HandleData(const char* aBuf, PRUint32 aLen)
|
||||
for (PRUint32 i = 0; i < aLen; i++)
|
||||
{
|
||||
codingState = mCodingSM->NextState(aBuf[i]);
|
||||
if (codingState == eError)
|
||||
{
|
||||
mState = eNotMe;
|
||||
break;
|
||||
}
|
||||
if (codingState == eItsMe)
|
||||
{
|
||||
mState = eFoundIt;
|
||||
|
||||
@ -38,18 +38,19 @@
|
||||
#ifndef nsBig5Prober_h__
|
||||
#define nsBig5Prober_h__
|
||||
|
||||
#include "uchardetDefine.h"
|
||||
#include "nsCharSetProber.h"
|
||||
#include "nsCodingStateMachine.h"
|
||||
#include "CharDistribution.h"
|
||||
|
||||
class nsBig5Prober: public nsCharSetProber {
|
||||
public:
|
||||
nsBig5Prober(void){mCodingSM = new nsCodingStateMachine(&Big5SMModel);
|
||||
Reset();}
|
||||
nsBig5Prober(PRBool aIsPreferredLanguage)
|
||||
:mIsPreferredLanguage(aIsPreferredLanguage)
|
||||
{mCodingSM = new nsCodingStateMachine(&Big5SMModel);
|
||||
Reset();}
|
||||
virtual ~nsBig5Prober(void){delete mCodingSM;}
|
||||
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
|
||||
const char* GetCharSetName() {return CHARDET_ENCODING_BIG5;}
|
||||
const char* GetCharSetName() {return "Big5";}
|
||||
nsProbingState GetState(void) {return mState;}
|
||||
void Reset(void);
|
||||
float GetConfidence(void);
|
||||
@ -64,6 +65,7 @@ protected:
|
||||
//Big5ContextAnalysis mContextAnalyser;
|
||||
Big5DistributionAnalysis mDistributionAnalyser;
|
||||
char mLastChar[2];
|
||||
PRBool mIsPreferredLanguage;
|
||||
|
||||
};
|
||||
|
||||
|
||||
@ -61,7 +61,7 @@ public:
|
||||
virtual void SetOpion() = 0;
|
||||
|
||||
#ifdef DEBUG_chardet
|
||||
virtual void DumpStatus() {}
|
||||
virtual void DumpStatus() {};
|
||||
#endif
|
||||
|
||||
// Helper functions used in the Latin1 and Group probers.
|
||||
|
||||
@ -59,10 +59,7 @@ typedef struct
|
||||
|
||||
class nsCodingStateMachine {
|
||||
public:
|
||||
nsCodingStateMachine(SMModel* sm){
|
||||
mCurrentState = eStart;
|
||||
mModel = sm;
|
||||
}
|
||||
nsCodingStateMachine(const SMModel* sm) : mModel(sm) { mCurrentState = eStart; }
|
||||
nsSMState NextState(char c){
|
||||
//for each byte we get its class , if it is first byte, we also get byte length
|
||||
PRUint32 byteCls = GETCLASS(c);
|
||||
@ -86,23 +83,22 @@ protected:
|
||||
PRUint32 mCurrentCharLen;
|
||||
PRUint32 mCurrentBytePos;
|
||||
|
||||
SMModel *mModel;
|
||||
const SMModel *mModel;
|
||||
};
|
||||
|
||||
extern SMModel UTF8SMModel;
|
||||
extern SMModel Big5SMModel;
|
||||
extern SMModel EUCJPSMModel;
|
||||
extern SMModel EUCKRSMModel;
|
||||
extern SMModel EUCTWSMModel;
|
||||
extern SMModel GB18030SMModel;
|
||||
extern SMModel SJISSMModel;
|
||||
extern SMModel UCS2BESMModel;
|
||||
extern const SMModel UTF8SMModel;
|
||||
extern const SMModel Big5SMModel;
|
||||
extern const SMModel EUCJPSMModel;
|
||||
extern const SMModel EUCKRSMModel;
|
||||
extern const SMModel EUCTWSMModel;
|
||||
extern const SMModel GB18030SMModel;
|
||||
extern const SMModel SJISSMModel;
|
||||
|
||||
|
||||
extern SMModel HZSMModel;
|
||||
extern SMModel ISO2022CNSMModel;
|
||||
extern SMModel ISO2022JPSMModel;
|
||||
extern SMModel ISO2022KRSMModel;
|
||||
extern const SMModel HZSMModel;
|
||||
extern const SMModel ISO2022CNSMModel;
|
||||
extern const SMModel ISO2022JPSMModel;
|
||||
extern const SMModel ISO2022KRSMModel;
|
||||
|
||||
#endif /* nsCodingStateMachine_h__ */
|
||||
|
||||
|
||||
@ -46,8 +46,8 @@ void nsEUCJPProber::Reset(void)
|
||||
{
|
||||
mCodingSM->Reset();
|
||||
mState = eDetecting;
|
||||
mContextAnalyser.Reset();
|
||||
mDistributionAnalyser.Reset();
|
||||
mContextAnalyser.Reset(mIsPreferredLanguage);
|
||||
mDistributionAnalyser.Reset(mIsPreferredLanguage);
|
||||
}
|
||||
|
||||
nsProbingState nsEUCJPProber::HandleData(const char* aBuf, PRUint32 aLen)
|
||||
@ -57,11 +57,6 @@ nsProbingState nsEUCJPProber::HandleData(const char* aBuf, PRUint32 aLen)
|
||||
for (PRUint32 i = 0; i < aLen; i++)
|
||||
{
|
||||
codingState = mCodingSM->NextState(aBuf[i]);
|
||||
if (codingState == eError)
|
||||
{
|
||||
mState = eNotMe;
|
||||
break;
|
||||
}
|
||||
if (codingState == eItsMe)
|
||||
{
|
||||
mState = eFoundIt;
|
||||
|
||||
@ -43,7 +43,6 @@
|
||||
#ifndef nsEUCJPProber_h__
|
||||
#define nsEUCJPProber_h__
|
||||
|
||||
#include "uchardetDefine.h"
|
||||
#include "nsCharSetProber.h"
|
||||
#include "nsCodingStateMachine.h"
|
||||
#include "JpCntx.h"
|
||||
@ -51,11 +50,13 @@
|
||||
|
||||
class nsEUCJPProber: public nsCharSetProber {
|
||||
public:
|
||||
nsEUCJPProber(void){mCodingSM = new nsCodingStateMachine(&EUCJPSMModel);
|
||||
Reset();}
|
||||
nsEUCJPProber(PRBool aIsPreferredLanguage)
|
||||
:mIsPreferredLanguage(aIsPreferredLanguage)
|
||||
{mCodingSM = new nsCodingStateMachine(&EUCJPSMModel);
|
||||
Reset();}
|
||||
virtual ~nsEUCJPProber(void){delete mCodingSM;}
|
||||
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
|
||||
const char* GetCharSetName() {return CHARDET_ENCODING_EUC_JP;}
|
||||
const char* GetCharSetName() {return "EUC-JP";}
|
||||
nsProbingState GetState(void) {return mState;}
|
||||
void Reset(void);
|
||||
float GetConfidence(void);
|
||||
@ -69,6 +70,7 @@ protected:
|
||||
EUCJPDistributionAnalysis mDistributionAnalyser;
|
||||
|
||||
char mLastChar[2];
|
||||
PRBool mIsPreferredLanguage;
|
||||
};
|
||||
|
||||
|
||||
|
||||
@ -41,7 +41,7 @@ void nsEUCKRProber::Reset(void)
|
||||
{
|
||||
mCodingSM->Reset();
|
||||
mState = eDetecting;
|
||||
mDistributionAnalyser.Reset();
|
||||
mDistributionAnalyser.Reset(mIsPreferredLanguage);
|
||||
//mContextAnalyser.Reset();
|
||||
}
|
||||
|
||||
@ -52,11 +52,6 @@ nsProbingState nsEUCKRProber::HandleData(const char* aBuf, PRUint32 aLen)
|
||||
for (PRUint32 i = 0; i < aLen; i++)
|
||||
{
|
||||
codingState = mCodingSM->NextState(aBuf[i]);
|
||||
if (codingState == eError)
|
||||
{
|
||||
mState = eNotMe;
|
||||
break;
|
||||
}
|
||||
if (codingState == eItsMe)
|
||||
{
|
||||
mState = eFoundIt;
|
||||
|
||||
@ -38,18 +38,20 @@
|
||||
#ifndef nsEUCKRProber_h__
|
||||
#define nsEUCKRProber_h__
|
||||
|
||||
#include "uchardetDefine.h"
|
||||
#include "nsCharSetProber.h"
|
||||
#include "nsCodingStateMachine.h"
|
||||
#include "CharDistribution.h"
|
||||
|
||||
class nsEUCKRProber: public nsCharSetProber {
|
||||
public:
|
||||
nsEUCKRProber(void){mCodingSM = new nsCodingStateMachine(&EUCKRSMModel);
|
||||
Reset();}
|
||||
nsEUCKRProber(PRBool aIsPreferredLanguage)
|
||||
:mIsPreferredLanguage(aIsPreferredLanguage)
|
||||
{mCodingSM = new nsCodingStateMachine(&EUCKRSMModel);
|
||||
Reset();
|
||||
}
|
||||
virtual ~nsEUCKRProber(void){delete mCodingSM;}
|
||||
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
|
||||
const char* GetCharSetName() {return CHARDET_ENCODING_EUC_KR;}
|
||||
const char* GetCharSetName() {return "EUC-KR";}
|
||||
nsProbingState GetState(void) {return mState;}
|
||||
void Reset(void);
|
||||
float GetConfidence(void);
|
||||
@ -64,6 +66,7 @@ protected:
|
||||
//EUCKRContextAnalysis mContextAnalyser;
|
||||
EUCKRDistributionAnalysis mDistributionAnalyser;
|
||||
char mLastChar[2];
|
||||
PRBool mIsPreferredLanguage;
|
||||
|
||||
};
|
||||
|
||||
|
||||
@ -41,7 +41,7 @@ void nsEUCTWProber::Reset(void)
|
||||
{
|
||||
mCodingSM->Reset();
|
||||
mState = eDetecting;
|
||||
mDistributionAnalyser.Reset();
|
||||
mDistributionAnalyser.Reset(mIsPreferredLanguage);
|
||||
//mContextAnalyser.Reset();
|
||||
}
|
||||
|
||||
@ -52,11 +52,6 @@ nsProbingState nsEUCTWProber::HandleData(const char* aBuf, PRUint32 aLen)
|
||||
for (PRUint32 i = 0; i < aLen; i++)
|
||||
{
|
||||
codingState = mCodingSM->NextState(aBuf[i]);
|
||||
if (codingState == eError)
|
||||
{
|
||||
mState = eNotMe;
|
||||
break;
|
||||
}
|
||||
if (codingState == eItsMe)
|
||||
{
|
||||
mState = eFoundIt;
|
||||
|
||||
@ -38,18 +38,19 @@
|
||||
#ifndef nsEUCTWProber_h__
|
||||
#define nsEUCTWProber_h__
|
||||
|
||||
#include "uchardetDefine.h"
|
||||
#include "nsCharSetProber.h"
|
||||
#include "nsCodingStateMachine.h"
|
||||
#include "CharDistribution.h"
|
||||
|
||||
class nsEUCTWProber: public nsCharSetProber {
|
||||
public:
|
||||
nsEUCTWProber(void){mCodingSM = new nsCodingStateMachine(&EUCTWSMModel);
|
||||
Reset();}
|
||||
nsEUCTWProber(PRBool aIsPreferredLanguage)
|
||||
:mIsPreferredLanguage(aIsPreferredLanguage)
|
||||
{mCodingSM = new nsCodingStateMachine(&EUCTWSMModel);
|
||||
Reset();}
|
||||
virtual ~nsEUCTWProber(void){delete mCodingSM;}
|
||||
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
|
||||
const char* GetCharSetName() {return CHARDET_ENCODING_EUC_TW;}
|
||||
const char* GetCharSetName() {return "x-euc-tw";}
|
||||
nsProbingState GetState(void) {return mState;}
|
||||
void Reset(void);
|
||||
float GetConfidence(void);
|
||||
@ -64,6 +65,7 @@ protected:
|
||||
//EUCTWContextAnalysis mContextAnalyser;
|
||||
EUCTWDistributionAnalysis mDistributionAnalyser;
|
||||
char mLastChar[2];
|
||||
PRBool mIsPreferredLanguage;
|
||||
|
||||
};
|
||||
|
||||
|
||||
@ -37,13 +37,21 @@
|
||||
|
||||
|
||||
#include "nsEscCharsetProber.h"
|
||||
#include "nsUniversalDetector.h"
|
||||
|
||||
nsEscCharSetProber::nsEscCharSetProber(void)
|
||||
nsEscCharSetProber::nsEscCharSetProber(PRUint32 aLanguageFilter)
|
||||
{
|
||||
mCodingSM[0] = new nsCodingStateMachine(&HZSMModel);
|
||||
mCodingSM[1] = new nsCodingStateMachine(&ISO2022CNSMModel);
|
||||
mCodingSM[2] = new nsCodingStateMachine(&ISO2022JPSMModel);
|
||||
mCodingSM[3] = new nsCodingStateMachine(&ISO2022KRSMModel);
|
||||
for (PRUint32 i = 0; i < NUM_OF_ESC_CHARSETS; i++)
|
||||
mCodingSM[i] = nsnull;
|
||||
if (aLanguageFilter & NS_FILTER_CHINESE_SIMPLIFIED)
|
||||
{
|
||||
mCodingSM[0] = new nsCodingStateMachine(&HZSMModel);
|
||||
mCodingSM[1] = new nsCodingStateMachine(&ISO2022CNSMModel);
|
||||
}
|
||||
if (aLanguageFilter & NS_FILTER_JAPANESE)
|
||||
mCodingSM[2] = new nsCodingStateMachine(&ISO2022JPSMModel);
|
||||
if (aLanguageFilter & NS_FILTER_KOREAN)
|
||||
mCodingSM[3] = new nsCodingStateMachine(&ISO2022KRSMModel);
|
||||
mActiveSM = NUM_OF_ESC_CHARSETS;
|
||||
mState = eDetecting;
|
||||
mDetectedCharset = nsnull;
|
||||
@ -59,7 +67,8 @@ void nsEscCharSetProber::Reset(void)
|
||||
{
|
||||
mState = eDetecting;
|
||||
for (PRUint32 i = 0; i < NUM_OF_ESC_CHARSETS; i++)
|
||||
mCodingSM[i]->Reset();
|
||||
if (mCodingSM[i])
|
||||
mCodingSM[i]->Reset();
|
||||
mActiveSM = NUM_OF_ESC_CHARSETS;
|
||||
mDetectedCharset = nsnull;
|
||||
}
|
||||
@ -74,30 +83,15 @@ nsProbingState nsEscCharSetProber::HandleData(const char* aBuf, PRUint32 aLen)
|
||||
{
|
||||
for (j = mActiveSM-1; j>= 0; j--)
|
||||
{
|
||||
//byte is feed to all active state machine
|
||||
codingState = mCodingSM[j]->NextState(aBuf[i]);
|
||||
if (codingState == eError)
|
||||
if (mCodingSM[j])
|
||||
{
|
||||
//got negative answer for this state machine, make it inactive
|
||||
mActiveSM--;
|
||||
if (mActiveSM == 0)
|
||||
codingState = mCodingSM[j]->NextState(aBuf[i]);
|
||||
if (codingState == eItsMe)
|
||||
{
|
||||
mState = eNotMe;
|
||||
mState = eFoundIt;
|
||||
mDetectedCharset = mCodingSM[j]->GetCodingStateMachine();
|
||||
return mState;
|
||||
}
|
||||
else if (j != (PRInt32)mActiveSM)
|
||||
{
|
||||
nsCodingStateMachine* t;
|
||||
t = mCodingSM[mActiveSM];
|
||||
mCodingSM[mActiveSM] = mCodingSM[j];
|
||||
mCodingSM[j] = t;
|
||||
}
|
||||
}
|
||||
else if (codingState == eItsMe)
|
||||
{
|
||||
mState = eFoundIt;
|
||||
mDetectedCharset = mCodingSM[j]->GetCodingStateMachine();
|
||||
return mState;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -45,7 +45,7 @@
|
||||
|
||||
class nsEscCharSetProber: public nsCharSetProber {
|
||||
public:
|
||||
nsEscCharSetProber(void);
|
||||
nsEscCharSetProber(PRUint32 aLanguageFilter);
|
||||
virtual ~nsEscCharSetProber(void);
|
||||
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
|
||||
const char* GetCharSetName() {return mDetectedCharset;}
|
||||
|
||||
@ -20,7 +20,6 @@
|
||||
* the Initial Developer. All Rights Reserved.
|
||||
*
|
||||
* Contributor(s):
|
||||
* Kazutoshi Satoda
|
||||
*
|
||||
* Alternatively, the contents of this file may be used under the terms of
|
||||
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
@ -35,10 +34,9 @@
|
||||
* the terms of any one of the MPL, the GPL or the LGPL.
|
||||
*
|
||||
* ***** END LICENSE BLOCK ***** */
|
||||
#include "uchardetDefine.h"
|
||||
#include "nsCodingStateMachine.h"
|
||||
|
||||
static PRUint32 HZ_cls[ 256 / 8 ] = {
|
||||
static const PRUint32 HZ_cls[ 256 / 8 ] = {
|
||||
PCK4BITS(1,0,0,0,0,0,0,0), // 00 - 07
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // 08 - 0f
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // 10 - 17
|
||||
@ -74,7 +72,7 @@ PCK4BITS(1,1,1,1,1,1,1,1) // f8 - ff
|
||||
};
|
||||
|
||||
|
||||
static PRUint32 HZ_st [ 6] = {
|
||||
static const PRUint32 HZ_st [ 6] = {
|
||||
PCK4BITS(eStart,eError, 3,eStart,eStart,eStart,eError,eError),//00-07
|
||||
PCK4BITS(eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe),//08-0f
|
||||
PCK4BITS(eItsMe,eItsMe,eError,eError,eStart,eStart, 4,eError),//10-17
|
||||
@ -85,16 +83,16 @@ PCK4BITS( 4,eItsMe,eStart,eStart,eStart,eStart,eStart,eStart) //28-2f
|
||||
|
||||
static const PRUint32 HZCharLenTable[] = {0, 0, 0, 0, 0, 0};
|
||||
|
||||
SMModel HZSMModel = {
|
||||
const SMModel HZSMModel = {
|
||||
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, HZ_cls },
|
||||
6,
|
||||
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, HZ_st },
|
||||
HZCharLenTable,
|
||||
CHARDET_ENCODING_HZ_GB_2312,
|
||||
"HZ-GB-2312",
|
||||
};
|
||||
|
||||
|
||||
static PRUint32 ISO2022CN_cls [ 256 / 8 ] = {
|
||||
static const PRUint32 ISO2022CN_cls [ 256 / 8 ] = {
|
||||
PCK4BITS(2,0,0,0,0,0,0,0), // 00 - 07
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // 08 - 0f
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // 10 - 17
|
||||
@ -130,7 +128,7 @@ PCK4BITS(2,2,2,2,2,2,2,2) // f8 - ff
|
||||
};
|
||||
|
||||
|
||||
static PRUint32 ISO2022CN_st [ 8] = {
|
||||
static const PRUint32 ISO2022CN_st [ 8] = {
|
||||
PCK4BITS(eStart, 3,eError,eStart,eStart,eStart,eStart,eStart),//00-07
|
||||
PCK4BITS(eStart,eError,eError,eError,eError,eError,eError,eError),//08-0f
|
||||
PCK4BITS(eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe),//10-17
|
||||
@ -143,15 +141,15 @@ PCK4BITS(eError,eError,eError,eError,eError,eItsMe,eError,eStart) //38-3f
|
||||
|
||||
static const PRUint32 ISO2022CNCharLenTable[] = {0, 0, 0, 0, 0, 0, 0, 0, 0};
|
||||
|
||||
SMModel ISO2022CNSMModel = {
|
||||
const SMModel ISO2022CNSMModel = {
|
||||
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, ISO2022CN_cls },
|
||||
9,
|
||||
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, ISO2022CN_st },
|
||||
ISO2022CNCharLenTable,
|
||||
CHARDET_ENCODING_ISO_2022_CN,
|
||||
"ISO-2022-CN",
|
||||
};
|
||||
|
||||
static PRUint32 ISO2022JP_cls [ 256 / 8 ] = {
|
||||
static const PRUint32 ISO2022JP_cls [ 256 / 8 ] = {
|
||||
PCK4BITS(2,0,0,0,0,0,0,0), // 00 - 07
|
||||
PCK4BITS(0,0,0,0,0,0,2,2), // 08 - 0f
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // 10 - 17
|
||||
@ -187,7 +185,7 @@ PCK4BITS(2,2,2,2,2,2,2,2) // f8 - ff
|
||||
};
|
||||
|
||||
|
||||
static PRUint32 ISO2022JP_st [ 9] = {
|
||||
static const PRUint32 ISO2022JP_st [ 9] = {
|
||||
PCK4BITS(eStart, 3,eError,eStart,eStart,eStart,eStart,eStart),//00-07
|
||||
PCK4BITS(eStart,eStart,eError,eError,eError,eError,eError,eError),//08-0f
|
||||
PCK4BITS(eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe),//10-17
|
||||
@ -199,17 +197,17 @@ PCK4BITS(eError,eError,eError,eItsMe,eError,eError,eError,eError),//38-3f
|
||||
PCK4BITS(eError,eError,eError,eError,eItsMe,eError,eStart,eStart) //40-47
|
||||
};
|
||||
|
||||
static const PRUint32 ISO2022JPCharLenTable[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
|
||||
static const PRUint32 ISO2022JPCharLenTable[] = {0, 0, 0, 0, 0, 0, 0, 0};
|
||||
|
||||
SMModel ISO2022JPSMModel = {
|
||||
const SMModel ISO2022JPSMModel = {
|
||||
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, ISO2022JP_cls },
|
||||
10,
|
||||
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, ISO2022JP_st },
|
||||
ISO2022JPCharLenTable,
|
||||
CHARDET_ENCODING_ISO_2022_JP,
|
||||
"ISO-2022-JP",
|
||||
};
|
||||
|
||||
static PRUint32 ISO2022KR_cls [ 256 / 8 ] = {
|
||||
static const PRUint32 ISO2022KR_cls [ 256 / 8 ] = {
|
||||
PCK4BITS(2,0,0,0,0,0,0,0), // 00 - 07
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // 08 - 0f
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // 10 - 17
|
||||
@ -245,7 +243,7 @@ PCK4BITS(2,2,2,2,2,2,2,2) // f8 - ff
|
||||
};
|
||||
|
||||
|
||||
static PRUint32 ISO2022KR_st [ 5] = {
|
||||
static const PRUint32 ISO2022KR_st [ 5] = {
|
||||
PCK4BITS(eStart, 3,eError,eStart,eStart,eStart,eError,eError),//00-07
|
||||
PCK4BITS(eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe),//08-0f
|
||||
PCK4BITS(eItsMe,eItsMe,eError,eError,eError, 4,eError,eError),//10-17
|
||||
@ -255,11 +253,11 @@ PCK4BITS(eError,eError,eError,eItsMe,eStart,eStart,eStart,eStart) //20-27
|
||||
|
||||
static const PRUint32 ISO2022KRCharLenTable[] = {0, 0, 0, 0, 0, 0};
|
||||
|
||||
SMModel ISO2022KRSMModel = {
|
||||
const SMModel ISO2022KRSMModel = {
|
||||
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, ISO2022KR_cls },
|
||||
6,
|
||||
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, ISO2022KR_st },
|
||||
ISO2022KRCharLenTable,
|
||||
CHARDET_ENCODING_ISO_2022_KR,
|
||||
"ISO-2022-KR",
|
||||
};
|
||||
|
||||
|
||||
@ -46,7 +46,7 @@ void nsGB18030Prober::Reset(void)
|
||||
{
|
||||
mCodingSM->Reset();
|
||||
mState = eDetecting;
|
||||
mDistributionAnalyser.Reset();
|
||||
mDistributionAnalyser.Reset(mIsPreferredLanguage);
|
||||
//mContextAnalyser.Reset();
|
||||
}
|
||||
|
||||
@ -57,11 +57,6 @@ nsProbingState nsGB18030Prober::HandleData(const char* aBuf, PRUint32 aLen)
|
||||
for (PRUint32 i = 0; i < aLen; i++)
|
||||
{
|
||||
codingState = mCodingSM->NextState(aBuf[i]);
|
||||
if (codingState == eError)
|
||||
{
|
||||
mState = eNotMe;
|
||||
break;
|
||||
}
|
||||
if (codingState == eItsMe)
|
||||
{
|
||||
mState = eFoundIt;
|
||||
|
||||
@ -38,7 +38,6 @@
|
||||
#ifndef nsGB2312Prober_h__
|
||||
#define nsGB2312Prober_h__
|
||||
|
||||
#include "uchardetDefine.h"
|
||||
#include "nsCharSetProber.h"
|
||||
#include "nsCodingStateMachine.h"
|
||||
#include "CharDistribution.h"
|
||||
@ -47,11 +46,13 @@
|
||||
|
||||
class nsGB18030Prober: public nsCharSetProber {
|
||||
public:
|
||||
nsGB18030Prober(void){mCodingSM = new nsCodingStateMachine(&GB18030SMModel);
|
||||
Reset();}
|
||||
nsGB18030Prober(PRBool aIsPreferredLanguage)
|
||||
:mIsPreferredLanguage(aIsPreferredLanguage)
|
||||
{mCodingSM = new nsCodingStateMachine(&GB18030SMModel);
|
||||
Reset();}
|
||||
virtual ~nsGB18030Prober(void){delete mCodingSM;}
|
||||
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
|
||||
const char* GetCharSetName() {return CHARDET_ENCODING_GB18030;}
|
||||
const char* GetCharSetName() {return "gb18030";}
|
||||
nsProbingState GetState(void) {return mState;}
|
||||
void Reset(void);
|
||||
float GetConfidence(void);
|
||||
@ -66,6 +67,7 @@ protected:
|
||||
//GB2312ContextAnalysis mContextAnalyser;
|
||||
GB2312DistributionAnalysis mDistributionAnalyser;
|
||||
char mLastChar[2];
|
||||
PRBool mIsPreferredLanguage;
|
||||
|
||||
};
|
||||
|
||||
|
||||
@ -35,7 +35,6 @@
|
||||
*
|
||||
* ***** END LICENSE BLOCK ***** */
|
||||
|
||||
#include "uchardetDefine.h"
|
||||
#include "nsHebrewProber.h"
|
||||
#include <stdio.h>
|
||||
|
||||
@ -59,8 +58,8 @@
|
||||
// If the difference is below this, don't rely at all on the model score distance.
|
||||
#define MIN_MODEL_DISTANCE (0.01)
|
||||
|
||||
#define VISUAL_HEBREW_NAME (CHARDET_ENCODING_ISO_8859_8)
|
||||
#define LOGICAL_HEBREW_NAME (CHARDET_ENCODING_WINDOWS_1255)
|
||||
#define VISUAL_HEBREW_NAME ("ISO-8859-8")
|
||||
#define LOGICAL_HEBREW_NAME ("windows-1255")
|
||||
|
||||
PRBool nsHebrewProber::isFinal(char c)
|
||||
{
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
#define ASO 7 // accent small other
|
||||
#define CLASS_NUM 8 // total classes
|
||||
|
||||
static unsigned char Latin1_CharToClass[] =
|
||||
static const unsigned char Latin1_CharToClass[] =
|
||||
{
|
||||
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 00 - 07
|
||||
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 08 - 0F
|
||||
@ -92,7 +92,7 @@ static unsigned char Latin1_CharToClass[] =
|
||||
2 : normal
|
||||
3 : very likely
|
||||
*/
|
||||
static unsigned char Latin1ClassModel[] =
|
||||
static const unsigned char Latin1ClassModel[] =
|
||||
{
|
||||
/* UDF OTH ASC ASS ACV ACO ASV ASO */
|
||||
/*UDF*/ 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
|
||||
@ -39,7 +39,6 @@
|
||||
#ifndef nsLatin1Prober_h__
|
||||
#define nsLatin1Prober_h__
|
||||
|
||||
#include "uchardetDefine.h"
|
||||
#include "nsCharSetProber.h"
|
||||
|
||||
#define FREQ_CAT_NUM 4
|
||||
@ -49,7 +48,7 @@ public:
|
||||
nsLatin1Prober(void){Reset();}
|
||||
virtual ~nsLatin1Prober(void){}
|
||||
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
|
||||
const char* GetCharSetName() {return CHARDET_ENCODING_WINDOWS_1252;}
|
||||
const char* GetCharSetName() {return "windows-1252";}
|
||||
nsProbingState GetState(void) {return mState;}
|
||||
void Reset(void);
|
||||
float GetConfidence(void);
|
||||
|
||||
@ -21,6 +21,7 @@
|
||||
*
|
||||
* Contributor(s):
|
||||
* Shy Shalom <shooshX@gmail.com>
|
||||
* Proofpoint, Inc.
|
||||
*
|
||||
* Alternatively, the contents of this file may be used under the terms of
|
||||
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
@ -36,12 +37,12 @@
|
||||
*
|
||||
* ***** END LICENSE BLOCK ***** */
|
||||
#include <stdio.h>
|
||||
#include "prmem.h"
|
||||
|
||||
#include "nsMBCSGroupProber.h"
|
||||
#include "nsUniversalDetector.h"
|
||||
|
||||
#ifdef DEBUG_chardet
|
||||
char *ProberName[] =
|
||||
#if defined(DEBUG_chardet) || defined(DEBUG_jgmyers)
|
||||
const char *ProberName[] =
|
||||
{
|
||||
"UTF8",
|
||||
"SJIS",
|
||||
@ -54,15 +55,26 @@ char *ProberName[] =
|
||||
|
||||
#endif
|
||||
|
||||
nsMBCSGroupProber::nsMBCSGroupProber()
|
||||
nsMBCSGroupProber::nsMBCSGroupProber(PRUint32 aLanguageFilter)
|
||||
{
|
||||
for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++)
|
||||
mProbers[i] = nsnull;
|
||||
|
||||
mProbers[0] = new nsUTF8Prober();
|
||||
mProbers[1] = new nsSJISProber();
|
||||
mProbers[2] = new nsEUCJPProber();
|
||||
mProbers[3] = new nsGB18030Prober();
|
||||
mProbers[4] = new nsEUCKRProber();
|
||||
mProbers[5] = new nsBig5Prober();
|
||||
mProbers[6] = new nsEUCTWProber();
|
||||
if (aLanguageFilter & NS_FILTER_JAPANESE)
|
||||
{
|
||||
mProbers[1] = new nsSJISProber(aLanguageFilter == NS_FILTER_JAPANESE);
|
||||
mProbers[2] = new nsEUCJPProber(aLanguageFilter == NS_FILTER_JAPANESE);
|
||||
}
|
||||
if (aLanguageFilter & NS_FILTER_CHINESE_SIMPLIFIED)
|
||||
mProbers[3] = new nsGB18030Prober(aLanguageFilter == NS_FILTER_CHINESE_SIMPLIFIED);
|
||||
if (aLanguageFilter & NS_FILTER_KOREAN)
|
||||
mProbers[4] = new nsEUCKRProber(aLanguageFilter == NS_FILTER_KOREAN);
|
||||
if (aLanguageFilter & NS_FILTER_CHINESE_TRADITIONAL)
|
||||
{
|
||||
mProbers[5] = new nsBig5Prober(aLanguageFilter == NS_FILTER_CHINESE_TRADITIONAL);
|
||||
mProbers[6] = new nsEUCTWProber(aLanguageFilter == NS_FILTER_CHINESE_TRADITIONAL);
|
||||
}
|
||||
Reset();
|
||||
}
|
||||
|
||||
@ -101,62 +113,59 @@ void nsMBCSGroupProber::Reset(void)
|
||||
}
|
||||
mBestGuess = -1;
|
||||
mState = eDetecting;
|
||||
mKeepNext = 0;
|
||||
}
|
||||
|
||||
nsProbingState nsMBCSGroupProber::HandleData(const char* aBuf, PRUint32 aLen)
|
||||
{
|
||||
nsProbingState st;
|
||||
PRUint32 i;
|
||||
PRUint32 start = 0;
|
||||
PRUint32 keepNext = mKeepNext;
|
||||
|
||||
//do filtering to reduce load to probers
|
||||
char *highbyteBuf;
|
||||
char *hptr;
|
||||
PRBool keepNext = PR_TRUE; //assume previous is not ascii, it will do no harm except add some noise
|
||||
hptr = highbyteBuf = (char*)PR_Malloc(aLen);
|
||||
if (!hptr)
|
||||
return mState;
|
||||
for (i = 0; i < aLen; i++)
|
||||
for (PRUint32 pos = 0; pos < aLen; ++pos)
|
||||
{
|
||||
if (aBuf[i] & 0x80)
|
||||
if (aBuf[pos] & 0x80)
|
||||
{
|
||||
*hptr++ = aBuf[i];
|
||||
keepNext = PR_TRUE;
|
||||
if (!keepNext)
|
||||
start = pos;
|
||||
keepNext = 2;
|
||||
}
|
||||
else
|
||||
else if (keepNext)
|
||||
{
|
||||
//if previous is highbyte, keep this even it is a ASCII
|
||||
if (keepNext)
|
||||
if (--keepNext == 0)
|
||||
{
|
||||
*hptr++ = aBuf[i];
|
||||
keepNext = PR_FALSE;
|
||||
for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++)
|
||||
{
|
||||
if (!mIsActive[i])
|
||||
continue;
|
||||
st = mProbers[i]->HandleData(aBuf + start, pos + 1 - start);
|
||||
if (st == eFoundIt)
|
||||
{
|
||||
mBestGuess = i;
|
||||
mState = eFoundIt;
|
||||
return mState;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (i = 0; i < NUM_OF_PROBERS; i++)
|
||||
{
|
||||
if (!mIsActive[i])
|
||||
continue;
|
||||
st = mProbers[i]->HandleData(highbyteBuf, hptr - highbyteBuf);
|
||||
if (st == eFoundIt)
|
||||
{
|
||||
mBestGuess = i;
|
||||
mState = eFoundIt;
|
||||
break;
|
||||
}
|
||||
else if (st == eNotMe)
|
||||
{
|
||||
mIsActive[i] = PR_FALSE;
|
||||
mActiveNum--;
|
||||
if (mActiveNum <= 0)
|
||||
{
|
||||
mState = eNotMe;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (keepNext) {
|
||||
for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++)
|
||||
{
|
||||
if (!mIsActive[i])
|
||||
continue;
|
||||
st = mProbers[i]->HandleData(aBuf + start, aLen - start);
|
||||
if (st == eFoundIt)
|
||||
{
|
||||
mBestGuess = i;
|
||||
mState = eFoundIt;
|
||||
return mState;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
PR_FREEIF(highbyteBuf);
|
||||
mKeepNext = keepNext;
|
||||
|
||||
return mState;
|
||||
}
|
||||
@ -207,3 +216,15 @@ void nsMBCSGroupProber::DumpStatus()
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef DEBUG_jgmyers
|
||||
void nsMBCSGroupProber::GetDetectorState(nsUniversalDetector::DetectorState (&states)[nsUniversalDetector::NumDetectors], PRUint32 &offset)
|
||||
{
|
||||
for (PRUint32 i = 0; i < NUM_OF_PROBERS; ++i) {
|
||||
states[offset].name = ProberName[i];
|
||||
states[offset].isActive = mIsActive[i];
|
||||
states[offset].confidence = mIsActive[i] ? mProbers[i]->GetConfidence() : 0.0;
|
||||
++offset;
|
||||
}
|
||||
}
|
||||
#endif /* DEBUG_jgmyers */
|
||||
|
||||
@ -20,6 +20,7 @@
|
||||
* the Initial Developer. All Rights Reserved.
|
||||
*
|
||||
* Contributor(s):
|
||||
* Proofpoint, Inc.
|
||||
*
|
||||
* Alternatively, the contents of this file may be used under the terms of
|
||||
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
@ -50,7 +51,7 @@
|
||||
|
||||
class nsMBCSGroupProber: public nsCharSetProber {
|
||||
public:
|
||||
nsMBCSGroupProber();
|
||||
nsMBCSGroupProber(PRUint32 aLanguageFilter);
|
||||
virtual ~nsMBCSGroupProber();
|
||||
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
|
||||
const char* GetCharSetName();
|
||||
@ -62,6 +63,9 @@ public:
|
||||
#ifdef DEBUG_chardet
|
||||
void DumpStatus();
|
||||
#endif
|
||||
#ifdef DEBUG_jgmyers
|
||||
void GetDetectorState(nsUniversalDetector::DetectorState (&states)[nsUniversalDetector::NumDetectors], PRUint32 &offset);
|
||||
#endif
|
||||
|
||||
protected:
|
||||
nsProbingState mState;
|
||||
@ -69,6 +73,7 @@ protected:
|
||||
PRBool mIsActive[NUM_OF_PROBERS];
|
||||
PRInt32 mBestGuess;
|
||||
PRUint32 mActiveNum;
|
||||
PRUint32 mKeepNext;
|
||||
};
|
||||
|
||||
#endif /* nsMBCSGroupProber_h__ */
|
||||
|
||||
170
src/nsMBCSSM.cpp
170
src/nsMBCSSM.cpp
@ -34,7 +34,6 @@
|
||||
* the terms of any one of the MPL, the GPL or the LGPL.
|
||||
*
|
||||
* ***** END LICENSE BLOCK ***** */
|
||||
#include "uchardetDefine.h"
|
||||
#include "nsCodingStateMachine.h"
|
||||
|
||||
/*
|
||||
@ -45,7 +44,7 @@ Modification from frank tang's original work:
|
||||
|
||||
// BIG5
|
||||
|
||||
static PRUint32 BIG5_cls [ 256 / 8 ] = {
|
||||
static const PRUint32 BIG5_cls [ 256 / 8 ] = {
|
||||
//PCK4BITS(0,1,1,1,1,1,1,1), // 00 - 07
|
||||
PCK4BITS(1,1,1,1,1,1,1,1), // 00 - 07 //allow 0x00 as legal value
|
||||
PCK4BITS(1,1,1,1,1,1,0,0), // 08 - 0f
|
||||
@ -82,7 +81,7 @@ PCK4BITS(3,3,3,3,3,3,3,0) // f8 - ff
|
||||
};
|
||||
|
||||
|
||||
static PRUint32 BIG5_st [ 3] = {
|
||||
static const PRUint32 BIG5_st [ 3] = {
|
||||
PCK4BITS(eError,eStart,eStart, 3,eError,eError,eError,eError),//00-07
|
||||
PCK4BITS(eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError),//08-0f
|
||||
PCK4BITS(eError,eStart,eStart,eStart,eStart,eStart,eStart,eStart) //10-17
|
||||
@ -90,15 +89,15 @@ PCK4BITS(eError,eStart,eStart,eStart,eStart,eStart,eStart,eStart) //10-17
|
||||
|
||||
static const PRUint32 Big5CharLenTable[] = {0, 1, 1, 2, 0};
|
||||
|
||||
SMModel Big5SMModel = {
|
||||
SMModel const Big5SMModel = {
|
||||
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, BIG5_cls },
|
||||
5,
|
||||
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, BIG5_st },
|
||||
Big5CharLenTable,
|
||||
CHARDET_ENCODING_BIG5,
|
||||
"Big5",
|
||||
};
|
||||
|
||||
static PRUint32 EUCJP_cls [ 256 / 8 ] = {
|
||||
static const PRUint32 EUCJP_cls [ 256 / 8 ] = {
|
||||
//PCK4BITS(5,4,4,4,4,4,4,4), // 00 - 07
|
||||
PCK4BITS(4,4,4,4,4,4,4,4), // 00 - 07
|
||||
PCK4BITS(4,4,4,4,4,4,5,5), // 08 - 0f
|
||||
@ -135,7 +134,7 @@ PCK4BITS(0,0,0,0,0,0,0,5) // f8 - ff
|
||||
};
|
||||
|
||||
|
||||
static PRUint32 EUCJP_st [ 5] = {
|
||||
static const PRUint32 EUCJP_st [ 5] = {
|
||||
PCK4BITS( 3, 4, 3, 5,eStart,eError,eError,eError),//00-07
|
||||
PCK4BITS(eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe),//08-0f
|
||||
PCK4BITS(eItsMe,eItsMe,eStart,eError,eStart,eError,eError,eError),//10-17
|
||||
@ -145,15 +144,15 @@ PCK4BITS( 3,eError,eError,eError,eStart,eStart,eStart,eStart) //20-27
|
||||
|
||||
static const PRUint32 EUCJPCharLenTable[] = {2, 2, 2, 3, 1, 0};
|
||||
|
||||
SMModel EUCJPSMModel = {
|
||||
const SMModel EUCJPSMModel = {
|
||||
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCJP_cls },
|
||||
6,
|
||||
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCJP_st },
|
||||
EUCJPCharLenTable,
|
||||
CHARDET_ENCODING_EUC_JP,
|
||||
"EUC-JP",
|
||||
};
|
||||
|
||||
static PRUint32 EUCKR_cls [ 256 / 8 ] = {
|
||||
static const PRUint32 EUCKR_cls [ 256 / 8 ] = {
|
||||
//PCK4BITS(0,1,1,1,1,1,1,1), // 00 - 07
|
||||
PCK4BITS(1,1,1,1,1,1,1,1), // 00 - 07
|
||||
PCK4BITS(1,1,1,1,1,1,0,0), // 08 - 0f
|
||||
@ -190,22 +189,22 @@ PCK4BITS(2,2,2,2,2,2,2,0) // f8 - ff
|
||||
};
|
||||
|
||||
|
||||
static PRUint32 EUCKR_st [ 2] = {
|
||||
static const PRUint32 EUCKR_st [ 2] = {
|
||||
PCK4BITS(eError,eStart, 3,eError,eError,eError,eError,eError),//00-07
|
||||
PCK4BITS(eItsMe,eItsMe,eItsMe,eItsMe,eError,eError,eStart,eStart) //08-0f
|
||||
};
|
||||
|
||||
static const PRUint32 EUCKRCharLenTable[] = {0, 1, 2, 0};
|
||||
|
||||
SMModel EUCKRSMModel = {
|
||||
const SMModel EUCKRSMModel = {
|
||||
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCKR_cls },
|
||||
4,
|
||||
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCKR_st },
|
||||
EUCKRCharLenTable,
|
||||
CHARDET_ENCODING_EUC_KR,
|
||||
"EUC-KR",
|
||||
};
|
||||
|
||||
static PRUint32 EUCTW_cls [ 256 / 8 ] = {
|
||||
static const PRUint32 EUCTW_cls [ 256 / 8 ] = {
|
||||
//PCK4BITS(0,2,2,2,2,2,2,2), // 00 - 07
|
||||
PCK4BITS(2,2,2,2,2,2,2,2), // 00 - 07
|
||||
PCK4BITS(2,2,2,2,2,2,0,0), // 08 - 0f
|
||||
@ -242,7 +241,7 @@ PCK4BITS(3,3,3,3,3,3,3,0) // f8 - ff
|
||||
};
|
||||
|
||||
|
||||
static PRUint32 EUCTW_st [ 6] = {
|
||||
static const PRUint32 EUCTW_st [ 6] = {
|
||||
PCK4BITS(eError,eError,eStart, 3, 3, 3, 4,eError),//00-07
|
||||
PCK4BITS(eError,eError,eError,eError,eError,eError,eItsMe,eItsMe),//08-0f
|
||||
PCK4BITS(eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError,eStart,eError),//10-17
|
||||
@ -253,12 +252,12 @@ PCK4BITS(eStart,eError,eStart,eStart,eStart,eStart,eStart,eStart) //28-2f
|
||||
|
||||
static const PRUint32 EUCTWCharLenTable[] = {0, 0, 1, 2, 2, 2, 3};
|
||||
|
||||
SMModel EUCTWSMModel = {
|
||||
const SMModel EUCTWSMModel = {
|
||||
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCTW_cls },
|
||||
7,
|
||||
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCTW_st },
|
||||
EUCTWCharLenTable,
|
||||
CHARDET_ENCODING_EUC_TW,
|
||||
"x-euc-tw",
|
||||
};
|
||||
|
||||
/* obsolete GB2312 by gb18030
|
||||
@ -317,7 +316,7 @@ SMModel GB2312SMModel = {
|
||||
|
||||
// the following state machine data was created by perl script in
|
||||
// intl/chardet/tools. It should be the same as in PSM detector.
|
||||
static PRUint32 GB18030_cls [ 256 / 8 ] = {
|
||||
static const PRUint32 GB18030_cls [ 256 / 8 ] = {
|
||||
PCK4BITS(1,1,1,1,1,1,1,1), // 00 - 07
|
||||
PCK4BITS(1,1,1,1,1,1,0,0), // 08 - 0f
|
||||
PCK4BITS(1,1,1,1,1,1,1,1), // 10 - 17
|
||||
@ -353,7 +352,7 @@ PCK4BITS(6,6,6,6,6,6,6,0) // f8 - ff
|
||||
};
|
||||
|
||||
|
||||
static PRUint32 GB18030_st [ 6] = {
|
||||
static const PRUint32 GB18030_st [ 6] = {
|
||||
PCK4BITS(eError,eStart,eStart,eStart,eStart,eStart, 3,eError),//00-07
|
||||
PCK4BITS(eError,eError,eError,eError,eError,eError,eItsMe,eItsMe),//08-0f
|
||||
PCK4BITS(eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError,eError,eStart),//10-17
|
||||
@ -369,17 +368,17 @@ PCK4BITS(eError,eError,eStart,eStart,eStart,eStart,eStart,eStart) //28-2f
|
||||
// 2 here.
|
||||
static const PRUint32 GB18030CharLenTable[] = {0, 1, 1, 1, 1, 1, 2};
|
||||
|
||||
SMModel GB18030SMModel = {
|
||||
const SMModel GB18030SMModel = {
|
||||
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, GB18030_cls },
|
||||
7,
|
||||
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, GB18030_st },
|
||||
GB18030CharLenTable,
|
||||
CHARDET_ENCODING_GB18030,
|
||||
"GB18030",
|
||||
};
|
||||
|
||||
// sjis
|
||||
|
||||
static PRUint32 SJIS_cls [ 256 / 8 ] = {
|
||||
static const PRUint32 SJIS_cls [ 256 / 8 ] = {
|
||||
//PCK4BITS(0,1,1,1,1,1,1,1), // 00 - 07
|
||||
PCK4BITS(1,1,1,1,1,1,1,1), // 00 - 07
|
||||
PCK4BITS(1,1,1,1,1,1,0,0), // 08 - 0f
|
||||
@ -418,7 +417,7 @@ PCK4BITS(4,4,4,4,4,0,0,0) // f8 - ff
|
||||
};
|
||||
|
||||
|
||||
static PRUint32 SJIS_st [ 3] = {
|
||||
static const PRUint32 SJIS_st [ 3] = {
|
||||
PCK4BITS(eError,eStart,eStart, 3,eError,eError,eError,eError),//00-07
|
||||
PCK4BITS(eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe),//08-0f
|
||||
PCK4BITS(eItsMe,eItsMe,eError,eError,eStart,eStart,eStart,eStart) //10-17
|
||||
@ -426,129 +425,16 @@ PCK4BITS(eItsMe,eItsMe,eError,eError,eStart,eStart,eStart,eStart) //10-17
|
||||
|
||||
static const PRUint32 SJISCharLenTable[] = {0, 1, 1, 2, 0, 0};
|
||||
|
||||
SMModel SJISSMModel = {
|
||||
const SMModel SJISSMModel = {
|
||||
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, SJIS_cls },
|
||||
6,
|
||||
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, SJIS_st },
|
||||
SJISCharLenTable,
|
||||
CHARDET_ENCODING_SHIFT_JIS,
|
||||
"Shift_JIS",
|
||||
};
|
||||
|
||||
|
||||
static PRUint32 UCS2BE_cls [ 256 / 8 ] = {
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // 00 - 07
|
||||
PCK4BITS(0,0,1,0,0,2,0,0), // 08 - 0f
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // 10 - 17
|
||||
PCK4BITS(0,0,0,3,0,0,0,0), // 18 - 1f
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // 20 - 27
|
||||
PCK4BITS(0,3,3,3,3,3,0,0), // 28 - 2f
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // 30 - 37
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // 38 - 3f
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // 40 - 47
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // 48 - 4f
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // 50 - 57
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // 58 - 5f
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // 60 - 67
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // 68 - 6f
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // 70 - 77
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // 78 - 7f
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // 80 - 87
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // 88 - 8f
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // 90 - 97
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // 98 - 9f
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // a0 - a7
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // a8 - af
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // b0 - b7
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // b8 - bf
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // c0 - c7
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // c8 - cf
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // d0 - d7
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // d8 - df
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // e0 - e7
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // e8 - ef
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // f0 - f7
|
||||
PCK4BITS(0,0,0,0,0,0,4,5) // f8 - ff
|
||||
};
|
||||
|
||||
|
||||
static PRUint32 UCS2BE_st [ 7] = {
|
||||
PCK4BITS( 5, 7, 7,eError, 4, 3,eError,eError),//00-07
|
||||
PCK4BITS(eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe),//08-0f
|
||||
PCK4BITS(eItsMe,eItsMe, 6, 6, 6, 6,eError,eError),//10-17
|
||||
PCK4BITS( 6, 6, 6, 6, 6,eItsMe, 6, 6),//18-1f
|
||||
PCK4BITS( 6, 6, 6, 6, 5, 7, 7,eError),//20-27
|
||||
PCK4BITS( 5, 8, 6, 6,eError, 6, 6, 6),//28-2f
|
||||
PCK4BITS( 6, 6, 6, 6,eError,eError,eStart,eStart) //30-37
|
||||
};
|
||||
|
||||
static const PRUint32 UCS2BECharLenTable[] = {2, 2, 2, 0, 2, 2};
|
||||
|
||||
SMModel UCS2BESMModel = {
|
||||
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, UCS2BE_cls },
|
||||
6,
|
||||
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, UCS2BE_st },
|
||||
UCS2BECharLenTable,
|
||||
CHARDET_ENCODING_UTF_16BE,
|
||||
};
|
||||
|
||||
static PRUint32 UCS2LE_cls [ 256 / 8 ] = {
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // 00 - 07
|
||||
PCK4BITS(0,0,1,0,0,2,0,0), // 08 - 0f
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // 10 - 17
|
||||
PCK4BITS(0,0,0,3,0,0,0,0), // 18 - 1f
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // 20 - 27
|
||||
PCK4BITS(0,3,3,3,3,3,0,0), // 28 - 2f
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // 30 - 37
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // 38 - 3f
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // 40 - 47
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // 48 - 4f
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // 50 - 57
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // 58 - 5f
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // 60 - 67
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // 68 - 6f
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // 70 - 77
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // 78 - 7f
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // 80 - 87
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // 88 - 8f
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // 90 - 97
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // 98 - 9f
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // a0 - a7
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // a8 - af
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // b0 - b7
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // b8 - bf
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // c0 - c7
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // c8 - cf
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // d0 - d7
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // d8 - df
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // e0 - e7
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // e8 - ef
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // f0 - f7
|
||||
PCK4BITS(0,0,0,0,0,0,4,5) // f8 - ff
|
||||
};
|
||||
|
||||
|
||||
static PRUint32 UCS2LE_st [ 7] = {
|
||||
PCK4BITS( 6, 6, 7, 6, 4, 3,eError,eError),//00-07
|
||||
PCK4BITS(eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe),//08-0f
|
||||
PCK4BITS(eItsMe,eItsMe, 5, 5, 5,eError,eItsMe,eError),//10-17
|
||||
PCK4BITS( 5, 5, 5,eError, 5,eError, 6, 6),//18-1f
|
||||
PCK4BITS( 7, 6, 8, 8, 5, 5, 5,eError),//20-27
|
||||
PCK4BITS( 5, 5, 5,eError,eError,eError, 5, 5),//28-2f
|
||||
PCK4BITS( 5, 5, 5,eError, 5,eError,eStart,eStart) //30-37
|
||||
};
|
||||
|
||||
static const PRUint32 UCS2LECharLenTable[] = {2, 2, 2, 2, 2, 2};
|
||||
|
||||
SMModel UCS2LESMModel = {
|
||||
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, UCS2LE_cls },
|
||||
6,
|
||||
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, UCS2LE_st },
|
||||
UCS2LECharLenTable,
|
||||
CHARDET_ENCODING_UTF_16LE,
|
||||
};
|
||||
|
||||
|
||||
static PRUint32 UTF8_cls [ 256 / 8 ] = {
|
||||
static const PRUint32 UTF8_cls [ 256 / 8 ] = {
|
||||
//PCK4BITS(0,1,1,1,1,1,1,1), // 00 - 07
|
||||
PCK4BITS(1,1,1,1,1,1,1,1), // 00 - 07 //allow 0x00 as a legal value
|
||||
PCK4BITS(1,1,1,1,1,1,0,0), // 08 - 0f
|
||||
@ -585,7 +471,7 @@ PCK4BITS(12,13,13,13,14,15,0,0) // f8 - ff
|
||||
};
|
||||
|
||||
|
||||
static PRUint32 UTF8_st [ 26] = {
|
||||
static const PRUint32 UTF8_st [ 26] = {
|
||||
PCK4BITS(eError,eStart,eError,eError,eError,eError, 12, 10),//00-07
|
||||
PCK4BITS( 9, 11, 8, 7, 6, 5, 4, 3),//08-0f
|
||||
PCK4BITS(eError,eError,eError,eError,eError,eError,eError,eError),//10-17
|
||||
@ -617,11 +503,11 @@ PCK4BITS(eError,eError,eError,eError,eError,eError,eError,eError) //c8-cf
|
||||
static const PRUint32 UTF8CharLenTable[] = {0, 1, 0, 0, 0, 0, 2, 3,
|
||||
3, 3, 4, 4, 5, 5, 6, 6 };
|
||||
|
||||
SMModel UTF8SMModel = {
|
||||
const SMModel UTF8SMModel = {
|
||||
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, UTF8_cls },
|
||||
16,
|
||||
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, UTF8_st },
|
||||
UTF8CharLenTable,
|
||||
CHARDET_ENCODING_UTF_8,
|
||||
"UTF-8",
|
||||
};
|
||||
|
||||
|
||||
@ -68,7 +68,7 @@ typedef struct nsPkgInt {
|
||||
nsSftMsk sftmsk;
|
||||
nsBitSft bitsft;
|
||||
nsUnitMsk unitmsk;
|
||||
PRUint32 *data;
|
||||
const PRUint32* const data;
|
||||
} nsPkgInt;
|
||||
|
||||
|
||||
|
||||
@ -56,21 +56,22 @@ nsSBCSGroupProber::nsSBCSGroupProber()
|
||||
mProbers[7] = new nsSingleByteCharSetProber(&Win1253Model);
|
||||
mProbers[8] = new nsSingleByteCharSetProber(&Latin5BulgarianModel);
|
||||
mProbers[9] = new nsSingleByteCharSetProber(&Win1251BulgarianModel);
|
||||
mProbers[10] = new nsSingleByteCharSetProber(&TIS620ThaiModel);
|
||||
|
||||
nsHebrewProber *hebprober = new nsHebrewProber();
|
||||
// Notice: Any change in these indexes - 10,11,12 must be reflected
|
||||
// in the code below as well.
|
||||
mProbers[10] = hebprober;
|
||||
mProbers[11] = new nsSingleByteCharSetProber(&Win1255Model, PR_FALSE, hebprober); // Logical Hebrew
|
||||
mProbers[12] = new nsSingleByteCharSetProber(&Win1255Model, PR_TRUE, hebprober); // Visual Hebrew
|
||||
mProbers[11] = hebprober;
|
||||
mProbers[12] = new nsSingleByteCharSetProber(&Win1255Model, PR_FALSE, hebprober); // Logical Hebrew
|
||||
mProbers[13] = new nsSingleByteCharSetProber(&Win1255Model, PR_TRUE, hebprober); // Visual Hebrew
|
||||
// Tell the Hebrew prober about the logical and visual probers
|
||||
if (mProbers[10] && mProbers[11] && mProbers[12]) // all are not null
|
||||
if (mProbers[11] && mProbers[12] && mProbers[13]) // all are not null
|
||||
{
|
||||
hebprober->SetModelProbers(mProbers[11], mProbers[12]);
|
||||
hebprober->SetModelProbers(mProbers[12], mProbers[13]);
|
||||
}
|
||||
else // One or more is null. avoid any Hebrew probing, null them all
|
||||
{
|
||||
for (PRUint32 i = 10; i <= 12; ++i)
|
||||
for (PRUint32 i = 11; i <= 13; ++i)
|
||||
{
|
||||
delete mProbers[i];
|
||||
mProbers[i] = 0;
|
||||
|
||||
@ -40,7 +40,7 @@
|
||||
#define nsSBCSGroupProber_h__
|
||||
|
||||
|
||||
#define NUM_OF_SBCS_PROBERS 13
|
||||
#define NUM_OF_SBCS_PROBERS 14
|
||||
|
||||
class nsCharSetProber;
|
||||
class nsSBCSGroupProber: public nsCharSetProber {
|
||||
|
||||
@ -51,19 +51,19 @@
|
||||
|
||||
typedef struct
|
||||
{
|
||||
unsigned char *charToOrderMap; // [256] table use to find a char's order
|
||||
char *precedenceMatrix; // [SAMPLE_SIZE][SAMPLE_SIZE]; table to find a 2-char sequence's frequency
|
||||
const unsigned char* const charToOrderMap; // [256] table use to find a char's order
|
||||
const PRUint8* const precedenceMatrix; // [SAMPLE_SIZE][SAMPLE_SIZE]; table to find a 2-char sequence's frequency
|
||||
float mTypicalPositiveRatio; // = freqSeqs / totalSeqs
|
||||
PRBool keepEnglishLetter; // says if this script contains English characters (not implemented)
|
||||
const char* charsetName;
|
||||
const char* const charsetName;
|
||||
} SequenceModel;
|
||||
|
||||
|
||||
class nsSingleByteCharSetProber : public nsCharSetProber{
|
||||
public:
|
||||
nsSingleByteCharSetProber(SequenceModel *model)
|
||||
nsSingleByteCharSetProber(const SequenceModel *model)
|
||||
:mModel(model), mReversed(PR_FALSE), mNameProber(0) { Reset(); }
|
||||
nsSingleByteCharSetProber(SequenceModel *model, PRBool reversed, nsCharSetProber* nameProber)
|
||||
nsSingleByteCharSetProber(const SequenceModel *model, PRBool reversed, nsCharSetProber* nameProber)
|
||||
:mModel(model), mReversed(reversed), mNameProber(nameProber) { Reset(); }
|
||||
|
||||
virtual const char* GetCharSetName();
|
||||
@ -87,7 +87,7 @@ public:
|
||||
|
||||
protected:
|
||||
nsProbingState mState;
|
||||
const SequenceModel *mModel;
|
||||
const SequenceModel* const mModel;
|
||||
const PRBool mReversed; // PR_TRUE if we need to reverse every pair in the model lookup
|
||||
|
||||
//char order of last character
|
||||
@ -106,19 +106,20 @@ protected:
|
||||
};
|
||||
|
||||
|
||||
extern SequenceModel Koi8rModel;
|
||||
extern SequenceModel Win1251Model;
|
||||
extern SequenceModel Latin5Model;
|
||||
extern SequenceModel MacCyrillicModel;
|
||||
extern SequenceModel Ibm866Model;
|
||||
extern SequenceModel Ibm855Model;
|
||||
extern SequenceModel Latin7Model;
|
||||
extern SequenceModel Win1253Model;
|
||||
extern SequenceModel Latin5BulgarianModel;
|
||||
extern SequenceModel Win1251BulgarianModel;
|
||||
extern SequenceModel Latin2HungarianModel;
|
||||
extern SequenceModel Win1250HungarianModel;
|
||||
extern SequenceModel Win1255Model;
|
||||
extern const SequenceModel Koi8rModel;
|
||||
extern const SequenceModel Win1251Model;
|
||||
extern const SequenceModel Latin5Model;
|
||||
extern const SequenceModel MacCyrillicModel;
|
||||
extern const SequenceModel Ibm866Model;
|
||||
extern const SequenceModel Ibm855Model;
|
||||
extern const SequenceModel Latin7Model;
|
||||
extern const SequenceModel Win1253Model;
|
||||
extern const SequenceModel Latin5BulgarianModel;
|
||||
extern const SequenceModel Win1251BulgarianModel;
|
||||
extern const SequenceModel Latin2HungarianModel;
|
||||
extern const SequenceModel Win1250HungarianModel;
|
||||
extern const SequenceModel Win1255Model;
|
||||
extern const SequenceModel TIS620ThaiModel;
|
||||
|
||||
#endif /* nsSingleByteCharSetProber_h__ */
|
||||
|
||||
|
||||
@ -46,8 +46,8 @@ void nsSJISProber::Reset(void)
|
||||
{
|
||||
mCodingSM->Reset();
|
||||
mState = eDetecting;
|
||||
mContextAnalyser.Reset();
|
||||
mDistributionAnalyser.Reset();
|
||||
mContextAnalyser.Reset(mIsPreferredLanguage);
|
||||
mDistributionAnalyser.Reset(mIsPreferredLanguage);
|
||||
}
|
||||
|
||||
nsProbingState nsSJISProber::HandleData(const char* aBuf, PRUint32 aLen)
|
||||
@ -57,11 +57,6 @@ nsProbingState nsSJISProber::HandleData(const char* aBuf, PRUint32 aLen)
|
||||
for (PRUint32 i = 0; i < aLen; i++)
|
||||
{
|
||||
codingState = mCodingSM->NextState(aBuf[i]);
|
||||
if (codingState == eError)
|
||||
{
|
||||
mState = eNotMe;
|
||||
break;
|
||||
}
|
||||
if (codingState == eItsMe)
|
||||
{
|
||||
mState = eFoundIt;
|
||||
|
||||
@ -43,7 +43,6 @@
|
||||
#ifndef nsSJISProber_h__
|
||||
#define nsSJISProber_h__
|
||||
|
||||
#include "uchardetDefine.h"
|
||||
#include "nsCharSetProber.h"
|
||||
#include "nsCodingStateMachine.h"
|
||||
#include "JpCntx.h"
|
||||
@ -52,11 +51,13 @@
|
||||
|
||||
class nsSJISProber: public nsCharSetProber {
|
||||
public:
|
||||
nsSJISProber(void){mCodingSM = new nsCodingStateMachine(&SJISSMModel);
|
||||
Reset();}
|
||||
nsSJISProber(PRBool aIsPreferredLanguage)
|
||||
:mIsPreferredLanguage(aIsPreferredLanguage)
|
||||
{mCodingSM = new nsCodingStateMachine(&SJISSMModel);
|
||||
Reset();}
|
||||
virtual ~nsSJISProber(void){delete mCodingSM;}
|
||||
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
|
||||
const char* GetCharSetName() {return CHARDET_ENCODING_SHIFT_JIS;}
|
||||
const char* GetCharSetName() {return "Shift_JIS";}
|
||||
nsProbingState GetState(void) {return mState;}
|
||||
void Reset(void);
|
||||
float GetConfidence(void);
|
||||
@ -70,6 +71,7 @@ protected:
|
||||
SJISDistributionAnalysis mDistributionAnalyser;
|
||||
|
||||
char mLastChar[2];
|
||||
PRBool mIsPreferredLanguage;
|
||||
|
||||
};
|
||||
|
||||
|
||||
@ -51,11 +51,6 @@ nsProbingState nsUTF8Prober::HandleData(const char* aBuf, PRUint32 aLen)
|
||||
for (PRUint32 i = 0; i < aLen; i++)
|
||||
{
|
||||
codingState = mCodingSM->NextState(aBuf[i]);
|
||||
if (codingState == eError)
|
||||
{
|
||||
mState = eNotMe;
|
||||
break;
|
||||
}
|
||||
if (codingState == eItsMe)
|
||||
{
|
||||
mState = eFoundIt;
|
||||
|
||||
@ -38,7 +38,6 @@
|
||||
#ifndef nsUTF8Prober_h__
|
||||
#define nsUTF8Prober_h__
|
||||
|
||||
#include "uchardetDefine.h"
|
||||
#include "nsCharSetProber.h"
|
||||
#include "nsCodingStateMachine.h"
|
||||
|
||||
@ -49,7 +48,7 @@ public:
|
||||
Reset(); }
|
||||
virtual ~nsUTF8Prober(){delete mCodingSM;}
|
||||
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
|
||||
const char* GetCharSetName() {return CHARDET_ENCODING_UTF_8;}
|
||||
const char* GetCharSetName() {return "UTF-8";}
|
||||
nsProbingState GetState(void) {return mState;}
|
||||
void Reset(void);
|
||||
float GetConfidence(void);
|
||||
|
||||
@ -38,7 +38,6 @@
|
||||
|
||||
#include "nscore.h"
|
||||
|
||||
#include "uchardetDefine.h"
|
||||
#include "nsUniversalDetector.h"
|
||||
|
||||
#include "nsMBCSGroupProber.h"
|
||||
@ -46,7 +45,7 @@
|
||||
#include "nsEscCharsetProber.h"
|
||||
#include "nsLatin1Prober.h"
|
||||
|
||||
nsUniversalDetector::nsUniversalDetector()
|
||||
nsUniversalDetector::nsUniversalDetector(PRUint32 aLanguageFilter)
|
||||
{
|
||||
mDone = PR_FALSE;
|
||||
mBestGuess = -1; //illegal value as signal
|
||||
@ -58,6 +57,7 @@ nsUniversalDetector::nsUniversalDetector()
|
||||
mGotData = PR_FALSE;
|
||||
mInputState = ePureAscii;
|
||||
mLastChar = '\0';
|
||||
mLanguageFilter = aLanguageFilter;
|
||||
|
||||
PRUint32 i;
|
||||
for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
|
||||
@ -67,10 +67,9 @@ nsUniversalDetector::nsUniversalDetector()
|
||||
nsUniversalDetector::~nsUniversalDetector()
|
||||
{
|
||||
for (PRInt32 i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
|
||||
if (mCharSetProbers[i])
|
||||
delete mCharSetProbers[i];
|
||||
if (mEscCharSetProber)
|
||||
delete mEscCharSetProber;
|
||||
delete mCharSetProbers[i];
|
||||
|
||||
delete mEscCharSetProber;
|
||||
}
|
||||
|
||||
void
|
||||
@ -111,37 +110,23 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
|
||||
if (mStart)
|
||||
{
|
||||
mStart = PR_FALSE;
|
||||
if (aLen > 3)
|
||||
if (aLen > 2)
|
||||
switch (aBuf[0])
|
||||
{
|
||||
case '\xEF':
|
||||
if (('\xBB' == aBuf[1]) && ('\xBF' == aBuf[2]))
|
||||
// EF BB BF UTF-8 encoded BOM
|
||||
mDetectedCharset = CHARDET_ENCODING_UTF_8;
|
||||
mDetectedCharset = "UTF-8";
|
||||
break;
|
||||
case '\xFE':
|
||||
if (('\xFF' == aBuf[1]) && ('\x00' == aBuf[2]) && ('\x00' == aBuf[3]))
|
||||
// FE FF 00 00 UCS-4, unusual octet order BOM (3412)
|
||||
mDetectedCharset = CHARDET_ENCODING_X_ISO_10646_UCS_4_3412;
|
||||
else if ('\xFF' == aBuf[1])
|
||||
if ('\xFF' == aBuf[1])
|
||||
// FE FF UTF-16, big endian BOM
|
||||
mDetectedCharset = CHARDET_ENCODING_UTF_16BE;
|
||||
break;
|
||||
case '\x00':
|
||||
if (('\x00' == aBuf[1]) && ('\xFE' == aBuf[2]) && ('\xFF' == aBuf[3]))
|
||||
// 00 00 FE FF UTF-32, big-endian BOM
|
||||
mDetectedCharset = CHARDET_ENCODING_UTF_32BE;
|
||||
else if (('\x00' == aBuf[1]) && ('\xFF' == aBuf[2]) && ('\xFE' == aBuf[3]))
|
||||
// 00 00 FF FE UCS-4, unusual octet order BOM (2143)
|
||||
mDetectedCharset = CHARDET_ENCODING_X_ISO_10646_UCS_4_2143;
|
||||
mDetectedCharset = "UTF-16";
|
||||
break;
|
||||
case '\xFF':
|
||||
if (('\xFE' == aBuf[1]) && ('\x00' == aBuf[2]) && ('\x00' == aBuf[3]))
|
||||
// FF FE 00 00 UTF-32, little-endian BOM
|
||||
mDetectedCharset = CHARDET_ENCODING_UTF_32LE;
|
||||
else if ('\xFE' == aBuf[1])
|
||||
if ('\xFE' == aBuf[1])
|
||||
// FF FE UTF-16, little endian BOM
|
||||
mDetectedCharset = CHARDET_ENCODING_UTF_16LE;
|
||||
mDetectedCharset = "UTF-16";
|
||||
break;
|
||||
} // switch
|
||||
|
||||
@ -172,16 +157,24 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
|
||||
|
||||
//start multibyte and singlebyte charset prober
|
||||
if (nsnull == mCharSetProbers[0])
|
||||
mCharSetProbers[0] = new nsMBCSGroupProber;
|
||||
if (nsnull == mCharSetProbers[1])
|
||||
mCharSetProbers[1] = new nsSBCSGroupProber;
|
||||
if (nsnull == mCharSetProbers[2])
|
||||
mCharSetProbers[2] = new nsLatin1Prober;
|
||||
|
||||
if ((nsnull == mCharSetProbers[0]) ||
|
||||
(nsnull == mCharSetProbers[1]) ||
|
||||
(nsnull == mCharSetProbers[2]))
|
||||
{
|
||||
mCharSetProbers[0] = new nsMBCSGroupProber(mLanguageFilter);
|
||||
if (nsnull == mCharSetProbers[0])
|
||||
return NS_ERROR_OUT_OF_MEMORY;
|
||||
}
|
||||
if (nsnull == mCharSetProbers[1] &&
|
||||
(mLanguageFilter & NS_FILTER_NON_CJK))
|
||||
{
|
||||
mCharSetProbers[1] = new nsSBCSGroupProber;
|
||||
if (nsnull == mCharSetProbers[1])
|
||||
return NS_ERROR_OUT_OF_MEMORY;
|
||||
}
|
||||
if (nsnull == mCharSetProbers[2])
|
||||
{
|
||||
mCharSetProbers[2] = new nsLatin1Prober;
|
||||
if (nsnull == mCharSetProbers[2])
|
||||
return NS_ERROR_OUT_OF_MEMORY;
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
@ -202,7 +195,7 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
|
||||
{
|
||||
case eEscAscii:
|
||||
if (nsnull == mEscCharSetProber) {
|
||||
mEscCharSetProber = new nsEscCharSetProber;
|
||||
mEscCharSetProber = new nsEscCharSetProber(mLanguageFilter);
|
||||
if (nsnull == mEscCharSetProber)
|
||||
return NS_ERROR_OUT_OF_MEMORY;
|
||||
}
|
||||
@ -216,12 +209,15 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
|
||||
case eHighbyte:
|
||||
for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
|
||||
{
|
||||
st = mCharSetProbers[i]->HandleData(aBuf, aLen);
|
||||
if (st == eFoundIt)
|
||||
if (mCharSetProbers[i])
|
||||
{
|
||||
mDone = PR_TRUE;
|
||||
mDetectedCharset = mCharSetProbers[i]->GetCharSetName();
|
||||
return NS_OK;
|
||||
st = mCharSetProbers[i]->HandleData(aBuf, aLen);
|
||||
if (st == eFoundIt)
|
||||
{
|
||||
mDone = PR_TRUE;
|
||||
mDetectedCharset = mCharSetProbers[i]->GetCharSetName();
|
||||
return NS_OK;
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
@ -260,11 +256,14 @@ void nsUniversalDetector::DataEnd()
|
||||
|
||||
for (PRInt32 i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
|
||||
{
|
||||
proberConfidence = mCharSetProbers[i]->GetConfidence();
|
||||
if (proberConfidence > maxProberConfidence)
|
||||
if (mCharSetProbers[i])
|
||||
{
|
||||
maxProberConfidence = proberConfidence;
|
||||
maxProber = i;
|
||||
proberConfidence = mCharSetProbers[i]->GetConfidence();
|
||||
if (proberConfidence > maxProberConfidence)
|
||||
{
|
||||
maxProberConfidence = proberConfidence;
|
||||
maxProber = i;
|
||||
}
|
||||
}
|
||||
}
|
||||
//do not report anything because we are not confident of it, that's in fact a negative answer
|
||||
|
||||
@ -48,9 +48,22 @@ typedef enum {
|
||||
eHighbyte = 2
|
||||
} nsInputState;
|
||||
|
||||
#define NS_FILTER_CHINESE_SIMPLIFIED 0x01
|
||||
#define NS_FILTER_CHINESE_TRADITIONAL 0x02
|
||||
#define NS_FILTER_JAPANESE 0x04
|
||||
#define NS_FILTER_KOREAN 0x08
|
||||
#define NS_FILTER_NON_CJK 0x10
|
||||
#define NS_FILTER_ALL 0x1F
|
||||
#define NS_FILTER_CHINESE (NS_FILTER_CHINESE_SIMPLIFIED | \
|
||||
NS_FILTER_CHINESE_TRADITIONAL)
|
||||
#define NS_FILTER_CJK (NS_FILTER_CHINESE_SIMPLIFIED | \
|
||||
NS_FILTER_CHINESE_TRADITIONAL | \
|
||||
NS_FILTER_JAPANESE | \
|
||||
NS_FILTER_KOREAN)
|
||||
|
||||
class nsUniversalDetector {
|
||||
public:
|
||||
nsUniversalDetector();
|
||||
nsUniversalDetector(PRUint32 aLanguageFilter);
|
||||
virtual ~nsUniversalDetector();
|
||||
virtual nsresult HandleData(const char* aBuf, PRUint32 aLen);
|
||||
virtual void DataEnd(void);
|
||||
@ -66,6 +79,7 @@ protected:
|
||||
char mLastChar;
|
||||
const char * mDetectedCharset;
|
||||
PRInt32 mBestGuess;
|
||||
PRUint32 mLanguageFilter;
|
||||
|
||||
nsCharSetProber *mCharSetProbers[NUM_OF_CHARSET_PROBERS];
|
||||
nsCharSetProber *mEscCharSetProber;
|
||||
|
||||
@ -42,6 +42,8 @@ typedef int PRInt32;
|
||||
typedef unsigned int PRUint32;
|
||||
typedef short PRInt16;
|
||||
typedef unsigned short PRUint16;
|
||||
typedef signed char PRInt8;
|
||||
typedef unsigned char PRUint8;
|
||||
|
||||
#define PR_FALSE false
|
||||
#define PR_TRUE true
|
||||
|
||||
@ -44,38 +44,18 @@
|
||||
#ifndef VERSION
|
||||
#define VERSION "Unknown"
|
||||
#endif
|
||||
#define BUFFER_SIZE 32768
|
||||
#define BUFFER_SIZE 65536
|
||||
|
||||
char buffer[BUFFER_SIZE];
|
||||
|
||||
void detect(FILE * fp)
|
||||
{
|
||||
uchardet_t handle = uchardet_new();
|
||||
|
||||
size_t size = BUFFER_SIZE;
|
||||
char * buffer_in = (char *) malloc(size * sizeof(char));
|
||||
|
||||
while (fgets(buffer_in, size, fp) != NULL)
|
||||
while (!feof(fp))
|
||||
{
|
||||
size_t freesize = size;
|
||||
|
||||
char * buffer_in_p = buffer_in;
|
||||
size_t line_length = strlen(buffer_in_p);
|
||||
while (line_length + 1 == freesize && buffer_in_p[line_length - 2] != '\n')
|
||||
{
|
||||
buffer_in_p += size - 1;
|
||||
freesize = size + 1;
|
||||
size += size;
|
||||
size_t offset = buffer_in_p - buffer_in;
|
||||
buffer_in = (char *) realloc(buffer_in, size * sizeof(char));
|
||||
buffer_in_p = buffer_in + offset;
|
||||
|
||||
if (fgets(buffer_in_p, freesize, fp) == NULL)
|
||||
break;
|
||||
|
||||
line_length = strlen(buffer_in_p);
|
||||
}
|
||||
|
||||
int retval = uchardet_handle_data(handle, buffer_in, strlen(buffer_in));
|
||||
|
||||
size_t len = fread(buffer, 1, BUFFER_SIZE, fp);
|
||||
int retval = uchardet_handle_data(handle, buffer, len);
|
||||
if (retval != 0)
|
||||
{
|
||||
fprintf(stderr, "Handle data error.\n");
|
||||
@ -84,10 +64,10 @@ void detect(FILE * fp)
|
||||
}
|
||||
uchardet_data_end(handle);
|
||||
|
||||
printf("%s\n", uchardet_get_charset(handle));
|
||||
const char * charset = uchardet_get_charset(handle);
|
||||
printf("%s\n", charset);
|
||||
|
||||
uchardet_delete(handle);
|
||||
free(buffer_in);
|
||||
}
|
||||
|
||||
void show_version()
|
||||
|
||||
@ -37,67 +37,69 @@
|
||||
#include "uchardet.h"
|
||||
#include "nscore.h"
|
||||
#include "nsUniversalDetector.h"
|
||||
#include <string.h>
|
||||
#include <string>
|
||||
|
||||
class DllDetector : public nsUniversalDetector
|
||||
using std::string;
|
||||
|
||||
class HandleUniversalDetector : public nsUniversalDetector
|
||||
{
|
||||
protected:
|
||||
char charset_[256];
|
||||
string m_charset;
|
||||
|
||||
public:
|
||||
DllDetector()
|
||||
: nsUniversalDetector()
|
||||
HandleUniversalDetector()
|
||||
: nsUniversalDetector(NS_FILTER_ALL)
|
||||
{
|
||||
*charset_=0;
|
||||
m_charset = "";
|
||||
}
|
||||
|
||||
virtual ~DllDetector()
|
||||
virtual ~HandleUniversalDetector()
|
||||
{}
|
||||
|
||||
virtual void Report(const char* charset)
|
||||
{
|
||||
strncpy( charset_ , charset , sizeof(charset_) );
|
||||
m_charset = charset;
|
||||
}
|
||||
|
||||
virtual void Reset()
|
||||
{
|
||||
nsUniversalDetector::Reset();
|
||||
*charset_=0;
|
||||
m_charset = "";
|
||||
}
|
||||
|
||||
const char* GetCharset() const
|
||||
{
|
||||
return charset_;
|
||||
return m_charset.c_str();
|
||||
}
|
||||
};
|
||||
|
||||
uchardet_t uchardet_new()
|
||||
{
|
||||
return reinterpret_cast<uchardet_t> (new DllDetector());
|
||||
return reinterpret_cast<uchardet_t> (new HandleUniversalDetector());
|
||||
}
|
||||
|
||||
void uchardet_delete(uchardet_t ud)
|
||||
{
|
||||
delete reinterpret_cast<DllDetector*>(ud);
|
||||
delete reinterpret_cast<HandleUniversalDetector*>(ud);
|
||||
}
|
||||
|
||||
int uchardet_handle_data(uchardet_t ud, const char * data, size_t len)
|
||||
{
|
||||
nsresult ret = reinterpret_cast<DllDetector*>(ud)->HandleData(data, (PRUint32)len);
|
||||
nsresult ret = reinterpret_cast<HandleUniversalDetector*>(ud)->HandleData(data, (PRUint32)len);
|
||||
return (ret != NS_OK);
|
||||
}
|
||||
|
||||
void uchardet_data_end(uchardet_t ud)
|
||||
{
|
||||
reinterpret_cast<DllDetector*>(ud)->DataEnd();
|
||||
reinterpret_cast<HandleUniversalDetector*>(ud)->DataEnd();
|
||||
}
|
||||
|
||||
void uchardet_reset(uchardet_t ud)
|
||||
{
|
||||
reinterpret_cast<DllDetector*>(ud)->Reset();
|
||||
reinterpret_cast<HandleUniversalDetector*>(ud)->Reset();
|
||||
}
|
||||
|
||||
const char* uchardet_get_charset(uchardet_t ud)
|
||||
{
|
||||
return reinterpret_cast<DllDetector*>(ud)->GetCharset();
|
||||
return reinterpret_cast<HandleUniversalDetector*>(ud)->GetCharset();
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user