mirror of
https://gitlab.freedesktop.org/uchardet/uchardet.git
synced 2026-02-16 23:30:00 +08:00
nsSBCharSetProber: replace the fixed 64 SAMPLE_SIZE...
... with per-language model "frequent character" count.
This commit is contained in:
parent
b64831ff89
commit
dbb4c1d2ff
@ -230,6 +230,7 @@ const SequenceModel Latin5BulgarianModel =
|
|||||||
{
|
{
|
||||||
Latin5_BulgarianCharToOrderMap,
|
Latin5_BulgarianCharToOrderMap,
|
||||||
BulgarianLangModel,
|
BulgarianLangModel,
|
||||||
|
64,
|
||||||
(float)0.969392,
|
(float)0.969392,
|
||||||
PR_FALSE,
|
PR_FALSE,
|
||||||
"ISO-8859-5"
|
"ISO-8859-5"
|
||||||
@ -239,6 +240,7 @@ const SequenceModel Win1251BulgarianModel =
|
|||||||
{
|
{
|
||||||
win1251BulgarianCharToOrderMap,
|
win1251BulgarianCharToOrderMap,
|
||||||
BulgarianLangModel,
|
BulgarianLangModel,
|
||||||
|
64,
|
||||||
(float)0.969392,
|
(float)0.969392,
|
||||||
PR_FALSE,
|
PR_FALSE,
|
||||||
"WINDOWS-1251"
|
"WINDOWS-1251"
|
||||||
|
|||||||
@ -304,6 +304,7 @@ const SequenceModel Koi8rCyrillicModel =
|
|||||||
{
|
{
|
||||||
KOI8R_CharToOrderMap,
|
KOI8R_CharToOrderMap,
|
||||||
RussianLangModel,
|
RussianLangModel,
|
||||||
|
64,
|
||||||
(float)0.976601,
|
(float)0.976601,
|
||||||
PR_FALSE,
|
PR_FALSE,
|
||||||
"KOI8-R"
|
"KOI8-R"
|
||||||
@ -313,6 +314,7 @@ const SequenceModel Win1251CyrillicModel =
|
|||||||
{
|
{
|
||||||
win1251_CharToOrderMap,
|
win1251_CharToOrderMap,
|
||||||
RussianLangModel,
|
RussianLangModel,
|
||||||
|
64,
|
||||||
(float)0.976601,
|
(float)0.976601,
|
||||||
PR_FALSE,
|
PR_FALSE,
|
||||||
"WINDOWS-1251"
|
"WINDOWS-1251"
|
||||||
@ -322,6 +324,7 @@ const SequenceModel Latin5CyrillicModel =
|
|||||||
{
|
{
|
||||||
latin5_CharToOrderMap,
|
latin5_CharToOrderMap,
|
||||||
RussianLangModel,
|
RussianLangModel,
|
||||||
|
64,
|
||||||
(float)0.976601,
|
(float)0.976601,
|
||||||
PR_FALSE,
|
PR_FALSE,
|
||||||
"ISO-8859-5"
|
"ISO-8859-5"
|
||||||
@ -331,6 +334,7 @@ const SequenceModel MacCyrillicModel =
|
|||||||
{
|
{
|
||||||
macCyrillic_CharToOrderMap,
|
macCyrillic_CharToOrderMap,
|
||||||
RussianLangModel,
|
RussianLangModel,
|
||||||
|
64,
|
||||||
(float)0.976601,
|
(float)0.976601,
|
||||||
PR_FALSE,
|
PR_FALSE,
|
||||||
"MAC-CYRILLIC"
|
"MAC-CYRILLIC"
|
||||||
@ -340,6 +344,7 @@ const SequenceModel Ibm866CyrillicModel =
|
|||||||
{
|
{
|
||||||
IBM866_CharToOrderMap,
|
IBM866_CharToOrderMap,
|
||||||
RussianLangModel,
|
RussianLangModel,
|
||||||
|
64,
|
||||||
(float)0.976601,
|
(float)0.976601,
|
||||||
PR_FALSE,
|
PR_FALSE,
|
||||||
"IBM866"
|
"IBM866"
|
||||||
@ -349,6 +354,7 @@ const SequenceModel Ibm855CyrillicModel =
|
|||||||
{
|
{
|
||||||
IBM855_CharToOrderMap,
|
IBM855_CharToOrderMap,
|
||||||
RussianLangModel,
|
RussianLangModel,
|
||||||
|
64,
|
||||||
(float)0.976601,
|
(float)0.976601,
|
||||||
PR_FALSE,
|
PR_FALSE,
|
||||||
"IBM855"
|
"IBM855"
|
||||||
|
|||||||
@ -229,6 +229,7 @@ const SequenceModel Iso_8859_15FrenchModel =
|
|||||||
{
|
{
|
||||||
Iso_8859_15_CharToOrderMap,
|
Iso_8859_15_CharToOrderMap,
|
||||||
FrenchLangModel,
|
FrenchLangModel,
|
||||||
|
64,
|
||||||
(float)0.98,
|
(float)0.98,
|
||||||
PR_TRUE,
|
PR_TRUE,
|
||||||
"ISO-8859-15"
|
"ISO-8859-15"
|
||||||
@ -238,6 +239,7 @@ const SequenceModel Iso_8859_1FrenchModel =
|
|||||||
{
|
{
|
||||||
Iso_8859_1_CharToOrderMap,
|
Iso_8859_1_CharToOrderMap,
|
||||||
FrenchLangModel,
|
FrenchLangModel,
|
||||||
|
64,
|
||||||
(float)0.99,
|
(float)0.99,
|
||||||
PR_TRUE,
|
PR_TRUE,
|
||||||
"ISO-8859-1"
|
"ISO-8859-1"
|
||||||
|
|||||||
@ -229,6 +229,7 @@ const SequenceModel Latin7GreekModel =
|
|||||||
{
|
{
|
||||||
Latin7_CharToOrderMap,
|
Latin7_CharToOrderMap,
|
||||||
GreekLangModel,
|
GreekLangModel,
|
||||||
|
64,
|
||||||
(float)0.982851,
|
(float)0.982851,
|
||||||
PR_FALSE,
|
PR_FALSE,
|
||||||
"ISO-8859-7"
|
"ISO-8859-7"
|
||||||
@ -238,6 +239,7 @@ const SequenceModel Win1253GreekModel =
|
|||||||
{
|
{
|
||||||
win1253_CharToOrderMap,
|
win1253_CharToOrderMap,
|
||||||
GreekLangModel,
|
GreekLangModel,
|
||||||
|
64,
|
||||||
(float)0.982851,
|
(float)0.982851,
|
||||||
PR_FALSE,
|
PR_FALSE,
|
||||||
"WINDOWS-1253"
|
"WINDOWS-1253"
|
||||||
|
|||||||
@ -212,6 +212,7 @@ const SequenceModel Win1255Model =
|
|||||||
{
|
{
|
||||||
win1255_CharToOrderMap,
|
win1255_CharToOrderMap,
|
||||||
HebrewLangModel,
|
HebrewLangModel,
|
||||||
|
64,
|
||||||
(float)0.984004,
|
(float)0.984004,
|
||||||
PR_FALSE,
|
PR_FALSE,
|
||||||
"WINDOWS-1255"
|
"WINDOWS-1255"
|
||||||
|
|||||||
@ -227,6 +227,7 @@ const SequenceModel Latin2HungarianModel =
|
|||||||
{
|
{
|
||||||
Latin2_HungarianCharToOrderMap,
|
Latin2_HungarianCharToOrderMap,
|
||||||
HungarianLangModel,
|
HungarianLangModel,
|
||||||
|
64,
|
||||||
(float)0.947368,
|
(float)0.947368,
|
||||||
PR_TRUE,
|
PR_TRUE,
|
||||||
"ISO-8859-2"
|
"ISO-8859-2"
|
||||||
@ -236,6 +237,7 @@ const SequenceModel Win1250HungarianModel =
|
|||||||
{
|
{
|
||||||
win1250HungarianCharToOrderMap,
|
win1250HungarianCharToOrderMap,
|
||||||
HungarianLangModel,
|
HungarianLangModel,
|
||||||
|
64,
|
||||||
(float)0.947368,
|
(float)0.947368,
|
||||||
PR_TRUE,
|
PR_TRUE,
|
||||||
"WINDOWS-1250"
|
"WINDOWS-1250"
|
||||||
|
|||||||
@ -215,6 +215,7 @@ const SequenceModel TIS620ThaiModel =
|
|||||||
{
|
{
|
||||||
TIS620CharToOrderMap,
|
TIS620CharToOrderMap,
|
||||||
ThaiLangModel,
|
ThaiLangModel,
|
||||||
|
64,
|
||||||
(float)0.926386,
|
(float)0.926386,
|
||||||
PR_FALSE,
|
PR_FALSE,
|
||||||
"TIS-620"
|
"TIS-620"
|
||||||
|
|||||||
@ -48,17 +48,17 @@ nsProbingState nsSingleByteCharSetProber::HandleData(const char* aBuf, PRUint32
|
|||||||
|
|
||||||
if (order < SYMBOL_CAT_ORDER)
|
if (order < SYMBOL_CAT_ORDER)
|
||||||
mTotalChar++;
|
mTotalChar++;
|
||||||
if (order < SAMPLE_SIZE)
|
if (order < mModel->freqCharCount)
|
||||||
{
|
{
|
||||||
mFreqChar++;
|
mFreqChar++;
|
||||||
|
|
||||||
if (mLastOrder < SAMPLE_SIZE)
|
if (mLastOrder < mModel->freqCharCount)
|
||||||
{
|
{
|
||||||
mTotalSeqs++;
|
mTotalSeqs++;
|
||||||
if (!mReversed)
|
if (!mReversed)
|
||||||
++(mSeqCounters[mModel->precedenceMatrix[mLastOrder*SAMPLE_SIZE+order]]);
|
++(mSeqCounters[mModel->precedenceMatrix[mLastOrder*mModel->freqCharCount+order]]);
|
||||||
else // reverse the order of the letters in the lookup
|
else // reverse the order of the letters in the lookup
|
||||||
++(mSeqCounters[mModel->precedenceMatrix[order*SAMPLE_SIZE+mLastOrder]]);
|
++(mSeqCounters[mModel->precedenceMatrix[order*mModel->freqCharCount+mLastOrder]]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
mLastOrder = order;
|
mLastOrder = order;
|
||||||
|
|||||||
@ -53,7 +53,6 @@
|
|||||||
/* Numbers 0-9. */
|
/* Numbers 0-9. */
|
||||||
#define NUM 251
|
#define NUM 251
|
||||||
|
|
||||||
#define SAMPLE_SIZE 64
|
|
||||||
#define SB_ENOUGH_REL_THRESHOLD 1024
|
#define SB_ENOUGH_REL_THRESHOLD 1024
|
||||||
#define POSITIVE_SHORTCUT_THRESHOLD (float)0.95
|
#define POSITIVE_SHORTCUT_THRESHOLD (float)0.95
|
||||||
#define NEGATIVE_SHORTCUT_THRESHOLD (float)0.05
|
#define NEGATIVE_SHORTCUT_THRESHOLD (float)0.05
|
||||||
@ -64,8 +63,12 @@
|
|||||||
|
|
||||||
typedef struct
|
typedef struct
|
||||||
{
|
{
|
||||||
const unsigned char* const charToOrderMap; // [256] table use to find a char's order
|
/* [256] table mapping codepoints to chararacter orders. */
|
||||||
const PRUint8* const precedenceMatrix; // [SAMPLE_SIZE][SAMPLE_SIZE]; table to find a 2-char sequence's frequency
|
const unsigned char* const charToOrderMap;
|
||||||
|
/* freqCharCount x freqCharCount table of 2-char sequence's frequencies. */
|
||||||
|
const PRUint8* const precedenceMatrix;
|
||||||
|
/* The count of frequent characters. */
|
||||||
|
int freqCharCount;
|
||||||
float mTypicalPositiveRatio; // = freqSeqs / totalSeqs
|
float mTypicalPositiveRatio; // = freqSeqs / totalSeqs
|
||||||
PRBool keepEnglishLetter; // says if this script contains English characters (not implemented)
|
PRBool keepEnglishLetter; // says if this script contains English characters (not implemented)
|
||||||
const char* const charsetName;
|
const char* const charsetName;
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user