nsSBCharSetProber: replace the fixed 64 SAMPLE_SIZE...

... with per-language model "frequent character" count.
This commit is contained in:
Jehan 2015-11-29 23:51:55 +01:00
parent b64831ff89
commit dbb4c1d2ff
9 changed files with 27 additions and 8 deletions

View File

@ -230,6 +230,7 @@ const SequenceModel Latin5BulgarianModel =
{ {
Latin5_BulgarianCharToOrderMap, Latin5_BulgarianCharToOrderMap,
BulgarianLangModel, BulgarianLangModel,
64,
(float)0.969392, (float)0.969392,
PR_FALSE, PR_FALSE,
"ISO-8859-5" "ISO-8859-5"
@ -239,6 +240,7 @@ const SequenceModel Win1251BulgarianModel =
{ {
win1251BulgarianCharToOrderMap, win1251BulgarianCharToOrderMap,
BulgarianLangModel, BulgarianLangModel,
64,
(float)0.969392, (float)0.969392,
PR_FALSE, PR_FALSE,
"WINDOWS-1251" "WINDOWS-1251"

View File

@ -304,6 +304,7 @@ const SequenceModel Koi8rCyrillicModel =
{ {
KOI8R_CharToOrderMap, KOI8R_CharToOrderMap,
RussianLangModel, RussianLangModel,
64,
(float)0.976601, (float)0.976601,
PR_FALSE, PR_FALSE,
"KOI8-R" "KOI8-R"
@ -313,6 +314,7 @@ const SequenceModel Win1251CyrillicModel =
{ {
win1251_CharToOrderMap, win1251_CharToOrderMap,
RussianLangModel, RussianLangModel,
64,
(float)0.976601, (float)0.976601,
PR_FALSE, PR_FALSE,
"WINDOWS-1251" "WINDOWS-1251"
@ -322,6 +324,7 @@ const SequenceModel Latin5CyrillicModel =
{ {
latin5_CharToOrderMap, latin5_CharToOrderMap,
RussianLangModel, RussianLangModel,
64,
(float)0.976601, (float)0.976601,
PR_FALSE, PR_FALSE,
"ISO-8859-5" "ISO-8859-5"
@ -331,6 +334,7 @@ const SequenceModel MacCyrillicModel =
{ {
macCyrillic_CharToOrderMap, macCyrillic_CharToOrderMap,
RussianLangModel, RussianLangModel,
64,
(float)0.976601, (float)0.976601,
PR_FALSE, PR_FALSE,
"MAC-CYRILLIC" "MAC-CYRILLIC"
@ -340,6 +344,7 @@ const SequenceModel Ibm866CyrillicModel =
{ {
IBM866_CharToOrderMap, IBM866_CharToOrderMap,
RussianLangModel, RussianLangModel,
64,
(float)0.976601, (float)0.976601,
PR_FALSE, PR_FALSE,
"IBM866" "IBM866"
@ -349,6 +354,7 @@ const SequenceModel Ibm855CyrillicModel =
{ {
IBM855_CharToOrderMap, IBM855_CharToOrderMap,
RussianLangModel, RussianLangModel,
64,
(float)0.976601, (float)0.976601,
PR_FALSE, PR_FALSE,
"IBM855" "IBM855"

View File

@ -229,6 +229,7 @@ const SequenceModel Iso_8859_15FrenchModel =
{ {
Iso_8859_15_CharToOrderMap, Iso_8859_15_CharToOrderMap,
FrenchLangModel, FrenchLangModel,
64,
(float)0.98, (float)0.98,
PR_TRUE, PR_TRUE,
"ISO-8859-15" "ISO-8859-15"
@ -238,6 +239,7 @@ const SequenceModel Iso_8859_1FrenchModel =
{ {
Iso_8859_1_CharToOrderMap, Iso_8859_1_CharToOrderMap,
FrenchLangModel, FrenchLangModel,
64,
(float)0.99, (float)0.99,
PR_TRUE, PR_TRUE,
"ISO-8859-1" "ISO-8859-1"

View File

@ -229,6 +229,7 @@ const SequenceModel Latin7GreekModel =
{ {
Latin7_CharToOrderMap, Latin7_CharToOrderMap,
GreekLangModel, GreekLangModel,
64,
(float)0.982851, (float)0.982851,
PR_FALSE, PR_FALSE,
"ISO-8859-7" "ISO-8859-7"
@ -238,6 +239,7 @@ const SequenceModel Win1253GreekModel =
{ {
win1253_CharToOrderMap, win1253_CharToOrderMap,
GreekLangModel, GreekLangModel,
64,
(float)0.982851, (float)0.982851,
PR_FALSE, PR_FALSE,
"WINDOWS-1253" "WINDOWS-1253"

View File

@ -212,6 +212,7 @@ const SequenceModel Win1255Model =
{ {
win1255_CharToOrderMap, win1255_CharToOrderMap,
HebrewLangModel, HebrewLangModel,
64,
(float)0.984004, (float)0.984004,
PR_FALSE, PR_FALSE,
"WINDOWS-1255" "WINDOWS-1255"

View File

@ -227,6 +227,7 @@ const SequenceModel Latin2HungarianModel =
{ {
Latin2_HungarianCharToOrderMap, Latin2_HungarianCharToOrderMap,
HungarianLangModel, HungarianLangModel,
64,
(float)0.947368, (float)0.947368,
PR_TRUE, PR_TRUE,
"ISO-8859-2" "ISO-8859-2"
@ -236,6 +237,7 @@ const SequenceModel Win1250HungarianModel =
{ {
win1250HungarianCharToOrderMap, win1250HungarianCharToOrderMap,
HungarianLangModel, HungarianLangModel,
64,
(float)0.947368, (float)0.947368,
PR_TRUE, PR_TRUE,
"WINDOWS-1250" "WINDOWS-1250"

View File

@ -215,6 +215,7 @@ const SequenceModel TIS620ThaiModel =
{ {
TIS620CharToOrderMap, TIS620CharToOrderMap,
ThaiLangModel, ThaiLangModel,
64,
(float)0.926386, (float)0.926386,
PR_FALSE, PR_FALSE,
"TIS-620" "TIS-620"

View File

@ -48,17 +48,17 @@ nsProbingState nsSingleByteCharSetProber::HandleData(const char* aBuf, PRUint32
if (order < SYMBOL_CAT_ORDER) if (order < SYMBOL_CAT_ORDER)
mTotalChar++; mTotalChar++;
if (order < SAMPLE_SIZE) if (order < mModel->freqCharCount)
{ {
mFreqChar++; mFreqChar++;
if (mLastOrder < SAMPLE_SIZE) if (mLastOrder < mModel->freqCharCount)
{ {
mTotalSeqs++; mTotalSeqs++;
if (!mReversed) if (!mReversed)
++(mSeqCounters[mModel->precedenceMatrix[mLastOrder*SAMPLE_SIZE+order]]); ++(mSeqCounters[mModel->precedenceMatrix[mLastOrder*mModel->freqCharCount+order]]);
else // reverse the order of the letters in the lookup else // reverse the order of the letters in the lookup
++(mSeqCounters[mModel->precedenceMatrix[order*SAMPLE_SIZE+mLastOrder]]); ++(mSeqCounters[mModel->precedenceMatrix[order*mModel->freqCharCount+mLastOrder]]);
} }
} }
mLastOrder = order; mLastOrder = order;

View File

@ -53,7 +53,6 @@
/* Numbers 0-9. */ /* Numbers 0-9. */
#define NUM 251 #define NUM 251
#define SAMPLE_SIZE 64
#define SB_ENOUGH_REL_THRESHOLD 1024 #define SB_ENOUGH_REL_THRESHOLD 1024
#define POSITIVE_SHORTCUT_THRESHOLD (float)0.95 #define POSITIVE_SHORTCUT_THRESHOLD (float)0.95
#define NEGATIVE_SHORTCUT_THRESHOLD (float)0.05 #define NEGATIVE_SHORTCUT_THRESHOLD (float)0.05
@ -64,9 +63,13 @@
typedef struct typedef struct
{ {
const unsigned char* const charToOrderMap; // [256] table use to find a char's order /* [256] table mapping codepoints to chararacter orders. */
const PRUint8* const precedenceMatrix; // [SAMPLE_SIZE][SAMPLE_SIZE]; table to find a 2-char sequence's frequency const unsigned char* const charToOrderMap;
float mTypicalPositiveRatio; // = freqSeqs / totalSeqs /* freqCharCount x freqCharCount table of 2-char sequence's frequencies. */
const PRUint8* const precedenceMatrix;
/* The count of frequent characters. */
int freqCharCount;
float mTypicalPositiveRatio; // = freqSeqs / totalSeqs
PRBool keepEnglishLetter; // says if this script contains English characters (not implemented) PRBool keepEnglishLetter; // says if this script contains English characters (not implemented)
const char* const charsetName; const char* const charsetName;
} SequenceModel; } SequenceModel;