From dbb4c1d2ff44ecde86caee024764d04a1d1e1c63 Mon Sep 17 00:00:00 2001 From: Jehan Date: Sun, 29 Nov 2015 23:51:55 +0100 Subject: [PATCH] nsSBCharSetProber: replace the fixed 64 SAMPLE_SIZE... ... with per-language model "frequent character" count. --- src/LangModels/LangBulgarianModel.cpp | 2 ++ src/LangModels/LangCyrillicModel.cpp | 6 ++++++ src/LangModels/LangFrenchModel.cpp | 2 ++ src/LangModels/LangGreekModel.cpp | 2 ++ src/LangModels/LangHebrewModel.cpp | 1 + src/LangModels/LangHungarianModel.cpp | 2 ++ src/LangModels/LangThaiModel.cpp | 1 + src/nsSBCharSetProber.cpp | 8 ++++---- src/nsSBCharSetProber.h | 11 +++++++---- 9 files changed, 27 insertions(+), 8 deletions(-) diff --git a/src/LangModels/LangBulgarianModel.cpp b/src/LangModels/LangBulgarianModel.cpp index 283f35e..d8dcae4 100644 --- a/src/LangModels/LangBulgarianModel.cpp +++ b/src/LangModels/LangBulgarianModel.cpp @@ -230,6 +230,7 @@ const SequenceModel Latin5BulgarianModel = { Latin5_BulgarianCharToOrderMap, BulgarianLangModel, + 64, (float)0.969392, PR_FALSE, "ISO-8859-5" @@ -239,6 +240,7 @@ const SequenceModel Win1251BulgarianModel = { win1251BulgarianCharToOrderMap, BulgarianLangModel, + 64, (float)0.969392, PR_FALSE, "WINDOWS-1251" diff --git a/src/LangModels/LangCyrillicModel.cpp b/src/LangModels/LangCyrillicModel.cpp index 678a6d1..2b51a6c 100644 --- a/src/LangModels/LangCyrillicModel.cpp +++ b/src/LangModels/LangCyrillicModel.cpp @@ -304,6 +304,7 @@ const SequenceModel Koi8rCyrillicModel = { KOI8R_CharToOrderMap, RussianLangModel, + 64, (float)0.976601, PR_FALSE, "KOI8-R" @@ -313,6 +314,7 @@ const SequenceModel Win1251CyrillicModel = { win1251_CharToOrderMap, RussianLangModel, + 64, (float)0.976601, PR_FALSE, "WINDOWS-1251" @@ -322,6 +324,7 @@ const SequenceModel Latin5CyrillicModel = { latin5_CharToOrderMap, RussianLangModel, + 64, (float)0.976601, PR_FALSE, "ISO-8859-5" @@ -331,6 +334,7 @@ const SequenceModel MacCyrillicModel = { macCyrillic_CharToOrderMap, RussianLangModel, + 64, (float)0.976601, PR_FALSE, "MAC-CYRILLIC" @@ -340,6 +344,7 @@ const SequenceModel Ibm866CyrillicModel = { IBM866_CharToOrderMap, RussianLangModel, + 64, (float)0.976601, PR_FALSE, "IBM866" @@ -349,6 +354,7 @@ const SequenceModel Ibm855CyrillicModel = { IBM855_CharToOrderMap, RussianLangModel, + 64, (float)0.976601, PR_FALSE, "IBM855" diff --git a/src/LangModels/LangFrenchModel.cpp b/src/LangModels/LangFrenchModel.cpp index c4df4c0..693c78e 100644 --- a/src/LangModels/LangFrenchModel.cpp +++ b/src/LangModels/LangFrenchModel.cpp @@ -229,6 +229,7 @@ const SequenceModel Iso_8859_15FrenchModel = { Iso_8859_15_CharToOrderMap, FrenchLangModel, + 64, (float)0.98, PR_TRUE, "ISO-8859-15" @@ -238,6 +239,7 @@ const SequenceModel Iso_8859_1FrenchModel = { Iso_8859_1_CharToOrderMap, FrenchLangModel, + 64, (float)0.99, PR_TRUE, "ISO-8859-1" diff --git a/src/LangModels/LangGreekModel.cpp b/src/LangModels/LangGreekModel.cpp index b71f10b..cf61ea3 100644 --- a/src/LangModels/LangGreekModel.cpp +++ b/src/LangModels/LangGreekModel.cpp @@ -229,6 +229,7 @@ const SequenceModel Latin7GreekModel = { Latin7_CharToOrderMap, GreekLangModel, + 64, (float)0.982851, PR_FALSE, "ISO-8859-7" @@ -238,6 +239,7 @@ const SequenceModel Win1253GreekModel = { win1253_CharToOrderMap, GreekLangModel, + 64, (float)0.982851, PR_FALSE, "WINDOWS-1253" diff --git a/src/LangModels/LangHebrewModel.cpp b/src/LangModels/LangHebrewModel.cpp index c808d0e..6377e5c 100644 --- a/src/LangModels/LangHebrewModel.cpp +++ b/src/LangModels/LangHebrewModel.cpp @@ -212,6 +212,7 @@ const SequenceModel Win1255Model = { win1255_CharToOrderMap, HebrewLangModel, + 64, (float)0.984004, PR_FALSE, "WINDOWS-1255" diff --git a/src/LangModels/LangHungarianModel.cpp b/src/LangModels/LangHungarianModel.cpp index 5c9b10f..daddca6 100644 --- a/src/LangModels/LangHungarianModel.cpp +++ b/src/LangModels/LangHungarianModel.cpp @@ -227,6 +227,7 @@ const SequenceModel Latin2HungarianModel = { Latin2_HungarianCharToOrderMap, HungarianLangModel, + 64, (float)0.947368, PR_TRUE, "ISO-8859-2" @@ -236,6 +237,7 @@ const SequenceModel Win1250HungarianModel = { win1250HungarianCharToOrderMap, HungarianLangModel, + 64, (float)0.947368, PR_TRUE, "WINDOWS-1250" diff --git a/src/LangModels/LangThaiModel.cpp b/src/LangModels/LangThaiModel.cpp index a8bfbdb..c20269a 100644 --- a/src/LangModels/LangThaiModel.cpp +++ b/src/LangModels/LangThaiModel.cpp @@ -215,6 +215,7 @@ const SequenceModel TIS620ThaiModel = { TIS620CharToOrderMap, ThaiLangModel, + 64, (float)0.926386, PR_FALSE, "TIS-620" diff --git a/src/nsSBCharSetProber.cpp b/src/nsSBCharSetProber.cpp index 2a59fd7..f333454 100644 --- a/src/nsSBCharSetProber.cpp +++ b/src/nsSBCharSetProber.cpp @@ -48,17 +48,17 @@ nsProbingState nsSingleByteCharSetProber::HandleData(const char* aBuf, PRUint32 if (order < SYMBOL_CAT_ORDER) mTotalChar++; - if (order < SAMPLE_SIZE) + if (order < mModel->freqCharCount) { mFreqChar++; - if (mLastOrder < SAMPLE_SIZE) + if (mLastOrder < mModel->freqCharCount) { mTotalSeqs++; if (!mReversed) - ++(mSeqCounters[mModel->precedenceMatrix[mLastOrder*SAMPLE_SIZE+order]]); + ++(mSeqCounters[mModel->precedenceMatrix[mLastOrder*mModel->freqCharCount+order]]); else // reverse the order of the letters in the lookup - ++(mSeqCounters[mModel->precedenceMatrix[order*SAMPLE_SIZE+mLastOrder]]); + ++(mSeqCounters[mModel->precedenceMatrix[order*mModel->freqCharCount+mLastOrder]]); } } mLastOrder = order; diff --git a/src/nsSBCharSetProber.h b/src/nsSBCharSetProber.h index 50c6e56..20e392e 100644 --- a/src/nsSBCharSetProber.h +++ b/src/nsSBCharSetProber.h @@ -53,7 +53,6 @@ /* Numbers 0-9. */ #define NUM 251 -#define SAMPLE_SIZE 64 #define SB_ENOUGH_REL_THRESHOLD 1024 #define POSITIVE_SHORTCUT_THRESHOLD (float)0.95 #define NEGATIVE_SHORTCUT_THRESHOLD (float)0.05 @@ -64,9 +63,13 @@ typedef struct { - const unsigned char* const charToOrderMap; // [256] table use to find a char's order - const PRUint8* const precedenceMatrix; // [SAMPLE_SIZE][SAMPLE_SIZE]; table to find a 2-char sequence's frequency - float mTypicalPositiveRatio; // = freqSeqs / totalSeqs + /* [256] table mapping codepoints to chararacter orders. */ + const unsigned char* const charToOrderMap; + /* freqCharCount x freqCharCount table of 2-char sequence's frequencies. */ + const PRUint8* const precedenceMatrix; + /* The count of frequent characters. */ + int freqCharCount; + float mTypicalPositiveRatio; // = freqSeqs / totalSeqs PRBool keepEnglishLetter; // says if this script contains English characters (not implemented) const char* const charsetName; } SequenceModel;