diff --git a/src/LangModels/LangArabicModel.cpp b/src/LangModels/LangArabicModel.cpp index 6ac80f3..0a6d654 100644 --- a/src/LangModels/LangArabicModel.cpp +++ b/src/LangModels/LangArabicModel.cpp @@ -251,7 +251,8 @@ const SequenceModel Iso_8859_6ArabicModel = 64, (float)0.9696025116913417, PR_FALSE, - "ISO-8859-6" + "ISO-8859-6", + "ar" }; const SequenceModel Windows_1256ArabicModel = @@ -261,5 +262,6 @@ const SequenceModel Windows_1256ArabicModel = 64, (float)0.9696025116913417, PR_FALSE, - "WINDOWS-1256" + "WINDOWS-1256", + "ar" }; diff --git a/src/LangModels/LangBulgarianModel.cpp b/src/LangModels/LangBulgarianModel.cpp index 18c58ee..1120054 100644 --- a/src/LangModels/LangBulgarianModel.cpp +++ b/src/LangModels/LangBulgarianModel.cpp @@ -233,7 +233,8 @@ const SequenceModel Latin5BulgarianModel = 64, (float)0.969392, PR_FALSE, - "ISO-8859-5" + "ISO-8859-5", + "bg" }; const SequenceModel Win1251BulgarianModel = @@ -243,5 +244,6 @@ const SequenceModel Win1251BulgarianModel = 64, (float)0.969392, PR_FALSE, - "WINDOWS-1251" + "WINDOWS-1251", + "bg" }; diff --git a/src/LangModels/LangCroatianModel.cpp b/src/LangModels/LangCroatianModel.cpp index 58f882e..961bd0e 100644 --- a/src/LangModels/LangCroatianModel.cpp +++ b/src/LangModels/LangCroatianModel.cpp @@ -238,7 +238,8 @@ const SequenceModel Windows_1250CroatianModel = 31, (float)0.9989731099787131, PR_TRUE, - "WINDOWS-1250" + "WINDOWS-1250", + "hr" }; const SequenceModel Iso_8859_2CroatianModel = @@ -248,7 +249,8 @@ const SequenceModel Iso_8859_2CroatianModel = 31, (float)0.9989731099787131, PR_TRUE, - "ISO-8859-2" + "ISO-8859-2", + "hr" }; const SequenceModel Iso_8859_16CroatianModel = @@ -258,7 +260,8 @@ const SequenceModel Iso_8859_16CroatianModel = 31, (float)0.9989731099787131, PR_TRUE, - "ISO-8859-16" + "ISO-8859-16", + "hr" }; const SequenceModel Mac_CentraleuropeCroatianModel = @@ -268,7 +271,8 @@ const SequenceModel Mac_CentraleuropeCroatianModel = 31, (float)0.9989731099787131, PR_TRUE, - "MAC-CENTRALEUROPE" + "MAC-CENTRALEUROPE", + "hr" }; const SequenceModel Iso_8859_13CroatianModel = @@ -278,7 +282,8 @@ const SequenceModel Iso_8859_13CroatianModel = 31, (float)0.9989731099787131, PR_TRUE, - "ISO-8859-13" + "ISO-8859-13", + "hr" }; const SequenceModel Ibm852CroatianModel = @@ -288,5 +293,6 @@ const SequenceModel Ibm852CroatianModel = 31, (float)0.9989731099787131, PR_TRUE, - "IBM852" + "IBM852", + "hr" }; diff --git a/src/LangModels/LangCzechModel.cpp b/src/LangModels/LangCzechModel.cpp index 2557376..c12c07e 100644 --- a/src/LangModels/LangCzechModel.cpp +++ b/src/LangModels/LangCzechModel.cpp @@ -247,7 +247,8 @@ const SequenceModel Windows_1250CzechModel = 41, (float)0.9786035192432675, PR_TRUE, - "WINDOWS-1250" + "WINDOWS-1250", + "cs" }; const SequenceModel Mac_CentraleuropeCzechModel = @@ -257,7 +258,8 @@ const SequenceModel Mac_CentraleuropeCzechModel = 41, (float)0.9786035192432675, PR_TRUE, - "MAC-CENTRALEUROPE" + "MAC-CENTRALEUROPE", + "cs" }; const SequenceModel Ibm852CzechModel = @@ -267,7 +269,8 @@ const SequenceModel Ibm852CzechModel = 41, (float)0.9786035192432675, PR_TRUE, - "IBM852" + "IBM852", + "cs" }; const SequenceModel Iso_8859_2CzechModel = @@ -277,5 +280,6 @@ const SequenceModel Iso_8859_2CzechModel = 41, (float)0.9786035192432675, PR_TRUE, - "ISO-8859-2" + "ISO-8859-2", + "cs" }; diff --git a/src/LangModels/LangDanishModel.cpp b/src/LangModels/LangDanishModel.cpp index c60e7b2..cb99e9b 100644 --- a/src/LangModels/LangDanishModel.cpp +++ b/src/LangModels/LangDanishModel.cpp @@ -174,7 +174,8 @@ const SequenceModel Iso_8859_15DanishModel = 30, (float)0.9968082796759031, PR_TRUE, - "ISO-8859-15" + "ISO-8859-15", + "da" }; const SequenceModel Iso_8859_1DanishModel = @@ -184,7 +185,8 @@ const SequenceModel Iso_8859_1DanishModel = 30, (float)0.9968082796759031, PR_TRUE, - "ISO-8859-1" + "ISO-8859-1", + "da" }; const SequenceModel Windows_1252DanishModel = @@ -194,5 +196,6 @@ const SequenceModel Windows_1252DanishModel = 30, (float)0.9968082796759031, PR_TRUE, - "WINDOWS-1252" + "WINDOWS-1252", + "da" }; diff --git a/src/LangModels/LangEsperantoModel.cpp b/src/LangModels/LangEsperantoModel.cpp index 0884cd2..1d55ec7 100644 --- a/src/LangModels/LangEsperantoModel.cpp +++ b/src/LangModels/LangEsperantoModel.cpp @@ -137,5 +137,6 @@ const SequenceModel Iso_8859_3EsperantoModel = 35, (float)0.9942980632768038, PR_FALSE, - "ISO-8859-3" + "ISO-8859-3", + "eo" }; diff --git a/src/LangModels/LangEstonianModel.cpp b/src/LangModels/LangEstonianModel.cpp index c5fa9b3..71d9c66 100644 --- a/src/LangModels/LangEstonianModel.cpp +++ b/src/LangModels/LangEstonianModel.cpp @@ -219,7 +219,8 @@ const SequenceModel Iso_8859_4EstonianModel = 33, (float)0.9972721312183132, PR_TRUE, - "ISO-8859-4" + "ISO-8859-4", + "et" }; const SequenceModel Windows_1252EstonianModel = @@ -229,7 +230,8 @@ const SequenceModel Windows_1252EstonianModel = 33, (float)0.9972721312183132, PR_TRUE, - "WINDOWS-1252" + "WINDOWS-1252", + "et" }; const SequenceModel Iso_8859_15EstonianModel = @@ -239,7 +241,8 @@ const SequenceModel Iso_8859_15EstonianModel = 33, (float)0.9972721312183132, PR_TRUE, - "ISO-8859-15" + "ISO-8859-15", + "et" }; const SequenceModel Iso_8859_13EstonianModel = @@ -249,7 +252,8 @@ const SequenceModel Iso_8859_13EstonianModel = 33, (float)0.9972721312183132, PR_TRUE, - "ISO-8859-13" + "ISO-8859-13", + "et" }; const SequenceModel Windows_1257EstonianModel = @@ -259,5 +263,6 @@ const SequenceModel Windows_1257EstonianModel = 33, (float)0.9972721312183132, PR_TRUE, - "WINDOWS-1257" + "WINDOWS-1257", + "et" }; diff --git a/src/LangModels/LangFinnishModel.cpp b/src/LangModels/LangFinnishModel.cpp index ee91e14..cbc9528 100644 --- a/src/LangModels/LangFinnishModel.cpp +++ b/src/LangModels/LangFinnishModel.cpp @@ -237,7 +237,8 @@ const SequenceModel Iso_8859_15FinnishModel = 30, (float)0.9985378147555799, PR_TRUE, - "ISO-8859-15" + "ISO-8859-15", + "fi" }; const SequenceModel Windows_1252FinnishModel = @@ -247,7 +248,8 @@ const SequenceModel Windows_1252FinnishModel = 30, (float)0.9985378147555799, PR_TRUE, - "WINDOWS-1252" + "WINDOWS-1252", + "fi" }; const SequenceModel Iso_8859_4FinnishModel = @@ -257,7 +259,8 @@ const SequenceModel Iso_8859_4FinnishModel = 30, (float)0.9985378147555799, PR_TRUE, - "ISO-8859-4" + "ISO-8859-4", + "fi" }; const SequenceModel Iso_8859_13FinnishModel = @@ -267,7 +270,8 @@ const SequenceModel Iso_8859_13FinnishModel = 30, (float)0.9985378147555799, PR_TRUE, - "ISO-8859-13" + "ISO-8859-13", + "fi" }; const SequenceModel Iso_8859_9FinnishModel = @@ -277,7 +281,8 @@ const SequenceModel Iso_8859_9FinnishModel = 30, (float)0.9985378147555799, PR_TRUE, - "ISO-8859-9" + "ISO-8859-9", + "fi" }; const SequenceModel Iso_8859_1FinnishModel = @@ -287,5 +292,6 @@ const SequenceModel Iso_8859_1FinnishModel = 30, (float)0.9985378147555799, PR_TRUE, - "ISO-8859-1" + "ISO-8859-1", + "fi" }; diff --git a/src/LangModels/LangFrenchModel.cpp b/src/LangModels/LangFrenchModel.cpp index cd458cb..5baaf31 100644 --- a/src/LangModels/LangFrenchModel.cpp +++ b/src/LangModels/LangFrenchModel.cpp @@ -182,7 +182,8 @@ const SequenceModel Windows_1252FrenchModel = 38, (float)0.997057879992383, PR_TRUE, - "WINDOWS-1252" + "WINDOWS-1252", + "fr" }; const SequenceModel Iso_8859_1FrenchModel = @@ -192,7 +193,8 @@ const SequenceModel Iso_8859_1FrenchModel = 38, (float)0.997057879992383, PR_TRUE, - "ISO-8859-1" + "ISO-8859-1", + "fr" }; const SequenceModel Iso_8859_15FrenchModel = @@ -202,5 +204,6 @@ const SequenceModel Iso_8859_15FrenchModel = 38, (float)0.997057879992383, PR_TRUE, - "ISO-8859-15" + "ISO-8859-15", + "fr" }; diff --git a/src/LangModels/LangGermanModel.cpp b/src/LangModels/LangGermanModel.cpp index feeda8e..dd4228c 100644 --- a/src/LangModels/LangGermanModel.cpp +++ b/src/LangModels/LangGermanModel.cpp @@ -154,7 +154,8 @@ const SequenceModel Windows_1252GermanModel = 31, (float)0.9934041448127945, PR_TRUE, - "WINDOWS-1252" + "WINDOWS-1252", + "de" }; const SequenceModel Iso_8859_1GermanModel = @@ -164,5 +165,6 @@ const SequenceModel Iso_8859_1GermanModel = 31, (float)0.9934041448127945, PR_TRUE, - "ISO-8859-1" + "ISO-8859-1", + "de" }; diff --git a/src/LangModels/LangGreekModel.cpp b/src/LangModels/LangGreekModel.cpp index 499affe..28951e6 100644 --- a/src/LangModels/LangGreekModel.cpp +++ b/src/LangModels/LangGreekModel.cpp @@ -215,7 +215,8 @@ const SequenceModel Windows_1253GreekModel = 46, (float)0.958419074626211, PR_FALSE, - "WINDOWS-1253" + "WINDOWS-1253", + "el" }; const SequenceModel Iso_8859_7GreekModel = @@ -225,5 +226,6 @@ const SequenceModel Iso_8859_7GreekModel = 46, (float)0.958419074626211, PR_FALSE, - "ISO-8859-7" + "ISO-8859-7", + "el" }; diff --git a/src/LangModels/LangHebrewModel.cpp b/src/LangModels/LangHebrewModel.cpp index af9ac2b..811c048 100644 --- a/src/LangModels/LangHebrewModel.cpp +++ b/src/LangModels/LangHebrewModel.cpp @@ -215,6 +215,6 @@ const SequenceModel Win1255Model = 64, (float)0.984004, PR_FALSE, - "WINDOWS-1255" + "WINDOWS-1255", + "he" }; - diff --git a/src/LangModels/LangHungarianModel.cpp b/src/LangModels/LangHungarianModel.cpp index 83e6eaa..22f0de6 100644 --- a/src/LangModels/LangHungarianModel.cpp +++ b/src/LangModels/LangHungarianModel.cpp @@ -155,7 +155,8 @@ const SequenceModel Iso_8859_2HungarianModel = 32, (float)0.9748272224933486, PR_FALSE, - "ISO-8859-2" + "ISO-8859-2", + "hu" }; const SequenceModel Windows_1250HungarianModel = @@ -165,5 +166,6 @@ const SequenceModel Windows_1250HungarianModel = 32, (float)0.9748272224933486, PR_FALSE, - "WINDOWS-1250" + "WINDOWS-1250", + "hu" }; diff --git a/src/LangModels/LangIrishModel.cpp b/src/LangModels/LangIrishModel.cpp index af3a16d..bbd9500 100644 --- a/src/LangModels/LangIrishModel.cpp +++ b/src/LangModels/LangIrishModel.cpp @@ -196,7 +196,8 @@ const SequenceModel Iso_8859_1IrishModel = 31, (float)0.9974076651249096, PR_TRUE, - "ISO-8859-1" + "ISO-8859-1", + "ga" }; const SequenceModel Windows_1252IrishModel = @@ -206,7 +207,8 @@ const SequenceModel Windows_1252IrishModel = 31, (float)0.9974076651249096, PR_TRUE, - "WINDOWS-1252" + "WINDOWS-1252", + "ga" }; const SequenceModel Iso_8859_15IrishModel = @@ -216,7 +218,8 @@ const SequenceModel Iso_8859_15IrishModel = 31, (float)0.9974076651249096, PR_TRUE, - "ISO-8859-15" + "ISO-8859-15", + "ga" }; const SequenceModel Iso_8859_9IrishModel = @@ -226,5 +229,6 @@ const SequenceModel Iso_8859_9IrishModel = 31, (float)0.9974076651249096, PR_TRUE, - "ISO-8859-9" + "ISO-8859-9", + "ga" }; diff --git a/src/LangModels/LangItalianModel.cpp b/src/LangModels/LangItalianModel.cpp index 0a9565c..4bb5dc5 100644 --- a/src/LangModels/LangItalianModel.cpp +++ b/src/LangModels/LangItalianModel.cpp @@ -220,7 +220,8 @@ const SequenceModel Iso_8859_3ItalianModel = 34, (float)0.9989484485502651, PR_TRUE, - "ISO-8859-3" + "ISO-8859-3", + "it" }; const SequenceModel Iso_8859_15ItalianModel = @@ -230,7 +231,8 @@ const SequenceModel Iso_8859_15ItalianModel = 34, (float)0.9989484485502651, PR_TRUE, - "ISO-8859-15" + "ISO-8859-15", + "it" }; const SequenceModel Iso_8859_9ItalianModel = @@ -240,7 +242,8 @@ const SequenceModel Iso_8859_9ItalianModel = 34, (float)0.9989484485502651, PR_TRUE, - "ISO-8859-9" + "ISO-8859-9", + "it" }; const SequenceModel Iso_8859_1ItalianModel = @@ -250,7 +253,8 @@ const SequenceModel Iso_8859_1ItalianModel = 34, (float)0.9989484485502651, PR_TRUE, - "ISO-8859-1" + "ISO-8859-1", + "it" }; const SequenceModel Windows_1252ItalianModel = @@ -260,5 +264,6 @@ const SequenceModel Windows_1252ItalianModel = 34, (float)0.9989484485502651, PR_TRUE, - "WINDOWS-1252" + "WINDOWS-1252", + "it" }; diff --git a/src/LangModels/LangLatvianModel.cpp b/src/LangModels/LangLatvianModel.cpp index b62d414..fcccc82 100644 --- a/src/LangModels/LangLatvianModel.cpp +++ b/src/LangModels/LangLatvianModel.cpp @@ -183,7 +183,8 @@ const SequenceModel Iso_8859_4LatvianModel = 39, (float)0.9904102202220861, PR_TRUE, - "ISO-8859-4" + "ISO-8859-4", + "lv" }; const SequenceModel Iso_8859_10LatvianModel = @@ -193,7 +194,8 @@ const SequenceModel Iso_8859_10LatvianModel = 39, (float)0.9904102202220861, PR_TRUE, - "ISO-8859-10" + "ISO-8859-10", + "lv" }; const SequenceModel Iso_8859_13LatvianModel = @@ -203,5 +205,6 @@ const SequenceModel Iso_8859_13LatvianModel = 39, (float)0.9904102202220861, PR_TRUE, - "ISO-8859-13" + "ISO-8859-13", + "lv" }; diff --git a/src/LangModels/LangLithuanianModel.cpp b/src/LangModels/LangLithuanianModel.cpp index af65db3..686014a 100644 --- a/src/LangModels/LangLithuanianModel.cpp +++ b/src/LangModels/LangLithuanianModel.cpp @@ -182,7 +182,8 @@ const SequenceModel Iso_8859_10LithuanianModel = 38, (float)0.9928710196247589, PR_TRUE, - "ISO-8859-10" + "ISO-8859-10", + "lt" }; const SequenceModel Iso_8859_4LithuanianModel = @@ -192,7 +193,8 @@ const SequenceModel Iso_8859_4LithuanianModel = 38, (float)0.9928710196247589, PR_TRUE, - "ISO-8859-4" + "ISO-8859-4", + "lt" }; const SequenceModel Iso_8859_13LithuanianModel = @@ -202,5 +204,6 @@ const SequenceModel Iso_8859_13LithuanianModel = 38, (float)0.9928710196247589, PR_TRUE, - "ISO-8859-13" + "ISO-8859-13", + "lt" }; diff --git a/src/LangModels/LangMalteseModel.cpp b/src/LangModels/LangMalteseModel.cpp index dd82ef6..e253539 100644 --- a/src/LangModels/LangMalteseModel.cpp +++ b/src/LangModels/LangMalteseModel.cpp @@ -133,5 +133,6 @@ const SequenceModel Iso_8859_3MalteseModel = 31, (float)0.9959115850692665, PR_TRUE, - "ISO-8859-3" + "ISO-8859-3", + "mt" }; diff --git a/src/LangModels/LangPolishModel.cpp b/src/LangModels/LangPolishModel.cpp index cb62bdc..38791de 100644 --- a/src/LangModels/LangPolishModel.cpp +++ b/src/LangModels/LangPolishModel.cpp @@ -244,7 +244,8 @@ const SequenceModel Ibm852PolishModel = 37, (float)0.9894531815946438, PR_TRUE, - "IBM852" + "IBM852", + "pl" }; const SequenceModel Iso_8859_16PolishModel = @@ -254,7 +255,8 @@ const SequenceModel Iso_8859_16PolishModel = 37, (float)0.9894531815946438, PR_TRUE, - "ISO-8859-16" + "ISO-8859-16", + "pl" }; const SequenceModel Iso_8859_2PolishModel = @@ -264,7 +266,8 @@ const SequenceModel Iso_8859_2PolishModel = 37, (float)0.9894531815946438, PR_TRUE, - "ISO-8859-2" + "ISO-8859-2", + "pl" }; const SequenceModel Mac_CentraleuropePolishModel = @@ -274,7 +277,8 @@ const SequenceModel Mac_CentraleuropePolishModel = 37, (float)0.9894531815946438, PR_TRUE, - "MAC-CENTRALEUROPE" + "MAC-CENTRALEUROPE", + "pl" }; const SequenceModel Iso_8859_13PolishModel = @@ -284,7 +288,8 @@ const SequenceModel Iso_8859_13PolishModel = 37, (float)0.9894531815946438, PR_TRUE, - "ISO-8859-13" + "ISO-8859-13", + "pl" }; const SequenceModel Windows_1250PolishModel = @@ -294,5 +299,6 @@ const SequenceModel Windows_1250PolishModel = 37, (float)0.9894531815946438, PR_TRUE, - "WINDOWS-1250" + "WINDOWS-1250", + "pl" }; diff --git a/src/LangModels/LangPortugueseModel.cpp b/src/LangModels/LangPortugueseModel.cpp index 8d4bc4a..0b2dd1b 100644 --- a/src/LangModels/LangPortugueseModel.cpp +++ b/src/LangModels/LangPortugueseModel.cpp @@ -203,7 +203,8 @@ const SequenceModel Iso_8859_1PortugueseModel = 38, (float)0.9953179582313172, PR_TRUE, - "ISO-8859-1" + "ISO-8859-1", + "pt" }; const SequenceModel Iso_8859_9PortugueseModel = @@ -213,7 +214,8 @@ const SequenceModel Iso_8859_9PortugueseModel = 38, (float)0.9953179582313172, PR_TRUE, - "ISO-8859-9" + "ISO-8859-9", + "pt" }; const SequenceModel Iso_8859_15PortugueseModel = @@ -223,7 +225,8 @@ const SequenceModel Iso_8859_15PortugueseModel = 38, (float)0.9953179582313172, PR_TRUE, - "ISO-8859-15" + "ISO-8859-15", + "pt" }; const SequenceModel Windows_1252PortugueseModel = @@ -233,5 +236,6 @@ const SequenceModel Windows_1252PortugueseModel = 38, (float)0.9953179582313172, PR_TRUE, - "WINDOWS-1252" + "WINDOWS-1252", + "pt" }; diff --git a/src/LangModels/LangRomanianModel.cpp b/src/LangModels/LangRomanianModel.cpp index 154c03f..cfb1b8d 100644 --- a/src/LangModels/LangRomanianModel.cpp +++ b/src/LangModels/LangRomanianModel.cpp @@ -198,7 +198,8 @@ const SequenceModel Iso_8859_16RomanianModel = 33, (float)0.997762564143313, PR_TRUE, - "ISO-8859-16" + "ISO-8859-16", + "ro" }; const SequenceModel Iso_8859_2RomanianModel = @@ -208,7 +209,8 @@ const SequenceModel Iso_8859_2RomanianModel = 33, (float)0.997762564143313, PR_TRUE, - "ISO-8859-2" + "ISO-8859-2", + "ro" }; const SequenceModel Windows_1250RomanianModel = @@ -218,7 +220,8 @@ const SequenceModel Windows_1250RomanianModel = 33, (float)0.997762564143313, PR_TRUE, - "WINDOWS-1250" + "WINDOWS-1250", + "ro" }; const SequenceModel Ibm852RomanianModel = @@ -228,5 +231,6 @@ const SequenceModel Ibm852RomanianModel = 33, (float)0.997762564143313, PR_TRUE, - "IBM852" + "IBM852", + "ro" }; diff --git a/src/LangModels/LangRussianModel.cpp b/src/LangModels/LangRussianModel.cpp index a532049..50631df 100644 --- a/src/LangModels/LangRussianModel.cpp +++ b/src/LangModels/LangRussianModel.cpp @@ -307,7 +307,8 @@ const SequenceModel Koi8rRussianModel = 64, (float)0.976601, PR_FALSE, - "KOI8-R" + "KOI8-R", + "ru" }; const SequenceModel Win1251RussianModel = @@ -317,7 +318,8 @@ const SequenceModel Win1251RussianModel = 64, (float)0.976601, PR_FALSE, - "WINDOWS-1251" + "WINDOWS-1251", + "ru" }; const SequenceModel Latin5RussianModel = @@ -327,7 +329,8 @@ const SequenceModel Latin5RussianModel = 64, (float)0.976601, PR_FALSE, - "ISO-8859-5" + "ISO-8859-5", + "ru" }; const SequenceModel MacCyrillicRussianModel = @@ -337,7 +340,8 @@ const SequenceModel MacCyrillicRussianModel = 64, (float)0.976601, PR_FALSE, - "MAC-CYRILLIC" + "MAC-CYRILLIC", + "ru" }; const SequenceModel Ibm866RussianModel = @@ -347,7 +351,8 @@ const SequenceModel Ibm866RussianModel = 64, (float)0.976601, PR_FALSE, - "IBM866" + "IBM866", + "ru" }; const SequenceModel Ibm855RussianModel = @@ -357,5 +362,6 @@ const SequenceModel Ibm855RussianModel = 64, (float)0.976601, PR_FALSE, - "IBM855" + "IBM855", + "ru" }; diff --git a/src/LangModels/LangSlovakModel.cpp b/src/LangModels/LangSlovakModel.cpp index cfa94aa..480b4b5 100644 --- a/src/LangModels/LangSlovakModel.cpp +++ b/src/LangModels/LangSlovakModel.cpp @@ -255,7 +255,8 @@ const SequenceModel Ibm852SlovakModel = 45, (float)0.9733303573968434, PR_TRUE, - "IBM852" + "IBM852", + "sk" }; const SequenceModel Iso_8859_2SlovakModel = @@ -265,7 +266,8 @@ const SequenceModel Iso_8859_2SlovakModel = 45, (float)0.9733303573968434, PR_TRUE, - "ISO-8859-2" + "ISO-8859-2", + "sk" }; const SequenceModel Mac_CentraleuropeSlovakModel = @@ -275,7 +277,8 @@ const SequenceModel Mac_CentraleuropeSlovakModel = 45, (float)0.9733303573968434, PR_TRUE, - "MAC-CENTRALEUROPE" + "MAC-CENTRALEUROPE", + "sk" }; const SequenceModel Windows_1250SlovakModel = @@ -285,5 +288,6 @@ const SequenceModel Windows_1250SlovakModel = 45, (float)0.9733303573968434, PR_TRUE, - "WINDOWS-1250" + "WINDOWS-1250", + "sk" }; diff --git a/src/LangModels/LangSloveneModel.cpp b/src/LangModels/LangSloveneModel.cpp index da28d86..160f054 100644 --- a/src/LangModels/LangSloveneModel.cpp +++ b/src/LangModels/LangSloveneModel.cpp @@ -215,7 +215,8 @@ const SequenceModel Iso_8859_2SloveneModel = 29, (float)0.9983524317161332, PR_TRUE, - "ISO-8859-2" + "ISO-8859-2", + "sl" }; const SequenceModel Iso_8859_16SloveneModel = @@ -225,7 +226,8 @@ const SequenceModel Iso_8859_16SloveneModel = 29, (float)0.9983524317161332, PR_TRUE, - "ISO-8859-16" + "ISO-8859-16", + "sl" }; const SequenceModel Windows_1250SloveneModel = @@ -235,7 +237,8 @@ const SequenceModel Windows_1250SloveneModel = 29, (float)0.9983524317161332, PR_TRUE, - "WINDOWS-1250" + "WINDOWS-1250", + "sl" }; const SequenceModel Mac_CentraleuropeSloveneModel = @@ -245,7 +248,8 @@ const SequenceModel Mac_CentraleuropeSloveneModel = 29, (float)0.9983524317161332, PR_TRUE, - "MAC-CENTRALEUROPE" + "MAC-CENTRALEUROPE", + "sl" }; const SequenceModel Ibm852SloveneModel = @@ -255,5 +259,6 @@ const SequenceModel Ibm852SloveneModel = 29, (float)0.9983524317161332, PR_TRUE, - "IBM852" + "IBM852", + "sl" }; diff --git a/src/LangModels/LangSpanishModel.cpp b/src/LangModels/LangSpanishModel.cpp index 18c400a..6c3f3a9 100644 --- a/src/LangModels/LangSpanishModel.cpp +++ b/src/LangModels/LangSpanishModel.cpp @@ -177,7 +177,8 @@ const SequenceModel Iso_8859_1SpanishModel = 33, (float)0.9970385677528184, PR_TRUE, - "ISO-8859-1" + "ISO-8859-1", + "es" }; const SequenceModel Iso_8859_15SpanishModel = @@ -187,7 +188,8 @@ const SequenceModel Iso_8859_15SpanishModel = 33, (float)0.9970385677528184, PR_TRUE, - "ISO-8859-15" + "ISO-8859-15", + "es" }; const SequenceModel Windows_1252SpanishModel = @@ -197,5 +199,6 @@ const SequenceModel Windows_1252SpanishModel = 33, (float)0.9970385677528184, PR_TRUE, - "WINDOWS-1252" + "WINDOWS-1252", + "es" }; diff --git a/src/LangModels/LangSwedishModel.cpp b/src/LangModels/LangSwedishModel.cpp index 0d2dadf..3dca8e8 100644 --- a/src/LangModels/LangSwedishModel.cpp +++ b/src/LangModels/LangSwedishModel.cpp @@ -217,7 +217,8 @@ const SequenceModel Windows_1252SwedishModel = 31, (float)0.997323508584682, PR_TRUE, - "WINDOWS-1252" + "WINDOWS-1252", + "sv" }; const SequenceModel Iso_8859_9SwedishModel = @@ -227,7 +228,8 @@ const SequenceModel Iso_8859_9SwedishModel = 31, (float)0.997323508584682, PR_TRUE, - "ISO-8859-9" + "ISO-8859-9", + "sv" }; const SequenceModel Iso_8859_1SwedishModel = @@ -237,7 +239,8 @@ const SequenceModel Iso_8859_1SwedishModel = 31, (float)0.997323508584682, PR_TRUE, - "ISO-8859-1" + "ISO-8859-1", + "sv" }; const SequenceModel Iso_8859_4SwedishModel = @@ -247,7 +250,8 @@ const SequenceModel Iso_8859_4SwedishModel = 31, (float)0.997323508584682, PR_TRUE, - "ISO-8859-4" + "ISO-8859-4", + "sv" }; const SequenceModel Iso_8859_15SwedishModel = @@ -257,5 +261,6 @@ const SequenceModel Iso_8859_15SwedishModel = 31, (float)0.997323508584682, PR_TRUE, - "ISO-8859-15" + "ISO-8859-15", + "sv" }; diff --git a/src/LangModels/LangThaiModel.cpp b/src/LangModels/LangThaiModel.cpp index 091fb8d..9880e09 100644 --- a/src/LangModels/LangThaiModel.cpp +++ b/src/LangModels/LangThaiModel.cpp @@ -251,7 +251,8 @@ const SequenceModel Tis_620ThaiModel = 64, (float)0.8815720594354438, PR_FALSE, - "TIS-620" + "TIS-620", + "th" }; const SequenceModel Iso_8859_11ThaiModel = @@ -261,5 +262,6 @@ const SequenceModel Iso_8859_11ThaiModel = 64, (float)0.8815720594354438, PR_FALSE, - "ISO-8859-11" + "ISO-8859-11", + "th" }; diff --git a/src/LangModels/LangTurkishModel.cpp b/src/LangModels/LangTurkishModel.cpp index 71d72c5..16c133f 100644 --- a/src/LangModels/LangTurkishModel.cpp +++ b/src/LangModels/LangTurkishModel.cpp @@ -159,7 +159,8 @@ const SequenceModel Iso_8859_3TurkishModel = 36, (float)0.991865243864388, PR_FALSE, - "ISO-8859-3" + "ISO-8859-3", + "tr" }; const SequenceModel Iso_8859_9TurkishModel = @@ -169,5 +170,6 @@ const SequenceModel Iso_8859_9TurkishModel = 36, (float)0.991865243864388, PR_FALSE, - "ISO-8859-9" + "ISO-8859-9", + "tr" }; diff --git a/src/LangModels/LangVietnameseModel.cpp b/src/LangModels/LangVietnameseModel.cpp index 288a525..0569887 100644 --- a/src/LangModels/LangVietnameseModel.cpp +++ b/src/LangModels/LangVietnameseModel.cpp @@ -233,7 +233,8 @@ const SequenceModel Windows_1258VietnameseModel = 55, (float)0.9321889118082535, PR_FALSE, - "WINDOWS-1258" + "WINDOWS-1258", + "vi" }; const SequenceModel VisciiVietnameseModel = @@ -243,5 +244,6 @@ const SequenceModel VisciiVietnameseModel = 55, (float)0.9321889118082535, PR_FALSE, - "VISCII" + "VISCII", + "vi" }; diff --git a/src/nsBig5Prober.h b/src/nsBig5Prober.h index 7d13be8..4b5d9fa 100644 --- a/src/nsBig5Prober.h +++ b/src/nsBig5Prober.h @@ -51,6 +51,7 @@ public: virtual ~nsBig5Prober(void){delete mCodingSM;} nsProbingState HandleData(const char* aBuf, PRUint32 aLen); const char* GetCharSetName() {return "BIG5";} + const char* GetLanguage() {return "zh";} nsProbingState GetState(void) {return mState;} void Reset(void); float GetConfidence(void); diff --git a/src/nsCharSetProber.h b/src/nsCharSetProber.h index c078ccf..c13afb8 100644 --- a/src/nsCharSetProber.h +++ b/src/nsCharSetProber.h @@ -54,6 +54,7 @@ class nsCharSetProber { public: virtual ~nsCharSetProber() {} virtual const char* GetCharSetName() = 0; + virtual const char* GetLanguage() = 0; virtual nsProbingState HandleData(const char* aBuf, PRUint32 aLen) = 0; virtual nsProbingState GetState(void) = 0; virtual void Reset(void) = 0; diff --git a/src/nsEUCJPProber.h b/src/nsEUCJPProber.h index a7a2f51..a74c779 100644 --- a/src/nsEUCJPProber.h +++ b/src/nsEUCJPProber.h @@ -57,6 +57,7 @@ public: virtual ~nsEUCJPProber(void){delete mCodingSM;} nsProbingState HandleData(const char* aBuf, PRUint32 aLen); const char* GetCharSetName() {return "EUC-JP";} + const char* GetLanguage() {return "ja";} nsProbingState GetState(void) {return mState;} void Reset(void); float GetConfidence(void); diff --git a/src/nsEUCKRProber.h b/src/nsEUCKRProber.h index 954c038..8ce9eb2 100644 --- a/src/nsEUCKRProber.h +++ b/src/nsEUCKRProber.h @@ -57,6 +57,7 @@ public: * Korean documents are actually created with this character set. */ const char* GetCharSetName() {return "UHC";} + const char* GetLanguage() {return "ko";} nsProbingState GetState(void) {return mState;} void Reset(void); float GetConfidence(void); diff --git a/src/nsEUCTWProber.h b/src/nsEUCTWProber.h index ee6376e..6701027 100644 --- a/src/nsEUCTWProber.h +++ b/src/nsEUCTWProber.h @@ -51,6 +51,7 @@ public: virtual ~nsEUCTWProber(void){delete mCodingSM;} nsProbingState HandleData(const char* aBuf, PRUint32 aLen); const char* GetCharSetName() {return "EUC-TW";} + const char* GetLanguage() {return "zh";} nsProbingState GetState(void) {return mState;} void Reset(void); float GetConfidence(void); diff --git a/src/nsEscCharsetProber.h b/src/nsEscCharsetProber.h index 4b648e0..eab3080 100644 --- a/src/nsEscCharsetProber.h +++ b/src/nsEscCharsetProber.h @@ -38,6 +38,8 @@ #ifndef nsEscCharSetProber_h__ #define nsEscCharSetProber_h__ +#include + #include "nsCharSetProber.h" #include "nsCodingStateMachine.h" @@ -49,6 +51,7 @@ public: virtual ~nsEscCharSetProber(void); nsProbingState HandleData(const char* aBuf, PRUint32 aLen); const char* GetCharSetName() {return mDetectedCharset;} + const char* GetLanguage() {return NULL;} nsProbingState GetState(void) {return mState;} void Reset(void); float GetConfidence(void){return (float)0.99;} diff --git a/src/nsGB2312Prober.h b/src/nsGB2312Prober.h index 26ebf84..a35e585 100644 --- a/src/nsGB2312Prober.h +++ b/src/nsGB2312Prober.h @@ -53,6 +53,7 @@ public: virtual ~nsGB18030Prober(void){delete mCodingSM;} nsProbingState HandleData(const char* aBuf, PRUint32 aLen); const char* GetCharSetName() {return "GB18030";} + const char* GetLanguage() {return "zh";} nsProbingState GetState(void) {return mState;} void Reset(void); float GetConfidence(void); diff --git a/src/nsHebrewProber.h b/src/nsHebrewProber.h index eedfed4..8442aab 100644 --- a/src/nsHebrewProber.h +++ b/src/nsHebrewProber.h @@ -49,7 +49,8 @@ public: virtual ~nsHebrewProber(void) {} virtual nsProbingState HandleData(const char* aBuf, PRUint32 aLen); - virtual const char* GetCharSetName(); + virtual const char *GetCharSetName(); + virtual const char *GetLanguage(void) { return "he"; } virtual void Reset(void); virtual nsProbingState GetState(void); diff --git a/src/nsLatin1Prober.h b/src/nsLatin1Prober.h index 59118a7..bd3a9d5 100644 --- a/src/nsLatin1Prober.h +++ b/src/nsLatin1Prober.h @@ -39,6 +39,8 @@ #ifndef nsLatin1Prober_h__ #define nsLatin1Prober_h__ +#include + #include "nsCharSetProber.h" #define FREQ_CAT_NUM 4 @@ -49,6 +51,7 @@ public: virtual ~nsLatin1Prober(void){} nsProbingState HandleData(const char* aBuf, PRUint32 aLen); const char* GetCharSetName() {return "WINDOWS-1252";} + const char* GetLanguage() {return NULL;} nsProbingState GetState(void) {return mState;} void Reset(void); float GetConfidence(void); diff --git a/src/nsMBCSGroupProber.cpp b/src/nsMBCSGroupProber.cpp index 057ddb1..68c896a 100644 --- a/src/nsMBCSGroupProber.cpp +++ b/src/nsMBCSGroupProber.cpp @@ -97,6 +97,18 @@ const char* nsMBCSGroupProber::GetCharSetName() return mProbers[mBestGuess]->GetCharSetName(); } +const char* nsMBCSGroupProber::GetLanguage(void) +{ + if (mBestGuess == -1) + { + GetConfidence(); + } + if (mBestGuess == -1) + return NULL; + else + return mProbers[mBestGuess]->GetLanguage(); +} + void nsMBCSGroupProber::Reset(void) { mActiveNum = 0; diff --git a/src/nsMBCSGroupProber.h b/src/nsMBCSGroupProber.h index c4e9964..0e55221 100644 --- a/src/nsMBCSGroupProber.h +++ b/src/nsMBCSGroupProber.h @@ -55,6 +55,7 @@ public: virtual ~nsMBCSGroupProber(); nsProbingState HandleData(const char* aBuf, PRUint32 aLen); const char* GetCharSetName(); + const char* GetLanguage(); nsProbingState GetState(void) {return mState;} void Reset(void); float GetConfidence(void); diff --git a/src/nsSBCSGroupProber.cpp b/src/nsSBCSGroupProber.cpp index f956d25..6a3ef4f 100644 --- a/src/nsSBCSGroupProber.cpp +++ b/src/nsSBCSGroupProber.cpp @@ -217,6 +217,17 @@ const char* nsSBCSGroupProber::GetCharSetName() return mProbers[mBestGuess]->GetCharSetName(); } +const char* nsSBCSGroupProber::GetLanguage() +{ + if (mBestGuess == -1) + { + GetConfidence(); + if (mBestGuess == -1) + mBestGuess = 0; + } + return mProbers[mBestGuess]->GetLanguage(); +} + void nsSBCSGroupProber::Reset(void) { mActiveNum = 0; diff --git a/src/nsSBCSGroupProber.h b/src/nsSBCSGroupProber.h index ec72324..d07e16f 100644 --- a/src/nsSBCSGroupProber.h +++ b/src/nsSBCSGroupProber.h @@ -49,6 +49,7 @@ public: virtual ~nsSBCSGroupProber(); nsProbingState HandleData(const char* aBuf, PRUint32 aLen); const char* GetCharSetName(); + const char* GetLanguage(); nsProbingState GetState(void) {return mState;} void Reset(void); float GetConfidence(void); diff --git a/src/nsSBCharSetProber.cpp b/src/nsSBCharSetProber.cpp index 001529f..7832f11 100644 --- a/src/nsSBCharSetProber.cpp +++ b/src/nsSBCharSetProber.cpp @@ -145,6 +145,13 @@ const char* nsSingleByteCharSetProber::GetCharSetName() return mNameProber->GetCharSetName(); } +const char* nsSingleByteCharSetProber::GetLanguage() +{ + if (!mNameProber) + return mModel->langName; + return mNameProber->GetLanguage(); +} + #ifdef DEBUG_chardet void nsSingleByteCharSetProber::DumpStatus() { diff --git a/src/nsSBCharSetProber.h b/src/nsSBCharSetProber.h index 42d21b2..2cd4409 100644 --- a/src/nsSBCharSetProber.h +++ b/src/nsSBCharSetProber.h @@ -75,6 +75,7 @@ typedef struct float mTypicalPositiveRatio; // = freqSeqs / totalSeqs PRBool keepEnglishLetter; // says if this script contains English characters (not implemented) const char* const charsetName; + const char* const langName; } SequenceModel; @@ -86,6 +87,7 @@ public: :mModel(model), mReversed(reversed), mNameProber(nameProber) { Reset(); } virtual const char* GetCharSetName(); + virtual const char* GetLanguage(); virtual nsProbingState HandleData(const char* aBuf, PRUint32 aLen); virtual nsProbingState GetState(void) {return mState;} virtual void Reset(void); diff --git a/src/nsSJISProber.h b/src/nsSJISProber.h index f326ded..61e6352 100644 --- a/src/nsSJISProber.h +++ b/src/nsSJISProber.h @@ -58,6 +58,7 @@ public: virtual ~nsSJISProber(void){delete mCodingSM;} nsProbingState HandleData(const char* aBuf, PRUint32 aLen); const char* GetCharSetName() {return "SHIFT_JIS";} + const char* GetLanguage() {return "ja";} nsProbingState GetState(void) {return mState;} void Reset(void); float GetConfidence(void); diff --git a/src/nsUTF8Prober.h b/src/nsUTF8Prober.h index 21c91c4..a2cf4ee 100644 --- a/src/nsUTF8Prober.h +++ b/src/nsUTF8Prober.h @@ -38,6 +38,7 @@ #ifndef nsUTF8Prober_h__ #define nsUTF8Prober_h__ +#include #include "nsCharSetProber.h" #include "nsCodingStateMachine.h" @@ -49,6 +50,7 @@ public: virtual ~nsUTF8Prober(){delete mCodingSM;} nsProbingState HandleData(const char* aBuf, PRUint32 aLen); const char* GetCharSetName() {return "UTF-8";} + const char* GetLanguage() {return NULL;} nsProbingState GetState(void) {return mState;} void Reset(void); float GetConfidence(void); diff --git a/src/nsUniversalDetector.cpp b/src/nsUniversalDetector.cpp index 2da4b4b..bc9e9b2 100644 --- a/src/nsUniversalDetector.cpp +++ b/src/nsUniversalDetector.cpp @@ -305,7 +305,7 @@ void nsUniversalDetector::DataEnd() * when finding them. */ mDone = PR_TRUE; - Report(mDetectedCharset, 1.0); + Report(mDetectedCharset, NULL, 1.0); return; } @@ -323,7 +323,9 @@ void nsUniversalDetector::DataEnd() if (proberConfidence > MINIMUM_THRESHOLD) /* Only report what we are confident in. */ - Report(mCharSetProbers[i]->GetCharSetName(), proberConfidence); + Report(mCharSetProbers[i]->GetCharSetName(), + mCharSetProbers[i]->GetLanguage(), + proberConfidence); } } } diff --git a/src/nsUniversalDetector.h b/src/nsUniversalDetector.h index eecdea6..702a9fe 100644 --- a/src/nsUniversalDetector.h +++ b/src/nsUniversalDetector.h @@ -69,7 +69,8 @@ public: virtual void DataEnd(void); protected: - virtual void Report(const char* aCharset, + virtual void Report(const char *encoding, + const char *language, float confidence) = 0; virtual void Reset(); nsInputState mInputState; diff --git a/src/symbols.cmake b/src/symbols.cmake index a6690ff..e66bfa0 100644 --- a/src/symbols.cmake +++ b/src/symbols.cmake @@ -9,6 +9,7 @@ set( uchardet_get_candidates uchardet_get_encoding uchardet_get_confidence + uchardet_get_language ) set (LINK_FLAGS "") diff --git a/src/uchardet.cpp b/src/uchardet.cpp index f5391ea..19a73f0 100644 --- a/src/uchardet.cpp +++ b/src/uchardet.cpp @@ -65,6 +65,7 @@ public: } virtual void Report(const char *encoding, + const char *language, float confidence) { std::vector::iterator it; @@ -72,7 +73,8 @@ public: for (it = candidates.begin(); it != candidates.end(); it++) { - if (strcmp(it->encoding, encoding) == 0) + if (strcmp(it->encoding, encoding) == 0 && + it->language && language && strcmp(it->language, language) == 0) { /* Already reported. Bail out or update the confidence * when needed. @@ -91,6 +93,7 @@ public: candidate = UChardetCandidate(); candidate.encoding = strdup(encoding); + candidate.language = language ? strdup(language) : NULL; candidate.confidence = confidence; for (it = candidates.begin(); it != candidates.end(); it++) @@ -107,7 +110,11 @@ public: nsUniversalDetector::Reset(); for (it = candidates.begin(); it != candidates.end(); it++) + { free(it->encoding); + if (it->language) + free(it->language); + } candidates.clear(); } @@ -125,6 +132,12 @@ public: { return (candidates.size() > i) ? candidates[i].confidence : 0.0; } + + const char* GetLanguage(size_t i) const + { + return (candidates.size() > i) ? candidates[i].language : NULL; + } + }; uchardet_t uchardet_new(void) @@ -178,3 +191,9 @@ const char * uchardet_get_encoding (uchardet_t ud, { return reinterpret_cast(ud)->GetCharset(candidate); } + +const char * uchardet_get_language (uchardet_t ud, + size_t candidate) +{ + return reinterpret_cast(ud)->GetLanguage(candidate); +} diff --git a/src/uchardet.h b/src/uchardet.h index c452a69..df1387e 100644 --- a/src/uchardet.h +++ b/src/uchardet.h @@ -120,6 +120,8 @@ UCHARDET_INTERFACE float uchardet_get_confidence (uchardet_t ud, size_t candidate); UCHARDET_INTERFACE const char * uchardet_get_encoding (uchardet_t ud, size_t candidate); +UCHARDET_INTERFACE const char * uchardet_get_language (uchardet_t ud, + size_t candidate); #ifdef __cplusplus