src: new API to get the detected language.

This doesn't work for all probers yet, in particular not for the most
generic probers (such as UTF-8) or WINDOWS-1252. These will return NULL.
It's still a good first step.

Right now, it returns the 2-character language code from ISO 639-1. A
using project could easily get the English language name from the
XML/json files provided by the iso-codes project. This project will also
allow to easily localize the language name in other languages through
gettext (this is what we do in GIMP for instance). I don't add any
dependency though and leave it to downstream projects to implement this.

I was also wondering if we want to support region information for cases
when it would make sense. I especially wondered about it for Chinese
encodings as some of them seem quite specific to a region (according to
Wikipedia at least). For the time being though, these just return "zh".
We'll see later if it makes sense to be more accurate (maybe depending
on reports?).
This commit is contained in:
Jehan 2020-04-23 18:24:12 +02:00
parent 94736d1565
commit 4e967c9e88
51 changed files with 276 additions and 104 deletions

View File

@ -251,7 +251,8 @@ const SequenceModel Iso_8859_6ArabicModel =
64,
(float)0.9696025116913417,
PR_FALSE,
"ISO-8859-6"
"ISO-8859-6",
"ar"
};
const SequenceModel Windows_1256ArabicModel =
@ -261,5 +262,6 @@ const SequenceModel Windows_1256ArabicModel =
64,
(float)0.9696025116913417,
PR_FALSE,
"WINDOWS-1256"
"WINDOWS-1256",
"ar"
};

View File

@ -233,7 +233,8 @@ const SequenceModel Latin5BulgarianModel =
64,
(float)0.969392,
PR_FALSE,
"ISO-8859-5"
"ISO-8859-5",
"bg"
};
const SequenceModel Win1251BulgarianModel =
@ -243,5 +244,6 @@ const SequenceModel Win1251BulgarianModel =
64,
(float)0.969392,
PR_FALSE,
"WINDOWS-1251"
"WINDOWS-1251",
"bg"
};

View File

@ -238,7 +238,8 @@ const SequenceModel Windows_1250CroatianModel =
31,
(float)0.9989731099787131,
PR_TRUE,
"WINDOWS-1250"
"WINDOWS-1250",
"hr"
};
const SequenceModel Iso_8859_2CroatianModel =
@ -248,7 +249,8 @@ const SequenceModel Iso_8859_2CroatianModel =
31,
(float)0.9989731099787131,
PR_TRUE,
"ISO-8859-2"
"ISO-8859-2",
"hr"
};
const SequenceModel Iso_8859_16CroatianModel =
@ -258,7 +260,8 @@ const SequenceModel Iso_8859_16CroatianModel =
31,
(float)0.9989731099787131,
PR_TRUE,
"ISO-8859-16"
"ISO-8859-16",
"hr"
};
const SequenceModel Mac_CentraleuropeCroatianModel =
@ -268,7 +271,8 @@ const SequenceModel Mac_CentraleuropeCroatianModel =
31,
(float)0.9989731099787131,
PR_TRUE,
"MAC-CENTRALEUROPE"
"MAC-CENTRALEUROPE",
"hr"
};
const SequenceModel Iso_8859_13CroatianModel =
@ -278,7 +282,8 @@ const SequenceModel Iso_8859_13CroatianModel =
31,
(float)0.9989731099787131,
PR_TRUE,
"ISO-8859-13"
"ISO-8859-13",
"hr"
};
const SequenceModel Ibm852CroatianModel =
@ -288,5 +293,6 @@ const SequenceModel Ibm852CroatianModel =
31,
(float)0.9989731099787131,
PR_TRUE,
"IBM852"
"IBM852",
"hr"
};

View File

@ -247,7 +247,8 @@ const SequenceModel Windows_1250CzechModel =
41,
(float)0.9786035192432675,
PR_TRUE,
"WINDOWS-1250"
"WINDOWS-1250",
"cs"
};
const SequenceModel Mac_CentraleuropeCzechModel =
@ -257,7 +258,8 @@ const SequenceModel Mac_CentraleuropeCzechModel =
41,
(float)0.9786035192432675,
PR_TRUE,
"MAC-CENTRALEUROPE"
"MAC-CENTRALEUROPE",
"cs"
};
const SequenceModel Ibm852CzechModel =
@ -267,7 +269,8 @@ const SequenceModel Ibm852CzechModel =
41,
(float)0.9786035192432675,
PR_TRUE,
"IBM852"
"IBM852",
"cs"
};
const SequenceModel Iso_8859_2CzechModel =
@ -277,5 +280,6 @@ const SequenceModel Iso_8859_2CzechModel =
41,
(float)0.9786035192432675,
PR_TRUE,
"ISO-8859-2"
"ISO-8859-2",
"cs"
};

View File

@ -174,7 +174,8 @@ const SequenceModel Iso_8859_15DanishModel =
30,
(float)0.9968082796759031,
PR_TRUE,
"ISO-8859-15"
"ISO-8859-15",
"da"
};
const SequenceModel Iso_8859_1DanishModel =
@ -184,7 +185,8 @@ const SequenceModel Iso_8859_1DanishModel =
30,
(float)0.9968082796759031,
PR_TRUE,
"ISO-8859-1"
"ISO-8859-1",
"da"
};
const SequenceModel Windows_1252DanishModel =
@ -194,5 +196,6 @@ const SequenceModel Windows_1252DanishModel =
30,
(float)0.9968082796759031,
PR_TRUE,
"WINDOWS-1252"
"WINDOWS-1252",
"da"
};

View File

@ -137,5 +137,6 @@ const SequenceModel Iso_8859_3EsperantoModel =
35,
(float)0.9942980632768038,
PR_FALSE,
"ISO-8859-3"
"ISO-8859-3",
"eo"
};

View File

@ -219,7 +219,8 @@ const SequenceModel Iso_8859_4EstonianModel =
33,
(float)0.9972721312183132,
PR_TRUE,
"ISO-8859-4"
"ISO-8859-4",
"et"
};
const SequenceModel Windows_1252EstonianModel =
@ -229,7 +230,8 @@ const SequenceModel Windows_1252EstonianModel =
33,
(float)0.9972721312183132,
PR_TRUE,
"WINDOWS-1252"
"WINDOWS-1252",
"et"
};
const SequenceModel Iso_8859_15EstonianModel =
@ -239,7 +241,8 @@ const SequenceModel Iso_8859_15EstonianModel =
33,
(float)0.9972721312183132,
PR_TRUE,
"ISO-8859-15"
"ISO-8859-15",
"et"
};
const SequenceModel Iso_8859_13EstonianModel =
@ -249,7 +252,8 @@ const SequenceModel Iso_8859_13EstonianModel =
33,
(float)0.9972721312183132,
PR_TRUE,
"ISO-8859-13"
"ISO-8859-13",
"et"
};
const SequenceModel Windows_1257EstonianModel =
@ -259,5 +263,6 @@ const SequenceModel Windows_1257EstonianModel =
33,
(float)0.9972721312183132,
PR_TRUE,
"WINDOWS-1257"
"WINDOWS-1257",
"et"
};

View File

@ -237,7 +237,8 @@ const SequenceModel Iso_8859_15FinnishModel =
30,
(float)0.9985378147555799,
PR_TRUE,
"ISO-8859-15"
"ISO-8859-15",
"fi"
};
const SequenceModel Windows_1252FinnishModel =
@ -247,7 +248,8 @@ const SequenceModel Windows_1252FinnishModel =
30,
(float)0.9985378147555799,
PR_TRUE,
"WINDOWS-1252"
"WINDOWS-1252",
"fi"
};
const SequenceModel Iso_8859_4FinnishModel =
@ -257,7 +259,8 @@ const SequenceModel Iso_8859_4FinnishModel =
30,
(float)0.9985378147555799,
PR_TRUE,
"ISO-8859-4"
"ISO-8859-4",
"fi"
};
const SequenceModel Iso_8859_13FinnishModel =
@ -267,7 +270,8 @@ const SequenceModel Iso_8859_13FinnishModel =
30,
(float)0.9985378147555799,
PR_TRUE,
"ISO-8859-13"
"ISO-8859-13",
"fi"
};
const SequenceModel Iso_8859_9FinnishModel =
@ -277,7 +281,8 @@ const SequenceModel Iso_8859_9FinnishModel =
30,
(float)0.9985378147555799,
PR_TRUE,
"ISO-8859-9"
"ISO-8859-9",
"fi"
};
const SequenceModel Iso_8859_1FinnishModel =
@ -287,5 +292,6 @@ const SequenceModel Iso_8859_1FinnishModel =
30,
(float)0.9985378147555799,
PR_TRUE,
"ISO-8859-1"
"ISO-8859-1",
"fi"
};

View File

@ -182,7 +182,8 @@ const SequenceModel Windows_1252FrenchModel =
38,
(float)0.997057879992383,
PR_TRUE,
"WINDOWS-1252"
"WINDOWS-1252",
"fr"
};
const SequenceModel Iso_8859_1FrenchModel =
@ -192,7 +193,8 @@ const SequenceModel Iso_8859_1FrenchModel =
38,
(float)0.997057879992383,
PR_TRUE,
"ISO-8859-1"
"ISO-8859-1",
"fr"
};
const SequenceModel Iso_8859_15FrenchModel =
@ -202,5 +204,6 @@ const SequenceModel Iso_8859_15FrenchModel =
38,
(float)0.997057879992383,
PR_TRUE,
"ISO-8859-15"
"ISO-8859-15",
"fr"
};

View File

@ -154,7 +154,8 @@ const SequenceModel Windows_1252GermanModel =
31,
(float)0.9934041448127945,
PR_TRUE,
"WINDOWS-1252"
"WINDOWS-1252",
"de"
};
const SequenceModel Iso_8859_1GermanModel =
@ -164,5 +165,6 @@ const SequenceModel Iso_8859_1GermanModel =
31,
(float)0.9934041448127945,
PR_TRUE,
"ISO-8859-1"
"ISO-8859-1",
"de"
};

View File

@ -215,7 +215,8 @@ const SequenceModel Windows_1253GreekModel =
46,
(float)0.958419074626211,
PR_FALSE,
"WINDOWS-1253"
"WINDOWS-1253",
"el"
};
const SequenceModel Iso_8859_7GreekModel =
@ -225,5 +226,6 @@ const SequenceModel Iso_8859_7GreekModel =
46,
(float)0.958419074626211,
PR_FALSE,
"ISO-8859-7"
"ISO-8859-7",
"el"
};

View File

@ -215,6 +215,6 @@ const SequenceModel Win1255Model =
64,
(float)0.984004,
PR_FALSE,
"WINDOWS-1255"
"WINDOWS-1255",
"he"
};

View File

@ -155,7 +155,8 @@ const SequenceModel Iso_8859_2HungarianModel =
32,
(float)0.9748272224933486,
PR_FALSE,
"ISO-8859-2"
"ISO-8859-2",
"hu"
};
const SequenceModel Windows_1250HungarianModel =
@ -165,5 +166,6 @@ const SequenceModel Windows_1250HungarianModel =
32,
(float)0.9748272224933486,
PR_FALSE,
"WINDOWS-1250"
"WINDOWS-1250",
"hu"
};

View File

@ -196,7 +196,8 @@ const SequenceModel Iso_8859_1IrishModel =
31,
(float)0.9974076651249096,
PR_TRUE,
"ISO-8859-1"
"ISO-8859-1",
"ga"
};
const SequenceModel Windows_1252IrishModel =
@ -206,7 +207,8 @@ const SequenceModel Windows_1252IrishModel =
31,
(float)0.9974076651249096,
PR_TRUE,
"WINDOWS-1252"
"WINDOWS-1252",
"ga"
};
const SequenceModel Iso_8859_15IrishModel =
@ -216,7 +218,8 @@ const SequenceModel Iso_8859_15IrishModel =
31,
(float)0.9974076651249096,
PR_TRUE,
"ISO-8859-15"
"ISO-8859-15",
"ga"
};
const SequenceModel Iso_8859_9IrishModel =
@ -226,5 +229,6 @@ const SequenceModel Iso_8859_9IrishModel =
31,
(float)0.9974076651249096,
PR_TRUE,
"ISO-8859-9"
"ISO-8859-9",
"ga"
};

View File

@ -220,7 +220,8 @@ const SequenceModel Iso_8859_3ItalianModel =
34,
(float)0.9989484485502651,
PR_TRUE,
"ISO-8859-3"
"ISO-8859-3",
"it"
};
const SequenceModel Iso_8859_15ItalianModel =
@ -230,7 +231,8 @@ const SequenceModel Iso_8859_15ItalianModel =
34,
(float)0.9989484485502651,
PR_TRUE,
"ISO-8859-15"
"ISO-8859-15",
"it"
};
const SequenceModel Iso_8859_9ItalianModel =
@ -240,7 +242,8 @@ const SequenceModel Iso_8859_9ItalianModel =
34,
(float)0.9989484485502651,
PR_TRUE,
"ISO-8859-9"
"ISO-8859-9",
"it"
};
const SequenceModel Iso_8859_1ItalianModel =
@ -250,7 +253,8 @@ const SequenceModel Iso_8859_1ItalianModel =
34,
(float)0.9989484485502651,
PR_TRUE,
"ISO-8859-1"
"ISO-8859-1",
"it"
};
const SequenceModel Windows_1252ItalianModel =
@ -260,5 +264,6 @@ const SequenceModel Windows_1252ItalianModel =
34,
(float)0.9989484485502651,
PR_TRUE,
"WINDOWS-1252"
"WINDOWS-1252",
"it"
};

View File

@ -183,7 +183,8 @@ const SequenceModel Iso_8859_4LatvianModel =
39,
(float)0.9904102202220861,
PR_TRUE,
"ISO-8859-4"
"ISO-8859-4",
"lv"
};
const SequenceModel Iso_8859_10LatvianModel =
@ -193,7 +194,8 @@ const SequenceModel Iso_8859_10LatvianModel =
39,
(float)0.9904102202220861,
PR_TRUE,
"ISO-8859-10"
"ISO-8859-10",
"lv"
};
const SequenceModel Iso_8859_13LatvianModel =
@ -203,5 +205,6 @@ const SequenceModel Iso_8859_13LatvianModel =
39,
(float)0.9904102202220861,
PR_TRUE,
"ISO-8859-13"
"ISO-8859-13",
"lv"
};

View File

@ -182,7 +182,8 @@ const SequenceModel Iso_8859_10LithuanianModel =
38,
(float)0.9928710196247589,
PR_TRUE,
"ISO-8859-10"
"ISO-8859-10",
"lt"
};
const SequenceModel Iso_8859_4LithuanianModel =
@ -192,7 +193,8 @@ const SequenceModel Iso_8859_4LithuanianModel =
38,
(float)0.9928710196247589,
PR_TRUE,
"ISO-8859-4"
"ISO-8859-4",
"lt"
};
const SequenceModel Iso_8859_13LithuanianModel =
@ -202,5 +204,6 @@ const SequenceModel Iso_8859_13LithuanianModel =
38,
(float)0.9928710196247589,
PR_TRUE,
"ISO-8859-13"
"ISO-8859-13",
"lt"
};

View File

@ -133,5 +133,6 @@ const SequenceModel Iso_8859_3MalteseModel =
31,
(float)0.9959115850692665,
PR_TRUE,
"ISO-8859-3"
"ISO-8859-3",
"mt"
};

View File

@ -244,7 +244,8 @@ const SequenceModel Ibm852PolishModel =
37,
(float)0.9894531815946438,
PR_TRUE,
"IBM852"
"IBM852",
"pl"
};
const SequenceModel Iso_8859_16PolishModel =
@ -254,7 +255,8 @@ const SequenceModel Iso_8859_16PolishModel =
37,
(float)0.9894531815946438,
PR_TRUE,
"ISO-8859-16"
"ISO-8859-16",
"pl"
};
const SequenceModel Iso_8859_2PolishModel =
@ -264,7 +266,8 @@ const SequenceModel Iso_8859_2PolishModel =
37,
(float)0.9894531815946438,
PR_TRUE,
"ISO-8859-2"
"ISO-8859-2",
"pl"
};
const SequenceModel Mac_CentraleuropePolishModel =
@ -274,7 +277,8 @@ const SequenceModel Mac_CentraleuropePolishModel =
37,
(float)0.9894531815946438,
PR_TRUE,
"MAC-CENTRALEUROPE"
"MAC-CENTRALEUROPE",
"pl"
};
const SequenceModel Iso_8859_13PolishModel =
@ -284,7 +288,8 @@ const SequenceModel Iso_8859_13PolishModel =
37,
(float)0.9894531815946438,
PR_TRUE,
"ISO-8859-13"
"ISO-8859-13",
"pl"
};
const SequenceModel Windows_1250PolishModel =
@ -294,5 +299,6 @@ const SequenceModel Windows_1250PolishModel =
37,
(float)0.9894531815946438,
PR_TRUE,
"WINDOWS-1250"
"WINDOWS-1250",
"pl"
};

View File

@ -203,7 +203,8 @@ const SequenceModel Iso_8859_1PortugueseModel =
38,
(float)0.9953179582313172,
PR_TRUE,
"ISO-8859-1"
"ISO-8859-1",
"pt"
};
const SequenceModel Iso_8859_9PortugueseModel =
@ -213,7 +214,8 @@ const SequenceModel Iso_8859_9PortugueseModel =
38,
(float)0.9953179582313172,
PR_TRUE,
"ISO-8859-9"
"ISO-8859-9",
"pt"
};
const SequenceModel Iso_8859_15PortugueseModel =
@ -223,7 +225,8 @@ const SequenceModel Iso_8859_15PortugueseModel =
38,
(float)0.9953179582313172,
PR_TRUE,
"ISO-8859-15"
"ISO-8859-15",
"pt"
};
const SequenceModel Windows_1252PortugueseModel =
@ -233,5 +236,6 @@ const SequenceModel Windows_1252PortugueseModel =
38,
(float)0.9953179582313172,
PR_TRUE,
"WINDOWS-1252"
"WINDOWS-1252",
"pt"
};

View File

@ -198,7 +198,8 @@ const SequenceModel Iso_8859_16RomanianModel =
33,
(float)0.997762564143313,
PR_TRUE,
"ISO-8859-16"
"ISO-8859-16",
"ro"
};
const SequenceModel Iso_8859_2RomanianModel =
@ -208,7 +209,8 @@ const SequenceModel Iso_8859_2RomanianModel =
33,
(float)0.997762564143313,
PR_TRUE,
"ISO-8859-2"
"ISO-8859-2",
"ro"
};
const SequenceModel Windows_1250RomanianModel =
@ -218,7 +220,8 @@ const SequenceModel Windows_1250RomanianModel =
33,
(float)0.997762564143313,
PR_TRUE,
"WINDOWS-1250"
"WINDOWS-1250",
"ro"
};
const SequenceModel Ibm852RomanianModel =
@ -228,5 +231,6 @@ const SequenceModel Ibm852RomanianModel =
33,
(float)0.997762564143313,
PR_TRUE,
"IBM852"
"IBM852",
"ro"
};

View File

@ -307,7 +307,8 @@ const SequenceModel Koi8rRussianModel =
64,
(float)0.976601,
PR_FALSE,
"KOI8-R"
"KOI8-R",
"ru"
};
const SequenceModel Win1251RussianModel =
@ -317,7 +318,8 @@ const SequenceModel Win1251RussianModel =
64,
(float)0.976601,
PR_FALSE,
"WINDOWS-1251"
"WINDOWS-1251",
"ru"
};
const SequenceModel Latin5RussianModel =
@ -327,7 +329,8 @@ const SequenceModel Latin5RussianModel =
64,
(float)0.976601,
PR_FALSE,
"ISO-8859-5"
"ISO-8859-5",
"ru"
};
const SequenceModel MacCyrillicRussianModel =
@ -337,7 +340,8 @@ const SequenceModel MacCyrillicRussianModel =
64,
(float)0.976601,
PR_FALSE,
"MAC-CYRILLIC"
"MAC-CYRILLIC",
"ru"
};
const SequenceModel Ibm866RussianModel =
@ -347,7 +351,8 @@ const SequenceModel Ibm866RussianModel =
64,
(float)0.976601,
PR_FALSE,
"IBM866"
"IBM866",
"ru"
};
const SequenceModel Ibm855RussianModel =
@ -357,5 +362,6 @@ const SequenceModel Ibm855RussianModel =
64,
(float)0.976601,
PR_FALSE,
"IBM855"
"IBM855",
"ru"
};

View File

@ -255,7 +255,8 @@ const SequenceModel Ibm852SlovakModel =
45,
(float)0.9733303573968434,
PR_TRUE,
"IBM852"
"IBM852",
"sk"
};
const SequenceModel Iso_8859_2SlovakModel =
@ -265,7 +266,8 @@ const SequenceModel Iso_8859_2SlovakModel =
45,
(float)0.9733303573968434,
PR_TRUE,
"ISO-8859-2"
"ISO-8859-2",
"sk"
};
const SequenceModel Mac_CentraleuropeSlovakModel =
@ -275,7 +277,8 @@ const SequenceModel Mac_CentraleuropeSlovakModel =
45,
(float)0.9733303573968434,
PR_TRUE,
"MAC-CENTRALEUROPE"
"MAC-CENTRALEUROPE",
"sk"
};
const SequenceModel Windows_1250SlovakModel =
@ -285,5 +288,6 @@ const SequenceModel Windows_1250SlovakModel =
45,
(float)0.9733303573968434,
PR_TRUE,
"WINDOWS-1250"
"WINDOWS-1250",
"sk"
};

View File

@ -215,7 +215,8 @@ const SequenceModel Iso_8859_2SloveneModel =
29,
(float)0.9983524317161332,
PR_TRUE,
"ISO-8859-2"
"ISO-8859-2",
"sl"
};
const SequenceModel Iso_8859_16SloveneModel =
@ -225,7 +226,8 @@ const SequenceModel Iso_8859_16SloveneModel =
29,
(float)0.9983524317161332,
PR_TRUE,
"ISO-8859-16"
"ISO-8859-16",
"sl"
};
const SequenceModel Windows_1250SloveneModel =
@ -235,7 +237,8 @@ const SequenceModel Windows_1250SloveneModel =
29,
(float)0.9983524317161332,
PR_TRUE,
"WINDOWS-1250"
"WINDOWS-1250",
"sl"
};
const SequenceModel Mac_CentraleuropeSloveneModel =
@ -245,7 +248,8 @@ const SequenceModel Mac_CentraleuropeSloveneModel =
29,
(float)0.9983524317161332,
PR_TRUE,
"MAC-CENTRALEUROPE"
"MAC-CENTRALEUROPE",
"sl"
};
const SequenceModel Ibm852SloveneModel =
@ -255,5 +259,6 @@ const SequenceModel Ibm852SloveneModel =
29,
(float)0.9983524317161332,
PR_TRUE,
"IBM852"
"IBM852",
"sl"
};

View File

@ -177,7 +177,8 @@ const SequenceModel Iso_8859_1SpanishModel =
33,
(float)0.9970385677528184,
PR_TRUE,
"ISO-8859-1"
"ISO-8859-1",
"es"
};
const SequenceModel Iso_8859_15SpanishModel =
@ -187,7 +188,8 @@ const SequenceModel Iso_8859_15SpanishModel =
33,
(float)0.9970385677528184,
PR_TRUE,
"ISO-8859-15"
"ISO-8859-15",
"es"
};
const SequenceModel Windows_1252SpanishModel =
@ -197,5 +199,6 @@ const SequenceModel Windows_1252SpanishModel =
33,
(float)0.9970385677528184,
PR_TRUE,
"WINDOWS-1252"
"WINDOWS-1252",
"es"
};

View File

@ -217,7 +217,8 @@ const SequenceModel Windows_1252SwedishModel =
31,
(float)0.997323508584682,
PR_TRUE,
"WINDOWS-1252"
"WINDOWS-1252",
"sv"
};
const SequenceModel Iso_8859_9SwedishModel =
@ -227,7 +228,8 @@ const SequenceModel Iso_8859_9SwedishModel =
31,
(float)0.997323508584682,
PR_TRUE,
"ISO-8859-9"
"ISO-8859-9",
"sv"
};
const SequenceModel Iso_8859_1SwedishModel =
@ -237,7 +239,8 @@ const SequenceModel Iso_8859_1SwedishModel =
31,
(float)0.997323508584682,
PR_TRUE,
"ISO-8859-1"
"ISO-8859-1",
"sv"
};
const SequenceModel Iso_8859_4SwedishModel =
@ -247,7 +250,8 @@ const SequenceModel Iso_8859_4SwedishModel =
31,
(float)0.997323508584682,
PR_TRUE,
"ISO-8859-4"
"ISO-8859-4",
"sv"
};
const SequenceModel Iso_8859_15SwedishModel =
@ -257,5 +261,6 @@ const SequenceModel Iso_8859_15SwedishModel =
31,
(float)0.997323508584682,
PR_TRUE,
"ISO-8859-15"
"ISO-8859-15",
"sv"
};

View File

@ -251,7 +251,8 @@ const SequenceModel Tis_620ThaiModel =
64,
(float)0.8815720594354438,
PR_FALSE,
"TIS-620"
"TIS-620",
"th"
};
const SequenceModel Iso_8859_11ThaiModel =
@ -261,5 +262,6 @@ const SequenceModel Iso_8859_11ThaiModel =
64,
(float)0.8815720594354438,
PR_FALSE,
"ISO-8859-11"
"ISO-8859-11",
"th"
};

View File

@ -159,7 +159,8 @@ const SequenceModel Iso_8859_3TurkishModel =
36,
(float)0.991865243864388,
PR_FALSE,
"ISO-8859-3"
"ISO-8859-3",
"tr"
};
const SequenceModel Iso_8859_9TurkishModel =
@ -169,5 +170,6 @@ const SequenceModel Iso_8859_9TurkishModel =
36,
(float)0.991865243864388,
PR_FALSE,
"ISO-8859-9"
"ISO-8859-9",
"tr"
};

View File

@ -233,7 +233,8 @@ const SequenceModel Windows_1258VietnameseModel =
55,
(float)0.9321889118082535,
PR_FALSE,
"WINDOWS-1258"
"WINDOWS-1258",
"vi"
};
const SequenceModel VisciiVietnameseModel =
@ -243,5 +244,6 @@ const SequenceModel VisciiVietnameseModel =
55,
(float)0.9321889118082535,
PR_FALSE,
"VISCII"
"VISCII",
"vi"
};

View File

@ -51,6 +51,7 @@ public:
virtual ~nsBig5Prober(void){delete mCodingSM;}
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
const char* GetCharSetName() {return "BIG5";}
const char* GetLanguage() {return "zh";}
nsProbingState GetState(void) {return mState;}
void Reset(void);
float GetConfidence(void);

View File

@ -54,6 +54,7 @@ class nsCharSetProber {
public:
virtual ~nsCharSetProber() {}
virtual const char* GetCharSetName() = 0;
virtual const char* GetLanguage() = 0;
virtual nsProbingState HandleData(const char* aBuf, PRUint32 aLen) = 0;
virtual nsProbingState GetState(void) = 0;
virtual void Reset(void) = 0;

View File

@ -57,6 +57,7 @@ public:
virtual ~nsEUCJPProber(void){delete mCodingSM;}
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
const char* GetCharSetName() {return "EUC-JP";}
const char* GetLanguage() {return "ja";}
nsProbingState GetState(void) {return mState;}
void Reset(void);
float GetConfidence(void);

View File

@ -57,6 +57,7 @@ public:
* Korean documents are actually created with this character set.
*/
const char* GetCharSetName() {return "UHC";}
const char* GetLanguage() {return "ko";}
nsProbingState GetState(void) {return mState;}
void Reset(void);
float GetConfidence(void);

View File

@ -51,6 +51,7 @@ public:
virtual ~nsEUCTWProber(void){delete mCodingSM;}
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
const char* GetCharSetName() {return "EUC-TW";}
const char* GetLanguage() {return "zh";}
nsProbingState GetState(void) {return mState;}
void Reset(void);
float GetConfidence(void);

View File

@ -38,6 +38,8 @@
#ifndef nsEscCharSetProber_h__
#define nsEscCharSetProber_h__
#include <cstddef>
#include "nsCharSetProber.h"
#include "nsCodingStateMachine.h"
@ -49,6 +51,7 @@ public:
virtual ~nsEscCharSetProber(void);
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
const char* GetCharSetName() {return mDetectedCharset;}
const char* GetLanguage() {return NULL;}
nsProbingState GetState(void) {return mState;}
void Reset(void);
float GetConfidence(void){return (float)0.99;}

View File

@ -53,6 +53,7 @@ public:
virtual ~nsGB18030Prober(void){delete mCodingSM;}
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
const char* GetCharSetName() {return "GB18030";}
const char* GetLanguage() {return "zh";}
nsProbingState GetState(void) {return mState;}
void Reset(void);
float GetConfidence(void);

View File

@ -49,7 +49,8 @@ public:
virtual ~nsHebrewProber(void) {}
virtual nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
virtual const char* GetCharSetName();
virtual const char *GetCharSetName();
virtual const char *GetLanguage(void) { return "he"; }
virtual void Reset(void);
virtual nsProbingState GetState(void);

View File

@ -39,6 +39,8 @@
#ifndef nsLatin1Prober_h__
#define nsLatin1Prober_h__
#include <cstddef>
#include "nsCharSetProber.h"
#define FREQ_CAT_NUM 4
@ -49,6 +51,7 @@ public:
virtual ~nsLatin1Prober(void){}
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
const char* GetCharSetName() {return "WINDOWS-1252";}
const char* GetLanguage() {return NULL;}
nsProbingState GetState(void) {return mState;}
void Reset(void);
float GetConfidence(void);

View File

@ -97,6 +97,18 @@ const char* nsMBCSGroupProber::GetCharSetName()
return mProbers[mBestGuess]->GetCharSetName();
}
const char* nsMBCSGroupProber::GetLanguage(void)
{
if (mBestGuess == -1)
{
GetConfidence();
}
if (mBestGuess == -1)
return NULL;
else
return mProbers[mBestGuess]->GetLanguage();
}
void nsMBCSGroupProber::Reset(void)
{
mActiveNum = 0;

View File

@ -55,6 +55,7 @@ public:
virtual ~nsMBCSGroupProber();
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
const char* GetCharSetName();
const char* GetLanguage();
nsProbingState GetState(void) {return mState;}
void Reset(void);
float GetConfidence(void);

View File

@ -217,6 +217,17 @@ const char* nsSBCSGroupProber::GetCharSetName()
return mProbers[mBestGuess]->GetCharSetName();
}
const char* nsSBCSGroupProber::GetLanguage()
{
if (mBestGuess == -1)
{
GetConfidence();
if (mBestGuess == -1)
mBestGuess = 0;
}
return mProbers[mBestGuess]->GetLanguage();
}
void nsSBCSGroupProber::Reset(void)
{
mActiveNum = 0;

View File

@ -49,6 +49,7 @@ public:
virtual ~nsSBCSGroupProber();
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
const char* GetCharSetName();
const char* GetLanguage();
nsProbingState GetState(void) {return mState;}
void Reset(void);
float GetConfidence(void);

View File

@ -145,6 +145,13 @@ const char* nsSingleByteCharSetProber::GetCharSetName()
return mNameProber->GetCharSetName();
}
const char* nsSingleByteCharSetProber::GetLanguage()
{
if (!mNameProber)
return mModel->langName;
return mNameProber->GetLanguage();
}
#ifdef DEBUG_chardet
void nsSingleByteCharSetProber::DumpStatus()
{

View File

@ -75,6 +75,7 @@ typedef struct
float mTypicalPositiveRatio; // = freqSeqs / totalSeqs
PRBool keepEnglishLetter; // says if this script contains English characters (not implemented)
const char* const charsetName;
const char* const langName;
} SequenceModel;
@ -86,6 +87,7 @@ public:
:mModel(model), mReversed(reversed), mNameProber(nameProber) { Reset(); }
virtual const char* GetCharSetName();
virtual const char* GetLanguage();
virtual nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
virtual nsProbingState GetState(void) {return mState;}
virtual void Reset(void);

View File

@ -58,6 +58,7 @@ public:
virtual ~nsSJISProber(void){delete mCodingSM;}
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
const char* GetCharSetName() {return "SHIFT_JIS";}
const char* GetLanguage() {return "ja";}
nsProbingState GetState(void) {return mState;}
void Reset(void);
float GetConfidence(void);

View File

@ -38,6 +38,7 @@
#ifndef nsUTF8Prober_h__
#define nsUTF8Prober_h__
#include <cstddef>
#include "nsCharSetProber.h"
#include "nsCodingStateMachine.h"
@ -49,6 +50,7 @@ public:
virtual ~nsUTF8Prober(){delete mCodingSM;}
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
const char* GetCharSetName() {return "UTF-8";}
const char* GetLanguage() {return NULL;}
nsProbingState GetState(void) {return mState;}
void Reset(void);
float GetConfidence(void);

View File

@ -305,7 +305,7 @@ void nsUniversalDetector::DataEnd()
* when finding them.
*/
mDone = PR_TRUE;
Report(mDetectedCharset, 1.0);
Report(mDetectedCharset, NULL, 1.0);
return;
}
@ -323,7 +323,9 @@ void nsUniversalDetector::DataEnd()
if (proberConfidence > MINIMUM_THRESHOLD)
/* Only report what we are confident in. */
Report(mCharSetProbers[i]->GetCharSetName(), proberConfidence);
Report(mCharSetProbers[i]->GetCharSetName(),
mCharSetProbers[i]->GetLanguage(),
proberConfidence);
}
}
}

View File

@ -69,7 +69,8 @@ public:
virtual void DataEnd(void);
protected:
virtual void Report(const char* aCharset,
virtual void Report(const char *encoding,
const char *language,
float confidence) = 0;
virtual void Reset();
nsInputState mInputState;

View File

@ -9,6 +9,7 @@ set(
uchardet_get_candidates
uchardet_get_encoding
uchardet_get_confidence
uchardet_get_language
)
set (LINK_FLAGS "")

View File

@ -65,6 +65,7 @@ public:
}
virtual void Report(const char *encoding,
const char *language,
float confidence)
{
std::vector<UChardetCandidate>::iterator it;
@ -72,7 +73,8 @@ public:
for (it = candidates.begin(); it != candidates.end(); it++)
{
if (strcmp(it->encoding, encoding) == 0)
if (strcmp(it->encoding, encoding) == 0 &&
it->language && language && strcmp(it->language, language) == 0)
{
/* Already reported. Bail out or update the confidence
* when needed.
@ -91,6 +93,7 @@ public:
candidate = UChardetCandidate();
candidate.encoding = strdup(encoding);
candidate.language = language ? strdup(language) : NULL;
candidate.confidence = confidence;
for (it = candidates.begin(); it != candidates.end(); it++)
@ -107,7 +110,11 @@ public:
nsUniversalDetector::Reset();
for (it = candidates.begin(); it != candidates.end(); it++)
{
free(it->encoding);
if (it->language)
free(it->language);
}
candidates.clear();
}
@ -125,6 +132,12 @@ public:
{
return (candidates.size() > i) ? candidates[i].confidence : 0.0;
}
const char* GetLanguage(size_t i) const
{
return (candidates.size() > i) ? candidates[i].language : NULL;
}
};
uchardet_t uchardet_new(void)
@ -178,3 +191,9 @@ const char * uchardet_get_encoding (uchardet_t ud,
{
return reinterpret_cast<HandleUniversalDetector*>(ud)->GetCharset(candidate);
}
const char * uchardet_get_language (uchardet_t ud,
size_t candidate)
{
return reinterpret_cast<HandleUniversalDetector*>(ud)->GetLanguage(candidate);
}

View File

@ -120,6 +120,8 @@ UCHARDET_INTERFACE float uchardet_get_confidence (uchardet_t ud,
size_t candidate);
UCHARDET_INTERFACE const char * uchardet_get_encoding (uchardet_t ud,
size_t candidate);
UCHARDET_INTERFACE const char * uchardet_get_language (uchardet_t ud,
size_t candidate);
#ifdef __cplusplus