uchardet/src/LangModels/LangLatvianModel.cpp
Jehan 5c3a2e8037 src, script: regenerate all existing language models.
Now making sure that we have a generic language model working with UTF-8
for all 26 supported models which had single-byte encoding support until
now.
2021-03-17 02:07:17 +01:00

238 lines
12 KiB
C++

/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is Mozilla Communicator client code.
*
* The Initial Developer of the Original Code is
* Netscape Communications Corporation.
* Portions created by the Initial Developer are Copyright (C) 1998
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
#include "../nsSBCharSetProber.h"
#include "../nsLanguageDetector.h"
/********* Language model for: Latvian *********/
/**
* Generated by BuildLangModel.py
* On: 2021-03-16 19:30:28.293047
**/
/* Character Mapping Table:
* ILL: illegal character.
* CTR: control character specific to the charset.
* RET: carriage/return.
* SYM: symbol (punctuation) that does not belong to word.
* NUM: 0 - 9.
*
* Other characters are ordered by probabilities
* (0 is the most common character in the language).
*
* Orders are generic to a language. So the codepoint with order X in
* CHARSET1 maps to the same character as the codepoint with the same
* order X in CHARSET2 for the same language.
* As such, it is possible to get missing order. For instance the
* ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1
* even though they are both used for French. Same for the euro sign.
*/
static const unsigned char Iso_8859_4_CharToOrderMap[] =
{
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
SYM, 0, 17, 22, 13, 3, 25, 19, 28, 1, 15, 11, 9, 12, 7, 10, /* 4X */
16, 39, 5, 2, 4, 6, 14, 34, 35, 33, 20,SYM,SYM,SYM,SYM,SYM, /* 5X */
SYM, 0, 17, 22, 13, 3, 25, 19, 28, 1, 15, 11, 9, 12, 7, 10, /* 6X */
16, 39, 5, 2, 4, 6, 14, 34, 35, 33, 20,SYM,SYM,SYM,SYM,CTR, /* 7X */
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */
SYM, 55, 56, 57,SYM, 58, 26,SYM,SYM, 23, 21, 31, 59,SYM, 29,SYM, /* AX */
SYM, 60,SYM, 61,SYM, 62, 26,SYM,SYM, 23, 21, 31, 63, 49, 29, 49, /* BX */
8, 40, 64, 65, 41, 54, 42, 66, 32, 36, 67, 43, 46, 47, 44, 18, /* CX */
68, 24, 53, 30, 69, 70, 37,SYM, 71, 72, 73, 74, 38, 75, 27, 48, /* DX */
8, 40, 76, 77, 41, 54, 42, 78, 32, 36, 79, 43, 46, 47, 44, 18, /* EX */
80, 24, 53, 30, 81, 82, 37,SYM, 83, 84, 85, 86, 38, 87, 27,SYM, /* FX */
};
/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
static const unsigned char Iso_8859_10_CharToOrderMap[] =
{
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
SYM, 0, 17, 22, 13, 3, 25, 19, 28, 1, 15, 11, 9, 12, 7, 10, /* 4X */
16, 39, 5, 2, 4, 6, 14, 34, 35, 33, 20,SYM,SYM,SYM,SYM,SYM, /* 5X */
SYM, 0, 17, 22, 13, 3, 25, 19, 28, 1, 15, 11, 9, 12, 7, 10, /* 6X */
16, 39, 5, 2, 4, 6, 14, 34, 35, 33, 20,SYM,SYM,SYM,SYM,CTR, /* 7X */
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */
SYM, 88, 21, 31, 18, 89, 30,SYM, 26, 90, 23, 91, 29,SYM, 27, 49, /* AX */
SYM, 92, 21, 31, 18, 93, 30,SYM, 26, 94, 23, 95, 29, 96, 27, 49, /* BX */
8, 40, 97, 98, 41, 54, 42, 99, 32, 36,100, 43, 46, 47, 44,101, /* CX */
52, 24, 53, 45,102,103, 37,104,105,106,107,108, 38,109, 51, 48, /* DX */
8, 40,110,111, 41, 54, 42,112, 32, 36,113, 43, 46, 47, 44,114, /* EX */
52, 24, 53, 45,115,116, 37,117,118,119,120,121, 38,122, 51,123, /* FX */
};
/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
static const unsigned char Iso_8859_13_CharToOrderMap[] =
{
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
SYM, 0, 17, 22, 13, 3, 25, 19, 28, 1, 15, 11, 9, 12, 7, 10, /* 4X */
16, 39, 5, 2, 4, 6, 14, 34, 35, 33, 20,SYM,SYM,SYM,SYM,SYM, /* 5X */
SYM, 0, 17, 22, 13, 3, 25, 19, 28, 1, 15, 11, 9, 12, 7, 10, /* 6X */
16, 39, 5, 2, 4, 6, 14, 34, 35, 33, 20,SYM,SYM,SYM,SYM,CTR, /* 7X */
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,124,SYM,125,SYM,SYM,SYM,SYM, 42, /* AX */
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,126,SYM,127,SYM,SYM,SYM,SYM, 42, /* BX */
128,129, 8,130, 41, 54,131, 21, 32, 36,132, 46, 31, 30, 18, 26, /* CX */
23,133, 24, 45, 53,134, 37,SYM,135, 50,136, 27, 38,137, 29, 48, /* DX */
138,139, 8,140, 41, 54,141, 21, 32, 36,142, 46, 31, 30, 18, 26, /* EX */
23,143, 24, 45, 53,144, 37,SYM,145, 50,146, 27, 38,147, 29,SYM, /* FX */
};
/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
static const int Unicode_Char_size = 80;
static const unsigned int Unicode_CharOrder[] =
{
65, 0, 66, 17, 67, 22, 68, 13, 69, 3, 70, 25, 71, 19, 72, 28,
73, 1, 74, 15, 75, 11, 76, 9, 77, 12, 78, 7, 79, 10, 80, 16,
81, 39, 82, 5, 83, 2, 84, 4, 85, 6, 86, 14, 87, 34, 88, 35,
89, 33, 90, 20, 97, 0, 98, 17, 99, 22, 100, 13, 101, 3,102, 25,
103, 19, 104, 28, 105, 1, 106, 15, 107, 11, 108, 9, 109, 12,110, 7,
111, 10, 112, 16, 113, 39, 114, 5, 115, 2, 116, 4, 117, 6,118, 14,
119, 34, 120, 35, 121, 33, 122, 20, 201, 36, 214, 37, 220, 38,233, 36,
246, 37, 252, 38, 256, 8, 257, 8, 268, 32, 269, 32, 274, 21,275, 21,
290, 31, 291, 31, 298, 18, 299, 18, 310, 30, 311, 30, 315, 26,316, 26,
325, 24, 326, 24, 352, 23, 353, 23, 362, 27, 363, 27, 381, 29,382, 29,
};
/* Model Table:
* Total sequences: 982
* First 512 sequences: 0.9904642991017133
* Next 512 sequences (512-1024): 0.009535700898286757
* Rest: -5.377642775528102e-17
* Negative sequences: TODO
*/
static const PRUint8 LatvianLangModel[] =
{
2,3,3,3,3,3,3,3,2,3,2,3,3,3,3,3,3,3,2,3,3,2,3,3,3,3,3,0,3,3,2,2,3,2,2,2,0,0,0,0,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,2,3,3,3,3,3,2,3,3,3,2,3,0,0,2,2,0,0,2,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,2,2,3,2,0,2,2,2,3,2,2,0,0,0,2,2,0,0,0,2,2,
3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,2,3,3,2,3,3,3,3,3,0,3,3,2,3,2,2,2,2,0,0,0,0,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,2,3,3,3,0,0,2,0,2,2,0,0,0,0,0,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,2,2,3,3,2,3,2,2,2,2,0,2,2,2,2,
3,3,3,2,3,3,2,3,3,3,2,3,3,3,3,3,3,3,2,3,3,2,3,3,3,0,3,0,2,2,2,2,3,2,0,0,2,0,0,0,
3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,2,2,3,3,3,2,3,3,3,0,3,0,2,2,2,0,0,3,2,0,0,2,0,0,2,
2,2,3,2,3,3,2,3,0,3,0,3,3,3,3,3,3,3,0,2,3,0,3,3,3,3,3,0,0,2,0,2,2,0,0,0,0,0,0,0,
3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,0,3,2,2,0,2,2,2,2,2,2,0,0,0,
3,2,3,2,3,3,3,3,2,3,2,3,3,3,3,3,3,3,0,3,3,2,3,3,3,3,3,0,2,3,2,3,2,2,2,2,0,0,0,0,
3,3,3,3,3,3,3,3,3,3,3,2,3,2,3,2,2,2,2,2,2,2,3,3,2,0,3,2,2,0,0,0,0,2,0,0,2,2,2,0,
3,3,3,3,3,2,3,3,3,2,3,3,2,3,2,3,3,3,3,2,0,3,2,2,0,2,0,3,0,0,0,2,0,2,0,2,2,0,2,0,
3,3,3,3,2,3,3,3,3,2,3,2,3,0,3,2,2,2,3,2,3,3,2,2,2,0,0,3,0,3,0,0,0,0,2,0,2,0,2,0,
3,3,3,3,2,2,3,2,3,2,3,2,2,2,2,3,3,2,3,2,2,3,2,0,2,2,0,2,0,0,0,0,0,2,0,0,0,0,0,0,
3,3,3,3,0,2,3,3,3,2,3,2,2,2,2,0,2,0,2,2,0,3,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,
3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,2,3,3,3,3,2,3,2,3,2,2,2,2,2,0,2,2,0,0,0,0,2,0,0,0,
3,3,3,3,2,3,3,2,3,3,3,2,2,2,2,2,2,2,3,0,2,3,2,2,0,0,2,3,2,0,0,0,2,2,0,0,0,2,2,0,
0,0,3,0,3,3,0,3,0,3,0,3,3,3,3,3,3,3,0,3,3,0,3,3,3,2,2,0,0,2,2,0,2,0,0,0,0,0,0,0,
3,3,3,3,3,3,3,2,3,3,3,2,2,2,2,0,2,0,0,2,2,2,0,3,0,2,3,3,2,2,0,0,0,2,0,0,2,0,2,0,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,2,3,3,2,3,0,0,2,2,0,0,0,0,2,2,0,0,0,0,0,
2,0,3,0,3,3,2,3,0,3,0,3,3,3,3,3,2,2,0,3,2,0,3,3,2,2,3,0,0,2,2,3,0,0,0,0,0,0,0,0,
3,3,3,3,2,2,3,2,3,2,3,3,2,2,2,0,2,2,3,0,2,3,2,2,0,0,0,2,3,0,0,2,0,2,0,0,0,0,0,0,
3,3,3,3,3,2,3,3,3,3,3,2,2,2,3,2,2,2,3,2,2,3,0,0,2,0,2,2,0,0,3,0,2,0,0,0,0,0,0,0,
3,3,2,3,0,0,3,2,3,0,3,0,2,2,2,2,2,2,0,2,0,3,2,3,0,0,0,2,0,0,2,2,0,0,0,0,0,0,0,0,
3,3,3,3,2,3,3,2,2,3,3,2,0,0,0,0,0,0,2,2,0,2,0,2,0,2,0,2,0,0,0,0,0,0,0,0,2,0,2,0,
3,3,2,3,0,2,3,2,3,2,3,2,2,2,2,2,2,0,2,0,0,2,2,2,2,0,2,2,0,0,2,3,0,0,0,0,0,0,0,0,
0,2,3,0,3,3,0,3,0,3,2,3,2,3,3,2,3,0,0,2,3,0,3,2,0,2,0,0,2,2,0,0,0,0,0,0,0,0,0,0,
3,3,2,3,2,2,2,3,2,2,3,2,2,0,0,2,0,0,2,0,0,2,0,0,0,0,0,2,2,0,0,0,0,2,2,0,0,0,0,0,
3,3,2,3,0,2,3,2,3,2,3,2,2,0,2,0,0,0,2,0,2,2,0,0,2,0,0,2,0,0,2,2,0,0,0,0,0,0,0,0,
3,3,2,3,2,0,2,0,2,0,2,0,0,0,0,0,0,0,3,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2,3,0,3,0,0,2,0,0,0,2,0,0,0,0,0,0,0,2,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,2,2,3,0,0,3,2,2,0,2,2,2,0,0,0,2,0,2,0,0,0,0,0,2,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,
2,2,2,2,0,2,2,2,0,2,2,2,2,2,0,0,0,0,0,0,0,0,2,0,0,2,0,0,0,0,0,0,0,0,2,0,0,0,0,0,
2,2,2,2,2,0,0,0,0,2,2,0,0,0,0,2,2,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,2,0,0,0,0,0,
2,2,0,0,0,0,2,0,0,0,2,0,0,0,2,0,2,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,
0,0,2,0,2,2,0,2,0,2,2,0,0,2,0,0,2,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,2,0,0,0,0,2,0,2,0,0,2,2,0,0,0,0,0,0,0,0,0,0,0,2,0,0,2,0,0,0,0,0,0,0,0,0,0,0,
0,0,2,0,0,2,0,2,0,2,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
};
const SequenceModel Iso_8859_4LatvianModel =
{
Iso_8859_4_CharToOrderMap,
LatvianLangModel,
40,
(float)0.9904642991017133,
PR_TRUE,
"ISO-8859-4",
"lv"
};
const SequenceModel Iso_8859_10LatvianModel =
{
Iso_8859_10_CharToOrderMap,
LatvianLangModel,
40,
(float)0.9904642991017133,
PR_TRUE,
"ISO-8859-10",
"lv"
};
const SequenceModel Iso_8859_13LatvianModel =
{
Iso_8859_13_CharToOrderMap,
LatvianLangModel,
40,
(float)0.9904642991017133,
PR_TRUE,
"ISO-8859-13",
"lv"
};
const LanguageModel LatvianModel =
{
"lv",
Unicode_CharOrder,
80,
LatvianLangModel,
40,
(float)0.9904642991017133,
};