diff --git a/README.md b/README.md index c20b575..8aa7a7b 100644 --- a/README.md +++ b/README.md @@ -46,6 +46,12 @@ Techniques used by universalchardet are described at http://www.mozilla.org/proj * ASCII * Esperanto * ISO-8859-3 + * Estonian + * ISO-8859-4 + * ISO-8859-13 + * ISO-8859-13 + * Windows-1252 + * Windows-1257 * Finnish * ISO-8859-1 * ISO-8859-4 diff --git a/script/BuildLangModelLogs/LangEstonianModel.log b/script/BuildLangModelLogs/LangEstonianModel.log new file mode 100644 index 0000000..f1095eb --- /dev/null +++ b/script/BuildLangModelLogs/LangEstonianModel.log @@ -0,0 +1,159 @@ += Logs of language model for Estonian (et) = + +- Generated by BuildLangModel.py +- Started: 2016-09-26 23:45:22.351942 +- Maximum depth: 5 +- Max number of pages: 100 + +== Parsed pages == + +Harilik pohl (revision 4248853) +A-vitamiin (revision 4330862) +Aasta keskmine sademete hulk (revision 4266801) +Aasta keskmine õhutemperatuur (revision 3902142) +Ahm (revision 4343671) +Ain Raal (revision 4464651) +Alalehed (revision 2892741) +Alamliik (revision 3522810) +Alaska (revision 4216575) +Aleksander Heintalu (revision 4445156) +Aleuudid (revision 4335893) +Ameerika jänes (revision 4325220) +Ameerika valgejänes (revision 4355263) +Anneli Sihvart (revision 4211078) +Arbutiin (revision 4451788) +Baribal (revision 4268462) +Bensoehape (revision 3810308) +Binaarne nomenklatuur (revision 3970950) +C-vitamiin (revision 4444353) +Droog (revision 4352968) +E-vitamiin (revision 4336726) +Eesti (revision 4474984) +Eesti Entsüklopeediakirjastus (revision 4012421) +Eesti köök (revision 4314947) +Ellips (revision 4272113) +Emakakael (botaanika) (revision 3521516) +Euraasia (revision 3710768) +Fenoloogia (revision 3512905) +Folaadid (revision 4266628) +Fosfor (revision 4270122) +Fotosüntees (revision 4380600) +Fruktoos (revision 4285660) +Glükoos (revision 4047315) +Gneiss (revision 4333338) +Graniit (revision 4435351) +Gröönimaa (revision 4331557) +Halljänes (revision 4051603) +Haned (revision 4127680) +Happeline keskkond (revision 2966453) +Heilongjiang (revision 4342364) +Hendrik Relve (revision 4342591) +Hiina (revision 4448121) +Holland (revision 4307885) +Hunt (revision 4427752) +Hõimkond (revision 3489569) +Hüdrofiilsus (revision 4309797) +Ida-Euroopa (revision 4337624) +Ida-sinilind (revision 4248853) +Ida-vöötorav (revision 3520679) +Igihaljus (revision 3536500) +Ilves (revision 4404632) +Imetaja (revision 4289188) +Indiaanlased (revision 4479868) +Indrek Rohtmets (revision 4218674) +Itaalia (revision 4404119) +Jaapan (revision 4465542) +Jilin (revision 3894473) +Jood (revision 4025060) +Juurestik (revision 3341159) +Jääkaru (revision 4372399) +Jõhvikas (revision 4391549) +Kaalium (revision 4486067) +Kaheidulehelised (revision 4031352) +Kaheli õiekate (revision 3063362) +Kahesuguline õis (revision 3383221) +Kaitsestaatus (revision 3527096) +Kajakas (revision 4456839) +Kalorsus (revision 3843290) +Kaltsium (revision 4339861) +Kanada (revision 4434682) +Kanalised (revision 3616579) +Kanarbikulaadsed (revision 4318215) +Kanarbikulised (revision 3534760) +Karboksüülhapped (revision 3659011) +Karoteen (revision 4347634) +Kasvuperiood (revision 4231717) +Katteseemnetaimed (revision 4176294) +Kaukasus (revision 4476003) +Kesk-Euroopa (revision 3580746) +Kimalane (revision 4261145) +Kiudained (toit) (revision 3538655) +Klass (bioloogia) (revision 3489567) +Kliima (revision 4160781) +Korea (revision 4329396) +Kroom (revision 4030460) +Kroonlehed (revision 3543291) +Kuusepüü (revision 4028988) +Kvertsetiin (revision 4448461) +Laanemets (revision 4001157) +Laanepüü (revision 4475093) +Laiuskraad (revision 3990366) +Leesikas (revision 4420533) +Lehed (revision 4471821) +Leheroots (revision 3595351) +Liik (bioloogia) (revision 4320981) +Liiv (revision 4399494) +Liivakivi (revision 4330598) +Linnaeus (revision 4276836) +Linnud (revision 4479668) + +== End of Parsed pages == + +- Wikipedia parsing ended at: 2016-09-26 23:47:54.476445 + +55 characters appeared 433559 times. + +First 33 characters: +[ 0] Char a: 12.486881831538499 % +[ 1] Char i: 10.26503889897338 % +[ 2] Char e: 10.177622884082673 % +[ 3] Char s: 8.710233209320991 % +[ 4] Char t: 6.56634967789851 % +[ 5] Char l: 6.051540851418146 % +[ 6] Char u: 5.423944607308348 % +[ 7] Char n: 5.131020230233947 % +[ 8] Char k: 4.663033174262327 % +[ 9] Char o: 4.526950195936424 % +[10] Char d: 4.167368224393911 % +[11] Char r: 3.6740097656835635 % +[12] Char m: 3.552688330769284 % +[13] Char v: 2.4700213811730354 % +[14] Char p: 1.9229216784797456 % +[15] Char g: 1.865259399528092 % +[16] Char h: 1.8043680329551455 % +[17] Char j: 1.6860450365463524 % +[18] Char ä: 1.0247740215287884 % +[19] Char b: 0.9255949017319443 % +[20] Char õ: 0.9246723052687178 % +[21] Char ü: 0.6536595941959457 % +[22] Char f: 0.37342091849090897 % +[23] Char c: 0.34851081398379463 % +[24] Char ö: 0.24333481717597835 % +[25] Char y: 0.1287022066200909 % +[26] Char x: 0.06781084004714467 % +[27] Char w: 0.04082489349777078 % +[28] Char q: 0.020989069538401926 % +[29] Char š: 0.018913227496142396 % +[30] Char z: 0.017529332801302706 % +[31] Char ō: 0.010379210211297655 % +[32] Char ž: 0.009687262863877812 % + +The first 33 characters have an accumulated ratio of 0.9995410082595447. + +853 sequences found. + +First 512 (typical positive ratio): 0.9972721312183132 +Next 512 (512-1024): 9.687262863877811e-05 +Rest: -5.204170427930421e-18 + +- Processing end: 2016-09-26 23:47:54.561846 diff --git a/script/charsets/windows-1257.py b/script/charsets/windows-1257.py new file mode 100644 index 0000000..160473b --- /dev/null +++ b/script/charsets/windows-1257.py @@ -0,0 +1,72 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +# ##### BEGIN LICENSE BLOCK ##### +# Version: MPL 1.1/GPL 2.0/LGPL 2.1 +# +# The contents of this file are subject to the Mozilla Public License Version +# 1.1 (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# http://www.mozilla.org/MPL/ +# +# Software distributed under the License is distributed on an "AS IS" basis, +# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License +# for the specific language governing rights and limitations under the +# License. +# +# The Original Code is Mozilla Universal charset detector code. +# +# The Initial Developer of the Original Code is +# Netscape Communications Corporation. +# Portions created by the Initial Developer are Copyright (C) 2001 +# the Initial Developer. All Rights Reserved. +# +# Contributor(s): +# Jehan +# +# Alternatively, the contents of this file may be used under the terms of +# either the GNU General Public License Version 2 or later (the "GPL"), or +# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), +# in which case the provisions of the GPL or the LGPL are applicable instead +# of those above. If you wish to allow use of your version of this file only +# under the terms of either the GPL or the LGPL, and not to allow others to +# use your version of this file under the terms of the MPL, indicate your +# decision by deleting the provisions above and replace them with the notice +# and other provisions required by the GPL or the LGPL. If you do not delete +# the provisions above, a recipient may use your version of this file under +# the terms of any one of the MPL, the GPL or the LGPL. +# +# ##### END LICENSE BLOCK ##### + +from codepoints import * + +name = 'WINDOWS-1257' +aliases = ['CP-1257'] + +language = \ +{ + # Designed to support the Estonian, Latvian and Lithuanian languages. + 'complete': [ 'et', 'lv', 'lt' ], + 'incomplete': [] +} + +# X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF # +charmap = \ +[ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, # 0X + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 1X + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # 2X + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, # 3X + SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 4X + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,SYM, # 5X + SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 6X + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,CTR, # 7X + SYM,ILL,SYM,ILL,SYM,SYM,SYM,SYM,ILL,SYM,ILL,SYM,ILL,SYM,SYM,SYM, # 8X + ILL,SYM,SYM,SYM,SYM,SYM,SYM,SYM,ILL,SYM,ILL,SYM,ILL,SYM,SYM,ILL, # 9X + SYM,ILL,SYM,SYM,SYM,ILL,SYM,SYM,LET,SYM,LET,SYM,SYM,SYM,SYM,LET, # AX + SYM,SYM,SYM,SYM,SYM,LET,SYM,SYM,LET,SYM,LET,SYM,SYM,SYM,SYM,LET, # BX + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # CX + LET,LET,LET,LET,LET,LET,LET,SYM,LET,LET,LET,LET,LET,LET,LET,LET, # DX + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # EX + LET,LET,LET,LET,LET,LET,LET,SYM,LET,LET,LET,LET,LET,LET,LET,SYM, # FX +] diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 1fbed29..1b4773e 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -12,6 +12,7 @@ set( LangModels/LangCroatianModel.cpp LangModels/LangCzechModel.cpp LangModels/LangEsperantoModel.cpp + LangModels/LangEstonianModel.cpp LangModels/LangFinnishModel.cpp LangModels/LangFrenchModel.cpp LangModels/LangDanishModel.cpp diff --git a/src/LangModels/LangEstonianModel.cpp b/src/LangModels/LangEstonianModel.cpp new file mode 100644 index 0000000..c5fa9b3 --- /dev/null +++ b/src/LangModels/LangEstonianModel.cpp @@ -0,0 +1,263 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is Mozilla Communicator client code. + * + * The Initial Developer of the Original Code is + * Netscape Communications Corporation. + * Portions created by the Initial Developer are Copyright (C) 1998 + * the Initial Developer. All Rights Reserved. + * + * Contributor(s): + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ + +#include "../nsSBCharSetProber.h" + +/********* Language model for: Estonian *********/ + +/** + * Generated by BuildLangModel.py + * On: 2016-09-26 23:47:54.476870 + **/ + +/* Character Mapping Table: + * ILL: illegal character. + * CTR: control character specific to the charset. + * RET: carriage/return. + * SYM: symbol (punctuation) that does not belong to word. + * NUM: 0 - 9. + * + * Other characters are ordered by probabilities + * (0 is the most common character in the language). + * + * Orders are generic to a language. So the codepoint with order X in + * CHARSET1 maps to the same character as the codepoint with the same + * order X in CHARSET2 for the same language. + * As such, it is possible to get missing order. For instance the + * ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1 + * even though they are both used for French. Same for the euro sign. + */ +static const unsigned char Iso_8859_4_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 0, 19, 23, 10, 2, 22, 15, 16, 1, 17, 8, 5, 12, 7, 9, /* 4X */ + 14, 28, 11, 3, 4, 6, 13, 27, 26, 25, 30,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 0, 19, 23, 10, 2, 22, 15, 16, 1, 17, 8, 5, 12, 7, 9, /* 6X */ + 14, 28, 11, 3, 4, 6, 13, 27, 26, 25, 30,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM, 55, 56, 57,SYM, 58, 59,SYM,SYM, 29, 45, 60, 61,SYM, 32,SYM, /* AX */ + SYM, 62,SYM, 63,SYM, 64, 65,SYM,SYM, 29, 45, 66, 67, 68, 32, 69, /* BX */ + 37, 43, 70, 71, 18, 44, 47, 72, 73, 33, 74, 75, 76, 36, 77, 39, /* CX */ + 78, 79, 31, 80, 81, 20, 24,SYM, 38, 82, 52, 83, 21, 84, 34, 85, /* DX */ + 37, 43, 86, 87, 18, 44, 47, 88, 89, 33, 90, 91, 92, 36, 93, 39, /* EX */ + 94, 95, 31, 96, 97, 20, 24,SYM, 38, 98, 52, 99, 21,100, 34,SYM, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Windows_1252_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 0, 19, 23, 10, 2, 22, 15, 16, 1, 17, 8, 5, 12, 7, 9, /* 4X */ + 14, 28, 11, 3, 4, 6, 13, 27, 26, 25, 30,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 0, 19, 23, 10, 2, 22, 15, 16, 1, 17, 8, 5, 12, 7, 9, /* 6X */ + 14, 28, 11, 3, 4, 6, 13, 27, 26, 25, 30,SYM,SYM,SYM,SYM,CTR, /* 7X */ + SYM,ILL,SYM,101,SYM,SYM,SYM,SYM,SYM,SYM, 29,SYM,102,ILL, 32,ILL, /* 8X */ + ILL,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 29,SYM,103,ILL, 32,104, /* 9X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ + SYM,SYM,SYM,SYM,SYM, 50,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */ + 40, 43,105,106, 18, 44, 47, 48, 41, 33,107,108, 35, 36,109,110, /* CX */ + 46,111, 53, 42,112, 20, 24,SYM, 38, 54, 52,113, 21,114,115,116, /* DX */ + 40, 43,117,118, 18, 44, 47, 48, 41, 33,119,120, 35, 36,121,122, /* EX */ + 46,123, 53, 42,124, 20, 24,SYM, 38, 54, 52,125, 21,126,127,128, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Iso_8859_15_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 0, 19, 23, 10, 2, 22, 15, 16, 1, 17, 8, 5, 12, 7, 9, /* 4X */ + 14, 28, 11, 3, 4, 6, 13, 27, 26, 25, 30,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 0, 19, 23, 10, 2, 22, 15, 16, 1, 17, 8, 5, 12, 7, 9, /* 6X */ + 14, 28, 11, 3, 4, 6, 13, 27, 26, 25, 30,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM,SYM,SYM,SYM,SYM,SYM, 29,SYM, 29,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ + SYM,SYM,SYM,SYM, 32, 50,SYM,SYM, 32,SYM,SYM,SYM,129,130,131,SYM, /* BX */ + 40, 43,132,133, 18, 44, 47, 48, 41, 33,134,135, 35, 36,136,137, /* CX */ + 46,138, 53, 42,139, 20, 24,SYM, 38, 54, 52,140, 21,141,142,143, /* DX */ + 40, 43,144,145, 18, 44, 47, 48, 41, 33,146,147, 35, 36,148,149, /* EX */ + 46,150, 53, 42,151, 20, 24,SYM, 38, 54, 52,152, 21,153,154,155, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Iso_8859_13_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 0, 19, 23, 10, 2, 22, 15, 16, 1, 17, 8, 5, 12, 7, 9, /* 4X */ + 14, 28, 11, 3, 4, 6, 13, 27, 26, 25, 30,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 0, 19, 23, 10, 2, 22, 15, 16, 1, 17, 8, 5, 12, 7, 9, /* 6X */ + 14, 28, 11, 3, 4, 6, 13, 27, 26, 25, 30,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 38,SYM,156,SYM,SYM,SYM,SYM, 47, /* AX */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 38,SYM,157,SYM,SYM,SYM,SYM, 47, /* BX */ + 158,159, 37,160, 18, 44,161, 45,162, 33,163,164,165,166, 39,167, /* CX */ + 29,168,169, 42, 31, 20, 24,SYM,170, 51,171, 34, 21, 49, 32,172, /* DX */ + 173,174, 37,175, 18, 44,176, 45,177, 33,178,179,180,181, 39,182, /* EX */ + 29,183,184, 42, 31, 20, 24,SYM,185, 51,186, 34, 21, 49, 32,SYM, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Windows_1257_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 0, 19, 23, 10, 2, 22, 15, 16, 1, 17, 8, 5, 12, 7, 9, /* 4X */ + 14, 28, 11, 3, 4, 6, 13, 27, 26, 25, 30,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 0, 19, 23, 10, 2, 22, 15, 16, 1, 17, 8, 5, 12, 7, 9, /* 6X */ + 14, 28, 11, 3, 4, 6, 13, 27, 26, 25, 30,SYM,SYM,SYM,SYM,CTR, /* 7X */ + SYM,ILL,SYM,ILL,SYM,SYM,SYM,SYM,ILL,SYM,ILL,SYM,ILL,SYM,SYM,SYM, /* 8X */ + ILL,SYM,SYM,SYM,SYM,SYM,SYM,SYM,ILL,SYM,ILL,SYM,ILL,SYM,SYM,ILL, /* 9X */ + SYM,ILL,SYM,SYM,SYM,ILL,SYM,SYM, 38,SYM,187,SYM,SYM,SYM,SYM, 47, /* AX */ + SYM,SYM,SYM,SYM,SYM, 50,SYM,SYM, 38,SYM,188,SYM,SYM,SYM,SYM, 47, /* BX */ + 189,190, 37,191, 18, 44,192, 45,193, 33,194,195,196,197, 39,198, /* CX */ + 29,199,200, 42, 31, 20, 24,SYM,201, 51,202, 34, 21, 49, 32,203, /* DX */ + 204,205, 37,206, 18, 44,207, 45,208, 33,209,210,211,212, 39,213, /* EX */ + 29,214,215, 42, 31, 20, 24,SYM,216, 51,217, 34, 21, 49, 32,SYM, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + + +/* Model Table: + * Total sequences: 853 + * First 512 sequences: 0.9972721312183132 + * Next 512 sequences (512-1024): 0.0027278687816868537 + * Rest: -5.204170427930421e-18 + * Negative sequences: TODO + */ +static const PRUint8 EstonianLangModel[] = +{ + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,2,2,3,3,0,3,3,3,3,2,2,0,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,2,3,3,2,2,3,3,2,2,2,0,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,2,2,3,3,0,3,3,3,2,0,2,0,2, + 3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,2,2,2,0,2,2,0, + 3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,2,3,3,3,0,3,3,2,3,3,3,2,2,0,3,2,2,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,3,0,0,0,2,2,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,2,2,3,3,0,0,2,2,2,2,0,0,0, + 3,3,3,3,3,3,3,3,3,3,3,2,2,3,2,3,3,3,3,2,3,3,3,3,2,3,3,0,2,2,2,0,2, + 3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,0,3,3,2,2,3,3,0,2,0,0,0,2,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,0,2,3,3,0,3,3,3,2,2,2,0,0, + 3,3,3,3,3,3,3,3,3,3,2,3,3,3,2,2,2,3,0,2,0,3,0,0,0,2,2,2,0,0,0,3,3, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,2,0,2,0,0,0, + 3,3,3,3,3,3,3,3,3,3,2,2,3,2,3,3,3,2,3,3,3,3,3,2,3,3,0,2,0,2,2,0,0, + 3,3,3,3,2,3,3,3,3,3,2,2,2,2,2,2,2,2,3,0,3,2,0,2,3,2,0,0,0,0,0,0,0, + 3,3,3,3,3,3,3,3,3,3,2,3,3,2,3,2,3,2,3,0,3,3,0,2,3,3,0,0,0,0,0,0,0, + 3,3,3,3,3,3,3,3,3,3,2,3,3,2,3,3,3,3,0,2,2,2,2,2,0,3,2,0,2,0,2,0,0, + 3,3,3,3,3,3,3,3,3,3,0,3,3,3,0,0,3,3,3,0,3,3,3,2,0,3,0,2,0,0,0,2,0, + 3,3,3,2,3,0,3,3,0,3,0,2,3,0,3,0,0,0,3,0,3,3,0,0,2,0,0,0,0,0,0,0,0, + 2,3,3,3,3,3,0,3,3,3,3,3,3,3,3,3,3,0,3,3,0,0,0,0,0,0,2,0,0,0,0,0,0, + 3,3,3,3,2,3,3,3,2,3,0,3,2,0,0,0,2,3,0,2,0,2,0,2,0,2,2,0,0,0,0,0,0, + 0,3,3,3,3,3,3,3,2,0,3,3,3,3,3,3,3,3,0,3,3,0,0,0,0,0,0,0,0,0,2,0,0, + 3,0,2,3,3,3,0,3,3,3,3,3,3,3,3,3,3,0,0,3,0,3,2,0,0,0,0,0,0,0,0,0,0, + 3,3,3,2,3,3,3,2,0,3,2,3,0,0,0,2,0,2,2,0,0,3,3,3,2,0,0,0,0,0,0,0,0, + 3,3,3,3,3,3,3,3,3,3,2,3,3,2,2,2,3,0,0,2,0,0,2,3,0,3,0,0,2,0,0,0,0, + 2,3,3,3,3,3,0,3,3,2,3,3,2,3,3,3,2,2,0,3,0,0,0,0,3,0,0,0,0,0,0,0,0, + 3,3,3,3,3,3,3,3,2,3,2,3,2,0,3,3,0,0,0,0,0,0,0,3,2,0,2,0,0,0,2,3,0, + 3,3,2,2,2,2,2,2,2,2,0,0,0,0,0,0,0,2,0,0,0,0,0,2,0,2,0,0,0,0,0,0,0, + 3,3,3,2,2,2,2,2,2,3,0,2,0,0,0,2,2,0,0,0,0,0,2,0,0,2,0,2,0,0,0,0,0, + 3,3,2,0,0,0,3,0,0,2,0,2,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0, + 2,3,3,0,0,2,3,2,2,2,0,0,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0, + 2,3,2,2,0,2,2,2,2,3,2,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0, + 0,0,0,2,2,2,2,2,2,0,0,0,2,0,0,2,2,2,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0, + 2,3,0,0,0,0,3,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +}; + + +const SequenceModel Iso_8859_4EstonianModel = +{ + Iso_8859_4_CharToOrderMap, + EstonianLangModel, + 33, + (float)0.9972721312183132, + PR_TRUE, + "ISO-8859-4" +}; + +const SequenceModel Windows_1252EstonianModel = +{ + Windows_1252_CharToOrderMap, + EstonianLangModel, + 33, + (float)0.9972721312183132, + PR_TRUE, + "WINDOWS-1252" +}; + +const SequenceModel Iso_8859_15EstonianModel = +{ + Iso_8859_15_CharToOrderMap, + EstonianLangModel, + 33, + (float)0.9972721312183132, + PR_TRUE, + "ISO-8859-15" +}; + +const SequenceModel Iso_8859_13EstonianModel = +{ + Iso_8859_13_CharToOrderMap, + EstonianLangModel, + 33, + (float)0.9972721312183132, + PR_TRUE, + "ISO-8859-13" +}; + +const SequenceModel Windows_1257EstonianModel = +{ + Windows_1257_CharToOrderMap, + EstonianLangModel, + 33, + (float)0.9972721312183132, + PR_TRUE, + "WINDOWS-1257" +}; diff --git a/src/nsSBCSGroupProber.cpp b/src/nsSBCSGroupProber.cpp index 95c98e2..d77a9fc 100644 --- a/src/nsSBCSGroupProber.cpp +++ b/src/nsSBCSGroupProber.cpp @@ -163,6 +163,12 @@ nsSBCSGroupProber::nsSBCSGroupProber() mProbers[74] = new nsSingleByteCharSetProber(&Mac_CentraleuropeCroatianModel); mProbers[75] = new nsSingleByteCharSetProber(&Ibm852CroatianModel); + mProbers[76] = new nsSingleByteCharSetProber(&Windows_1252EstonianModel); + mProbers[77] = new nsSingleByteCharSetProber(&Windows_1257EstonianModel); + mProbers[78] = new nsSingleByteCharSetProber(&Iso_8859_4EstonianModel); + mProbers[79] = new nsSingleByteCharSetProber(&Iso_8859_13EstonianModel); + mProbers[80] = new nsSingleByteCharSetProber(&Iso_8859_15EstonianModel); + Reset(); } diff --git a/src/nsSBCSGroupProber.h b/src/nsSBCSGroupProber.h index 6617f9e..143e4cb 100644 --- a/src/nsSBCSGroupProber.h +++ b/src/nsSBCSGroupProber.h @@ -40,7 +40,7 @@ #define nsSBCSGroupProber_h__ -#define NUM_OF_SBCS_PROBERS 76 +#define NUM_OF_SBCS_PROBERS 81 class nsCharSetProber; class nsSBCSGroupProber: public nsCharSetProber { diff --git a/src/nsSBCharSetProber.h b/src/nsSBCharSetProber.h index 5092c8d..863bac0 100644 --- a/src/nsSBCharSetProber.h +++ b/src/nsSBCharSetProber.h @@ -224,5 +224,11 @@ extern const SequenceModel Iso_8859_16CroatianModel; extern const SequenceModel Ibm852CroatianModel; extern const SequenceModel Mac_CentraleuropeCroatianModel; +extern const SequenceModel Windows_1252EstonianModel; +extern const SequenceModel Windows_1257EstonianModel; +extern const SequenceModel Iso_8859_4EstonianModel; +extern const SequenceModel Iso_8859_13EstonianModel; +extern const SequenceModel Iso_8859_15EstonianModel; + #endif /* nsSingleByteCharSetProber_h__ */ diff --git a/test/et/iso-8859-13.txt b/test/et/iso-8859-13.txt new file mode 100644 index 0000000..b4c8124 --- /dev/null +++ b/test/et/iso-8859-13.txt @@ -0,0 +1,6 @@ +Anton Pavlovit Tehhov oli vene nite- ja novellikirjanik ning praktiseeriv arst. + +Tehhov on eelkige tuntud oma novellide poolest. Tema jutustuste tavaliseks +tegevuspaigaks olid vene vikeasulad ja need ksitlesid hingeksildust, raisatud +nne jms. Tuntud on ka tema pshholoogilised nidendid, kus valitseb kurb ja +lootusetu meeleolu. diff --git a/test/et/iso-8859-15.txt b/test/et/iso-8859-15.txt new file mode 100644 index 0000000..fc0509a --- /dev/null +++ b/test/et/iso-8859-15.txt @@ -0,0 +1,6 @@ +Anton Pavlovit Tehhov oli vene nite- ja novellikirjanik ning praktiseeriv arst. + +Tehhov on eelkige tuntud oma novellide poolest. Tema jutustuste tavaliseks +tegevuspaigaks olid vene vikeasulad ja need ksitlesid hingeksildust, raisatud +nne jms. Tuntud on ka tema pshholoogilised nidendid, kus valitseb kurb ja +lootusetu meeleolu. diff --git a/test/et/iso-8859-4.txt b/test/et/iso-8859-4.txt new file mode 100644 index 0000000..d5532b0 --- /dev/null +++ b/test/et/iso-8859-4.txt @@ -0,0 +1,6 @@ +Anton Pavlovit Tehhov oli vene nite- ja novellikirjanik ning praktiseeriv arst. + +Tehhov on eelkige tuntud oma novellide poolest. Tema jutustuste tavaliseks +tegevuspaigaks olid vene vikeasulad ja need ksitlesid hingeksildust, raisatud +nne jms. Tuntud on ka tema pshholoogilised nidendid, kus valitseb kurb ja +lootusetu meeleolu. diff --git a/test/et/utf-8.txt b/test/et/utf-8.txt new file mode 100644 index 0000000..d68c9d3 --- /dev/null +++ b/test/et/utf-8.txt @@ -0,0 +1,6 @@ +Anton Pavlovitš Tšehhov oli vene näite- ja novellikirjanik ning praktiseeriv arst. + +Tšehhov on eelkõige tuntud oma novellide poolest. Tema jutustuste tavaliseks +tegevuspaigaks olid vene väikeasulad ja need käsitlesid hingeüksildust, raisatud +õnne jms. Tuntud on ka tema psühholoogilised näidendid, kus valitseb kurb ja +lootusetu meeleolu. diff --git a/test/et/windows-1252.txt b/test/et/windows-1252.txt new file mode 100644 index 0000000..597e28e --- /dev/null +++ b/test/et/windows-1252.txt @@ -0,0 +1,6 @@ +Anton Pavlovit Tehhov oli vene nite- ja novellikirjanik ning praktiseeriv arst. + +Tehhov on eelkige tuntud oma novellide poolest. Tema jutustuste tavaliseks +tegevuspaigaks olid vene vikeasulad ja need ksitlesid hingeksildust, raisatud +nne jms. Tuntud on ka tema pshholoogilised nidendid, kus valitseb kurb ja +lootusetu meeleolu. diff --git a/test/et/windows-1257.txt b/test/et/windows-1257.txt new file mode 100644 index 0000000..64d3327 --- /dev/null +++ b/test/et/windows-1257.txt @@ -0,0 +1,6 @@ +Anton Pavlovit Tehhov oli vene nite- ja novellikirjanik ning praktiseeriv arst. + +Tehhov on eelkige tuntud oma novellide poolest. Tema jutustuste tavaliseks +tegevuspaigaks olid vene vikeasulad ja need ksitlesid hingeksildust, raisatud +nne jms. Tuntud on ka tema pshholoogilised nidendid, kus valitseb kurb ja +lootusetu meeleolu.