diff --git a/script/BuildLangModelLogs/LangLatvianModel.log b/script/BuildLangModelLogs/LangLatvianModel.log index 956f1a6..4dd7a21 100644 --- a/script/BuildLangModelLogs/LangLatvianModel.log +++ b/script/BuildLangModelLogs/LangLatvianModel.log @@ -1,7 +1,7 @@ = Logs of language model for Latvian (lv) = - Generated by BuildLangModel.py -- Started: 2016-09-20 23:16:39.184579 +- Started: 2016-09-21 00:16:33.485953 - Maximum depth: 5 - Max number of pages: 100 @@ -106,57 +106,57 @@ Eduards Radziņš (revision 2564393) == End of Parsed pages == -- Wikipedia parsing ended at: 2016-09-20 23:23:02.592168 +- Wikipedia parsing ended at: 2016-09-21 00:19:18.361533 -48 characters appeared 354730 times. +55 characters appeared 354745 times. First 39 characters: -[ 0] Char a: 11.905674738533532 % -[ 1] Char i: 9.398133791898063 % -[ 2] Char s: 8.224565162236066 % -[ 3] Char e: 6.367378005807234 % -[ 4] Char r: 5.854311730048205 % -[ 5] Char t: 5.831477461731457 % -[ 6] Char u: 4.939813379189807 % -[ 7] Char n: 4.463958503650664 % -[ 8] Char ā: 3.950046514250275 % -[ 9] Char l: 3.8031742452005752 % -[10] Char o: 3.6298029487215633 % -[11] Char k: 3.5249344571927943 % -[12] Char m: 3.2740394102556873 % -[13] Char d: 3.17790995968765 % -[14] Char v: 3.0048205677557576 % -[15] Char p: 2.8272207030699406 % -[16] Char j: 2.8167902348264877 % -[17] Char b: 2.0280213119837622 % -[18] Char ī: 1.885659515687988 % -[19] Char g: 1.6147492459053363 % -[20] Char z: 1.5344064499760381 % -[21] Char ē: 1.4594198404420264 % -[22] Char c: 1.2231838299551772 % -[23] Char š: 0.8877174188819666 % -[24] Char ņ: 0.4659882163899304 % -[25] Char f: 0.42031967975643447 % -[26] Char ļ: 0.34702449750514475 % -[27] Char ū: 0.3016378654187692 % -[28] Char h: 0.20071603754968567 % -[29] Char ž: 0.1877484283821498 % -[30] Char ķ: 0.1420798917486539 % -[31] Char ģ: 0.12685704620415528 % -[32] Char č: 0.08287993685338144 % -[33] Char w: 0.03241902291883968 % -[34] Char y: 0.02734474107067347 % -[35] Char x: 0.015786654638739323 % -[36] Char ö: 0.005074281848166211 % -[37] Char é: 0.003946663659684831 % -[38] Char q: 0.0031009500183237955 % +[ 0] Char a: 11.905171320244119 % +[ 1] Char i: 9.3977364022044 % +[ 2] Char s: 8.224217395594017 % +[ 3] Char e: 6.367108768270166 % +[ 4] Char r: 5.854064186951191 % +[ 5] Char t: 5.831230884156225 % +[ 6] Char u: 4.939604504644181 % +[ 7] Char n: 4.463769750102186 % +[ 8] Char ā: 3.9498794909019157 % +[ 9] Char l: 3.8030134321836813 % +[10] Char o: 3.6296494665182033 % +[11] Char k: 3.524785409237621 % +[12] Char m: 3.2739009711201 % +[13] Char d: 3.177775585279567 % +[14] Char v: 3.0046935122411873 % +[15] Char p: 2.827101157169234 % +[16] Char j: 2.8166711299665956 % +[17] Char b: 2.0279355593454453 % +[18] Char ī: 1.8855797826607845 % +[19] Char g: 1.6146809680192813 % +[20] Char z: 1.5343415692962552 % +[21] Char ē: 1.4593581304880971 % +[22] Char c: 1.2231321089796898 % +[23] Char š: 0.8876798827326671 % +[24] Char ņ: 0.46596851259355315 % +[25] Char f: 0.4203019070036223 % +[26] Char ļ: 0.34700982395805435 % +[27] Char ū: 0.30162511099522193 % +[28] Char h: 0.20070755049401684 % +[29] Char ž: 0.18774048964749326 % +[30] Char ķ: 0.14207388405756247 % +[31] Char ģ: 0.1268516821942522 % +[32] Char č: 0.08287643236691145 % +[33] Char w: 0.0324176521163089 % +[34] Char y: 0.02734358482853881 % +[35] Char x: 0.015785987117506943 % +[36] Char ö: 0.005074067287770088 % +[37] Char é: 0.003946496779376736 % +[38] Char q: 0.0031008188980817205 % -The first 39 characters have an accumulated ratio of 0.9999013334085078. +The first 39 characters have an accumulated ratio of 0.9998590536864506. -956 sequences found. +970 sequences found. -First 512 (typical positive ratio): 0.9904728616367904 -Next 512 (512-1024): 0.001877484283821498 -Rest: -4.683753385137379e-17 +First 512 (typical positive ratio): 0.9904102202220861 +Next 512 (512-1024): 0.0018774048964749328 +Rest: -1.734723475976807e-17 -- Processing end: 2016-09-20 23:23:02.695068 +- Processing end: 2016-09-21 00:19:18.484318 diff --git a/script/BuildLangModelLogs/LangLithuanianModel.log b/script/BuildLangModelLogs/LangLithuanianModel.log index 7e04157..9ea0467 100644 --- a/script/BuildLangModelLogs/LangLithuanianModel.log +++ b/script/BuildLangModelLogs/LangLithuanianModel.log @@ -1,7 +1,7 @@ = Logs of language model for Lithuanian (lt) = - Generated by BuildLangModel.py -- Started: 2016-09-20 22:53:23.311784 +- Started: 2016-09-21 00:23:03.857157 - Maximum depth: 5 - Max number of pages: 100 @@ -107,56 +107,56 @@ Jurgis Mikalojus Tiškevičius (revision 4939554) == End of Parsed pages == -- Wikipedia parsing ended at: 2016-09-20 22:57:33.076907 +- Wikipedia parsing ended at: 2016-09-21 00:25:34.773941 -53 characters appeared 353013 times. +60 characters appeared 353051 times. First 38 characters: -[ 0] Char i: 13.033797622183885 % -[ 1] Char a: 11.1684272250597 % -[ 2] Char s: 8.587502443252799 % -[ 3] Char o: 7.01957151719625 % -[ 4] Char e: 5.52642537243671 % -[ 5] Char r: 5.469770235090492 % -[ 6] Char n: 5.143153368289553 % -[ 7] Char t: 5.1063275290145125 % -[ 8] Char u: 4.270947528844546 % -[ 9] Char k: 3.9621770303076653 % -[10] Char l: 3.905521892961449 % -[11] Char m: 3.360216196004113 % -[12] Char d: 3.037565188817409 % -[13] Char v: 2.727378311846872 % -[14] Char j: 2.447501933356562 % -[15] Char p: 2.3293759719897 % -[16] Char g: 1.942987935288502 % -[17] Char ė: 1.56594799624943 % -[18] Char b: 1.5075932047828267 % -[19] Char y: 1.223750966678281 % -[20] Char ų: 1.1818261650420805 % -[21] Char š: 0.9631373348856841 % -[22] Char ž: 0.8172503562191761 % -[23] Char c: 0.5960120448821998 % -[24] Char č: 0.48015228900918666 % -[25] Char f: 0.42831283833739836 % -[26] Char h: 0.42519680578335645 % -[27] Char z: 0.40111837241121434 % -[28] Char ū: 0.3685416684371397 % -[29] Char ą: 0.352678229980199 % -[30] Char į: 0.29007430321262956 % -[31] Char ę: 0.1481531841603567 % -[32] Char x: 0.08753218719990481 % -[33] Char w: 0.05920461852679646 % -[34] Char ō: 0.018129643950789347 % -[35] Char ö: 0.00878154628866359 % -[36] Char é: 0.007648443541739256 % -[37] Char q: 0.0073651678550081725 % +[ 0] Char i: 13.032394753165974 % +[ 1] Char a: 11.167225131779828 % +[ 2] Char s: 8.586578143101137 % +[ 3] Char o: 7.018815978428046 % +[ 4] Char e: 5.525830545728521 % +[ 5] Char r: 5.469181506354606 % +[ 6] Char n: 5.142599794363987 % +[ 7] Char t: 5.105777918770942 % +[ 8] Char u: 4.270487833202568 % +[ 9] Char k: 3.9617505686147325 % +[10] Char l: 3.9051015292408184 % +[11] Char m: 3.359854525266888 % +[12] Char d: 3.0372382460324427 % +[13] Char v: 2.7270847554602593 % +[14] Char j: 2.4472385009531203 % +[15] Char p: 2.329125253858508 % +[16] Char g: 1.9427788053284087 % +[17] Char ė: 1.5657794482950054 % +[18] Char b: 1.5074309377398734 % +[19] Char y: 1.2236192504765602 % +[20] Char ų: 1.181698961339863 % +[21] Char š: 0.9630336693565519 % +[22] Char ž: 0.8171623929687212 % +[23] Char c: 0.5959478942135839 % +[24] Char č: 0.48010060869392807 % +[25] Char f: 0.428266737666796 % +[26] Char h: 0.42515104050123065 % +[27] Char z: 0.4010751987673169 % +[28] Char ū: 0.3685020011273159 % +[29] Char ą: 0.3526402701026197 % +[30] Char į: 0.29004308159444386 % +[31] Char ę: 0.14813723796278724 % +[32] Char x: 0.08752276583269838 % +[33] Char w: 0.059198246145740985 % +[34] Char ō: 0.01812769259965274 % +[35] Char ö: 0.008780601102956797 % +[36] Char é: 0.0076476203154785 % +[37] Char q: 0.007364375118608926 % -The first 38 characters have an accumulated ratio of 0.9997705466937479. +The first 38 characters have an accumulated ratio of 0.9996629382157253. -976 sequences found. +1016 sequences found. -First 512 (typical positive ratio): 0.9930868640383149 -Next 512 (512-1024): 0.008172503562191761 -Rest: -2.688821387764051e-17 +First 512 (typical positive ratio): 0.9928710196247589 +Next 512 (512-1024): 0.008171623929687212 +Rest: -4.85722573273506e-17 -- Processing end: 2016-09-20 22:57:33.185223 +- Processing end: 2016-09-21 00:25:34.935858 diff --git a/script/charsets/iso-8859-10.py b/script/charsets/iso-8859-10.py new file mode 100644 index 0000000..e17d5b6 --- /dev/null +++ b/script/charsets/iso-8859-10.py @@ -0,0 +1,73 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +# ##### BEGIN LICENSE BLOCK ##### +# Version: MPL 1.1/GPL 2.0/LGPL 2.1 +# +# The contents of this file are subject to the Mozilla Public License Version +# 1.1 (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# http://www.mozilla.org/MPL/ +# +# Software distributed under the License is distributed on an "AS IS" basis, +# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License +# for the specific language governing rights and limitations under the +# License. +# +# The Original Code is Mozilla Universal charset detector code. +# +# The Initial Developer of the Original Code is +# Netscape Communications Corporation. +# Portions created by the Initial Developer are Copyright (C) 2001 +# the Initial Developer. All Rights Reserved. +# +# Contributor(s): +# Jehan +# +# Alternatively, the contents of this file may be used under the terms of +# either the GNU General Public License Version 2 or later (the "GPL"), or +# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), +# in which case the provisions of the GPL or the LGPL are applicable instead +# of those above. If you wish to allow use of your version of this file only +# under the terms of either the GPL or the LGPL, and not to allow others to +# use your version of this file under the terms of the MPL, indicate your +# decision by deleting the provisions above and replace them with the notice +# and other provisions required by the GPL or the LGPL. If you do not delete +# the provisions above, a recipient may use your version of this file under +# the terms of any one of the MPL, the GPL or the LGPL. +# +# ##### END LICENSE BLOCK ##### + +from codepoints import * + +name = 'ISO-8859-10' +aliases = ['ISO_8859-10:1992', 'ISO_8859-10', 'iso-ir-157', + 'csISOLatin6', 'latin6', 'l6'] + +language = \ +{ + # Nordic languages. Supersedes ISO-8859-4. + 'complete': [ 'et', 'lv', 'lt', 'kl', 'saam1281' ], + 'incomplete': [] +} + +# X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF # +charmap = \ +[ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, # 0X + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 1X + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # 2X + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, # 3X + SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 4X + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,SYM, # 5X + SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 6X + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,CTR, # 7X + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 8X + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 9X + SYM,LET,LET,LET,LET,LET,LET,SYM,LET,LET,LET,LET,LET,SYM,LET,LET, # AX + SYM,LET,LET,LET,LET,LET,LET,SYM,LET,LET,LET,LET,LET,LET,LET,LET, # BX + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # CX + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # DX + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # EX + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # FX +] diff --git a/script/charsets/iso-8859-4.py b/script/charsets/iso-8859-4.py new file mode 100644 index 0000000..d9b3f25 --- /dev/null +++ b/script/charsets/iso-8859-4.py @@ -0,0 +1,73 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +# ##### BEGIN LICENSE BLOCK ##### +# Version: MPL 1.1/GPL 2.0/LGPL 2.1 +# +# The contents of this file are subject to the Mozilla Public License Version +# 1.1 (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# http://www.mozilla.org/MPL/ +# +# Software distributed under the License is distributed on an "AS IS" basis, +# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License +# for the specific language governing rights and limitations under the +# License. +# +# The Original Code is Mozilla Universal charset detector code. +# +# The Initial Developer of the Original Code is +# Netscape Communications Corporation. +# Portions created by the Initial Developer are Copyright (C) 2001 +# the Initial Developer. All Rights Reserved. +# +# Contributor(s): +# Jehan +# +# Alternatively, the contents of this file may be used under the terms of +# either the GNU General Public License Version 2 or later (the "GPL"), or +# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), +# in which case the provisions of the GPL or the LGPL are applicable instead +# of those above. If you wish to allow use of your version of this file only +# under the terms of either the GPL or the LGPL, and not to allow others to +# use your version of this file under the terms of the MPL, indicate your +# decision by deleting the provisions above and replace them with the notice +# and other provisions required by the GPL or the LGPL. If you do not delete +# the provisions above, a recipient may use your version of this file under +# the terms of any one of the MPL, the GPL or the LGPL. +# +# ##### END LICENSE BLOCK ##### + +from codepoints import * + +name = 'ISO-8859-4' +aliases = ['ISO_8859-2:1988', 'ISO_8859-4', 'iso-ir-110', + 'csISOLatin4', 'latin4', 'l4'] + +language = \ +{ + # Nordic languages. Largely superseded by ISO-8859-10. + 'complete': [ 'et', 'lv', 'lt', 'kl', 'saam1281' ], + 'incomplete': [] +} + +# X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF # +charmap = \ +[ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, # 0X + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 1X + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # 2X + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, # 3X + SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 4X + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,SYM, # 5X + SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 6X + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,CTR, # 7X + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 8X + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 9X + SYM,LET,LET,LET,SYM,LET,LET,SYM,SYM,LET,LET,LET,LET,SYM,LET,SYM, # AX + SYM,LET,SYM,LET,SYM,LET,LET,SYM,SYM,LET,LET,LET,LET,LET,LET,LET, # BX + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # CX + LET,LET,LET,LET,LET,LET,LET,SYM,LET,LET,LET,LET,LET,LET,LET,LET, # DX + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # EX + LET,LET,LET,LET,LET,LET,LET,SYM,LET,LET,LET,LET,LET,LET,LET,SYM, # FX +] diff --git a/script/langs/lt.py b/script/langs/lt.py index 2f030c8..dbd3297 100644 --- a/script/langs/lt.py +++ b/script/langs/lt.py @@ -50,7 +50,7 @@ code = 'lt' # ASCII characters are also used. use_ascii = True # The charsets we want to support and create data for. -charsets = ['ISO-8859-13'] +charsets = ['ISO-8859-4', 'ISO-8859-10', 'ISO-8859-13', ] ## Optional Properties ## diff --git a/script/langs/lv.py b/script/langs/lv.py index 85d1b39..dad6adf 100644 --- a/script/langs/lv.py +++ b/script/langs/lv.py @@ -50,7 +50,7 @@ code = 'lv' # ASCII characters are also used. use_ascii = True # The charsets we want to support and create data for. -charsets = ['ISO-8859-13'] +charsets = ['ISO-8859-4', 'ISO-8859-10', 'ISO-8859-13'] ## Optional Properties ## diff --git a/src/LangModels/LangLatvianModel.cpp b/src/LangModels/LangLatvianModel.cpp index fd60884..9f71799 100644 --- a/src/LangModels/LangLatvianModel.cpp +++ b/src/LangModels/LangLatvianModel.cpp @@ -41,7 +41,7 @@ /** * Generated by BuildLangModel.py - * On: 2016-09-20 23:23:02.592930 + * On: 2016-09-21 00:19:18.362275 **/ /* Character Mapping Table: @@ -61,6 +61,48 @@ * ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1 * even though they are both used for French. Same for the euro sign. */ +static const unsigned char Iso_8859_4_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 0, 17, 22, 13, 3, 25, 19, 28, 1, 16, 11, 9, 12, 7, 10, /* 4X */ + 15, 38, 4, 2, 5, 6, 14, 33, 35, 34, 20,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 0, 17, 22, 13, 3, 25, 19, 28, 1, 16, 11, 9, 12, 7, 10, /* 6X */ + 15, 38, 4, 2, 5, 6, 14, 33, 35, 34, 20,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM, 55, 56, 57,SYM, 58, 26,SYM,SYM, 23, 21, 31, 59,SYM, 29,SYM, /* AX */ + SYM, 60,SYM, 61,SYM, 62, 26,SYM,SYM, 23, 21, 31, 63, 48, 29, 48, /* BX */ + 8, 42, 64, 65, 40, 52, 53, 66, 32, 37, 67, 43, 46, 45, 49, 18, /* CX */ + 68, 24, 51, 30, 69, 70, 36,SYM, 71, 72, 73, 74, 39, 75, 27, 44, /* DX */ + 8, 42, 76, 77, 40, 52, 53, 78, 32, 37, 79, 43, 46, 45, 49, 18, /* EX */ + 80, 24, 51, 30, 81, 82, 36,SYM, 83, 84, 85, 86, 39, 87, 27,SYM, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Iso_8859_10_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 0, 17, 22, 13, 3, 25, 19, 28, 1, 16, 11, 9, 12, 7, 10, /* 4X */ + 15, 38, 4, 2, 5, 6, 14, 33, 35, 34, 20,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 0, 17, 22, 13, 3, 25, 19, 28, 1, 16, 11, 9, 12, 7, 10, /* 6X */ + 15, 38, 4, 2, 5, 6, 14, 33, 35, 34, 20,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM, 88, 21, 31, 18, 89, 30,SYM, 26, 90, 23, 91, 29,SYM, 27, 48, /* AX */ + SYM, 92, 21, 31, 18, 93, 30,SYM, 26, 94, 23, 95, 29, 96, 27, 48, /* BX */ + 8, 42, 97, 98, 40, 52, 53, 99, 32, 37,100, 43, 46, 45, 49,101, /* CX */ + 50, 24, 51, 47,102,103, 36,104,105,106,107,108, 39,109, 54, 44, /* DX */ + 8, 42,110,111, 40, 52, 53,112, 32, 37,113, 43, 46, 45, 49,114, /* EX */ + 50, 24, 51, 47,115,116, 36,117,118,119,120,121, 39,122, 54,123, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + static const unsigned char Iso_8859_13_CharToOrderMap[] = { CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ @@ -73,21 +115,21 @@ static const unsigned char Iso_8859_13_CharToOrderMap[] = 15, 38, 4, 2, 5, 6, 14, 33, 35, 34, 20,SYM,SYM,SYM,SYM,CTR, /* 7X */ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ - SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 48,SYM, 49,SYM,SYM,SYM,SYM, 47, /* AX */ - SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 50,SYM, 51,SYM,SYM,SYM,SYM, 47, /* BX */ - 52, 53, 8, 54, 40, 46, 55, 21, 32, 37, 56, 43, 31, 30, 18, 26, /* CX */ - 23, 57, 24, 44, 45, 58, 36,SYM, 59, 41, 60, 27, 39, 61, 29, 42, /* DX */ - 62, 63, 8, 64, 40, 46, 65, 21, 32, 37, 66, 43, 31, 30, 18, 26, /* EX */ - 23, 67, 24, 44, 45, 68, 36,SYM, 69, 41, 70, 27, 39, 71, 29,SYM, /* FX */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,124,SYM,125,SYM,SYM,SYM,SYM, 53, /* AX */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,126,SYM,127,SYM,SYM,SYM,SYM, 53, /* BX */ + 128,129, 8,130, 40, 52,131, 21, 32, 37,132, 46, 31, 30, 18, 26, /* CX */ + 23,133, 24, 47, 51,134, 36,SYM,135, 41,136, 27, 39,137, 29, 44, /* DX */ + 138,139, 8,140, 40, 52,141, 21, 32, 37,142, 46, 31, 30, 18, 26, /* EX */ + 23,143, 24, 47, 51,144, 36,SYM,145, 41,146, 27, 39,147, 29,SYM, /* FX */ }; /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ /* Model Table: - * Total sequences: 956 - * First 512 sequences: 0.9904728616367904 - * Next 512 sequences (512-1024): 0.009527138363209666 - * Rest: -4.683753385137379e-17 + * Total sequences: 970 + * First 512 sequences: 0.9904102202220861 + * Next 512 sequences (512-1024): 0.009589779777913882 + * Rest: -1.734723475976807e-17 * Negative sequences: TODO */ static const PRUint8 LatvianLangModel[] = @@ -134,12 +176,32 @@ static const PRUint8 LatvianLangModel[] = }; +const SequenceModel Iso_8859_4LatvianModel = +{ + Iso_8859_4_CharToOrderMap, + LatvianLangModel, + 39, + (float)0.9904102202220861, + PR_TRUE, + "ISO-8859-4" +}; + +const SequenceModel Iso_8859_10LatvianModel = +{ + Iso_8859_10_CharToOrderMap, + LatvianLangModel, + 39, + (float)0.9904102202220861, + PR_TRUE, + "ISO-8859-10" +}; + const SequenceModel Iso_8859_13LatvianModel = { Iso_8859_13_CharToOrderMap, LatvianLangModel, 39, - (float)0.9904728616367904, + (float)0.9904102202220861, PR_TRUE, "ISO-8859-13" -}; +}; \ No newline at end of file diff --git a/src/LangModels/LangLithuanianModel.cpp b/src/LangModels/LangLithuanianModel.cpp index d77d741..280f8a5 100644 --- a/src/LangModels/LangLithuanianModel.cpp +++ b/src/LangModels/LangLithuanianModel.cpp @@ -41,7 +41,7 @@ /** * Generated by BuildLangModel.py - * On: 2016-09-20 22:57:33.077635 + * On: 2016-09-21 00:25:34.775158 **/ /* Character Mapping Table: @@ -61,6 +61,48 @@ * ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1 * even though they are both used for French. Same for the euro sign. */ +static const unsigned char Iso_8859_10_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 1, 18, 23, 12, 4, 25, 16, 26, 0, 14, 9, 10, 11, 6, 3, /* 4X */ + 15, 37, 5, 2, 7, 8, 13, 33, 32, 19, 27,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 1, 18, 23, 12, 4, 25, 16, 26, 0, 14, 9, 10, 11, 6, 3, /* 6X */ + 15, 37, 5, 2, 7, 8, 13, 33, 32, 19, 27,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM, 29, 50, 60, 47, 61, 62,SYM, 56, 55, 21, 63, 22,SYM, 28, 64, /* AX */ + SYM, 29, 50, 65, 47, 66, 67,SYM, 56, 55, 21, 68, 22, 69, 28, 70, /* BX */ + 41, 39, 71, 53, 38, 43, 72, 30, 24, 36, 31, 73, 17, 40, 74, 46, /* CX */ + 75, 57, 34, 44, 59, 76, 35, 77, 48, 20, 54, 78, 45, 79, 80, 52, /* DX */ + 41, 39, 81, 53, 38, 43, 82, 30, 24, 36, 31, 83, 17, 40, 84, 46, /* EX */ + 85, 57, 34, 44, 59, 86, 35, 87, 48, 20, 54, 88, 45, 89, 90, 91, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Iso_8859_4_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 1, 18, 23, 12, 4, 25, 16, 26, 0, 14, 9, 10, 11, 6, 3, /* 4X */ + 15, 37, 5, 2, 7, 8, 13, 33, 32, 19, 27,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 1, 18, 23, 12, 4, 25, 16, 26, 0, 14, 9, 10, 11, 6, 3, /* 6X */ + 15, 37, 5, 2, 7, 8, 13, 33, 32, 19, 27,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM, 29, 92, 93,SYM, 94, 56,SYM,SYM, 21, 50, 95, 96,SYM, 22,SYM, /* AX */ + SYM, 29,SYM, 97,SYM, 98, 56,SYM,SYM, 21, 50, 99,100,101, 22,102, /* BX */ + 41, 39,103, 53, 38, 43,104, 30, 24, 36, 31,105, 17, 40,106, 47, /* CX */ + 55, 57, 34,107, 59,108, 35,SYM, 48, 20, 54,109, 45,110, 28, 52, /* DX */ + 41, 39,111, 53, 38, 43,112, 30, 24, 36, 31,113, 17, 40,114, 47, /* EX */ + 55, 57, 34,115, 59,116, 35,SYM, 48, 20, 54,117, 45,118, 28,SYM, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + static const unsigned char Iso_8859_13_CharToOrderMap[] = { CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ @@ -73,21 +115,21 @@ static const unsigned char Iso_8859_13_CharToOrderMap[] = 15, 37, 5, 2, 7, 8, 13, 33, 32, 19, 27,SYM,SYM,SYM,SYM,CTR, /* 7X */ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ - SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 45,SYM, 53,SYM,SYM,SYM,SYM, 54, /* AX */ - SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 45,SYM, 55,SYM,SYM,SYM,SYM, 56, /* BX */ - 29, 30, 39, 46, 38, 41, 31, 47, 24, 36, 57, 17, 58, 59, 44, 50, /* CX */ - 21, 48, 51, 42, 34, 60, 35,SYM, 20, 40, 52, 28, 43, 61, 22, 49, /* DX */ - 29, 30, 39, 46, 38, 41, 31, 47, 24, 36, 62, 17, 63, 64, 44, 50, /* EX */ - 21, 48, 51, 42, 34, 65, 35,SYM, 20, 40, 52, 28, 43, 66, 22,SYM, /* FX */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 48,SYM,119,SYM,SYM,SYM,SYM,120, /* AX */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 48,SYM,121,SYM,SYM,SYM,SYM,122, /* BX */ + 29, 30, 41, 49, 38, 43, 31, 50, 24, 36,123, 17,124,125, 47, 56, /* CX */ + 21, 51, 57, 44, 34,126, 35,SYM, 20, 42, 58, 28, 45,127, 22, 52, /* DX */ + 29, 30, 41, 49, 38, 43, 31, 50, 24, 36,128, 17,129,130, 47, 56, /* EX */ + 21, 51, 57, 44, 34,131, 35,SYM, 20, 42, 58, 28, 45,132, 22,SYM, /* FX */ }; /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ /* Model Table: - * Total sequences: 976 - * First 512 sequences: 0.9930868640383149 - * Next 512 sequences (512-1024): 0.0069131359616851065 - * Rest: -2.688821387764051e-17 + * Total sequences: 1016 + * First 512 sequences: 0.9928710196247589 + * Next 512 sequences (512-1024): 0.0071289803752411715 + * Rest: -4.85722573273506e-17 * Negative sequences: TODO */ static const PRUint8 LithuanianLangModel[] = @@ -133,12 +175,32 @@ static const PRUint8 LithuanianLangModel[] = }; +const SequenceModel Iso_8859_10LithuanianModel = +{ + Iso_8859_10_CharToOrderMap, + LithuanianLangModel, + 38, + (float)0.9928710196247589, + PR_TRUE, + "ISO-8859-10" +}; + +const SequenceModel Iso_8859_4LithuanianModel = +{ + Iso_8859_4_CharToOrderMap, + LithuanianLangModel, + 38, + (float)0.9928710196247589, + PR_TRUE, + "ISO-8859-4" +}; + const SequenceModel Iso_8859_13LithuanianModel = { Iso_8859_13_CharToOrderMap, LithuanianLangModel, 38, - (float)0.9930868640383149, + (float)0.9928710196247589, PR_TRUE, "ISO-8859-13" }; \ No newline at end of file diff --git a/src/nsSBCSGroupProber.cpp b/src/nsSBCSGroupProber.cpp index 57dbafe..e90902f 100644 --- a/src/nsSBCSGroupProber.cpp +++ b/src/nsSBCSGroupProber.cpp @@ -112,13 +112,17 @@ nsSBCSGroupProber::nsSBCSGroupProber() mProbers[34] = new nsSingleByteCharSetProber(&Windows_1252DanishModel); mProbers[35] = new nsSingleByteCharSetProber(&Iso_8859_13LithuanianModel); + mProbers[36] = new nsSingleByteCharSetProber(&Iso_8859_10LithuanianModel); + mProbers[37] = new nsSingleByteCharSetProber(&Iso_8859_4LithuanianModel); - mProbers[36] = new nsSingleByteCharSetProber(&Iso_8859_13LatvianModel); + mProbers[38] = new nsSingleByteCharSetProber(&Iso_8859_13LatvianModel); + mProbers[39] = new nsSingleByteCharSetProber(&Iso_8859_10LatvianModel); + mProbers[40] = new nsSingleByteCharSetProber(&Iso_8859_4LatvianModel); - mProbers[37] = new nsSingleByteCharSetProber(&Iso_8859_1PortugueseModel); - mProbers[38] = new nsSingleByteCharSetProber(&Iso_8859_9PortugueseModel); - mProbers[39] = new nsSingleByteCharSetProber(&Iso_8859_15PortugueseModel); - mProbers[40] = new nsSingleByteCharSetProber(&Windows_1252PortugueseModel); + mProbers[41] = new nsSingleByteCharSetProber(&Iso_8859_1PortugueseModel); + mProbers[42] = new nsSingleByteCharSetProber(&Iso_8859_9PortugueseModel); + mProbers[43] = new nsSingleByteCharSetProber(&Iso_8859_15PortugueseModel); + mProbers[44] = new nsSingleByteCharSetProber(&Windows_1252PortugueseModel); Reset(); } diff --git a/src/nsSBCSGroupProber.h b/src/nsSBCSGroupProber.h index 9cf7ea4..48444de 100644 --- a/src/nsSBCSGroupProber.h +++ b/src/nsSBCSGroupProber.h @@ -40,7 +40,7 @@ #define nsSBCSGroupProber_h__ -#define NUM_OF_SBCS_PROBERS 41 +#define NUM_OF_SBCS_PROBERS 45 class nsCharSetProber; class nsSBCSGroupProber: public nsCharSetProber { diff --git a/src/nsSBCharSetProber.h b/src/nsSBCharSetProber.h index 6000838..5b40db7 100644 --- a/src/nsSBCharSetProber.h +++ b/src/nsSBCharSetProber.h @@ -173,8 +173,12 @@ extern const SequenceModel Iso_8859_1DanishModel; extern const SequenceModel Windows_1252DanishModel; extern const SequenceModel Iso_8859_13LithuanianModel; +extern const SequenceModel Iso_8859_10LithuanianModel; +extern const SequenceModel Iso_8859_4LithuanianModel; extern const SequenceModel Iso_8859_13LatvianModel; +extern const SequenceModel Iso_8859_10LatvianModel; +extern const SequenceModel Iso_8859_4LatvianModel; extern const SequenceModel Iso_8859_1PortugueseModel; extern const SequenceModel Iso_8859_9PortugueseModel; diff --git a/test/lt/iso-8859-10.txt b/test/lt/iso-8859-10.txt new file mode 100644 index 0000000..d005822 --- /dev/null +++ b/test/lt/iso-8859-10.txt @@ -0,0 +1,3 @@ +Vincentas van Gogas (ol. Vincent van Gogh, 1853 m. kovo 30 d. Grot Zunderte, +Nyderlandai - 1890 m. liepos 29 d. Overe prie Uazos, Pranczija) - oland +tapytojas ir grafikas, postimpresionistas. diff --git a/test/lt/iso-8859-4.txt b/test/lt/iso-8859-4.txt new file mode 100644 index 0000000..d5ee32a --- /dev/null +++ b/test/lt/iso-8859-4.txt @@ -0,0 +1,3 @@ +Vincentas van Gogas (ol. Vincent van Gogh, 1853 m. kovo 30 d. Grot Zunderte, +Nyderlandai - 1890 m. liepos 29 d. Overe prie Uazos, Pranczija) - oland +tapytojas ir grafikas, postimpresionistas. diff --git a/test/lv/iso-8859-10.txt b/test/lv/iso-8859-10.txt new file mode 100644 index 0000000..6afb063 --- /dev/null +++ b/test/lv/iso-8859-10.txt @@ -0,0 +1,6 @@ +Vinsents Villems van Gogs (nderlandieu: Vincent Willem van Gogh, dzimis 1853. +gada 30. mart, miris 1890. gada 29. jlij) bija nderlandieu gleznotjs, +postimpresionisma prstvis. Kopum van Gogs radja vairk nek 2000 darbu, to +skait 900 gleznu un 1100 zmjumu un skiu. Savus slavenkos darbus vi radja +pdjo divu dzves gadu laik. Tiek uzskatts, ka van Gogs btiski ir ietekmjis +20. gadsimta mkslu, tostarp ekspresionismu un fovismu. diff --git a/test/lv/iso-8859-4.txt b/test/lv/iso-8859-4.txt new file mode 100644 index 0000000..7fd134d --- /dev/null +++ b/test/lv/iso-8859-4.txt @@ -0,0 +1,6 @@ +Vinsents Villems van Gogs (nderlandieu: Vincent Willem van Gogh, dzimis 1853. +gada 30. mart, miris 1890. gada 29. jlij) bija nderlandieu gleznotjs, +postimpresionisma prstvis. Kopum van Gogs radja vairk nek 2000 darbu, to +skait 900 gleznu un 1100 zmjumu un skiu. Savus slavenkos darbus vi radja +pdjo divu dzves gadu laik. Tiek uzskatts, ka van Gogs btiski ir ietekmjis +20. gadsimta mkslu, tostarp ekspresionismu un fovismu.