From b7aebfdfda84f9cad6795af6b5cdeaa7e9000420 Mon Sep 17 00:00:00 2001 From: Jehan Date: Wed, 21 Sep 2016 00:27:16 +0200 Subject: [PATCH] LangModels: add support for Latvian | Lithuanian / ISO-8859-4 | ISO-8859-10. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Just realizing that these 2 language can also be encoded with these charsets (even though ISO-8859-13 would appear to be more common… maybe?). Anyway now the models are updated and can recognize texts using these encoding for these languages. Added some test files as well, which work great. --- .../BuildLangModelLogs/LangLatvianModel.log | 96 +++++++++---------- .../LangLithuanianModel.log | 94 +++++++++--------- script/charsets/iso-8859-10.py | 73 ++++++++++++++ script/charsets/iso-8859-4.py | 73 ++++++++++++++ script/langs/lt.py | 2 +- script/langs/lv.py | 2 +- src/LangModels/LangLatvianModel.cpp | 88 ++++++++++++++--- src/LangModels/LangLithuanianModel.cpp | 86 ++++++++++++++--- src/nsSBCSGroupProber.cpp | 14 ++- src/nsSBCSGroupProber.h | 2 +- src/nsSBCharSetProber.h | 4 + test/lt/iso-8859-10.txt | 3 + test/lt/iso-8859-4.txt | 3 + test/lv/iso-8859-10.txt | 6 ++ test/lv/iso-8859-4.txt | 6 ++ 15 files changed, 424 insertions(+), 128 deletions(-) create mode 100644 script/charsets/iso-8859-10.py create mode 100644 script/charsets/iso-8859-4.py create mode 100644 test/lt/iso-8859-10.txt create mode 100644 test/lt/iso-8859-4.txt create mode 100644 test/lv/iso-8859-10.txt create mode 100644 test/lv/iso-8859-4.txt diff --git a/script/BuildLangModelLogs/LangLatvianModel.log b/script/BuildLangModelLogs/LangLatvianModel.log index 956f1a6..4dd7a21 100644 --- a/script/BuildLangModelLogs/LangLatvianModel.log +++ b/script/BuildLangModelLogs/LangLatvianModel.log @@ -1,7 +1,7 @@ = Logs of language model for Latvian (lv) = - Generated by BuildLangModel.py -- Started: 2016-09-20 23:16:39.184579 +- Started: 2016-09-21 00:16:33.485953 - Maximum depth: 5 - Max number of pages: 100 @@ -106,57 +106,57 @@ Eduards Radziņš (revision 2564393) == End of Parsed pages == -- Wikipedia parsing ended at: 2016-09-20 23:23:02.592168 +- Wikipedia parsing ended at: 2016-09-21 00:19:18.361533 -48 characters appeared 354730 times. +55 characters appeared 354745 times. First 39 characters: -[ 0] Char a: 11.905674738533532 % -[ 1] Char i: 9.398133791898063 % -[ 2] Char s: 8.224565162236066 % -[ 3] Char e: 6.367378005807234 % -[ 4] Char r: 5.854311730048205 % -[ 5] Char t: 5.831477461731457 % -[ 6] Char u: 4.939813379189807 % -[ 7] Char n: 4.463958503650664 % -[ 8] Char ā: 3.950046514250275 % -[ 9] Char l: 3.8031742452005752 % -[10] Char o: 3.6298029487215633 % -[11] Char k: 3.5249344571927943 % -[12] Char m: 3.2740394102556873 % -[13] Char d: 3.17790995968765 % -[14] Char v: 3.0048205677557576 % -[15] Char p: 2.8272207030699406 % -[16] Char j: 2.8167902348264877 % -[17] Char b: 2.0280213119837622 % -[18] Char ī: 1.885659515687988 % -[19] Char g: 1.6147492459053363 % -[20] Char z: 1.5344064499760381 % -[21] Char ē: 1.4594198404420264 % -[22] Char c: 1.2231838299551772 % -[23] Char š: 0.8877174188819666 % -[24] Char ņ: 0.4659882163899304 % -[25] Char f: 0.42031967975643447 % -[26] Char ļ: 0.34702449750514475 % -[27] Char ū: 0.3016378654187692 % -[28] Char h: 0.20071603754968567 % -[29] Char ž: 0.1877484283821498 % -[30] Char ķ: 0.1420798917486539 % -[31] Char ģ: 0.12685704620415528 % -[32] Char č: 0.08287993685338144 % -[33] Char w: 0.03241902291883968 % -[34] Char y: 0.02734474107067347 % -[35] Char x: 0.015786654638739323 % -[36] Char ö: 0.005074281848166211 % -[37] Char é: 0.003946663659684831 % -[38] Char q: 0.0031009500183237955 % +[ 0] Char a: 11.905171320244119 % +[ 1] Char i: 9.3977364022044 % +[ 2] Char s: 8.224217395594017 % +[ 3] Char e: 6.367108768270166 % +[ 4] Char r: 5.854064186951191 % +[ 5] Char t: 5.831230884156225 % +[ 6] Char u: 4.939604504644181 % +[ 7] Char n: 4.463769750102186 % +[ 8] Char ā: 3.9498794909019157 % +[ 9] Char l: 3.8030134321836813 % +[10] Char o: 3.6296494665182033 % +[11] Char k: 3.524785409237621 % +[12] Char m: 3.2739009711201 % +[13] Char d: 3.177775585279567 % +[14] Char v: 3.0046935122411873 % +[15] Char p: 2.827101157169234 % +[16] Char j: 2.8166711299665956 % +[17] Char b: 2.0279355593454453 % +[18] Char ī: 1.8855797826607845 % +[19] Char g: 1.6146809680192813 % +[20] Char z: 1.5343415692962552 % +[21] Char ē: 1.4593581304880971 % +[22] Char c: 1.2231321089796898 % +[23] Char š: 0.8876798827326671 % +[24] Char ņ: 0.46596851259355315 % +[25] Char f: 0.4203019070036223 % +[26] Char ļ: 0.34700982395805435 % +[27] Char ū: 0.30162511099522193 % +[28] Char h: 0.20070755049401684 % +[29] Char ž: 0.18774048964749326 % +[30] Char ķ: 0.14207388405756247 % +[31] Char ģ: 0.1268516821942522 % +[32] Char č: 0.08287643236691145 % +[33] Char w: 0.0324176521163089 % +[34] Char y: 0.02734358482853881 % +[35] Char x: 0.015785987117506943 % +[36] Char ö: 0.005074067287770088 % +[37] Char é: 0.003946496779376736 % +[38] Char q: 0.0031008188980817205 % -The first 39 characters have an accumulated ratio of 0.9999013334085078. +The first 39 characters have an accumulated ratio of 0.9998590536864506. -956 sequences found. +970 sequences found. -First 512 (typical positive ratio): 0.9904728616367904 -Next 512 (512-1024): 0.001877484283821498 -Rest: -4.683753385137379e-17 +First 512 (typical positive ratio): 0.9904102202220861 +Next 512 (512-1024): 0.0018774048964749328 +Rest: -1.734723475976807e-17 -- Processing end: 2016-09-20 23:23:02.695068 +- Processing end: 2016-09-21 00:19:18.484318 diff --git a/script/BuildLangModelLogs/LangLithuanianModel.log b/script/BuildLangModelLogs/LangLithuanianModel.log index 7e04157..9ea0467 100644 --- a/script/BuildLangModelLogs/LangLithuanianModel.log +++ b/script/BuildLangModelLogs/LangLithuanianModel.log @@ -1,7 +1,7 @@ = Logs of language model for Lithuanian (lt) = - Generated by BuildLangModel.py -- Started: 2016-09-20 22:53:23.311784 +- Started: 2016-09-21 00:23:03.857157 - Maximum depth: 5 - Max number of pages: 100 @@ -107,56 +107,56 @@ Jurgis Mikalojus Tiškevičius (revision 4939554) == End of Parsed pages == -- Wikipedia parsing ended at: 2016-09-20 22:57:33.076907 +- Wikipedia parsing ended at: 2016-09-21 00:25:34.773941 -53 characters appeared 353013 times. +60 characters appeared 353051 times. First 38 characters: -[ 0] Char i: 13.033797622183885 % -[ 1] Char a: 11.1684272250597 % -[ 2] Char s: 8.587502443252799 % -[ 3] Char o: 7.01957151719625 % -[ 4] Char e: 5.52642537243671 % -[ 5] Char r: 5.469770235090492 % -[ 6] Char n: 5.143153368289553 % -[ 7] Char t: 5.1063275290145125 % -[ 8] Char u: 4.270947528844546 % -[ 9] Char k: 3.9621770303076653 % -[10] Char l: 3.905521892961449 % -[11] Char m: 3.360216196004113 % -[12] Char d: 3.037565188817409 % -[13] Char v: 2.727378311846872 % -[14] Char j: 2.447501933356562 % -[15] Char p: 2.3293759719897 % -[16] Char g: 1.942987935288502 % -[17] Char ė: 1.56594799624943 % -[18] Char b: 1.5075932047828267 % -[19] Char y: 1.223750966678281 % -[20] Char ų: 1.1818261650420805 % -[21] Char š: 0.9631373348856841 % -[22] Char ž: 0.8172503562191761 % -[23] Char c: 0.5960120448821998 % -[24] Char č: 0.48015228900918666 % -[25] Char f: 0.42831283833739836 % -[26] Char h: 0.42519680578335645 % -[27] Char z: 0.40111837241121434 % -[28] Char ū: 0.3685416684371397 % -[29] Char ą: 0.352678229980199 % -[30] Char į: 0.29007430321262956 % -[31] Char ę: 0.1481531841603567 % -[32] Char x: 0.08753218719990481 % -[33] Char w: 0.05920461852679646 % -[34] Char ō: 0.018129643950789347 % -[35] Char ö: 0.00878154628866359 % -[36] Char é: 0.007648443541739256 % -[37] Char q: 0.0073651678550081725 % +[ 0] Char i: 13.032394753165974 % +[ 1] Char a: 11.167225131779828 % +[ 2] Char s: 8.586578143101137 % +[ 3] Char o: 7.018815978428046 % +[ 4] Char e: 5.525830545728521 % +[ 5] Char r: 5.469181506354606 % +[ 6] Char n: 5.142599794363987 % +[ 7] Char t: 5.105777918770942 % +[ 8] Char u: 4.270487833202568 % +[ 9] Char k: 3.9617505686147325 % +[10] Char l: 3.9051015292408184 % +[11] Char m: 3.359854525266888 % +[12] Char d: 3.0372382460324427 % +[13] Char v: 2.7270847554602593 % +[14] Char j: 2.4472385009531203 % +[15] Char p: 2.329125253858508 % +[16] Char g: 1.9427788053284087 % +[17] Char ė: 1.5657794482950054 % +[18] Char b: 1.5074309377398734 % +[19] Char y: 1.2236192504765602 % +[20] Char ų: 1.181698961339863 % +[21] Char š: 0.9630336693565519 % +[22] Char ž: 0.8171623929687212 % +[23] Char c: 0.5959478942135839 % +[24] Char č: 0.48010060869392807 % +[25] Char f: 0.428266737666796 % +[26] Char h: 0.42515104050123065 % +[27] Char z: 0.4010751987673169 % +[28] Char ū: 0.3685020011273159 % +[29] Char ą: 0.3526402701026197 % +[30] Char į: 0.29004308159444386 % +[31] Char ę: 0.14813723796278724 % +[32] Char x: 0.08752276583269838 % +[33] Char w: 0.059198246145740985 % +[34] Char ō: 0.01812769259965274 % +[35] Char ö: 0.008780601102956797 % +[36] Char é: 0.0076476203154785 % +[37] Char q: 0.007364375118608926 % -The first 38 characters have an accumulated ratio of 0.9997705466937479. +The first 38 characters have an accumulated ratio of 0.9996629382157253. -976 sequences found. +1016 sequences found. -First 512 (typical positive ratio): 0.9930868640383149 -Next 512 (512-1024): 0.008172503562191761 -Rest: -2.688821387764051e-17 +First 512 (typical positive ratio): 0.9928710196247589 +Next 512 (512-1024): 0.008171623929687212 +Rest: -4.85722573273506e-17 -- Processing end: 2016-09-20 22:57:33.185223 +- Processing end: 2016-09-21 00:25:34.935858 diff --git a/script/charsets/iso-8859-10.py b/script/charsets/iso-8859-10.py new file mode 100644 index 0000000..e17d5b6 --- /dev/null +++ b/script/charsets/iso-8859-10.py @@ -0,0 +1,73 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +# ##### BEGIN LICENSE BLOCK ##### +# Version: MPL 1.1/GPL 2.0/LGPL 2.1 +# +# The contents of this file are subject to the Mozilla Public License Version +# 1.1 (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# http://www.mozilla.org/MPL/ +# +# Software distributed under the License is distributed on an "AS IS" basis, +# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License +# for the specific language governing rights and limitations under the +# License. +# +# The Original Code is Mozilla Universal charset detector code. +# +# The Initial Developer of the Original Code is +# Netscape Communications Corporation. +# Portions created by the Initial Developer are Copyright (C) 2001 +# the Initial Developer. All Rights Reserved. +# +# Contributor(s): +# Jehan +# +# Alternatively, the contents of this file may be used under the terms of +# either the GNU General Public License Version 2 or later (the "GPL"), or +# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), +# in which case the provisions of the GPL or the LGPL are applicable instead +# of those above. If you wish to allow use of your version of this file only +# under the terms of either the GPL or the LGPL, and not to allow others to +# use your version of this file under the terms of the MPL, indicate your +# decision by deleting the provisions above and replace them with the notice +# and other provisions required by the GPL or the LGPL. If you do not delete +# the provisions above, a recipient may use your version of this file under +# the terms of any one of the MPL, the GPL or the LGPL. +# +# ##### END LICENSE BLOCK ##### + +from codepoints import * + +name = 'ISO-8859-10' +aliases = ['ISO_8859-10:1992', 'ISO_8859-10', 'iso-ir-157', + 'csISOLatin6', 'latin6', 'l6'] + +language = \ +{ + # Nordic languages. Supersedes ISO-8859-4. + 'complete': [ 'et', 'lv', 'lt', 'kl', 'saam1281' ], + 'incomplete': [] +} + +# X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF # +charmap = \ +[ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, # 0X + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 1X + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # 2X + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, # 3X + SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 4X + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,SYM, # 5X + SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 6X + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,CTR, # 7X + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 8X + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 9X + SYM,LET,LET,LET,LET,LET,LET,SYM,LET,LET,LET,LET,LET,SYM,LET,LET, # AX + SYM,LET,LET,LET,LET,LET,LET,SYM,LET,LET,LET,LET,LET,LET,LET,LET, # BX + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # CX + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # DX + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # EX + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # FX +] diff --git a/script/charsets/iso-8859-4.py b/script/charsets/iso-8859-4.py new file mode 100644 index 0000000..d9b3f25 --- /dev/null +++ b/script/charsets/iso-8859-4.py @@ -0,0 +1,73 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +# ##### BEGIN LICENSE BLOCK ##### +# Version: MPL 1.1/GPL 2.0/LGPL 2.1 +# +# The contents of this file are subject to the Mozilla Public License Version +# 1.1 (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# http://www.mozilla.org/MPL/ +# +# Software distributed under the License is distributed on an "AS IS" basis, +# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License +# for the specific language governing rights and limitations under the +# License. +# +# The Original Code is Mozilla Universal charset detector code. +# +# The Initial Developer of the Original Code is +# Netscape Communications Corporation. +# Portions created by the Initial Developer are Copyright (C) 2001 +# the Initial Developer. All Rights Reserved. +# +# Contributor(s): +# Jehan +# +# Alternatively, the contents of this file may be used under the terms of +# either the GNU General Public License Version 2 or later (the "GPL"), or +# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), +# in which case the provisions of the GPL or the LGPL are applicable instead +# of those above. If you wish to allow use of your version of this file only +# under the terms of either the GPL or the LGPL, and not to allow others to +# use your version of this file under the terms of the MPL, indicate your +# decision by deleting the provisions above and replace them with the notice +# and other provisions required by the GPL or the LGPL. If you do not delete +# the provisions above, a recipient may use your version of this file under +# the terms of any one of the MPL, the GPL or the LGPL. +# +# ##### END LICENSE BLOCK ##### + +from codepoints import * + +name = 'ISO-8859-4' +aliases = ['ISO_8859-2:1988', 'ISO_8859-4', 'iso-ir-110', + 'csISOLatin4', 'latin4', 'l4'] + +language = \ +{ + # Nordic languages. Largely superseded by ISO-8859-10. + 'complete': [ 'et', 'lv', 'lt', 'kl', 'saam1281' ], + 'incomplete': [] +} + +# X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF # +charmap = \ +[ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, # 0X + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 1X + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # 2X + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, # 3X + SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 4X + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,SYM, # 5X + SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 6X + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,CTR, # 7X + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 8X + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 9X + SYM,LET,LET,LET,SYM,LET,LET,SYM,SYM,LET,LET,LET,LET,SYM,LET,SYM, # AX + SYM,LET,SYM,LET,SYM,LET,LET,SYM,SYM,LET,LET,LET,LET,LET,LET,LET, # BX + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # CX + LET,LET,LET,LET,LET,LET,LET,SYM,LET,LET,LET,LET,LET,LET,LET,LET, # DX + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # EX + LET,LET,LET,LET,LET,LET,LET,SYM,LET,LET,LET,LET,LET,LET,LET,SYM, # FX +] diff --git a/script/langs/lt.py b/script/langs/lt.py index 2f030c8..dbd3297 100644 --- a/script/langs/lt.py +++ b/script/langs/lt.py @@ -50,7 +50,7 @@ code = 'lt' # ASCII characters are also used. use_ascii = True # The charsets we want to support and create data for. -charsets = ['ISO-8859-13'] +charsets = ['ISO-8859-4', 'ISO-8859-10', 'ISO-8859-13', ] ## Optional Properties ## diff --git a/script/langs/lv.py b/script/langs/lv.py index 85d1b39..dad6adf 100644 --- a/script/langs/lv.py +++ b/script/langs/lv.py @@ -50,7 +50,7 @@ code = 'lv' # ASCII characters are also used. use_ascii = True # The charsets we want to support and create data for. -charsets = ['ISO-8859-13'] +charsets = ['ISO-8859-4', 'ISO-8859-10', 'ISO-8859-13'] ## Optional Properties ## diff --git a/src/LangModels/LangLatvianModel.cpp b/src/LangModels/LangLatvianModel.cpp index fd60884..9f71799 100644 --- a/src/LangModels/LangLatvianModel.cpp +++ b/src/LangModels/LangLatvianModel.cpp @@ -41,7 +41,7 @@ /** * Generated by BuildLangModel.py - * On: 2016-09-20 23:23:02.592930 + * On: 2016-09-21 00:19:18.362275 **/ /* Character Mapping Table: @@ -61,6 +61,48 @@ * ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1 * even though they are both used for French. Same for the euro sign. */ +static const unsigned char Iso_8859_4_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 0, 17, 22, 13, 3, 25, 19, 28, 1, 16, 11, 9, 12, 7, 10, /* 4X */ + 15, 38, 4, 2, 5, 6, 14, 33, 35, 34, 20,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 0, 17, 22, 13, 3, 25, 19, 28, 1, 16, 11, 9, 12, 7, 10, /* 6X */ + 15, 38, 4, 2, 5, 6, 14, 33, 35, 34, 20,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM, 55, 56, 57,SYM, 58, 26,SYM,SYM, 23, 21, 31, 59,SYM, 29,SYM, /* AX */ + SYM, 60,SYM, 61,SYM, 62, 26,SYM,SYM, 23, 21, 31, 63, 48, 29, 48, /* BX */ + 8, 42, 64, 65, 40, 52, 53, 66, 32, 37, 67, 43, 46, 45, 49, 18, /* CX */ + 68, 24, 51, 30, 69, 70, 36,SYM, 71, 72, 73, 74, 39, 75, 27, 44, /* DX */ + 8, 42, 76, 77, 40, 52, 53, 78, 32, 37, 79, 43, 46, 45, 49, 18, /* EX */ + 80, 24, 51, 30, 81, 82, 36,SYM, 83, 84, 85, 86, 39, 87, 27,SYM, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Iso_8859_10_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 0, 17, 22, 13, 3, 25, 19, 28, 1, 16, 11, 9, 12, 7, 10, /* 4X */ + 15, 38, 4, 2, 5, 6, 14, 33, 35, 34, 20,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 0, 17, 22, 13, 3, 25, 19, 28, 1, 16, 11, 9, 12, 7, 10, /* 6X */ + 15, 38, 4, 2, 5, 6, 14, 33, 35, 34, 20,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM, 88, 21, 31, 18, 89, 30,SYM, 26, 90, 23, 91, 29,SYM, 27, 48, /* AX */ + SYM, 92, 21, 31, 18, 93, 30,SYM, 26, 94, 23, 95, 29, 96, 27, 48, /* BX */ + 8, 42, 97, 98, 40, 52, 53, 99, 32, 37,100, 43, 46, 45, 49,101, /* CX */ + 50, 24, 51, 47,102,103, 36,104,105,106,107,108, 39,109, 54, 44, /* DX */ + 8, 42,110,111, 40, 52, 53,112, 32, 37,113, 43, 46, 45, 49,114, /* EX */ + 50, 24, 51, 47,115,116, 36,117,118,119,120,121, 39,122, 54,123, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + static const unsigned char Iso_8859_13_CharToOrderMap[] = { CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ @@ -73,21 +115,21 @@ static const unsigned char Iso_8859_13_CharToOrderMap[] = 15, 38, 4, 2, 5, 6, 14, 33, 35, 34, 20,SYM,SYM,SYM,SYM,CTR, /* 7X */ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ - SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 48,SYM, 49,SYM,SYM,SYM,SYM, 47, /* AX */ - SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 50,SYM, 51,SYM,SYM,SYM,SYM, 47, /* BX */ - 52, 53, 8, 54, 40, 46, 55, 21, 32, 37, 56, 43, 31, 30, 18, 26, /* CX */ - 23, 57, 24, 44, 45, 58, 36,SYM, 59, 41, 60, 27, 39, 61, 29, 42, /* DX */ - 62, 63, 8, 64, 40, 46, 65, 21, 32, 37, 66, 43, 31, 30, 18, 26, /* EX */ - 23, 67, 24, 44, 45, 68, 36,SYM, 69, 41, 70, 27, 39, 71, 29,SYM, /* FX */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,124,SYM,125,SYM,SYM,SYM,SYM, 53, /* AX */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,126,SYM,127,SYM,SYM,SYM,SYM, 53, /* BX */ + 128,129, 8,130, 40, 52,131, 21, 32, 37,132, 46, 31, 30, 18, 26, /* CX */ + 23,133, 24, 47, 51,134, 36,SYM,135, 41,136, 27, 39,137, 29, 44, /* DX */ + 138,139, 8,140, 40, 52,141, 21, 32, 37,142, 46, 31, 30, 18, 26, /* EX */ + 23,143, 24, 47, 51,144, 36,SYM,145, 41,146, 27, 39,147, 29,SYM, /* FX */ }; /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ /* Model Table: - * Total sequences: 956 - * First 512 sequences: 0.9904728616367904 - * Next 512 sequences (512-1024): 0.009527138363209666 - * Rest: -4.683753385137379e-17 + * Total sequences: 970 + * First 512 sequences: 0.9904102202220861 + * Next 512 sequences (512-1024): 0.009589779777913882 + * Rest: -1.734723475976807e-17 * Negative sequences: TODO */ static const PRUint8 LatvianLangModel[] = @@ -134,12 +176,32 @@ static const PRUint8 LatvianLangModel[] = }; +const SequenceModel Iso_8859_4LatvianModel = +{ + Iso_8859_4_CharToOrderMap, + LatvianLangModel, + 39, + (float)0.9904102202220861, + PR_TRUE, + "ISO-8859-4" +}; + +const SequenceModel Iso_8859_10LatvianModel = +{ + Iso_8859_10_CharToOrderMap, + LatvianLangModel, + 39, + (float)0.9904102202220861, + PR_TRUE, + "ISO-8859-10" +}; + const SequenceModel Iso_8859_13LatvianModel = { Iso_8859_13_CharToOrderMap, LatvianLangModel, 39, - (float)0.9904728616367904, + (float)0.9904102202220861, PR_TRUE, "ISO-8859-13" -}; +}; \ No newline at end of file diff --git a/src/LangModels/LangLithuanianModel.cpp b/src/LangModels/LangLithuanianModel.cpp index d77d741..280f8a5 100644 --- a/src/LangModels/LangLithuanianModel.cpp +++ b/src/LangModels/LangLithuanianModel.cpp @@ -41,7 +41,7 @@ /** * Generated by BuildLangModel.py - * On: 2016-09-20 22:57:33.077635 + * On: 2016-09-21 00:25:34.775158 **/ /* Character Mapping Table: @@ -61,6 +61,48 @@ * ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1 * even though they are both used for French. Same for the euro sign. */ +static const unsigned char Iso_8859_10_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 1, 18, 23, 12, 4, 25, 16, 26, 0, 14, 9, 10, 11, 6, 3, /* 4X */ + 15, 37, 5, 2, 7, 8, 13, 33, 32, 19, 27,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 1, 18, 23, 12, 4, 25, 16, 26, 0, 14, 9, 10, 11, 6, 3, /* 6X */ + 15, 37, 5, 2, 7, 8, 13, 33, 32, 19, 27,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM, 29, 50, 60, 47, 61, 62,SYM, 56, 55, 21, 63, 22,SYM, 28, 64, /* AX */ + SYM, 29, 50, 65, 47, 66, 67,SYM, 56, 55, 21, 68, 22, 69, 28, 70, /* BX */ + 41, 39, 71, 53, 38, 43, 72, 30, 24, 36, 31, 73, 17, 40, 74, 46, /* CX */ + 75, 57, 34, 44, 59, 76, 35, 77, 48, 20, 54, 78, 45, 79, 80, 52, /* DX */ + 41, 39, 81, 53, 38, 43, 82, 30, 24, 36, 31, 83, 17, 40, 84, 46, /* EX */ + 85, 57, 34, 44, 59, 86, 35, 87, 48, 20, 54, 88, 45, 89, 90, 91, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Iso_8859_4_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 1, 18, 23, 12, 4, 25, 16, 26, 0, 14, 9, 10, 11, 6, 3, /* 4X */ + 15, 37, 5, 2, 7, 8, 13, 33, 32, 19, 27,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 1, 18, 23, 12, 4, 25, 16, 26, 0, 14, 9, 10, 11, 6, 3, /* 6X */ + 15, 37, 5, 2, 7, 8, 13, 33, 32, 19, 27,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM, 29, 92, 93,SYM, 94, 56,SYM,SYM, 21, 50, 95, 96,SYM, 22,SYM, /* AX */ + SYM, 29,SYM, 97,SYM, 98, 56,SYM,SYM, 21, 50, 99,100,101, 22,102, /* BX */ + 41, 39,103, 53, 38, 43,104, 30, 24, 36, 31,105, 17, 40,106, 47, /* CX */ + 55, 57, 34,107, 59,108, 35,SYM, 48, 20, 54,109, 45,110, 28, 52, /* DX */ + 41, 39,111, 53, 38, 43,112, 30, 24, 36, 31,113, 17, 40,114, 47, /* EX */ + 55, 57, 34,115, 59,116, 35,SYM, 48, 20, 54,117, 45,118, 28,SYM, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + static const unsigned char Iso_8859_13_CharToOrderMap[] = { CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ @@ -73,21 +115,21 @@ static const unsigned char Iso_8859_13_CharToOrderMap[] = 15, 37, 5, 2, 7, 8, 13, 33, 32, 19, 27,SYM,SYM,SYM,SYM,CTR, /* 7X */ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ - SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 45,SYM, 53,SYM,SYM,SYM,SYM, 54, /* AX */ - SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 45,SYM, 55,SYM,SYM,SYM,SYM, 56, /* BX */ - 29, 30, 39, 46, 38, 41, 31, 47, 24, 36, 57, 17, 58, 59, 44, 50, /* CX */ - 21, 48, 51, 42, 34, 60, 35,SYM, 20, 40, 52, 28, 43, 61, 22, 49, /* DX */ - 29, 30, 39, 46, 38, 41, 31, 47, 24, 36, 62, 17, 63, 64, 44, 50, /* EX */ - 21, 48, 51, 42, 34, 65, 35,SYM, 20, 40, 52, 28, 43, 66, 22,SYM, /* FX */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 48,SYM,119,SYM,SYM,SYM,SYM,120, /* AX */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 48,SYM,121,SYM,SYM,SYM,SYM,122, /* BX */ + 29, 30, 41, 49, 38, 43, 31, 50, 24, 36,123, 17,124,125, 47, 56, /* CX */ + 21, 51, 57, 44, 34,126, 35,SYM, 20, 42, 58, 28, 45,127, 22, 52, /* DX */ + 29, 30, 41, 49, 38, 43, 31, 50, 24, 36,128, 17,129,130, 47, 56, /* EX */ + 21, 51, 57, 44, 34,131, 35,SYM, 20, 42, 58, 28, 45,132, 22,SYM, /* FX */ }; /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ /* Model Table: - * Total sequences: 976 - * First 512 sequences: 0.9930868640383149 - * Next 512 sequences (512-1024): 0.0069131359616851065 - * Rest: -2.688821387764051e-17 + * Total sequences: 1016 + * First 512 sequences: 0.9928710196247589 + * Next 512 sequences (512-1024): 0.0071289803752411715 + * Rest: -4.85722573273506e-17 * Negative sequences: TODO */ static const PRUint8 LithuanianLangModel[] = @@ -133,12 +175,32 @@ static const PRUint8 LithuanianLangModel[] = }; +const SequenceModel Iso_8859_10LithuanianModel = +{ + Iso_8859_10_CharToOrderMap, + LithuanianLangModel, + 38, + (float)0.9928710196247589, + PR_TRUE, + "ISO-8859-10" +}; + +const SequenceModel Iso_8859_4LithuanianModel = +{ + Iso_8859_4_CharToOrderMap, + LithuanianLangModel, + 38, + (float)0.9928710196247589, + PR_TRUE, + "ISO-8859-4" +}; + const SequenceModel Iso_8859_13LithuanianModel = { Iso_8859_13_CharToOrderMap, LithuanianLangModel, 38, - (float)0.9930868640383149, + (float)0.9928710196247589, PR_TRUE, "ISO-8859-13" }; \ No newline at end of file diff --git a/src/nsSBCSGroupProber.cpp b/src/nsSBCSGroupProber.cpp index 57dbafe..e90902f 100644 --- a/src/nsSBCSGroupProber.cpp +++ b/src/nsSBCSGroupProber.cpp @@ -112,13 +112,17 @@ nsSBCSGroupProber::nsSBCSGroupProber() mProbers[34] = new nsSingleByteCharSetProber(&Windows_1252DanishModel); mProbers[35] = new nsSingleByteCharSetProber(&Iso_8859_13LithuanianModel); + mProbers[36] = new nsSingleByteCharSetProber(&Iso_8859_10LithuanianModel); + mProbers[37] = new nsSingleByteCharSetProber(&Iso_8859_4LithuanianModel); - mProbers[36] = new nsSingleByteCharSetProber(&Iso_8859_13LatvianModel); + mProbers[38] = new nsSingleByteCharSetProber(&Iso_8859_13LatvianModel); + mProbers[39] = new nsSingleByteCharSetProber(&Iso_8859_10LatvianModel); + mProbers[40] = new nsSingleByteCharSetProber(&Iso_8859_4LatvianModel); - mProbers[37] = new nsSingleByteCharSetProber(&Iso_8859_1PortugueseModel); - mProbers[38] = new nsSingleByteCharSetProber(&Iso_8859_9PortugueseModel); - mProbers[39] = new nsSingleByteCharSetProber(&Iso_8859_15PortugueseModel); - mProbers[40] = new nsSingleByteCharSetProber(&Windows_1252PortugueseModel); + mProbers[41] = new nsSingleByteCharSetProber(&Iso_8859_1PortugueseModel); + mProbers[42] = new nsSingleByteCharSetProber(&Iso_8859_9PortugueseModel); + mProbers[43] = new nsSingleByteCharSetProber(&Iso_8859_15PortugueseModel); + mProbers[44] = new nsSingleByteCharSetProber(&Windows_1252PortugueseModel); Reset(); } diff --git a/src/nsSBCSGroupProber.h b/src/nsSBCSGroupProber.h index 9cf7ea4..48444de 100644 --- a/src/nsSBCSGroupProber.h +++ b/src/nsSBCSGroupProber.h @@ -40,7 +40,7 @@ #define nsSBCSGroupProber_h__ -#define NUM_OF_SBCS_PROBERS 41 +#define NUM_OF_SBCS_PROBERS 45 class nsCharSetProber; class nsSBCSGroupProber: public nsCharSetProber { diff --git a/src/nsSBCharSetProber.h b/src/nsSBCharSetProber.h index 6000838..5b40db7 100644 --- a/src/nsSBCharSetProber.h +++ b/src/nsSBCharSetProber.h @@ -173,8 +173,12 @@ extern const SequenceModel Iso_8859_1DanishModel; extern const SequenceModel Windows_1252DanishModel; extern const SequenceModel Iso_8859_13LithuanianModel; +extern const SequenceModel Iso_8859_10LithuanianModel; +extern const SequenceModel Iso_8859_4LithuanianModel; extern const SequenceModel Iso_8859_13LatvianModel; +extern const SequenceModel Iso_8859_10LatvianModel; +extern const SequenceModel Iso_8859_4LatvianModel; extern const SequenceModel Iso_8859_1PortugueseModel; extern const SequenceModel Iso_8859_9PortugueseModel; diff --git a/test/lt/iso-8859-10.txt b/test/lt/iso-8859-10.txt new file mode 100644 index 0000000..d005822 --- /dev/null +++ b/test/lt/iso-8859-10.txt @@ -0,0 +1,3 @@ +Vincentas van Gogas (ol. Vincent van Gogh, 1853 m. kovo 30 d. Grot Zunderte, +Nyderlandai - 1890 m. liepos 29 d. Overe prie Uazos, Pranczija) - oland +tapytojas ir grafikas, postimpresionistas. diff --git a/test/lt/iso-8859-4.txt b/test/lt/iso-8859-4.txt new file mode 100644 index 0000000..d5ee32a --- /dev/null +++ b/test/lt/iso-8859-4.txt @@ -0,0 +1,3 @@ +Vincentas van Gogas (ol. Vincent van Gogh, 1853 m. kovo 30 d. Grot Zunderte, +Nyderlandai - 1890 m. liepos 29 d. Overe prie Uazos, Pranczija) - oland +tapytojas ir grafikas, postimpresionistas. diff --git a/test/lv/iso-8859-10.txt b/test/lv/iso-8859-10.txt new file mode 100644 index 0000000..6afb063 --- /dev/null +++ b/test/lv/iso-8859-10.txt @@ -0,0 +1,6 @@ +Vinsents Villems van Gogs (nderlandieu: Vincent Willem van Gogh, dzimis 1853. +gada 30. mart, miris 1890. gada 29. jlij) bija nderlandieu gleznotjs, +postimpresionisma prstvis. Kopum van Gogs radja vairk nek 2000 darbu, to +skait 900 gleznu un 1100 zmjumu un skiu. Savus slavenkos darbus vi radja +pdjo divu dzves gadu laik. Tiek uzskatts, ka van Gogs btiski ir ietekmjis +20. gadsimta mkslu, tostarp ekspresionismu un fovismu. diff --git a/test/lv/iso-8859-4.txt b/test/lv/iso-8859-4.txt new file mode 100644 index 0000000..7fd134d --- /dev/null +++ b/test/lv/iso-8859-4.txt @@ -0,0 +1,6 @@ +Vinsents Villems van Gogs (nderlandieu: Vincent Willem van Gogh, dzimis 1853. +gada 30. mart, miris 1890. gada 29. jlij) bija nderlandieu gleznotjs, +postimpresionisma prstvis. Kopum van Gogs radja vairk nek 2000 darbu, to +skait 900 gleznu un 1100 zmjumu un skiu. Savus slavenkos darbus vi radja +pdjo divu dzves gadu laik. Tiek uzskatts, ka van Gogs btiski ir ietekmjis +20. gadsimta mkslu, tostarp ekspresionismu un fovismu.