From ad2f7212e2803373c39b20488b49cd4c58fb2197 Mon Sep 17 00:00:00 2001 From: Jehan Date: Sun, 13 Dec 2015 18:00:07 +0100 Subject: [PATCH] LangModels: retraining Greek models with my training script. This fixes our Greek/Windows-1253 test. --- script/BuildLangModelLogs/LangGreekModel.log | 117 ++++++ script/charsets/iso-8859-7.py | 73 ++++ script/charsets/windows-1253.py | 72 ++++ script/langs/el.py | 63 ++++ src/LangModels/LangGreekModel.cpp | 367 +++++++++---------- src/nsSBCSGroupProber.cpp | 4 +- src/nsSBCharSetProber.h | 4 +- test/CMakeLists.txt | 3 +- 8 files changed, 500 insertions(+), 203 deletions(-) create mode 100644 script/BuildLangModelLogs/LangGreekModel.log create mode 100644 script/charsets/iso-8859-7.py create mode 100644 script/charsets/windows-1253.py create mode 100644 script/langs/el.py diff --git a/script/BuildLangModelLogs/LangGreekModel.log b/script/BuildLangModelLogs/LangGreekModel.log new file mode 100644 index 0000000..c7b5e6f --- /dev/null +++ b/script/BuildLangModelLogs/LangGreekModel.log @@ -0,0 +1,117 @@ += Logs of language model for Greek (el) = + +- Generated by BuildLangModel.py +- Started: 2015-12-13 17:52:58.225697 +- Maximum depth: 2 +- Max number of pages: 50 + +== Parsed pages == + +Πύλη:Κύρια (revision 5511929) +13 Δεκεμβρίου (revision 5540654) +1545 (revision 5340059) +1937 (revision 5573231) +1943 (revision 5503673) +2007 (revision 5561663) +2009 (revision 5561693) +2012 (revision 5559036) +Sukhoi Su-24 (revision 5582048) +Wiki (revision 5481490) +Wikimedia (revision 5563126) +Αναμνηστικά κέρματα ευρώ €2 (revision 5578001) +Αφρική (revision 5485484) +Γερμανία (revision 5579724) +Εγκυκλοπαίδεια (revision 5566281) +Ελεύθερο περιεχόμενο (revision 5285700) +Ελλάδα (revision 5580388) +Ελληνική γλώσσα (revision 5545135) +Ευρωζώνη (revision 5453082) +Ευρωπαϊκή Ένωση (revision 5562182) +Ευρωπαϊκή Επιτροπή (revision 5535040) +Ευρωπαϊκή Κεντρική Τράπεζα (revision 5352451) +Ευρώ (revision 5535228) +Ιαπωνία (revision 5540508) +Κέρματα ευρώ (revision 5421943) +Κίνα (revision 5538381) +Καθολική Εκκλησία (revision 5345868) +Καλάβρυτα (revision 5562415) +Κεντροαφρικανική Δημοκρατία (revision 5583804) +Κλίμα (revision 5331688) +Ναντσίνγκ (revision 5460512) +Οικουμενικές σύνοδοι (revision 5377374) +ΠΓΔΜ (revision 5577102) +Πάπας Φραγκίσκος (revision 5565143) +Παρίσι (revision 5524991) +Προτεσταντισμός (revision 5564242) +Πρωθυπουργός της Πορτογαλίας (revision 4986657) +Σφαγή της Ναντσίνγκ (revision 5026948) +Σφαγή των Καλαβρύτων (revision 5491100) +Σύνοδος των Ηνωμένων Εθνών για το κλίμα (2015) (revision 5521523) +Τουρκική κατάρριψη ρωσικού Sukhoi Su-24 (revision 5582048) +Χρυσά και ασημένια συλλεκτικά νομίσματα Ευρώ (revision 4458078) +10 Δεκεμβρίου (revision 5556215) +1124 (revision 5556117) +11 Δεκεμβρίου (revision 5537830) +1204 (revision 5234676) +1250 (revision 5445111) +1294 (revision 5563589) +12 Δεκεμβρίου (revision 5539079) + +== End of Parsed pages == + +- Wikipedia parsing ended at: 2015-12-13 17:54:25.103854 + +62 characters appeared 551775 times. + +First 41 characters: +[ 0] Char α: 9.098636219473518 % +[ 1] Char ο: 8.030447193149381 % +[ 2] Char τ: 7.717819763490554 % +[ 3] Char ι: 6.6942141271351545 % +[ 4] Char ε: 6.213764668569617 % +[ 5] Char ν: 5.920166734629151 % +[ 6] Char ρ: 4.645552988083911 % +[ 7] Char κ: 4.4978478546508995 % +[ 8] Char σ: 4.235422047030039 % +[ 9] Char η: 3.9173576185945356 % +[10] Char ς: 3.821666440125051 % +[11] Char π: 3.59023152553124 % +[12] Char μ: 3.2670925649041727 % +[13] Char υ: 3.258755833446604 % +[14] Char λ: 2.7634452448914866 % +[15] Char ί: 2.437406551583526 % +[16] Char ό: 2.070409134157945 % +[17] Char ά: 1.8300937882288977 % +[18] Char έ: 1.6805763218703276 % +[19] Char γ: 1.6767704227266547 % +[20] Char δ: 1.5888722758370715 % +[21] Char ω: 1.4756014679896698 % +[22] Char ή: 1.2963617416519415 % +[23] Char χ: 1.1928775316025553 % +[24] Char ύ: 0.9763037469983236 % +[25] Char θ: 0.8885868334012957 % +[26] Char ώ: 0.8104752843097277 % +[27] Char β: 0.7689728603144398 % +[28] Char φ: 0.6885052784196457 % +[29] Char ξ: 0.32549499343029314 % +[30] Char ζ: 0.3108150967332699 % +[31] Char i: 0.22273571655113045 % +[32] Char e: 0.2096869194871098 % +[33] Char a: 0.17742739341216981 % +[34] Char o: 0.14534910062978568 % +[35] Char n: 0.1428118345340039 % +[36] Char s: 0.12432603869330797 % +[37] Char r: 0.12305740564541706 % +[38] Char ϊ: 0.10819627565583799 % +[39] Char t: 0.10819627565583799 % +[40] Char ψ: 0.1040279099270536 % + +The first 41 characters have an accumulated ratio of 0.9915635902315255. + +1299 sequences found. + +First 512 (typical positive ratio): 0.9690985257709991 +Next 512 (512-1024): 0.008104752843097278 +Rest: 0.0010500394313971116 + +- Processing end: 2015-12-13 17:54:25.303820 diff --git a/script/charsets/iso-8859-7.py b/script/charsets/iso-8859-7.py new file mode 100644 index 0000000..b5ad188 --- /dev/null +++ b/script/charsets/iso-8859-7.py @@ -0,0 +1,73 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +# ##### BEGIN LICENSE BLOCK ##### +# Version: MPL 1.1/GPL 2.0/LGPL 2.1 +# +# The contents of this file are subject to the Mozilla Public License Version +# 1.1 (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# http://www.mozilla.org/MPL/ +# +# Software distributed under the License is distributed on an "AS IS" basis, +# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License +# for the specific language governing rights and limitations under the +# License. +# +# The Original Code is Mozilla Universal charset detector code. +# +# The Initial Developer of the Original Code is +# Netscape Communications Corporation. +# Portions created by the Initial Developer are Copyright (C) 2001 +# the Initial Developer. All Rights Reserved. +# +# Contributor(s): +# Jehan +# +# Alternatively, the contents of this file may be used under the terms of +# either the GNU General Public License Version 2 or later (the "GPL"), or +# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), +# in which case the provisions of the GPL or the LGPL are applicable instead +# of those above. If you wish to allow use of your version of this file only +# under the terms of either the GPL or the LGPL, and not to allow others to +# use your version of this file under the terms of the MPL, indicate your +# decision by deleting the provisions above and replace them with the notice +# and other provisions required by the GPL or the LGPL. If you do not delete +# the provisions above, a recipient may use your version of this file under +# the terms of any one of the MPL, the GPL or the LGPL. +# +# ##### END LICENSE BLOCK ##### + +from codepoints import * + +name = 'ISO-8859-7' +aliases = ['ISO_8859-7:1987', 'ISO_8859-7', 'iso-ir-126', + 'ELOT_928', 'ECMA-118', 'greek', 'greek8', 'csISOLatinGreek'] + +language = \ +{ + # Dedicated to modern Greek. + 'complete': [ 'el' ], + 'incomplete': [] +} + +# X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF # +charmap = \ +[ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, # 0X + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 1X + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # 2X + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, # 3X + SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 4X + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,SYM, # 5X + SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 6X + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,CTR, # 7X + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 8X + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 9X + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,ILL,SYM, # AX + SYM,SYM,SYM,SYM,SYM,SYM,LET,SYM,LET,LET,LET,SYM,LET,SYM,LET,LET, # BX + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # CX + LET,LET,ILL,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # DX + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # EX + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,ILL, # FX +] diff --git a/script/charsets/windows-1253.py b/script/charsets/windows-1253.py new file mode 100644 index 0000000..b39513a --- /dev/null +++ b/script/charsets/windows-1253.py @@ -0,0 +1,72 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +# ##### BEGIN LICENSE BLOCK ##### +# Version: MPL 1.1/GPL 2.0/LGPL 2.1 +# +# The contents of this file are subject to the Mozilla Public License Version +# 1.1 (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# http://www.mozilla.org/MPL/ +# +# Software distributed under the License is distributed on an "AS IS" basis, +# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License +# for the specific language governing rights and limitations under the +# License. +# +# The Original Code is Mozilla Universal charset detector code. +# +# The Initial Developer of the Original Code is +# Netscape Communications Corporation. +# Portions created by the Initial Developer are Copyright (C) 2001 +# the Initial Developer. All Rights Reserved. +# +# Contributor(s): +# Jehan +# +# Alternatively, the contents of this file may be used under the terms of +# either the GNU General Public License Version 2 or later (the "GPL"), or +# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), +# in which case the provisions of the GPL or the LGPL are applicable instead +# of those above. If you wish to allow use of your version of this file only +# under the terms of either the GPL or the LGPL, and not to allow others to +# use your version of this file under the terms of the MPL, indicate your +# decision by deleting the provisions above and replace them with the notice +# and other provisions required by the GPL or the LGPL. If you do not delete +# the provisions above, a recipient may use your version of this file under +# the terms of any one of the MPL, the GPL or the LGPL. +# +# ##### END LICENSE BLOCK ##### + +from codepoints import * + +name = 'WINDOWS-1253' +aliases = ['cswindows1253'] + +language = \ +{ + # Greek support. + 'complete': ['el'], + 'incomplete': [] +} + +# X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF # +charmap = \ +[ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, # 0X + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 1X + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # 2X + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, # 3X + SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 4X + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,SYM, # 5X + SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 6X + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,CTR, # 7X + SYM,ILL,SYM,SYM,SYM,SYM,SYM,SYM,ILL,SYM,ILL,SYM,ILL,ILL,ILL,ILL, # 8X + ILL,SYM,SYM,SYM,SYM,SYM,SYM,SYM,ILL,SYM,ILL,SYM,ILL,ILL,ILL,ILL, # 9X + SYM,SYM,LET,SYM,SYM,SYM,SYM,SYM,SYM,SYM,ILL,SYM,SYM,SYM,SYM,SYM, # AX + SYM,SYM,SYM,SYM,SYM,LET,SYM,SYM,LET,LET,LET,SYM,LET,SYM,LET,LET, # BX + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # CX + LET,LET,ILL,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # DX + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # EX + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,ILL, # FX +] diff --git a/script/langs/el.py b/script/langs/el.py new file mode 100644 index 0000000..7d12262 --- /dev/null +++ b/script/langs/el.py @@ -0,0 +1,63 @@ +#!/bin/python3 +# -*- coding: utf-8 -*- + +# ##### BEGIN LICENSE BLOCK ##### +# Version: MPL 1.1/GPL 2.0/LGPL 2.1 +# +# The contents of this file are subject to the Mozilla Public License Version +# 1.1 (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# http://www.mozilla.org/MPL/ +# +# Software distributed under the License is distributed on an "AS IS" basis, +# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License +# for the specific language governing rights and limitations under the +# License. +# +# The Original Code is Mozilla Universal charset detector code. +# +# The Initial Developer of the Original Code is +# Netscape Communications Corporation. +# Portions created by the Initial Developer are Copyright (C) 2001 +# the Initial Developer. All Rights Reserved. +# +# Contributor(s): +# Jehan +# +# Alternatively, the contents of this file may be used under the terms of +# either the GNU General Public License Version 2 or later (the "GPL"), or +# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), +# in which case the provisions of the GPL or the LGPL are applicable instead +# of those above. If you wish to allow use of your version of this file only +# under the terms of either the GPL or the LGPL, and not to allow others to +# use your version of this file under the terms of the MPL, indicate your +# decision by deleting the provisions above and replace them with the notice +# and other provisions required by the GPL or the LGPL. If you do not delete +# the provisions above, a recipient may use your version of this file under +# the terms of any one of the MPL, the GPL or the LGPL. +# +# ##### END LICENSE BLOCK ##### + +import re + +## Mandatory Properties ## + +name = 'Greek' +code = 'el' +use_ascii = False +charsets = ['ISO-8859-7', 'WINDOWS-1253'] + +## Optional Properties ## + +alphabet = 'αβγδεζηθικλμνξοπρσςτυφχψω' +start_pages = ['Πύλη:Κύρια'] +wikipedia_code = code +case_mapping = True + +# A function to clean content returned by the `wikipedia` python lib, +# in case some unwanted data has been overlooked. +def clean_wikipedia_content(content): + cleaned = re.sub(r'(=+) *([^=]+) *Επεξεργασία \1', + r'\2', + content) + return cleaned diff --git a/src/LangModels/LangGreekModel.cpp b/src/LangModels/LangGreekModel.cpp index ed23e4d..5692e01 100644 --- a/src/LangModels/LangGreekModel.cpp +++ b/src/LangModels/LangGreekModel.cpp @@ -36,211 +36,184 @@ * ***** END LICENSE BLOCK ***** */ #include "../nsSBCharSetProber.h" -/**************************************************************** -CTR: Control characters that usually does not exist in any text -RET: Carriage/Return -SYM: symbol (punctuation) that does not belong to word -NUM: 0 - 9 -*****************************************************************/ +/********* Language model for: Greek *********/ -//Character Mapping Table: -static const unsigned char Latin7_CharToOrderMap[] = +/** + * Generated by BuildLangModel.py + * On: 2015-12-13 17:54:25.105295 + **/ + +/* Character Mapping Table: + * ILL: illegal character. + * CTR: control character specific to the charset. + * RET: carriage/return. + * SYM: symbol (punctuation) that does not belong to word. + * NUM: 0 - 9. + * + * Other characters are ordered by probabilities + * (0 is the most common character in the language). + * + * Orders are generic to a language. So the codepoint with order X in + * CHARSET1 maps to the same character as the codepoint with the same + * order X in CHARSET2 for the same language. + * As such, it is possible to get missing order. For instance the + * ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1 + * even though they are both used for French. Same for the euro sign. + */ +static const unsigned char Windows_1253_CharToOrderMap[] = { -CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, //00 -CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, //10 -SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, //20 -NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, //30 -SYM, 82,100,104, 94, 98,101,116,102,111,187,117, 92, 88,113, 85, //40 - 79,118,105, 83, 67,114,119, 95, 99,109,188,SYM,SYM,SYM,SYM,SYM, //50 -SYM, 72, 70, 80, 81, 60, 96, 93, 89, 68,120, 97, 77, 86, 69, 55, //60 - 78,115, 65, 66, 58, 76,106,103, 87,107,112,SYM,SYM,SYM,SYM,CTR, //70 -CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, //80 -CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, //90 -SYM,233, 90,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 74,ILL,SYM, //a0 -SYM,SYM,SYM,SYM,247,248, 61, 36, 46, 71, 73,SYM, 54,SYM,108,123, //b0 -110, 31, 51, 43, 41, 34, 91, 40, 52, 47, 44, 53, 38, 49, 59, 39, //c0 - 35, 48,ILL, 37, 33, 45, 56, 50, 84, 57,120,121, 17, 18, 22, 15, //d0 -124, 1, 29, 20, 21, 3, 32, 13, 25, 5, 11, 16, 10, 6, 30, 4, //e0 - 9, 8, 14, 7, 2, 12, 28, 23, 42, 24, 64, 75, 19, 26, 27,ILL, //f0 + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 33, 51, 41, 43, 32, 53, 50, 48, 31, 56, 45, 42, 46, 35, 34, /* 4X */ + 47, 60, 37, 36, 39, 44, 54, 49, 57, 52, 59,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 33, 51, 41, 43, 32, 53, 50, 48, 31, 56, 45, 42, 46, 35, 34, /* 6X */ + 47, 60, 37, 36, 39, 44, 54, 49, 57, 52, 59,SYM,SYM,SYM,SYM,CTR, /* 7X */ + SYM,ILL,SYM,SYM,SYM,SYM,SYM,SYM,ILL,SYM,ILL,SYM,ILL,ILL,ILL,ILL, /* 8X */ + ILL,SYM,SYM,SYM,SYM,SYM,SYM,SYM,ILL,SYM,ILL,SYM,ILL,ILL,ILL,ILL, /* 9X */ + SYM,SYM, 17,SYM,SYM,SYM,SYM,SYM,SYM,SYM,ILL,SYM,SYM,SYM,SYM,SYM, /* AX */ + SYM,SYM,SYM,SYM,SYM, 61,SYM,SYM, 18, 22, 15,SYM, 16,SYM, 24, 26, /* BX */ + 55, 0, 27, 19, 20, 4, 30, 9, 25, 3, 7, 14, 12, 5, 29, 1, /* CX */ + 11, 6,ILL, 8, 2, 13, 28, 23, 40, 21, 38, 58, 17, 18, 22, 15, /* DX */ + 62, 0, 27, 19, 20, 4, 30, 9, 25, 3, 7, 14, 12, 5, 29, 1, /* EX */ + 11, 6, 10, 8, 2, 13, 28, 23, 40, 21, 38, 58, 16, 24, 26,ILL, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Iso_8859_7_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 33, 51, 41, 43, 32, 53, 50, 48, 31, 56, 45, 42, 46, 35, 34, /* 4X */ + 47, 60, 37, 36, 39, 44, 54, 49, 57, 52, 59,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 33, 51, 41, 43, 32, 53, 50, 48, 31, 56, 45, 42, 46, 35, 34, /* 6X */ + 47, 60, 37, 36, 39, 44, 54, 49, 57, 52, 59,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,ILL,SYM, /* AX */ + SYM,SYM,SYM,SYM,SYM,SYM, 17,SYM, 18, 22, 15,SYM, 16,SYM, 24, 26, /* BX */ + 55, 0, 27, 19, 20, 4, 30, 9, 25, 3, 7, 14, 12, 5, 29, 1, /* CX */ + 11, 6,ILL, 8, 2, 13, 28, 23, 40, 21, 38, 58, 17, 18, 22, 15, /* DX */ + 63, 0, 27, 19, 20, 4, 30, 9, 25, 3, 7, 14, 12, 5, 29, 1, /* EX */ + 11, 6, 10, 8, 2, 13, 28, 23, 40, 21, 38, 58, 16, 24, 26,ILL, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + + +/* Model Table: + * Total sequences: 1299 + * First 512 sequences: 0.9690985257709991 + * Next 512 sequences (512-1024): 0.029851434797603802 + * Rest: 0.0010500394313971116 + * Negative sequences: TODO + */ +static const PRUint8 GreekLangModel[] = +{ + 1,2,3,3,3,3,3,3,3,1,3,3,3,3,3,3,2,2,2,3, + 3,0,2,3,3,3,1,3,3,3,3,0,0,0,0,0,0,0,3,0,2, + 2,2,3,3,3,3,3,3,3,2,3,3,3,3,3,3,2,2,2,3, + 3,2,2,3,3,3,2,3,3,2,2,0,0,0,0,0,0,0,2,0,2, + 3,3,2,3,3,2,3,1,3,3,2,0,3,3,2,3,3,3,3,2, + 0,3,3,1,3,1,3,2,1,0,3,0,0,0,1,0,0,0,0,0,0, + 3,3,3,1,3,3,3,3,3,2,3,3,3,1,3,1,3,3,3,3, + 3,3,2,3,0,3,3,3,3,2,3,0,0,0,0,0,0,0,0,0,2, + 2,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,2,2,1,3, + 3,3,1,3,3,3,2,3,3,3,3,0,1,0,0,0,1,0,2,0,2, + 3,3,3,3,3,3,2,2,3,3,2,1,2,2,2,3,3,3,3,3, + 3,3,3,2,2,3,3,1,1,0,2,0,0,0,0,0,0,0,0,0,0, + 3,3,3,3,3,3,3,3,3,3,1,2,3,3,2,3,3,3,3,3, + 3,3,3,3,3,3,3,3,3,2,2,0,0,0,0,0,0,0,0,0,0, + 3,3,3,3,3,2,3,3,2,3,2,2,2,3,3,3,3,3,3,2, + 3,3,3,2,3,2,3,2,2,0,0,0,0,0,1,0,0,0,0,0,0, + 3,3,3,3,3,2,2,3,3,3,0,3,3,3,3,3,3,3,3,1, + 2,3,3,3,3,3,3,2,3,0,0,0,0,0,0,0,0,0,0,0,0, + 2,1,3,0,2,3,3,3,3,1,3,3,3,0,3,0,0,0,1,3, + 2,0,0,2,1,3,0,1,3,2,0,0,0,0,0,0,0,0,0,0,2, + 0,1,1,0,0,0,0,0,1,2,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,3,3,3,3,2,3,1,1,3,1,2,1,2,3,3,3,3,3,3, + 1,3,3,2,2,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,3,0,3,3,3,2,1,2,3,2,3,3,2,2,3,3,3,3,0, + 0,3,3,0,2,0,3,3,3,0,1,0,0,0,0,0,0,0,0,0,2, + 3,2,3,2,2,3,3,3,3,2,3,3,3,0,3,2,2,3,2,3, + 3,0,2,3,0,3,1,3,2,3,3,0,0,0,0,0,0,0,1,0,3, + 3,3,3,3,3,2,0,2,2,3,2,2,3,3,3,3,3,3,3,3, + 2,3,3,2,3,2,3,3,2,1,1,0,0,0,0,0,0,0,0,0,1, + 3,3,3,0,3,3,3,3,3,2,3,3,3,0,3,0,0,0,0,3, + 3,3,0,3,0,3,0,2,2,2,3,0,0,0,0,0,0,0,0,0,2, + 2,2,3,2,3,3,3,3,3,2,3,3,3,0,3,0,0,0,0,3, + 3,1,0,3,0,2,0,2,3,2,2,0,0,0,0,0,0,0,0,0,2, + 2,2,3,3,2,3,3,3,3,2,3,3,3,1,3,0,0,0,0,3, + 3,1,0,3,0,3,0,3,3,3,3,0,0,0,0,0,0,0,1,0,2, + 3,3,3,2,2,3,3,3,3,1,3,3,3,0,3,0,0,0,0,3, + 3,3,0,3,0,2,0,2,3,2,2,0,0,0,0,0,0,0,0,0,2, + 3,3,0,3,3,3,3,3,0,3,0,0,3,2,3,3,3,3,3,3, + 3,3,3,3,2,0,2,1,0,1,0,0,0,0,0,0,0,0,0,0,0, + 3,3,0,3,3,1,3,1,0,3,0,0,3,3,0,3,3,3,3,0, + 1,3,3,0,3,0,2,0,1,0,0,0,0,0,0,0,0,0,0,0,0, + 2,1,3,2,1,3,3,2,3,0,3,3,3,0,2,1,0,2,1,3, + 2,0,2,2,0,3,0,2,2,2,2,0,0,0,0,0,0,0,0,0,0, + 0,0,3,0,1,3,3,3,3,0,3,2,3,1,3,0,0,0,0,3, + 2,0,0,2,0,3,0,1,2,2,1,0,0,0,0,0,0,0,0,0,2, + 3,3,2,3,3,3,3,0,1,3,1,0,2,3,2,3,2,3,3,0, + 0,3,3,0,2,3,3,0,1,0,0,0,0,0,0,0,0,0,0,0,0, + 2,3,3,2,3,3,3,2,3,2,3,3,3,0,3,0,0,0,0,3, + 2,2,0,2,0,3,0,2,2,2,2,0,0,0,0,0,0,0,0,0,2, + 3,3,0,3,3,3,3,0,0,3,0,0,3,3,2,2,3,3,3,0, + 0,2,3,0,2,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 2,2,3,2,0,3,3,2,3,2,3,3,3,0,2,0,0,0,0,2, + 2,2,0,2,0,3,0,2,2,2,2,0,0,0,0,0,0,0,0,0,0, + 3,3,0,3,3,1,3,1,2,3,0,0,1,2,3,3,3,3,3,2, + 2,2,2,0,2,0,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,3,3,3,3,1,3,1,2,3,0,1,1,3,2,3,2,3,3,2, + 0,3,3,0,2,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,3,2,3,3,0,0,0,0,3,0,1,0,2,0,2,2,3,3,0, + 0,3,2,0,3,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,3,0,3,3,2,1,0,0,2,0,0,2,2,1,2,2,2,2,0, + 0,3,2,1,1,0,3,2,0,0,1,0,0,0,0,0,0,0,0,0,0, + 0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,1,2,3,2,3,3,2,0,2,0, + 0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,2,2,2,2,3,3,3,0,2,0, + 0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0, + 1,0,0,0,0,0,0,0,0,0,0,2,2,1,2,3,2,3,0,3,0, + 0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,2,1,1,2,3,2,3,0,2,0, + 0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,2,2,2,2,2,2,1,0,2,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,2,2,2,2,2,2,1,0,2,0, + 0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,2,3,2,3,2,2,2,0,2,0, + 2,0,2,0,1,2,2,3,2,0,1,1,2,0,2,0,2,1,0,2, + 1,0,0,1,0,0,0,2,0,0,1,0,0,0,0,0,0,0,0,0,0, + 0,1,1,0,0,0,0,0,1,2,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,3,2,2,2,0,2,2,0,2,0, + 2,2,0,2,3,0,0,0,0,3,0,0,0,2,0,2,2,1,1,0, + 0,1,2,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0, }; - -static const unsigned char win1253_CharToOrderMap[] = +const SequenceModel Windows_1253GreekModel = { -CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, //00 -CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, //10 -SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, //20 -NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, //30 -SYM, 82,100,104, 94, 98,101,116,102,111,187,117, 92, 88,113, 85, //40 - 79,118,105, 83, 67,114,119, 95, 99,109,188,SYM,SYM,SYM,SYM,SYM, //50 -SYM, 72, 70, 80, 81, 60, 96, 93, 89, 68,120, 97, 77, 86, 69, 55, //60 - 78,115, 65, 66, 58, 76,106,103, 87,107,112,SYM,SYM,SYM,SYM,SYM, //70 -CTR,ILL,CTR,CTR,CTR,CTR,CTR,CTR,ILL,CTR,ILL,CTR,ILL,ILL,ILL,ILL, //80 -ILL,CTR,CTR,CTR,CTR,CTR,CTR,CTR,ILL,CTR,ILL,CTR,ILL,ILL,ILL,ILL, //90 -SYM,233, 61,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 74,SYM,SYM, //a0 -SYM,SYM,SYM,SYM,247,SYM,SYM, 36, 46, 71, 73,SYM, 54,SYM,108,123, //b0 -110, 31, 51, 43, 41, 34, 91, 40, 52, 47, 44, 53, 38, 49, 59, 39, //c0 - 35, 48,ILL, 37, 33, 45, 56, 50, 84, 57,120,121, 17, 18, 22, 15, //d0 -124, 1, 29, 20, 21, 3, 32, 13, 25, 5, 11, 16, 10, 6, 30, 4, //e0 - 9, 8, 14, 7, 2, 12, 28, 23, 42, 24, 64, 75, 19, 26, 27,ILL, //f0 -}; - -//Model Table: -//total sequences: 100% -//first 512 sequences: 98.2851% -//first 1024 sequences:1.7001% -//rest sequences: 0.0359% -//negative sequences: 0.0148% -static const PRUint8 GreekLangModel[] = -{ -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,0,3,2,2,3,3,3,3,3,3,3,3,1,3,3,3,0,2,2,3,3,0,3,0,3,2,0,3,3,3,0, -3,0,0,0,2,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,3,3,3,3,3,0,3,3,0,3,2,3,3,0,3,2,3,3,3,0,0,3,0,3,0,3,3,2,0,0,0, -2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0, -0,2,3,2,2,3,3,3,3,3,3,3,3,0,3,3,3,3,0,2,3,3,0,3,3,3,3,2,3,3,3,0, -2,0,0,0,2,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,2,3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,0,2,1,3,3,3,3,2,3,3,2,3,3,2,0, -0,0,0,0,2,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,3,3,3,3,0,3,3,3,3,3,3,0,3,3,0,3,3,3,3,3,3,3,3,3,3,0,3,2,3,3,0, -2,0,1,0,2,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0, -0,3,3,3,3,3,2,3,0,0,0,0,3,3,0,3,1,3,3,3,0,3,3,0,3,3,3,3,0,0,0,0, -2,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,3,3,3,3,3,0,3,0,3,3,3,3,3,0,3,2,2,2,3,0,2,3,3,3,3,3,2,3,3,0,0, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,3,3,3,3,3,3,2,2,2,3,3,3,3,0,3,1,3,3,3,3,2,3,3,3,3,3,3,3,2,2,0, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,3,3,3,3,3,2,0,3,0,0,0,3,3,2,3,3,3,3,3,0,0,3,2,3,0,2,3,0,0,0,0, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,3,0,3,3,3,3,0,0,3,3,0,2,3,0,3,0,3,3,3,0,0,3,0,3,0,2,2,3,3,0,0, -0,0,1,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,3,3,3,3,3,2,0,3,2,3,3,3,3,0,3,3,3,3,3,0,3,3,2,3,2,3,3,2,0,0,0, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,3,3,2,3,2,3,3,3,3,3,3,0,2,3,2,3,2,2,2,3,2,3,3,2,3,0,2,2,2,3,0, -2,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,0,3,0,0,0,3,3,3,2,3,3,0,0,3,0,3,0,0,0,3,2,0,3,0,3,0,0,2,0,2,0, -0,0,0,0,2,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,3,3,3,3,0,3,3,3,3,3,3,0,3,3,0,3,0,0,0,3,3,0,3,3,3,0,0,1,2,3,0, -3,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,3,3,3,3,3,2,0,0,3,2,2,3,3,0,3,3,3,3,3,2,1,3,0,3,2,3,3,2,1,0,0, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,0,3,3,0,2,3,3,3,3,3,3,0,0,3,0,3,0,0,0,3,3,0,3,2,3,0,0,3,3,3,0, -3,0,0,0,2,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,3,3,3,3,0,3,3,3,3,3,3,0,0,3,0,3,0,0,0,3,2,0,3,2,3,0,0,3,2,3,0, -2,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,0,3,1,2,2,3,3,3,3,3,3,0,2,3,0,3,0,0,0,3,3,0,3,0,2,0,0,2,3,1,0, -2,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,3,0,3,3,3,3,0,3,0,3,3,2,3,0,3,3,3,3,3,3,0,3,3,3,0,2,3,0,0,3,0, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,3,0,3,3,3,0,0,3,0,0,0,3,3,0,3,0,2,3,3,0,0,3,0,3,0,3,3,0,0,0,0, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,0,3,0,0,0,3,3,3,3,3,3,0,0,3,0,2,0,0,0,3,3,0,3,0,3,0,0,2,0,2,0, -0,0,0,0,1,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,3,3,3,3,3,3,0,3,0,2,0,3,2,0,3,2,3,2,3,0,0,3,2,3,2,3,3,0,0,0,0, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,0,3,0,0,2,3,3,3,3,3,0,0,0,3,0,2,1,0,0,3,2,2,2,0,3,0,0,2,2,0,0, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,3,0,3,3,3,2,0,3,0,3,0,3,3,0,2,1,2,3,3,0,0,3,0,3,0,3,3,0,0,0,0, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,2,3,3,3,0,3,3,3,3,3,3,0,2,3,0,3,0,0,0,2,1,0,2,2,3,0,0,2,2,2,0, -0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,0,3,0,0,2,3,3,3,2,3,0,0,1,3,0,2,0,0,0,0,3,0,1,0,2,0,0,1,1,1,0, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,3,3,3,3,3,1,0,3,0,0,0,3,2,0,3,2,3,3,3,0,0,3,0,3,2,2,2,1,0,0,0, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,3,0,3,3,3,0,0,3,0,0,0,0,2,0,2,3,3,2,2,2,2,3,0,2,0,2,2,0,0,0,0, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,3,3,3,3,2,0,0,0,0,0,0,2,3,0,2,0,2,3,2,0,0,3,0,3,0,3,1,0,0,0,0, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,0,0,0,0,0,3,2,3,3,2,2,3,0,2,0,3,0,0,0,2,0,0,0,0,1,2,0,2,0,2,0, -0,2,0,2,0,2,2,0,0,1,0,2,2,2,0,2,2,2,0,2,2,2,0,0,2,0,0,1,0,0,0,0, -0,2,0,3,3,2,0,0,0,0,0,0,1,3,0,2,0,2,2,2,0,0,2,0,3,0,0,2,0,0,0,0, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,3,0,2,3,2,0,2,2,0,2,0,2,2,0,2,0,2,2,2,0,0,0,0,0,0,2,3,0,0,0,2, -0,1,2,0,0,0,0,2,2,0,0,0,2,1,0,2,2,0,0,0,0,0,0,1,0,2,0,0,0,0,0,0, -0,0,2,1,0,2,3,2,2,3,2,3,2,0,0,3,3,3,0,0,3,2,0,0,0,1,1,0,2,0,2,2, -0,2,0,2,0,2,2,0,0,2,0,2,2,2,0,2,2,2,2,0,0,2,0,0,0,2,0,1,0,0,0,0, -0,3,0,3,3,2,2,0,3,0,0,0,2,2,0,2,2,2,1,2,0,0,1,2,2,0,0,3,0,0,0,2, -0,1,2,0,0,0,1,2,0,0,0,0,0,0,0,2,2,0,1,0,0,2,0,0,0,2,0,0,0,0,0,0, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,2,3,3,2,2,0,0,0,2,0,2,3,3,0,2,0,0,0,0,0,0,2,2,2,0,2,2,0,2,0,2, -0,2,2,0,0,2,2,2,2,1,0,0,2,2,0,2,0,0,2,0,0,0,0,0,0,2,0,0,0,0,0,0, -0,2,0,3,2,3,0,0,0,3,0,0,2,2,0,2,0,2,2,2,0,0,2,0,0,0,0,0,0,0,0,2, -0,0,2,2,0,0,2,2,2,0,0,0,0,0,0,2,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0, -0,0,2,0,0,3,2,0,2,2,2,2,2,0,0,0,2,0,0,0,0,2,0,1,0,0,2,0,1,0,0,0, -0,2,2,2,0,2,2,0,1,2,0,2,2,2,0,2,2,2,2,1,2,2,0,0,2,0,0,0,0,0,0,0, -0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0, -0,2,0,2,0,2,2,0,0,0,0,1,2,1,0,0,2,2,0,0,2,0,0,0,0,0,0,0,0,0,0,0, -0,0,0,3,2,3,0,0,2,0,0,0,2,2,0,2,0,0,0,1,0,0,2,0,2,0,2,2,0,0,0,0, -0,0,2,0,0,0,0,2,2,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0, -0,2,2,3,2,2,0,0,0,0,0,0,1,3,0,2,0,2,2,0,0,0,1,0,2,0,0,0,0,0,0,0, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,2,0,2,0,3,2,0,2,0,0,0,0,0,0,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1, -0,0,2,0,0,0,0,1,1,0,0,2,1,2,0,2,2,0,1,0,0,1,0,0,0,2,0,0,0,0,0,0, -0,3,0,2,2,2,0,0,2,0,0,0,2,0,0,0,2,3,0,2,0,0,0,0,0,0,2,2,0,0,0,2, -0,1,2,0,0,0,1,2,2,1,0,0,0,2,0,0,2,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0, -0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,2,1,2,0,2,2,0,2,0,0,2,0,0,0,0,1,2,1,0,2,1,0,0,0,0,0,0,0,0,0,0, -0,0,2,0,0,0,3,1,2,2,0,2,0,0,0,0,2,0,0,0,2,0,0,3,0,0,0,0,2,2,2,0, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,2,1,0,2,0,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,1,0,0,0,0,0,0,2, -0,2,2,0,0,2,2,2,2,2,0,1,2,0,0,0,2,2,0,1,0,2,0,0,2,2,0,0,0,0,0,0, -0,0,0,0,1,0,0,0,0,0,0,0,3,0,0,2,0,0,0,0,0,0,0,0,2,0,2,0,0,0,0,2, -0,1,2,0,0,0,0,2,2,1,0,1,0,1,0,2,2,2,1,0,0,0,0,0,0,1,0,0,0,0,0,0, -0,2,0,1,2,0,0,0,0,0,0,0,0,0,0,2,0,0,2,2,0,0,0,0,1,0,0,0,0,0,0,2, -0,2,2,0,0,0,0,2,2,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,2,0,0,2,0,0,0, -0,2,2,2,2,0,0,0,3,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,2,0,0,0,0,0,0,1, -0,0,2,0,0,0,0,1,2,0,0,0,0,0,0,2,2,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0, -0,2,0,2,2,2,0,0,2,0,0,0,0,0,0,0,2,2,2,0,0,0,2,0,0,0,0,0,0,0,0,2, -0,0,1,0,0,0,0,2,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0, -0,3,0,2,0,0,0,0,0,0,0,0,2,0,0,0,0,0,2,0,0,0,0,0,0,0,2,0,0,0,0,2, -0,0,2,0,0,0,0,2,2,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,2,0,2,2,1,0,0,0,0,0,0,2,0,0,2,0,2,2,2,0,0,0,0,0,0,2,0,0,0,0,2, -0,0,2,0,0,2,0,2,2,0,0,0,0,2,0,2,0,0,0,0,0,2,0,0,0,2,0,0,0,0,0,0, -0,0,3,0,0,0,2,2,0,2,2,0,0,0,0,0,2,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,2,0,0,0,0,0, -0,2,2,2,2,2,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1, -0,0,0,0,0,0,0,2,1,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,0,0,0,0,0,0,2,2,0,0,0,0,0,2,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0, -0,2,0,0,0,2,0,0,0,0,0,1,0,0,0,0,2,2,0,0,0,1,0,0,0,0,0,0,0,0,0,0, -0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,2,0,0,0, -0,2,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,2,0,2,0,0,0, -0,0,0,0,0,0,0,0,2,1,0,0,0,0,0,0,2,0,0,0,1,2,0,0,0,0,0,0,0,0,0,0, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -}; - -const SequenceModel Latin7GreekModel = -{ - Latin7_CharToOrderMap, + Windows_1253_CharToOrderMap, GreekLangModel, - 64, - (float)0.982851, - PR_FALSE, - "ISO-8859-7" -}; - -const SequenceModel Win1253GreekModel = -{ - win1253_CharToOrderMap, - GreekLangModel, - 64, - (float)0.982851, + 41, + (float)0.9690985257709991, PR_FALSE, "WINDOWS-1253" }; + +const SequenceModel Iso_8859_7GreekModel = +{ + Iso_8859_7_CharToOrderMap, + GreekLangModel, + 41, + (float)0.9690985257709991, + PR_FALSE, + "ISO-8859-7" +}; \ No newline at end of file diff --git a/src/nsSBCSGroupProber.cpp b/src/nsSBCSGroupProber.cpp index e150a50..58efecd 100644 --- a/src/nsSBCSGroupProber.cpp +++ b/src/nsSBCSGroupProber.cpp @@ -53,8 +53,8 @@ nsSBCSGroupProber::nsSBCSGroupProber() mProbers[4] = new nsSingleByteCharSetProber(&Ibm866RussianModel); mProbers[5] = new nsSingleByteCharSetProber(&Ibm855RussianModel); - mProbers[6] = new nsSingleByteCharSetProber(&Latin7GreekModel); - mProbers[7] = new nsSingleByteCharSetProber(&Win1253GreekModel); + mProbers[6] = new nsSingleByteCharSetProber(&Iso_8859_7GreekModel); + mProbers[7] = new nsSingleByteCharSetProber(&Windows_1253GreekModel); mProbers[8] = new nsSingleByteCharSetProber(&Latin5BulgarianModel); mProbers[9] = new nsSingleByteCharSetProber(&Win1251BulgarianModel); diff --git a/src/nsSBCharSetProber.h b/src/nsSBCharSetProber.h index 4c815cf..07da6ce 100644 --- a/src/nsSBCharSetProber.h +++ b/src/nsSBCharSetProber.h @@ -130,8 +130,8 @@ extern const SequenceModel MacCyrillicRussianModel; extern const SequenceModel Ibm866RussianModel; extern const SequenceModel Ibm855RussianModel; -extern const SequenceModel Latin7GreekModel; -extern const SequenceModel Win1253GreekModel; +extern const SequenceModel Iso_8859_7GreekModel; +extern const SequenceModel Windows_1253GreekModel; extern const SequenceModel Latin5BulgarianModel; extern const SequenceModel Win1251BulgarianModel; diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 8588fa1..a36a739 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -34,8 +34,7 @@ foreach(dir ${dirs}) # enough). We will have to take a closer look and fix these, but # there is no need to break the whole `make test` right now, # which may make actual regressions harder to notice. - if ("${lang}:${charset}" STREQUAL "el:windows-1253" OR - "${lang}:${charset}" STREQUAL "ja:utf-16le" OR + if ("${lang}:${charset}" STREQUAL "ja:utf-16le" OR "${lang}:${charset}" STREQUAL "ja:utf-16be" OR "${lang}:${charset}" STREQUAL "es:iso-8859-15" OR "${lang}:${charset}" STREQUAL "he:iso-8859-8")