From a7525b404d5f0fc6ec113024c865c39e110a0f4b Mon Sep 17 00:00:00 2001 From: Jehan Date: Tue, 27 Sep 2016 00:46:37 +0200 Subject: [PATCH] LangModels: added support for Irish Gaelic. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Encodings: ISO-8859-1, ISO-8859-9, ISO-8859-15 and WINDOWS-1252. Test text from: https://ga.wikipedia.org/wiki/Gluais_théarmaí_seoltóireachta --- README.md | 5 + script/BuildLangModelLogs/LangIrishModel.log | 156 +++++++++++++ script/langs/ga.py | 60 +++++ src/CMakeLists.txt | 1 + src/LangModels/LangIrishModel.cpp | 230 +++++++++++++++++++ src/nsSBCSGroupProber.cpp | 5 + src/nsSBCSGroupProber.h | 2 +- src/nsSBCharSetProber.h | 5 + test/ga/iso-8859-1.txt | 6 + test/ga/utf-8.txt | 6 + test/ga/windows-1252.txt | 6 + 11 files changed, 481 insertions(+), 1 deletion(-) create mode 100644 script/BuildLangModelLogs/LangIrishModel.log create mode 100644 script/langs/ga.py create mode 100644 src/LangModels/LangIrishModel.cpp create mode 100644 test/ga/iso-8859-1.txt create mode 100644 test/ga/utf-8.txt create mode 100644 test/ga/windows-1252.txt diff --git a/README.md b/README.md index 8aa7a7b..ba4d494 100644 --- a/README.md +++ b/README.md @@ -75,6 +75,11 @@ Techniques used by universalchardet are described at http://www.mozilla.org/proj * Hungarian: * ISO-8859-2 * WINDOWS-1250 + * Irish Gaelic + * ISO-8859-1 + * ISO-8859-9 + * ISO-8859-15 + * WINDOWS-1252 * Italian * ISO-8859-1 * ISO-8859-3 diff --git a/script/BuildLangModelLogs/LangIrishModel.log b/script/BuildLangModelLogs/LangIrishModel.log new file mode 100644 index 0000000..7bee9d8 --- /dev/null +++ b/script/BuildLangModelLogs/LangIrishModel.log @@ -0,0 +1,156 @@ += Logs of language model for Irish (ga) = + +- Generated by BuildLangModel.py +- Started: 2016-09-27 00:31:16.489602 +- Maximum depth: 5 +- Max number of pages: 100 + +== Parsed pages == + +Tracy Caldwell Dyson (revision 812158) +14 Lúnasa (revision 716575) +1969 (revision 810361) +California (revision 790976) +Ceimic (revision 759983) +Ceimic fhisiciúil (revision 656896) +NASA (revision 806394) +Rúisis (revision 771746) +SAM (revision 807668) +Spáinnis (revision 812323) +Stáisiún Idirnáisiúnta Spáis (revision 806394) +Tointeálaí spáis (revision 761309) +10 Lúnasa (revision 649045) +11 Lúnasa (revision 776455) +12 Lúnasa (revision 716531) +13 Lúnasa (revision 716546) +1598 (revision 703178) +15 Lúnasa (revision 776986) +16 Lúnasa (revision 648836) +1740 (revision 791225) +1771 (revision 776762) +17 Lúnasa (revision 777131) +1823 (revision 791774) +1832 (revision 794492) +1898 (revision 805176) +18 Lúnasa (revision 777242) +1911 (revision 801932) +1956 (revision 797081) +1962 (revision 801511) +1966 (revision 807415) +19 Lúnasa (revision 648524) +1 Lúnasa (revision 647726) +2001 (revision 801012) +2004 (revision 795759) +2016 (revision 812091) +20 Lúnasa (revision 777924) +21 Lúnasa (revision 647805) +22 Lúnasa (revision 778960) +23 Lúnasa (revision 778453) +24 Lúnasa (revision 778495) +25 Lúnasa (revision 778551) +26 Lúnasa (revision 649051) +27 Lúnasa (revision 778763) +28 Lúnasa (revision 778813) +29 Lúnasa (revision 778959) +2 Lúnasa (revision 774393) +30 Lúnasa (revision 648308) +31 Lúnasa (revision 649053) +3 Lúnasa (revision 647811) +4 Lúnasa (revision 786284) +5 Lúnasa (revision 776845) +6 Lúnasa (revision 647834) +7 Lúnasa (revision 775859) +8 Lúnasa (revision 648745) +9 Lúnasa (revision 648522) +AK Parti (revision 792248) +An Phacastáin (revision 759339) +An Tuirc (revision 811970) +Aoine (revision 717430) +Bertolt Brecht (revision 800584) +Czesław Miłosz (revision 780306) +Céadaoin (revision 717606) +Dan Boyle (revision 797926) +Domhnach (revision 717663) +Déardaoin (revision 647860) +Féilire (revision 648837) +Halle Berry (revision 759955) +Henry Bagenal (revision 716575) +Iúil (revision 647071) +Luan (revision 717791) +Lúnasa (revision 810265) +Meán Fómhair (revision 779166) +Pápa Pius VII (revision 758126) +Satharn (revision 784525) +Walter Scott (revision 759029) +Áth Buí (revision 716575) +11 Márta (revision 716519) +17 Márta (revision 798614) +1882 (revision 801198) +1886 (revision 776624) +1890 (revision 801200) +1891 (revision 796677) +1903 (revision 812849) +1922 (revision 801227) +1930í (revision 740221) +1940í (revision 740219) +1950í (revision 740217) +1960í (revision 772724) +1967 (revision 796983) +1968 (revision 810926) +1970 (revision 812852) +1970í (revision 740213) +1971 (revision 809746) +1972 (revision 789490) +1980í (revision 740211) +1990í (revision 740208) +19ú haois (revision 739964) +1 Bealtaine (revision 647679) + +== End of Parsed pages == + +- Wikipedia parsing ended at: 2016-09-27 00:33:40.157338 + +44 characters appeared 183561 times. + +First 31 characters: +[ 0] Char a: 15.192769705983297 % +[ 1] Char i: 10.534372769814938 % +[ 2] Char n: 8.106297089250985 % +[ 3] Char h: 7.243368689427493 % +[ 4] Char r: 6.442544985045844 % +[ 5] Char e: 6.198484427520007 % +[ 6] Char s: 5.622654049607488 % +[ 7] Char t: 4.776068990689743 % +[ 8] Char c: 4.543448771797931 % +[ 9] Char l: 4.1953356105054995 % +[10] Char o: 3.9469168287381304 % +[11] Char d: 3.2169142682813887 % +[12] Char g: 2.811054635788648 % +[13] Char m: 2.6269196615838877 % +[14] Char á: 2.2749930540801153 % +[15] Char u: 2.1932763495513754 % +[16] Char b: 2.0478206154902185 % +[17] Char í: 1.6599386579938005 % +[18] Char é: 1.2829522611012143 % +[19] Char f: 1.1494816437042727 % +[20] Char ú: 1.0525111543301682 % +[21] Char p: 0.9059658642086281 % +[22] Char ó: 0.8890777452726886 % +[23] Char v: 0.2522322279787101 % +[24] Char y: 0.23479933101257894 % +[25] Char k: 0.18195586208399386 % +[26] Char w: 0.1688811893593955 % +[27] Char j: 0.09697048937410452 % +[28] Char z: 0.07735848028720697 % +[29] Char x: 0.0343210159020707 % +[30] Char q: 0.010895560603831969 % + +The first 31 characters have an accumulated ratio of 0.9997058198636966. + +701 sequences found. + +First 512 (typical positive ratio): 0.9974076651249096 +Next 512 (512-1024): 5.447780301915984e-06 +Rest: -2.7755575615628914e-17 + +- Processing end: 2016-09-27 00:33:40.258886 diff --git a/script/langs/ga.py b/script/langs/ga.py new file mode 100644 index 0000000..5c2adb4 --- /dev/null +++ b/script/langs/ga.py @@ -0,0 +1,60 @@ +#!/bin/python3 +# -*- coding: utf-8 -*- + +# ##### BEGIN LICENSE BLOCK ##### +# Version: MPL 1.1/GPL 2.0/LGPL 2.1 +# +# The contents of this file are subject to the Mozilla Public License Version +# 1.1 (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# http://www.mozilla.org/MPL/ +# +# Software distributed under the License is distributed on an "AS IS" basis, +# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License +# for the specific language governing rights and limitations under the +# License. +# +# The Original Code is Mozilla Universal charset detector code. +# +# The Initial Developer of the Original Code is +# Netscape Communications Corporation. +# Portions created by the Initial Developer are Copyright (C) 2001 +# the Initial Developer. All Rights Reserved. +# +# Contributor(s): +# Jehan +# +# Alternatively, the contents of this file may be used under the terms of +# either the GNU General Public License Version 2 or later (the "GPL"), or +# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), +# in which case the provisions of the GPL or the LGPL are applicable instead +# of those above. If you wish to allow use of your version of this file only +# under the terms of either the GPL or the LGPL, and not to allow others to +# use your version of this file under the terms of the MPL, indicate your +# decision by deleting the provisions above and replace them with the notice +# and other provisions required by the GPL or the LGPL. If you do not delete +# the provisions above, a recipient may use your version of this file under +# the terms of any one of the MPL, the GPL or the LGPL. +# +# ##### END LICENSE BLOCK ##### + +import re + +## Mandatory Properties ## + +name = 'Irish' +code = 'ga' +# ASCII characters are also used in French. +use_ascii = True +# The charsets we want to support and create data for. +charsets = ['ISO-8859-15', 'ISO-8859-1', 'ISO-8859-9', 'WINDOWS-1252'] + +## Optional Properties ## + +# XXX: Irish gaelic also uses sometimes the dotless 'i' but without any +# semantic difference from the dotted 'i'. Only for stylistic reasons. +# So I don't add it in the glyph list. +alphabet = 'áéíóú' +start_pages = ['Tracy Caldwell Dyson'] +wikipedia_code = code +case_mapping = True diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 1b4773e..74b3939 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -20,6 +20,7 @@ set( LangModels/LangGreekModel.cpp LangModels/LangHungarianModel.cpp LangModels/LangHebrewModel.cpp + LangModels/LangIrishModel.cpp LangModels/LangItalianModel.cpp LangModels/LangLithuanianModel.cpp LangModels/LangLatvianModel.cpp diff --git a/src/LangModels/LangIrishModel.cpp b/src/LangModels/LangIrishModel.cpp new file mode 100644 index 0000000..af3a16d --- /dev/null +++ b/src/LangModels/LangIrishModel.cpp @@ -0,0 +1,230 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is Mozilla Communicator client code. + * + * The Initial Developer of the Original Code is + * Netscape Communications Corporation. + * Portions created by the Initial Developer are Copyright (C) 1998 + * the Initial Developer. All Rights Reserved. + * + * Contributor(s): + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ + +#include "../nsSBCharSetProber.h" + +/********* Language model for: Irish *********/ + +/** + * Generated by BuildLangModel.py + * On: 2016-09-27 00:33:40.158624 + **/ + +/* Character Mapping Table: + * ILL: illegal character. + * CTR: control character specific to the charset. + * RET: carriage/return. + * SYM: symbol (punctuation) that does not belong to word. + * NUM: 0 - 9. + * + * Other characters are ordered by probabilities + * (0 is the most common character in the language). + * + * Orders are generic to a language. So the codepoint with order X in + * CHARSET1 maps to the same character as the codepoint with the same + * order X in CHARSET2 for the same language. + * As such, it is possible to get missing order. For instance the + * ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1 + * even though they are both used for French. Same for the euro sign. + */ +static const unsigned char Iso_8859_1_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 0, 16, 8, 11, 5, 19, 12, 3, 1, 27, 25, 9, 13, 2, 10, /* 4X */ + 21, 30, 4, 6, 7, 15, 23, 26, 29, 24, 28,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 0, 16, 8, 11, 5, 19, 12, 3, 1, 27, 25, 9, 13, 2, 10, /* 6X */ + 21, 30, 4, 6, 7, 15, 23, 26, 29, 24, 28,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ + SYM,SYM,SYM,SYM,SYM, 44,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */ + 45, 14, 46, 47, 33, 48, 49, 39, 35, 18, 42, 37, 50, 17, 51, 40, /* CX */ + 52, 32, 43, 22, 53, 54, 38,SYM, 36, 55, 20, 56, 31, 57, 58, 59, /* DX */ + 60, 14, 61, 62, 33, 63, 64, 39, 35, 18, 42, 37, 65, 17, 66, 40, /* EX */ + 67, 32, 43, 22, 68, 69, 38,SYM, 36, 70, 20, 71, 31, 72, 73, 74, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Windows_1252_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 0, 16, 8, 11, 5, 19, 12, 3, 1, 27, 25, 9, 13, 2, 10, /* 4X */ + 21, 30, 4, 6, 7, 15, 23, 26, 29, 24, 28,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 0, 16, 8, 11, 5, 19, 12, 3, 1, 27, 25, 9, 13, 2, 10, /* 6X */ + 21, 30, 4, 6, 7, 15, 23, 26, 29, 24, 28,SYM,SYM,SYM,SYM,CTR, /* 7X */ + SYM,ILL,SYM, 75,SYM,SYM,SYM,SYM,SYM,SYM, 34,SYM, 76,ILL, 77,ILL, /* 8X */ + ILL,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 34,SYM, 78,ILL, 79, 80, /* 9X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ + SYM,SYM,SYM,SYM,SYM, 81,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */ + 82, 14, 83, 84, 33, 85, 86, 39, 35, 18, 42, 37, 87, 17, 88, 40, /* CX */ + 89, 32, 43, 22, 90, 91, 38,SYM, 36, 92, 20, 93, 31, 94, 95, 96, /* DX */ + 97, 14, 98, 99, 33,100,101, 39, 35, 18, 42, 37,102, 17,103, 40, /* EX */ + 104, 32, 43, 22,105,106, 38,SYM, 36,107, 20,108, 31,109,110,111, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Iso_8859_15_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 0, 16, 8, 11, 5, 19, 12, 3, 1, 27, 25, 9, 13, 2, 10, /* 4X */ + 21, 30, 4, 6, 7, 15, 23, 26, 29, 24, 28,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 0, 16, 8, 11, 5, 19, 12, 3, 1, 27, 25, 9, 13, 2, 10, /* 6X */ + 21, 30, 4, 6, 7, 15, 23, 26, 29, 24, 28,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM,SYM,SYM,SYM,SYM,SYM, 34,SYM, 34,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ + SYM,SYM,SYM,SYM,112,113,SYM,SYM,114,SYM,SYM,SYM,115,116,117,SYM, /* BX */ + 118, 14,119,120, 33,121,122, 39, 35, 18, 42, 37,123, 17,124, 40, /* CX */ + 125, 32, 43, 22,126,127, 38,SYM, 36,128, 20,129, 31,130,131,132, /* DX */ + 133, 14,134,135, 33,136,137, 39, 35, 18, 42, 37,138, 17,139, 40, /* EX */ + 140, 32, 43, 22,141,142, 38,SYM, 36,143, 20,144, 31,145,146,147, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Iso_8859_9_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 0, 16, 8, 11, 5, 19, 12, 3, 1, 27, 25, 9, 13, 2, 10, /* 4X */ + 21, 30, 4, 6, 7, 15, 23, 26, 29, 24, 28,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 0, 16, 8, 11, 5, 19, 12, 3, 1, 27, 25, 9, 13, 2, 10, /* 6X */ + 21, 30, 4, 6, 7, 15, 23, 26, 29, 24, 28,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ + SYM,SYM,SYM,SYM,SYM,148,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */ + 149, 14,150,151, 33,152,153, 39, 35, 18, 42, 37,154, 17,155, 40, /* CX */ + 156, 32, 43, 22,157,158, 38,SYM, 36,159, 20,160, 31,161,162,163, /* DX */ + 164, 14,165,166, 33,167,168, 39, 35, 18, 42, 37,169, 17,170, 40, /* EX */ + 171, 32, 43, 22,172,173, 38,SYM, 36,174, 20,175, 31, 41,176,177, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + + +/* Model Table: + * Total sequences: 701 + * First 512 sequences: 0.9974076651249096 + * Next 512 sequences (512-1024): 0.0025923348750903907 + * Rest: -2.7755575615628914e-17 + * Negative sequences: TODO + */ +static const PRUint8 IrishLangModel[] = +{ + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,0,3,0,3,0,3,3,3,3,2,3,3,2, + 3,3,3,2,3,3,3,3,3,3,3,3,3,3,2,3,3,0,2,3,3,3,3,3,3,3,0,3,3,3,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,0,2,3,0,2, + 3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,2,0,0,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,2,3,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,0,3,0,3,3,3,3,3,3,2,3,3,0, + 3,3,3,3,3,3,3,3,3,3,3,2,2,3,3,3,3,3,3,3,3,3,3,2,3,3,3,0,3,0,0, + 3,3,3,3,3,3,3,3,3,3,3,0,2,3,3,3,3,3,3,2,3,0,3,3,3,3,2,2,3,0,0, + 3,3,3,3,3,3,3,3,3,3,3,2,0,3,3,3,2,3,3,3,3,2,3,0,3,3,2,0,3,0,2, + 3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,0,3,0,0, + 2,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,0,3,3,3,0,3,3,3,3,3,2,3,2, + 3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,2,3,3,3,3,0,3,2,3,2,3,2,2,0,0, + 3,3,3,3,3,3,3,3,3,3,3,2,2,3,3,3,2,3,3,2,3,0,3,0,2,0,2,0,0,0,0, + 3,3,3,3,3,3,3,3,3,3,3,2,0,3,3,3,3,3,3,2,3,3,3,0,3,0,0,0,2,2,0, + 0,3,3,0,3,2,3,3,3,3,0,3,3,3,0,0,3,3,0,3,0,3,0,2,0,0,0,0,2,0,0, + 3,3,3,2,3,3,3,3,3,3,2,3,3,3,3,0,3,3,2,2,0,3,0,2,2,2,0,2,3,2,0, + 3,3,3,3,3,3,3,2,2,3,3,2,0,0,3,3,3,3,3,2,3,3,3,0,2,0,0,2,0,0,0, + 2,0,3,0,3,0,3,3,3,3,3,3,3,2,0,0,3,0,0,0,3,0,0,2,0,0,0,0,0,0,0, + 3,3,3,0,2,2,3,3,0,2,3,2,0,2,0,0,2,0,0,2,2,2,0,2,0,0,0,0,0,0,0, + 3,3,0,3,3,3,2,3,2,3,3,0,3,2,3,3,2,3,3,3,0,0,3,2,2,0,0,0,0,0,0, + 2,3,3,0,3,0,3,3,3,3,0,3,2,2,0,0,3,0,0,0,0,3,0,0,0,0,0,0,0,0,0, + 3,3,0,3,3,3,3,3,2,3,3,0,0,2,3,3,0,3,3,0,2,3,3,0,2,0,0,0,0,0,0, + 0,3,3,0,3,0,3,3,3,3,0,3,3,3,0,0,3,0,0,2,3,3,0,2,0,0,0,0,2,0,0, + 3,3,2,0,3,3,3,2,0,2,3,0,2,0,3,2,0,3,3,0,0,0,3,2,2,0,0,0,0,0,0, + 3,0,3,0,2,3,3,2,3,3,3,2,0,3,0,3,2,0,0,2,0,0,0,0,2,0,3,0,0,0,0, + 3,3,3,3,3,3,3,0,0,3,3,0,0,2,2,3,2,0,2,0,0,2,0,2,3,2,2,0,0,0,0, + 3,3,3,3,3,3,3,2,0,2,3,2,0,2,0,2,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0, + 3,3,2,0,2,3,0,0,0,0,3,0,0,0,0,3,0,0,0,0,0,0,2,0,0,2,0,0,0,0,0, + 3,3,2,3,0,3,2,0,0,0,3,2,2,2,0,2,2,0,0,0,0,0,0,0,2,0,0,0,2,0,2, + 3,3,0,0,0,2,0,0,0,0,2,0,0,0,0,0,0,0,0,3,0,3,0,2,2,0,0,0,0,0,0, + 2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +}; + + +const SequenceModel Iso_8859_1IrishModel = +{ + Iso_8859_1_CharToOrderMap, + IrishLangModel, + 31, + (float)0.9974076651249096, + PR_TRUE, + "ISO-8859-1" +}; + +const SequenceModel Windows_1252IrishModel = +{ + Windows_1252_CharToOrderMap, + IrishLangModel, + 31, + (float)0.9974076651249096, + PR_TRUE, + "WINDOWS-1252" +}; + +const SequenceModel Iso_8859_15IrishModel = +{ + Iso_8859_15_CharToOrderMap, + IrishLangModel, + 31, + (float)0.9974076651249096, + PR_TRUE, + "ISO-8859-15" +}; + +const SequenceModel Iso_8859_9IrishModel = +{ + Iso_8859_9_CharToOrderMap, + IrishLangModel, + 31, + (float)0.9974076651249096, + PR_TRUE, + "ISO-8859-9" +}; diff --git a/src/nsSBCSGroupProber.cpp b/src/nsSBCSGroupProber.cpp index d77a9fc..037153b 100644 --- a/src/nsSBCSGroupProber.cpp +++ b/src/nsSBCSGroupProber.cpp @@ -169,6 +169,11 @@ nsSBCSGroupProber::nsSBCSGroupProber() mProbers[79] = new nsSingleByteCharSetProber(&Iso_8859_13EstonianModel); mProbers[80] = new nsSingleByteCharSetProber(&Iso_8859_15EstonianModel); + mProbers[81] = new nsSingleByteCharSetProber(&Iso_8859_1IrishModel); + mProbers[82] = new nsSingleByteCharSetProber(&Iso_8859_9IrishModel); + mProbers[83] = new nsSingleByteCharSetProber(&Iso_8859_15IrishModel); + mProbers[84] = new nsSingleByteCharSetProber(&Windows_1252IrishModel); + Reset(); } diff --git a/src/nsSBCSGroupProber.h b/src/nsSBCSGroupProber.h index 143e4cb..405e43c 100644 --- a/src/nsSBCSGroupProber.h +++ b/src/nsSBCSGroupProber.h @@ -40,7 +40,7 @@ #define nsSBCSGroupProber_h__ -#define NUM_OF_SBCS_PROBERS 81 +#define NUM_OF_SBCS_PROBERS 85 class nsCharSetProber; class nsSBCSGroupProber: public nsCharSetProber { diff --git a/src/nsSBCharSetProber.h b/src/nsSBCharSetProber.h index 863bac0..dc9ddd7 100644 --- a/src/nsSBCharSetProber.h +++ b/src/nsSBCharSetProber.h @@ -230,5 +230,10 @@ extern const SequenceModel Iso_8859_4EstonianModel; extern const SequenceModel Iso_8859_13EstonianModel; extern const SequenceModel Iso_8859_15EstonianModel; +extern const SequenceModel Iso_8859_15IrishModel; +extern const SequenceModel Iso_8859_9IrishModel; +extern const SequenceModel Iso_8859_1IrishModel; +extern const SequenceModel Windows_1252IrishModel; + #endif /* nsSingleByteCharSetProber_h__ */ diff --git a/test/ga/iso-8859-1.txt b/test/ga/iso-8859-1.txt new file mode 100644 index 0000000..f062a67 --- /dev/null +++ b/test/ga/iso-8859-1.txt @@ -0,0 +1,6 @@ +Ag seo tarma seoltireachta a bhaineann le longa adhmaid agus le bid. + +N bhodh de cheangal idir ire agus tortha eile ach na longa, agus t ire +fin ln de lochanna agus d'aibhneacha. Fgann seo go bhfuil an teanga breac le +tarmaocht seoltireachta agus loingseoireachta agus cuid di tugtha isteach n +Lochlainnis agus n mBarla tr lonnaitheoir n iasacht. diff --git a/test/ga/utf-8.txt b/test/ga/utf-8.txt new file mode 100644 index 0000000..33cc012 --- /dev/null +++ b/test/ga/utf-8.txt @@ -0,0 +1,6 @@ +Ag seo téarmaí seoltóireachta a bhaineann le longa adhmaid agus le báid. + +Ní bhíodh de cheangal idir Éire agus tíortha eile ach na longa, agus tá Éire +féin lán de lochanna agus d’aibhneacha. Fágann seo go bhfuil an teanga breac le +téarmaíocht seoltóireachta agus loingseoireachta agus cuid di tugtha isteach ón +Lochlainnis agus ón mBéarla trí lonnaitheoirí ón iasacht. diff --git a/test/ga/windows-1252.txt b/test/ga/windows-1252.txt new file mode 100644 index 0000000..1a97dae --- /dev/null +++ b/test/ga/windows-1252.txt @@ -0,0 +1,6 @@ +Ag seo tarma seoltireachta a bhaineann le longa adhmaid agus le bid. + +N bhodh de cheangal idir ire agus tortha eile ach na longa, agus t ire +fin ln de lochanna agus daibhneacha. Fgann seo go bhfuil an teanga breac le +tarmaocht seoltireachta agus loingseoireachta agus cuid di tugtha isteach n +Lochlainnis agus n mBarla tr lonnaitheoir n iasacht.