mirror of
https://gitlab.freedesktop.org/uchardet/uchardet.git
synced 2025-12-06 16:56:40 +08:00
LangModels: added support for Irish Gaelic.
Encodings: ISO-8859-1, ISO-8859-9, ISO-8859-15 and WINDOWS-1252. Test text from: https://ga.wikipedia.org/wiki/Gluais_théarmaí_seoltóireachta
This commit is contained in:
parent
a3a271dfd5
commit
a7525b404d
@ -75,6 +75,11 @@ Techniques used by universalchardet are described at http://www.mozilla.org/proj
|
|||||||
* Hungarian:
|
* Hungarian:
|
||||||
* ISO-8859-2
|
* ISO-8859-2
|
||||||
* WINDOWS-1250
|
* WINDOWS-1250
|
||||||
|
* Irish Gaelic
|
||||||
|
* ISO-8859-1
|
||||||
|
* ISO-8859-9
|
||||||
|
* ISO-8859-15
|
||||||
|
* WINDOWS-1252
|
||||||
* Italian
|
* Italian
|
||||||
* ISO-8859-1
|
* ISO-8859-1
|
||||||
* ISO-8859-3
|
* ISO-8859-3
|
||||||
|
|||||||
156
script/BuildLangModelLogs/LangIrishModel.log
Normal file
156
script/BuildLangModelLogs/LangIrishModel.log
Normal file
@ -0,0 +1,156 @@
|
|||||||
|
= Logs of language model for Irish (ga) =
|
||||||
|
|
||||||
|
- Generated by BuildLangModel.py
|
||||||
|
- Started: 2016-09-27 00:31:16.489602
|
||||||
|
- Maximum depth: 5
|
||||||
|
- Max number of pages: 100
|
||||||
|
|
||||||
|
== Parsed pages ==
|
||||||
|
|
||||||
|
Tracy Caldwell Dyson (revision 812158)
|
||||||
|
14 Lúnasa (revision 716575)
|
||||||
|
1969 (revision 810361)
|
||||||
|
California (revision 790976)
|
||||||
|
Ceimic (revision 759983)
|
||||||
|
Ceimic fhisiciúil (revision 656896)
|
||||||
|
NASA (revision 806394)
|
||||||
|
Rúisis (revision 771746)
|
||||||
|
SAM (revision 807668)
|
||||||
|
Spáinnis (revision 812323)
|
||||||
|
Stáisiún Idirnáisiúnta Spáis (revision 806394)
|
||||||
|
Tointeálaí spáis (revision 761309)
|
||||||
|
10 Lúnasa (revision 649045)
|
||||||
|
11 Lúnasa (revision 776455)
|
||||||
|
12 Lúnasa (revision 716531)
|
||||||
|
13 Lúnasa (revision 716546)
|
||||||
|
1598 (revision 703178)
|
||||||
|
15 Lúnasa (revision 776986)
|
||||||
|
16 Lúnasa (revision 648836)
|
||||||
|
1740 (revision 791225)
|
||||||
|
1771 (revision 776762)
|
||||||
|
17 Lúnasa (revision 777131)
|
||||||
|
1823 (revision 791774)
|
||||||
|
1832 (revision 794492)
|
||||||
|
1898 (revision 805176)
|
||||||
|
18 Lúnasa (revision 777242)
|
||||||
|
1911 (revision 801932)
|
||||||
|
1956 (revision 797081)
|
||||||
|
1962 (revision 801511)
|
||||||
|
1966 (revision 807415)
|
||||||
|
19 Lúnasa (revision 648524)
|
||||||
|
1 Lúnasa (revision 647726)
|
||||||
|
2001 (revision 801012)
|
||||||
|
2004 (revision 795759)
|
||||||
|
2016 (revision 812091)
|
||||||
|
20 Lúnasa (revision 777924)
|
||||||
|
21 Lúnasa (revision 647805)
|
||||||
|
22 Lúnasa (revision 778960)
|
||||||
|
23 Lúnasa (revision 778453)
|
||||||
|
24 Lúnasa (revision 778495)
|
||||||
|
25 Lúnasa (revision 778551)
|
||||||
|
26 Lúnasa (revision 649051)
|
||||||
|
27 Lúnasa (revision 778763)
|
||||||
|
28 Lúnasa (revision 778813)
|
||||||
|
29 Lúnasa (revision 778959)
|
||||||
|
2 Lúnasa (revision 774393)
|
||||||
|
30 Lúnasa (revision 648308)
|
||||||
|
31 Lúnasa (revision 649053)
|
||||||
|
3 Lúnasa (revision 647811)
|
||||||
|
4 Lúnasa (revision 786284)
|
||||||
|
5 Lúnasa (revision 776845)
|
||||||
|
6 Lúnasa (revision 647834)
|
||||||
|
7 Lúnasa (revision 775859)
|
||||||
|
8 Lúnasa (revision 648745)
|
||||||
|
9 Lúnasa (revision 648522)
|
||||||
|
AK Parti (revision 792248)
|
||||||
|
An Phacastáin (revision 759339)
|
||||||
|
An Tuirc (revision 811970)
|
||||||
|
Aoine (revision 717430)
|
||||||
|
Bertolt Brecht (revision 800584)
|
||||||
|
Czesław Miłosz (revision 780306)
|
||||||
|
Céadaoin (revision 717606)
|
||||||
|
Dan Boyle (revision 797926)
|
||||||
|
Domhnach (revision 717663)
|
||||||
|
Déardaoin (revision 647860)
|
||||||
|
Féilire (revision 648837)
|
||||||
|
Halle Berry (revision 759955)
|
||||||
|
Henry Bagenal (revision 716575)
|
||||||
|
Iúil (revision 647071)
|
||||||
|
Luan (revision 717791)
|
||||||
|
Lúnasa (revision 810265)
|
||||||
|
Meán Fómhair (revision 779166)
|
||||||
|
Pápa Pius VII (revision 758126)
|
||||||
|
Satharn (revision 784525)
|
||||||
|
Walter Scott (revision 759029)
|
||||||
|
Áth Buí (revision 716575)
|
||||||
|
11 Márta (revision 716519)
|
||||||
|
17 Márta (revision 798614)
|
||||||
|
1882 (revision 801198)
|
||||||
|
1886 (revision 776624)
|
||||||
|
1890 (revision 801200)
|
||||||
|
1891 (revision 796677)
|
||||||
|
1903 (revision 812849)
|
||||||
|
1922 (revision 801227)
|
||||||
|
1930í (revision 740221)
|
||||||
|
1940í (revision 740219)
|
||||||
|
1950í (revision 740217)
|
||||||
|
1960í (revision 772724)
|
||||||
|
1967 (revision 796983)
|
||||||
|
1968 (revision 810926)
|
||||||
|
1970 (revision 812852)
|
||||||
|
1970í (revision 740213)
|
||||||
|
1971 (revision 809746)
|
||||||
|
1972 (revision 789490)
|
||||||
|
1980í (revision 740211)
|
||||||
|
1990í (revision 740208)
|
||||||
|
19ú haois (revision 739964)
|
||||||
|
1 Bealtaine (revision 647679)
|
||||||
|
|
||||||
|
== End of Parsed pages ==
|
||||||
|
|
||||||
|
- Wikipedia parsing ended at: 2016-09-27 00:33:40.157338
|
||||||
|
|
||||||
|
44 characters appeared 183561 times.
|
||||||
|
|
||||||
|
First 31 characters:
|
||||||
|
[ 0] Char a: 15.192769705983297 %
|
||||||
|
[ 1] Char i: 10.534372769814938 %
|
||||||
|
[ 2] Char n: 8.106297089250985 %
|
||||||
|
[ 3] Char h: 7.243368689427493 %
|
||||||
|
[ 4] Char r: 6.442544985045844 %
|
||||||
|
[ 5] Char e: 6.198484427520007 %
|
||||||
|
[ 6] Char s: 5.622654049607488 %
|
||||||
|
[ 7] Char t: 4.776068990689743 %
|
||||||
|
[ 8] Char c: 4.543448771797931 %
|
||||||
|
[ 9] Char l: 4.1953356105054995 %
|
||||||
|
[10] Char o: 3.9469168287381304 %
|
||||||
|
[11] Char d: 3.2169142682813887 %
|
||||||
|
[12] Char g: 2.811054635788648 %
|
||||||
|
[13] Char m: 2.6269196615838877 %
|
||||||
|
[14] Char á: 2.2749930540801153 %
|
||||||
|
[15] Char u: 2.1932763495513754 %
|
||||||
|
[16] Char b: 2.0478206154902185 %
|
||||||
|
[17] Char í: 1.6599386579938005 %
|
||||||
|
[18] Char é: 1.2829522611012143 %
|
||||||
|
[19] Char f: 1.1494816437042727 %
|
||||||
|
[20] Char ú: 1.0525111543301682 %
|
||||||
|
[21] Char p: 0.9059658642086281 %
|
||||||
|
[22] Char ó: 0.8890777452726886 %
|
||||||
|
[23] Char v: 0.2522322279787101 %
|
||||||
|
[24] Char y: 0.23479933101257894 %
|
||||||
|
[25] Char k: 0.18195586208399386 %
|
||||||
|
[26] Char w: 0.1688811893593955 %
|
||||||
|
[27] Char j: 0.09697048937410452 %
|
||||||
|
[28] Char z: 0.07735848028720697 %
|
||||||
|
[29] Char x: 0.0343210159020707 %
|
||||||
|
[30] Char q: 0.010895560603831969 %
|
||||||
|
|
||||||
|
The first 31 characters have an accumulated ratio of 0.9997058198636966.
|
||||||
|
|
||||||
|
701 sequences found.
|
||||||
|
|
||||||
|
First 512 (typical positive ratio): 0.9974076651249096
|
||||||
|
Next 512 (512-1024): 5.447780301915984e-06
|
||||||
|
Rest: -2.7755575615628914e-17
|
||||||
|
|
||||||
|
- Processing end: 2016-09-27 00:33:40.258886
|
||||||
60
script/langs/ga.py
Normal file
60
script/langs/ga.py
Normal file
@ -0,0 +1,60 @@
|
|||||||
|
#!/bin/python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
# ##### BEGIN LICENSE BLOCK #####
|
||||||
|
# Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||||
|
#
|
||||||
|
# The contents of this file are subject to the Mozilla Public License Version
|
||||||
|
# 1.1 (the "License"); you may not use this file except in compliance with
|
||||||
|
# the License. You may obtain a copy of the License at
|
||||||
|
# http://www.mozilla.org/MPL/
|
||||||
|
#
|
||||||
|
# Software distributed under the License is distributed on an "AS IS" basis,
|
||||||
|
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||||
|
# for the specific language governing rights and limitations under the
|
||||||
|
# License.
|
||||||
|
#
|
||||||
|
# The Original Code is Mozilla Universal charset detector code.
|
||||||
|
#
|
||||||
|
# The Initial Developer of the Original Code is
|
||||||
|
# Netscape Communications Corporation.
|
||||||
|
# Portions created by the Initial Developer are Copyright (C) 2001
|
||||||
|
# the Initial Developer. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Contributor(s):
|
||||||
|
# Jehan <jehan@girinstud.io>
|
||||||
|
#
|
||||||
|
# Alternatively, the contents of this file may be used under the terms of
|
||||||
|
# either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||||
|
# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||||
|
# in which case the provisions of the GPL or the LGPL are applicable instead
|
||||||
|
# of those above. If you wish to allow use of your version of this file only
|
||||||
|
# under the terms of either the GPL or the LGPL, and not to allow others to
|
||||||
|
# use your version of this file under the terms of the MPL, indicate your
|
||||||
|
# decision by deleting the provisions above and replace them with the notice
|
||||||
|
# and other provisions required by the GPL or the LGPL. If you do not delete
|
||||||
|
# the provisions above, a recipient may use your version of this file under
|
||||||
|
# the terms of any one of the MPL, the GPL or the LGPL.
|
||||||
|
#
|
||||||
|
# ##### END LICENSE BLOCK #####
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
## Mandatory Properties ##
|
||||||
|
|
||||||
|
name = 'Irish'
|
||||||
|
code = 'ga'
|
||||||
|
# ASCII characters are also used in French.
|
||||||
|
use_ascii = True
|
||||||
|
# The charsets we want to support and create data for.
|
||||||
|
charsets = ['ISO-8859-15', 'ISO-8859-1', 'ISO-8859-9', 'WINDOWS-1252']
|
||||||
|
|
||||||
|
## Optional Properties ##
|
||||||
|
|
||||||
|
# XXX: Irish gaelic also uses sometimes the dotless 'i' but without any
|
||||||
|
# semantic difference from the dotted 'i'. Only for stylistic reasons.
|
||||||
|
# So I don't add it in the glyph list.
|
||||||
|
alphabet = 'áéíóú'
|
||||||
|
start_pages = ['Tracy Caldwell Dyson']
|
||||||
|
wikipedia_code = code
|
||||||
|
case_mapping = True
|
||||||
@ -20,6 +20,7 @@ set(
|
|||||||
LangModels/LangGreekModel.cpp
|
LangModels/LangGreekModel.cpp
|
||||||
LangModels/LangHungarianModel.cpp
|
LangModels/LangHungarianModel.cpp
|
||||||
LangModels/LangHebrewModel.cpp
|
LangModels/LangHebrewModel.cpp
|
||||||
|
LangModels/LangIrishModel.cpp
|
||||||
LangModels/LangItalianModel.cpp
|
LangModels/LangItalianModel.cpp
|
||||||
LangModels/LangLithuanianModel.cpp
|
LangModels/LangLithuanianModel.cpp
|
||||||
LangModels/LangLatvianModel.cpp
|
LangModels/LangLatvianModel.cpp
|
||||||
|
|||||||
230
src/LangModels/LangIrishModel.cpp
Normal file
230
src/LangModels/LangIrishModel.cpp
Normal file
@ -0,0 +1,230 @@
|
|||||||
|
/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
|
||||||
|
/* ***** BEGIN LICENSE BLOCK *****
|
||||||
|
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||||
|
*
|
||||||
|
* The contents of this file are subject to the Mozilla Public License Version
|
||||||
|
* 1.1 (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
* http://www.mozilla.org/MPL/
|
||||||
|
*
|
||||||
|
* Software distributed under the License is distributed on an "AS IS" basis,
|
||||||
|
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||||
|
* for the specific language governing rights and limitations under the
|
||||||
|
* License.
|
||||||
|
*
|
||||||
|
* The Original Code is Mozilla Communicator client code.
|
||||||
|
*
|
||||||
|
* The Initial Developer of the Original Code is
|
||||||
|
* Netscape Communications Corporation.
|
||||||
|
* Portions created by the Initial Developer are Copyright (C) 1998
|
||||||
|
* the Initial Developer. All Rights Reserved.
|
||||||
|
*
|
||||||
|
* Contributor(s):
|
||||||
|
*
|
||||||
|
* Alternatively, the contents of this file may be used under the terms of
|
||||||
|
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||||
|
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||||
|
* in which case the provisions of the GPL or the LGPL are applicable instead
|
||||||
|
* of those above. If you wish to allow use of your version of this file only
|
||||||
|
* under the terms of either the GPL or the LGPL, and not to allow others to
|
||||||
|
* use your version of this file under the terms of the MPL, indicate your
|
||||||
|
* decision by deleting the provisions above and replace them with the notice
|
||||||
|
* and other provisions required by the GPL or the LGPL. If you do not delete
|
||||||
|
* the provisions above, a recipient may use your version of this file under
|
||||||
|
* the terms of any one of the MPL, the GPL or the LGPL.
|
||||||
|
*
|
||||||
|
* ***** END LICENSE BLOCK ***** */
|
||||||
|
|
||||||
|
#include "../nsSBCharSetProber.h"
|
||||||
|
|
||||||
|
/********* Language model for: Irish *********/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Generated by BuildLangModel.py
|
||||||
|
* On: 2016-09-27 00:33:40.158624
|
||||||
|
**/
|
||||||
|
|
||||||
|
/* Character Mapping Table:
|
||||||
|
* ILL: illegal character.
|
||||||
|
* CTR: control character specific to the charset.
|
||||||
|
* RET: carriage/return.
|
||||||
|
* SYM: symbol (punctuation) that does not belong to word.
|
||||||
|
* NUM: 0 - 9.
|
||||||
|
*
|
||||||
|
* Other characters are ordered by probabilities
|
||||||
|
* (0 is the most common character in the language).
|
||||||
|
*
|
||||||
|
* Orders are generic to a language. So the codepoint with order X in
|
||||||
|
* CHARSET1 maps to the same character as the codepoint with the same
|
||||||
|
* order X in CHARSET2 for the same language.
|
||||||
|
* As such, it is possible to get missing order. For instance the
|
||||||
|
* ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1
|
||||||
|
* even though they are both used for French. Same for the euro sign.
|
||||||
|
*/
|
||||||
|
static const unsigned char Iso_8859_1_CharToOrderMap[] =
|
||||||
|
{
|
||||||
|
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
|
||||||
|
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
|
||||||
|
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
|
||||||
|
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
|
||||||
|
SYM, 0, 16, 8, 11, 5, 19, 12, 3, 1, 27, 25, 9, 13, 2, 10, /* 4X */
|
||||||
|
21, 30, 4, 6, 7, 15, 23, 26, 29, 24, 28,SYM,SYM,SYM,SYM,SYM, /* 5X */
|
||||||
|
SYM, 0, 16, 8, 11, 5, 19, 12, 3, 1, 27, 25, 9, 13, 2, 10, /* 6X */
|
||||||
|
21, 30, 4, 6, 7, 15, 23, 26, 29, 24, 28,SYM,SYM,SYM,SYM,CTR, /* 7X */
|
||||||
|
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */
|
||||||
|
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */
|
||||||
|
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */
|
||||||
|
SYM,SYM,SYM,SYM,SYM, 44,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */
|
||||||
|
45, 14, 46, 47, 33, 48, 49, 39, 35, 18, 42, 37, 50, 17, 51, 40, /* CX */
|
||||||
|
52, 32, 43, 22, 53, 54, 38,SYM, 36, 55, 20, 56, 31, 57, 58, 59, /* DX */
|
||||||
|
60, 14, 61, 62, 33, 63, 64, 39, 35, 18, 42, 37, 65, 17, 66, 40, /* EX */
|
||||||
|
67, 32, 43, 22, 68, 69, 38,SYM, 36, 70, 20, 71, 31, 72, 73, 74, /* FX */
|
||||||
|
};
|
||||||
|
/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
|
||||||
|
|
||||||
|
static const unsigned char Windows_1252_CharToOrderMap[] =
|
||||||
|
{
|
||||||
|
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
|
||||||
|
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
|
||||||
|
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
|
||||||
|
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
|
||||||
|
SYM, 0, 16, 8, 11, 5, 19, 12, 3, 1, 27, 25, 9, 13, 2, 10, /* 4X */
|
||||||
|
21, 30, 4, 6, 7, 15, 23, 26, 29, 24, 28,SYM,SYM,SYM,SYM,SYM, /* 5X */
|
||||||
|
SYM, 0, 16, 8, 11, 5, 19, 12, 3, 1, 27, 25, 9, 13, 2, 10, /* 6X */
|
||||||
|
21, 30, 4, 6, 7, 15, 23, 26, 29, 24, 28,SYM,SYM,SYM,SYM,CTR, /* 7X */
|
||||||
|
SYM,ILL,SYM, 75,SYM,SYM,SYM,SYM,SYM,SYM, 34,SYM, 76,ILL, 77,ILL, /* 8X */
|
||||||
|
ILL,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 34,SYM, 78,ILL, 79, 80, /* 9X */
|
||||||
|
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */
|
||||||
|
SYM,SYM,SYM,SYM,SYM, 81,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */
|
||||||
|
82, 14, 83, 84, 33, 85, 86, 39, 35, 18, 42, 37, 87, 17, 88, 40, /* CX */
|
||||||
|
89, 32, 43, 22, 90, 91, 38,SYM, 36, 92, 20, 93, 31, 94, 95, 96, /* DX */
|
||||||
|
97, 14, 98, 99, 33,100,101, 39, 35, 18, 42, 37,102, 17,103, 40, /* EX */
|
||||||
|
104, 32, 43, 22,105,106, 38,SYM, 36,107, 20,108, 31,109,110,111, /* FX */
|
||||||
|
};
|
||||||
|
/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
|
||||||
|
|
||||||
|
static const unsigned char Iso_8859_15_CharToOrderMap[] =
|
||||||
|
{
|
||||||
|
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
|
||||||
|
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
|
||||||
|
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
|
||||||
|
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
|
||||||
|
SYM, 0, 16, 8, 11, 5, 19, 12, 3, 1, 27, 25, 9, 13, 2, 10, /* 4X */
|
||||||
|
21, 30, 4, 6, 7, 15, 23, 26, 29, 24, 28,SYM,SYM,SYM,SYM,SYM, /* 5X */
|
||||||
|
SYM, 0, 16, 8, 11, 5, 19, 12, 3, 1, 27, 25, 9, 13, 2, 10, /* 6X */
|
||||||
|
21, 30, 4, 6, 7, 15, 23, 26, 29, 24, 28,SYM,SYM,SYM,SYM,CTR, /* 7X */
|
||||||
|
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */
|
||||||
|
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */
|
||||||
|
SYM,SYM,SYM,SYM,SYM,SYM, 34,SYM, 34,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */
|
||||||
|
SYM,SYM,SYM,SYM,112,113,SYM,SYM,114,SYM,SYM,SYM,115,116,117,SYM, /* BX */
|
||||||
|
118, 14,119,120, 33,121,122, 39, 35, 18, 42, 37,123, 17,124, 40, /* CX */
|
||||||
|
125, 32, 43, 22,126,127, 38,SYM, 36,128, 20,129, 31,130,131,132, /* DX */
|
||||||
|
133, 14,134,135, 33,136,137, 39, 35, 18, 42, 37,138, 17,139, 40, /* EX */
|
||||||
|
140, 32, 43, 22,141,142, 38,SYM, 36,143, 20,144, 31,145,146,147, /* FX */
|
||||||
|
};
|
||||||
|
/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
|
||||||
|
|
||||||
|
static const unsigned char Iso_8859_9_CharToOrderMap[] =
|
||||||
|
{
|
||||||
|
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
|
||||||
|
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
|
||||||
|
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
|
||||||
|
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
|
||||||
|
SYM, 0, 16, 8, 11, 5, 19, 12, 3, 1, 27, 25, 9, 13, 2, 10, /* 4X */
|
||||||
|
21, 30, 4, 6, 7, 15, 23, 26, 29, 24, 28,SYM,SYM,SYM,SYM,SYM, /* 5X */
|
||||||
|
SYM, 0, 16, 8, 11, 5, 19, 12, 3, 1, 27, 25, 9, 13, 2, 10, /* 6X */
|
||||||
|
21, 30, 4, 6, 7, 15, 23, 26, 29, 24, 28,SYM,SYM,SYM,SYM,CTR, /* 7X */
|
||||||
|
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */
|
||||||
|
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */
|
||||||
|
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */
|
||||||
|
SYM,SYM,SYM,SYM,SYM,148,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */
|
||||||
|
149, 14,150,151, 33,152,153, 39, 35, 18, 42, 37,154, 17,155, 40, /* CX */
|
||||||
|
156, 32, 43, 22,157,158, 38,SYM, 36,159, 20,160, 31,161,162,163, /* DX */
|
||||||
|
164, 14,165,166, 33,167,168, 39, 35, 18, 42, 37,169, 17,170, 40, /* EX */
|
||||||
|
171, 32, 43, 22,172,173, 38,SYM, 36,174, 20,175, 31, 41,176,177, /* FX */
|
||||||
|
};
|
||||||
|
/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
|
||||||
|
|
||||||
|
|
||||||
|
/* Model Table:
|
||||||
|
* Total sequences: 701
|
||||||
|
* First 512 sequences: 0.9974076651249096
|
||||||
|
* Next 512 sequences (512-1024): 0.0025923348750903907
|
||||||
|
* Rest: -2.7755575615628914e-17
|
||||||
|
* Negative sequences: TODO
|
||||||
|
*/
|
||||||
|
static const PRUint8 IrishLangModel[] =
|
||||||
|
{
|
||||||
|
3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,0,3,0,3,0,3,3,3,3,2,3,3,2,
|
||||||
|
3,3,3,2,3,3,3,3,3,3,3,3,3,3,2,3,3,0,2,3,3,3,3,3,3,3,0,3,3,3,0,
|
||||||
|
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,0,2,3,0,2,
|
||||||
|
3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,2,0,0,0,0,
|
||||||
|
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,2,3,2,
|
||||||
|
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,0,3,0,3,3,3,3,3,3,2,3,3,0,
|
||||||
|
3,3,3,3,3,3,3,3,3,3,3,2,2,3,3,3,3,3,3,3,3,3,3,2,3,3,3,0,3,0,0,
|
||||||
|
3,3,3,3,3,3,3,3,3,3,3,0,2,3,3,3,3,3,3,2,3,0,3,3,3,3,2,2,3,0,0,
|
||||||
|
3,3,3,3,3,3,3,3,3,3,3,2,0,3,3,3,2,3,3,3,3,2,3,0,3,3,2,0,3,0,2,
|
||||||
|
3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,0,3,0,0,
|
||||||
|
2,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,0,3,3,3,0,3,3,3,3,3,2,3,2,
|
||||||
|
3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,2,3,3,3,3,0,3,2,3,2,3,2,2,0,0,
|
||||||
|
3,3,3,3,3,3,3,3,3,3,3,2,2,3,3,3,2,3,3,2,3,0,3,0,2,0,2,0,0,0,0,
|
||||||
|
3,3,3,3,3,3,3,3,3,3,3,2,0,3,3,3,3,3,3,2,3,3,3,0,3,0,0,0,2,2,0,
|
||||||
|
0,3,3,0,3,2,3,3,3,3,0,3,3,3,0,0,3,3,0,3,0,3,0,2,0,0,0,0,2,0,0,
|
||||||
|
3,3,3,2,3,3,3,3,3,3,2,3,3,3,3,0,3,3,2,2,0,3,0,2,2,2,0,2,3,2,0,
|
||||||
|
3,3,3,3,3,3,3,2,2,3,3,2,0,0,3,3,3,3,3,2,3,3,3,0,2,0,0,2,0,0,0,
|
||||||
|
2,0,3,0,3,0,3,3,3,3,3,3,3,2,0,0,3,0,0,0,3,0,0,2,0,0,0,0,0,0,0,
|
||||||
|
3,3,3,0,2,2,3,3,0,2,3,2,0,2,0,0,2,0,0,2,2,2,0,2,0,0,0,0,0,0,0,
|
||||||
|
3,3,0,3,3,3,2,3,2,3,3,0,3,2,3,3,2,3,3,3,0,0,3,2,2,0,0,0,0,0,0,
|
||||||
|
2,3,3,0,3,0,3,3,3,3,0,3,2,2,0,0,3,0,0,0,0,3,0,0,0,0,0,0,0,0,0,
|
||||||
|
3,3,0,3,3,3,3,3,2,3,3,0,0,2,3,3,0,3,3,0,2,3,3,0,2,0,0,0,0,0,0,
|
||||||
|
0,3,3,0,3,0,3,3,3,3,0,3,3,3,0,0,3,0,0,2,3,3,0,2,0,0,0,0,2,0,0,
|
||||||
|
3,3,2,0,3,3,3,2,0,2,3,0,2,0,3,2,0,3,3,0,0,0,3,2,2,0,0,0,0,0,0,
|
||||||
|
3,0,3,0,2,3,3,2,3,3,3,2,0,3,0,3,2,0,0,2,0,0,0,0,2,0,3,0,0,0,0,
|
||||||
|
3,3,3,3,3,3,3,0,0,3,3,0,0,2,2,3,2,0,2,0,0,2,0,2,3,2,2,0,0,0,0,
|
||||||
|
3,3,3,3,3,3,3,2,0,2,3,2,0,2,0,2,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,
|
||||||
|
3,3,2,0,2,3,0,0,0,0,3,0,0,0,0,3,0,0,0,0,0,0,2,0,0,2,0,0,0,0,0,
|
||||||
|
3,3,2,3,0,3,2,0,0,0,3,2,2,2,0,2,2,0,0,0,0,0,0,0,2,0,0,0,2,0,2,
|
||||||
|
3,3,0,0,0,2,0,0,0,0,2,0,0,0,0,0,0,0,0,3,0,3,0,2,2,0,0,0,0,0,0,
|
||||||
|
2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
const SequenceModel Iso_8859_1IrishModel =
|
||||||
|
{
|
||||||
|
Iso_8859_1_CharToOrderMap,
|
||||||
|
IrishLangModel,
|
||||||
|
31,
|
||||||
|
(float)0.9974076651249096,
|
||||||
|
PR_TRUE,
|
||||||
|
"ISO-8859-1"
|
||||||
|
};
|
||||||
|
|
||||||
|
const SequenceModel Windows_1252IrishModel =
|
||||||
|
{
|
||||||
|
Windows_1252_CharToOrderMap,
|
||||||
|
IrishLangModel,
|
||||||
|
31,
|
||||||
|
(float)0.9974076651249096,
|
||||||
|
PR_TRUE,
|
||||||
|
"WINDOWS-1252"
|
||||||
|
};
|
||||||
|
|
||||||
|
const SequenceModel Iso_8859_15IrishModel =
|
||||||
|
{
|
||||||
|
Iso_8859_15_CharToOrderMap,
|
||||||
|
IrishLangModel,
|
||||||
|
31,
|
||||||
|
(float)0.9974076651249096,
|
||||||
|
PR_TRUE,
|
||||||
|
"ISO-8859-15"
|
||||||
|
};
|
||||||
|
|
||||||
|
const SequenceModel Iso_8859_9IrishModel =
|
||||||
|
{
|
||||||
|
Iso_8859_9_CharToOrderMap,
|
||||||
|
IrishLangModel,
|
||||||
|
31,
|
||||||
|
(float)0.9974076651249096,
|
||||||
|
PR_TRUE,
|
||||||
|
"ISO-8859-9"
|
||||||
|
};
|
||||||
@ -169,6 +169,11 @@ nsSBCSGroupProber::nsSBCSGroupProber()
|
|||||||
mProbers[79] = new nsSingleByteCharSetProber(&Iso_8859_13EstonianModel);
|
mProbers[79] = new nsSingleByteCharSetProber(&Iso_8859_13EstonianModel);
|
||||||
mProbers[80] = new nsSingleByteCharSetProber(&Iso_8859_15EstonianModel);
|
mProbers[80] = new nsSingleByteCharSetProber(&Iso_8859_15EstonianModel);
|
||||||
|
|
||||||
|
mProbers[81] = new nsSingleByteCharSetProber(&Iso_8859_1IrishModel);
|
||||||
|
mProbers[82] = new nsSingleByteCharSetProber(&Iso_8859_9IrishModel);
|
||||||
|
mProbers[83] = new nsSingleByteCharSetProber(&Iso_8859_15IrishModel);
|
||||||
|
mProbers[84] = new nsSingleByteCharSetProber(&Windows_1252IrishModel);
|
||||||
|
|
||||||
Reset();
|
Reset();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -40,7 +40,7 @@
|
|||||||
#define nsSBCSGroupProber_h__
|
#define nsSBCSGroupProber_h__
|
||||||
|
|
||||||
|
|
||||||
#define NUM_OF_SBCS_PROBERS 81
|
#define NUM_OF_SBCS_PROBERS 85
|
||||||
|
|
||||||
class nsCharSetProber;
|
class nsCharSetProber;
|
||||||
class nsSBCSGroupProber: public nsCharSetProber {
|
class nsSBCSGroupProber: public nsCharSetProber {
|
||||||
|
|||||||
@ -230,5 +230,10 @@ extern const SequenceModel Iso_8859_4EstonianModel;
|
|||||||
extern const SequenceModel Iso_8859_13EstonianModel;
|
extern const SequenceModel Iso_8859_13EstonianModel;
|
||||||
extern const SequenceModel Iso_8859_15EstonianModel;
|
extern const SequenceModel Iso_8859_15EstonianModel;
|
||||||
|
|
||||||
|
extern const SequenceModel Iso_8859_15IrishModel;
|
||||||
|
extern const SequenceModel Iso_8859_9IrishModel;
|
||||||
|
extern const SequenceModel Iso_8859_1IrishModel;
|
||||||
|
extern const SequenceModel Windows_1252IrishModel;
|
||||||
|
|
||||||
#endif /* nsSingleByteCharSetProber_h__ */
|
#endif /* nsSingleByteCharSetProber_h__ */
|
||||||
|
|
||||||
|
|||||||
6
test/ga/iso-8859-1.txt
Normal file
6
test/ga/iso-8859-1.txt
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
Ag seo téarmaí seoltóireachta a bhaineann le longa adhmaid agus le báid.
|
||||||
|
|
||||||
|
Ní bhíodh de cheangal idir Éire agus tíortha eile ach na longa, agus tá Éire
|
||||||
|
féin lán de lochanna agus d'aibhneacha. Fágann seo go bhfuil an teanga breac le
|
||||||
|
téarmaíocht seoltóireachta agus loingseoireachta agus cuid di tugtha isteach ón
|
||||||
|
Lochlainnis agus ón mBéarla trí lonnaitheoirí ón iasacht.
|
||||||
6
test/ga/utf-8.txt
Normal file
6
test/ga/utf-8.txt
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
Ag seo téarmaí seoltóireachta a bhaineann le longa adhmaid agus le báid.
|
||||||
|
|
||||||
|
Ní bhíodh de cheangal idir Éire agus tíortha eile ach na longa, agus tá Éire
|
||||||
|
féin lán de lochanna agus d’aibhneacha. Fágann seo go bhfuil an teanga breac le
|
||||||
|
téarmaíocht seoltóireachta agus loingseoireachta agus cuid di tugtha isteach ón
|
||||||
|
Lochlainnis agus ón mBéarla trí lonnaitheoirí ón iasacht.
|
||||||
6
test/ga/windows-1252.txt
Normal file
6
test/ga/windows-1252.txt
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
Ag seo téarmaí seoltóireachta a bhaineann le longa adhmaid agus le báid.
|
||||||
|
|
||||||
|
Ní bhíodh de cheangal idir Éire agus tíortha eile ach na longa, agus tá Éire
|
||||||
|
féin lán de lochanna agus d’aibhneacha. Fágann seo go bhfuil an teanga breac le
|
||||||
|
téarmaíocht seoltóireachta agus loingseoireachta agus cuid di tugtha isteach ón
|
||||||
|
Lochlainnis agus ón mBéarla trí lonnaitheoirí ón iasacht.
|
||||||
Loading…
x
Reference in New Issue
Block a user