diff --git a/README.md b/README.md index 3a5897e..d427c29 100644 --- a/README.md +++ b/README.md @@ -42,6 +42,8 @@ Techniques used by universalchardet are described at http://www.mozilla.org/proj * WINDOWS-1255 * Thai * TIS-620 + * Esperanto + * ISO-8859-3 * French * ISO-8859-1 * ISO-8859-15 diff --git a/script/BuildLangModelLogs/LangEsperantoModel.log b/script/BuildLangModelLogs/LangEsperantoModel.log new file mode 100644 index 0000000..5f020cd --- /dev/null +++ b/script/BuildLangModelLogs/LangEsperantoModel.log @@ -0,0 +1,110 @@ += Logs of language model for Esperanto (eo) = + +- Generated by BuildLangModel.py +- Started: 2015-12-04 01:22:51.466573 +- Maximum depth: 3 +- Max number of pages: 50 + +== Parsed pages == + +Vikipedio:Ĉefpaĝo (revision 5524911) +10-a de novembro (revision 5792999) +12-a de novembro (revision 5793854) +13-a de novembro (revision 5795088) +18-a de novembro (revision 5796972) +2-a de novembro (revision 5772615) +20-a de novembro (revision 5799664) +2015 (revision 5791963) +22-a de novembro (revision 5799355) +24-a de novembro (revision 5800563) +4-a de decembro (revision 5806422) +4-a de novembro (revision 5789811) +5-a de novembro (revision 5789774) +6-a de novembro (revision 5790336) +7-a de novembro (revision 5791066) +8-a de novembro (revision 5791337) +9-a de novembro (revision 5791916) +A Night at the Opera (Queen) (revision 5184272) +Abdelhamid Abaaoud (revision 5800134) +André Glucksmann (revision 5792591) +Anglio (revision 5693468) +Argentino (revision 5804665) +Atencoj de novembro 2015 en Parizo (revision 5800135) +Aung San Suu Kyi (revision 5791362) +Austin FX4 (revision 5583207) +Azilo (revision 5751210) +Aŭstrio (revision 5804014) +Bahio (revision 5773065) +Bamako (revision 5798202) +Bataclan (revision 5795605) +Bejruto (revision 5774306) +Birmo (revision 5790386) +Blonda (revision 5441229) +Bohemian rhapsody (revision 5654078) +Cayetano Redondo (revision 5591025) +Ciro la 2-a (revision 5774667) +DJ Abdel (revision 5628860) +Daniela Mercury (revision 5764721) +Decembro de 2015 (revision 5626904) +Dilatkoeficiento (revision 5806460) +Eksproprietigo (revision 5586845) +Elektroniko (revision 5788966) +Elle s'appelait Sarah (filmo) (revision 5475154) +Esperanto (revision 5804190) +Federaciero (revision 5696168) +Fondaĵo Vikimedio (revision 5772681) +Francio (revision 5759775) +François Hollande (revision 5627721) + +== End of Parsed pages == + +- Wikipedia parsing ended at: 2015-12-04 01:27:38.176708 + +56 characters appeared 342524 times. + +First 35 characters: +[ 0] Char a: 12.557952143499435 % +[ 1] Char o: 9.84719318938235 % +[ 2] Char e: 9.10242785906973 % +[ 3] Char i: 8.362333734278474 % +[ 4] Char n: 7.6359612757062285 % +[ 5] Char r: 6.630192336887342 % +[ 6] Char t: 5.70821314710794 % +[ 7] Char l: 5.610409781504361 % +[ 8] Char s: 5.004320865107262 % +[ 9] Char k: 3.8855671427403626 % +[10] Char d: 3.7194473963868226 % +[11] Char j: 3.28531723324497 % +[12] Char u: 2.8465158645817517 % +[13] Char m: 2.787833845219605 % +[14] Char p: 2.6582078920017285 % +[15] Char g: 1.6825098387266293 % +[16] Char v: 1.4048650605505015 % +[17] Char c: 1.3823848839789328 % +[18] Char b: 1.1406499982482978 % +[19] Char f: 1.077296773364786 % +[20] Char z: 0.7342551178895493 % +[21] Char h: 0.6735294461118053 % +[22] Char ĝ: 0.53572888323154 % +[23] Char ŭ: 0.4268314045147202 % +[24] Char ĉ: 0.33545094650301877 % +[25] Char y: 0.17079095187490512 % +[26] Char ŝ: 0.15327393116978666 % +[27] Char w: 0.1442234704721421 % +[28] Char ĵ: 0.1039343228503696 % +[29] Char á: 0.0814541462788009 % +[30] Char ó: 0.05430276418586727 % +[31] Char é: 0.053718863495696656 % +[32] Char q: 0.04350060141771087 % +[33] Char x: 0.040873048311943105 % +[34] Char ĥ: 0.03824549520617533 % + +The first 35 characters have an accumulated ratio of 0.9991971365510156. + +989 sequences found. + +First 512 (typical positive ratio): 0.9942980632768038 +Next 512 (512-1024): 0.0015327393116978665 +Rest: -5.0306980803327406e-17 + +- Processing end: 2015-12-04 01:27:38.307198 diff --git a/script/charsets/iso-8859-3.py b/script/charsets/iso-8859-3.py new file mode 100644 index 0000000..5ff27c9 --- /dev/null +++ b/script/charsets/iso-8859-3.py @@ -0,0 +1,75 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +# ##### BEGIN LICENSE BLOCK ##### +# Version: MPL 1.1/GPL 2.0/LGPL 2.1 +# +# The contents of this file are subject to the Mozilla Public License Version +# 1.1 (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# http://www.mozilla.org/MPL/ +# +# Software distributed under the License is distributed on an "AS IS" basis, +# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License +# for the specific language governing rights and limitations under the +# License. +# +# The Original Code is Mozilla Universal charset detector code. +# +# The Initial Developer of the Original Code is +# Netscape Communications Corporation. +# Portions created by the Initial Developer are Copyright (C) 2001 +# the Initial Developer. All Rights Reserved. +# +# Contributor(s): +# Jehan +# +# Alternatively, the contents of this file may be used under the terms of +# either the GNU General Public License Version 2 or later (the "GPL"), or +# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), +# in which case the provisions of the GPL or the LGPL are applicable instead +# of those above. If you wish to allow use of your version of this file only +# under the terms of either the GPL or the LGPL, and not to allow others to +# use your version of this file under the terms of the MPL, indicate your +# decision by deleting the provisions above and replace them with the notice +# and other provisions required by the GPL or the LGPL. If you do not delete +# the provisions above, a recipient may use your version of this file under +# the terms of any one of the MPL, the GPL or the LGPL. +# +# ##### END LICENSE BLOCK ##### + +from codepoints import * + +# ISO-8859-3 is the full 8-bit range, IANA-defined, superset of ISO/CEI 8859-3. +# It is basically the same as ISO/CEI 8859-3, but with control characters. +name = 'ISO-8859-3' +aliases = ['ISO_8859-3:1988', 'ISO_8859-3', 'iso-ir-109', + 'csISOLatin3', 'latin3', 'l3'] + +language = \ +{ + # Languages with complete coverage. + 'complete': [ 'eo', 'tr', 'mt' ], + 'incomplete': [] +} + +# X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF # +charmap = \ +[ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, # 0X + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 1X + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # 2X + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, # 3X + SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 4X + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,SYM, # 5X + SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 6X + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,CTR, # 7X + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 8X + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 9X + SYM,LET,SYM,SYM,SYM,ILL,LET,SYM,SYM,LET,LET,LET,LET,SYM,ILL,LET, # AX + SYM,LET,SYM,SYM,SYM,SYM,LET,SYM,SYM,LET,LET,LET,LET,SYM,ILL,LET, # BX + LET,LET,LET,ILL,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # CX + ILL,LET,LET,LET,LET,LET,LET,SYM,LET,LET,LET,LET,LET,LET,LET,LET, # DX + LET,LET,LET,ILL,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # EX + ILL,LET,LET,LET,LET,LET,LET,SYM,LET,LET,LET,LET,LET,LET,LET,SYM, # FX +] diff --git a/script/langs/eo.py b/script/langs/eo.py new file mode 100644 index 0000000..c593921 --- /dev/null +++ b/script/langs/eo.py @@ -0,0 +1,76 @@ +#!/bin/python3 +# -*- coding: utf-8 -*- + +# ##### BEGIN LICENSE BLOCK ##### +# Version: MPL 1.1/GPL 2.0/LGPL 2.1 +# +# The contents of this file are subject to the Mozilla Public License Version +# 1.1 (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# http://www.mozilla.org/MPL/ +# +# Software distributed under the License is distributed on an "AS IS" basis, +# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License +# for the specific language governing rights and limitations under the +# License. +# +# The Original Code is Mozilla Universal charset detector code. +# +# The Initial Developer of the Original Code is +# Netscape Communications Corporation. +# Portions created by the Initial Developer are Copyright (C) 2001 +# the Initial Developer. All Rights Reserved. +# +# Contributor(s): +# Jehan +# +# Alternatively, the contents of this file may be used under the terms of +# either the GNU General Public License Version 2 or later (the "GPL"), or +# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), +# in which case the provisions of the GPL or the LGPL are applicable instead +# of those above. If you wish to allow use of your version of this file only +# under the terms of either the GPL or the LGPL, and not to allow others to +# use your version of this file under the terms of the MPL, indicate your +# decision by deleting the provisions above and replace them with the notice +# and other provisions required by the GPL or the LGPL. If you do not delete +# the provisions above, a recipient may use your version of this file under +# the terms of any one of the MPL, the GPL or the LGPL. +# +# ##### END LICENSE BLOCK ##### + +import re + +## Mandatory Properties ## + +# The human name for the language, in English. +name = 'Esperanto' +# Use 2-letter ISO 639-1 if possible, 3-letter ISO code otherwise, +# or use another catalog as a last resort. +code = 'eo' +# Esperanto actually does use ASCII, but not q, w, x, or y. +# So I just use the alphabet variable below instead. +use_ascii = False +# The charsets we want to support and create data for. +charsets = ['ISO-8859-3'] + +## Optional Properties ## + +# Alphabet characters. +alphabet = 'abcĉdefgĝhĥijĵklmnoprsŝtuŭvz' +# The start page. Though optional, it is advised to choose one yourself. +start_pages = ['Vikipedio:Ĉefpaĝo'] +# give possibility to select another code for the Wikipedia URL. +wikipedia_code = code +# 'a' and 'A' will be considered the same character, and so on. +# This uses Python algorithm to determine upper/lower-case of a given +# character. +case_mapping = True + +# A function to clean content returned by the `wikipedia` python lib, +# in case some unwanted data has been overlooked. +def clean_wikipedia_content(content): + # Get rid of title syntax: "=== Articles connexes ===" + cleaned = re.sub(r'(=+) *([^=]+) *\1', + r'\2', + content) + return cleaned diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index c8fae7b..7ab910b 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -10,6 +10,7 @@ set( JpCntx.cpp LangModels/LangBulgarianModel.cpp LangModels/LangCyrillicModel.cpp + LangModels/LangEsperantoModel.cpp LangModels/LangFrenchModel.cpp LangModels/LangGermanModel.cpp LangModels/LangGreekModel.cpp diff --git a/src/LangModels/LangEsperantoModel.cpp b/src/LangModels/LangEsperantoModel.cpp new file mode 100644 index 0000000..4993abc --- /dev/null +++ b/src/LangModels/LangEsperantoModel.cpp @@ -0,0 +1,141 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is Mozilla Communicator client code. + * + * The Initial Developer of the Original Code is + * Netscape Communications Corporation. + * Portions created by the Initial Developer are Copyright (C) 1998 + * the Initial Developer. All Rights Reserved. + * + * Contributor(s): + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ + +#include "../nsSBCharSetProber.h" + +/********* Language model for: Esperanto *********/ + +/** + * Generated by BuildLangModel.py + * On: 2015-12-04 01:27:38.177516 + **/ + +/* Character Mapping Table: + * ILL: illegal character. + * CTR: control character specific to the charset. + * RET: carriage/return. + * SYM: symbol (punctuation) that does not belong to word. + * NUM: 0 - 9. + * + * Other characters are ordered by probabilities + * (0 is the most common character in the language). + * + * Orders are generic to a language. So the codepoint with order X in + * CHARSET1 maps to the same character as the codepoint with the same + * order X in CHARSET2 for the same language. + * As such, it is possible to get missing order. For instance the + * ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1 + * even though they are both used for French. Same for the euro sign. + */ +static const unsigned char Iso_8859_3_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 0, 18, 17, 10, 2, 19, 15, 21, 3, 11, 9, 7, 13, 4, 1, /* 4X */ + 14, 32, 5, 8, 6, 12, 16, 27, 33, 25, 20,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 0, 18, 17, 10, 2, 19, 15, 21, 3, 11, 9, 7, 13, 4, 1, /* 6X */ + 14, 32, 5, 8, 6, 12, 16, 27, 33, 25, 20,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM, 56,SYM,SYM,SYM,ILL, 34,SYM,SYM, 57, 53, 58, 28,SYM,ILL, 40, /* AX */ + SYM, 59,SYM,SYM,SYM,SYM, 34,SYM,SYM, 60, 53, 61, 28,SYM,ILL, 40, /* BX */ + 44, 29, 46,ILL, 43, 62, 24, 38, 41, 31, 48, 50, 54, 35, 49, 52, /* CX */ + ILL, 42, 63, 30, 47, 64, 36,SYM, 22, 51, 39, 55, 37, 23, 26, 45, /* DX */ + 44, 29, 46,ILL, 43, 65, 24, 38, 41, 31, 48, 50, 54, 35, 49, 52, /* EX */ + ILL, 42, 66, 30, 47, 67, 36,SYM, 22, 51, 39, 55, 37, 23, 26,SYM, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + + +/* Model Table: + * Total sequences: 989 + * First 512 sequences: 0.9942980632768038 + * Next 512 sequences (512-1024): 0.0057019367231962385 + * Rest: -5.0306980803327406e-17 + * Negative sequences: TODO + */ +static const PRUint8 EsperantoLangModel[] = +{ + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,0,0,2,3,3, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,2,3,2,0,0,0,2,2,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,0,0,0,2,3,3, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,2,3,2,2,2,3,0,2,2,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,2,2,3,2,3,2,0,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,3,0,3,3,3,2,2,2, + 3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,2,3,2,3,3,3,3,0,0,2,3,2,2,2,3,3,2,0,2,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,2,3,2,2,2,3,2,2,0,3,3,3,2,0,2, + 3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,2,3,3,3,3,3,3,0,0,0,3,0,2,0,3,2,3,2,2,0, + 3,3,3,3,3,3,3,3,3,2,3,2,3,3,3,2,3,3,2,3,3,3,0,0,0,3,2,0,2,3,2,2,0,0,0, + 3,3,3,3,3,3,2,3,3,2,3,2,3,3,3,3,3,2,2,2,3,3,0,0,2,3,0,3,2,2,2,2,0,0,0, + 3,3,3,3,3,3,3,3,3,2,3,2,3,3,2,2,0,2,2,2,2,2,2,0,0,0,0,0,0,3,3,2,0,2,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,0,3,2,3,2,0,0,0,2,0,2,2, + 3,3,3,3,3,3,3,3,3,3,2,2,3,3,3,2,2,3,3,3,2,0,0,0,2,3,2,2,0,3,2,2,0,0,0, + 3,3,3,3,2,3,3,3,3,2,2,2,3,2,3,2,0,2,2,2,2,3,0,0,0,2,2,0,0,3,2,2,0,0,0, + 3,3,3,3,3,3,2,3,3,2,3,0,3,3,2,2,3,2,2,2,2,3,0,2,2,3,2,2,2,2,2,3,0,2,0, + 3,3,3,3,2,3,2,2,2,2,2,3,3,2,2,2,0,0,2,0,2,2,0,0,2,2,0,0,0,3,2,2,0,0,0, + 3,3,3,3,0,3,3,3,3,3,2,0,3,2,2,2,0,3,2,2,3,3,0,0,0,3,0,0,0,2,2,2,2,2,2, + 3,3,3,3,3,3,3,3,3,2,3,2,3,2,2,2,2,2,3,2,0,2,0,0,0,3,2,0,0,3,3,3,0,0,0, + 3,3,3,3,0,3,3,3,2,2,2,2,3,3,2,3,2,0,2,3,0,0,0,0,0,2,0,0,0,0,0,2,0,3,0, + 3,3,3,3,3,2,2,3,3,3,2,2,3,2,2,2,2,3,3,2,2,0,0,0,0,3,2,2,0,2,2,2,2,0,0, + 3,3,3,3,3,3,3,3,2,2,2,0,3,3,2,0,2,0,2,2,0,2,0,0,0,2,0,2,0,2,2,2,0,2,0, + 3,3,3,3,0,0,2,3,0,0,2,2,3,0,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,2,3,3,2,3,3,3,3,3,3,2,2,2,2,3,2,0,2,2,3,2,0,0,2,0,3,0,0,0,0,0,0,0,0, + 3,3,3,3,0,0,2,2,0,2,3,2,3,3,0,0,2,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,3,3,3,3,3,2,3,3,2,3,2,2,2,2,2,2,2,2,2,0,0,0,0,0,0,0,2,0,2,0,2,0,0,0, + 3,3,3,3,2,2,3,2,0,2,0,2,3,2,2,0,3,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0, + 3,3,3,3,2,2,2,2,3,2,0,0,2,0,0,0,0,0,0,2,0,2,0,0,0,2,0,3,0,0,2,0,0,0,0, + 3,3,2,2,2,2,0,2,0,2,0,0,3,0,0,2,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,2,0,3,3,3,3,3,2,3,0,0,2,2,2,2,3,2,2,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,3,3,2,2,2,2,2,2,0,0,2,2,2,0,2,2,3,0,0,0,0,0,0,3,0,0,0,0,0,0,0, + 2,2,2,0,3,3,3,3,3,2,2,0,2,2,2,2,2,2,2,2,2,0,0,0,0,0,0,0,0,0,0,0,2,0,0, + 2,0,0,2,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0, + 2,2,2,3,0,0,2,2,0,0,0,0,2,2,2,2,0,0,0,2,0,2,0,0,0,0,0,2,0,0,0,0,0,0,0, + 3,3,3,2,2,0,2,0,0,0,2,0,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +}; + + +const SequenceModel Iso_8859_3EsperantoModel = +{ + Iso_8859_3_CharToOrderMap, + EsperantoLangModel, + 35, + (float)0.9942980632768038, + PR_FALSE, + "ISO-8859-3" +}; \ No newline at end of file diff --git a/src/nsSBCSGroupProber.cpp b/src/nsSBCSGroupProber.cpp index c4bd868..210831b 100644 --- a/src/nsSBCSGroupProber.cpp +++ b/src/nsSBCSGroupProber.cpp @@ -88,6 +88,8 @@ nsSBCSGroupProber::nsSBCSGroupProber() mProbers[19] = new nsSingleByteCharSetProber(&Iso_8859_1GermanModel); mProbers[20] = new nsSingleByteCharSetProber(&Windows_1252GermanModel); + mProbers[21] = new nsSingleByteCharSetProber(&Iso_8859_3EsperantoModel); + Reset(); } diff --git a/src/nsSBCSGroupProber.h b/src/nsSBCSGroupProber.h index c2a8768..dbe3650 100644 --- a/src/nsSBCSGroupProber.h +++ b/src/nsSBCSGroupProber.h @@ -40,7 +40,7 @@ #define nsSBCSGroupProber_h__ -#define NUM_OF_SBCS_PROBERS 21 +#define NUM_OF_SBCS_PROBERS 22 class nsCharSetProber; class nsSBCSGroupProber: public nsCharSetProber { diff --git a/src/nsSBCharSetProber.h b/src/nsSBCharSetProber.h index 63da429..63ce080 100644 --- a/src/nsSBCharSetProber.h +++ b/src/nsSBCharSetProber.h @@ -129,19 +129,28 @@ extern const SequenceModel Latin5CyrillicModel; extern const SequenceModel MacCyrillicModel; extern const SequenceModel Ibm866CyrillicModel; extern const SequenceModel Ibm855CyrillicModel; + extern const SequenceModel Latin7GreekModel; extern const SequenceModel Win1253GreekModel; + extern const SequenceModel Latin5BulgarianModel; extern const SequenceModel Win1251BulgarianModel; + extern const SequenceModel Latin2HungarianModel; extern const SequenceModel Win1250HungarianModel; + extern const SequenceModel Win1255Model; + extern const SequenceModel TIS620ThaiModel; + extern const SequenceModel Iso_8859_15FrenchModel; extern const SequenceModel Iso_8859_1FrenchModel; extern const SequenceModel Windows_1252FrenchModel; + extern const SequenceModel Iso_8859_1GermanModel; extern const SequenceModel Windows_1252GermanModel; +extern const SequenceModel Iso_8859_3EsperantoModel; + #endif /* nsSingleByteCharSetProber_h__ */ diff --git a/test/eo/iso-8859-3.txt b/test/eo/iso-8859-3.txt new file mode 100644 index 0000000..9f35d8a --- /dev/null +++ b/test/eo/iso-8859-3.txt @@ -0,0 +1,7 @@ +Esperanto (origine Lingvo Internacia) estas la plej disvastigita internacia +planlingvo.[3] La nomo venas de la kanomo "Dr-o Esperanto", sub kiu la juda +kuracisto Ludoviko Lazaro Zamenhofo en la jaro 1887 publikigis la bazon de la +lingvo. La unua versio, la rusa, ricevis la cenzuran permeson disvastii en la +26-a de julio; i tiun daton oni konsideras la naskitago de Esperanto[4][5]. Li +intencis krei facile lerneblan netralan lingvon, tagan por uzo en la +internacia komunikado, tamen ne anstataigi aliajn, naciajn lingvojn.