mirror of
https://gitlab.freedesktop.org/uchardet/uchardet.git
synced 2025-12-06 16:56:40 +08:00
LangModels: add Esperanto ISO-8859-3 language model.
This commit is contained in:
parent
a167bd5e42
commit
f0e122b506
@ -42,6 +42,8 @@ Techniques used by universalchardet are described at http://www.mozilla.org/proj
|
||||
* WINDOWS-1255
|
||||
* Thai
|
||||
* TIS-620
|
||||
* Esperanto
|
||||
* ISO-8859-3
|
||||
* French
|
||||
* ISO-8859-1
|
||||
* ISO-8859-15
|
||||
|
||||
110
script/BuildLangModelLogs/LangEsperantoModel.log
Normal file
110
script/BuildLangModelLogs/LangEsperantoModel.log
Normal file
@ -0,0 +1,110 @@
|
||||
= Logs of language model for Esperanto (eo) =
|
||||
|
||||
- Generated by BuildLangModel.py
|
||||
- Started: 2015-12-04 01:22:51.466573
|
||||
- Maximum depth: 3
|
||||
- Max number of pages: 50
|
||||
|
||||
== Parsed pages ==
|
||||
|
||||
Vikipedio:Ĉefpaĝo (revision 5524911)
|
||||
10-a de novembro (revision 5792999)
|
||||
12-a de novembro (revision 5793854)
|
||||
13-a de novembro (revision 5795088)
|
||||
18-a de novembro (revision 5796972)
|
||||
2-a de novembro (revision 5772615)
|
||||
20-a de novembro (revision 5799664)
|
||||
2015 (revision 5791963)
|
||||
22-a de novembro (revision 5799355)
|
||||
24-a de novembro (revision 5800563)
|
||||
4-a de decembro (revision 5806422)
|
||||
4-a de novembro (revision 5789811)
|
||||
5-a de novembro (revision 5789774)
|
||||
6-a de novembro (revision 5790336)
|
||||
7-a de novembro (revision 5791066)
|
||||
8-a de novembro (revision 5791337)
|
||||
9-a de novembro (revision 5791916)
|
||||
A Night at the Opera (Queen) (revision 5184272)
|
||||
Abdelhamid Abaaoud (revision 5800134)
|
||||
André Glucksmann (revision 5792591)
|
||||
Anglio (revision 5693468)
|
||||
Argentino (revision 5804665)
|
||||
Atencoj de novembro 2015 en Parizo (revision 5800135)
|
||||
Aung San Suu Kyi (revision 5791362)
|
||||
Austin FX4 (revision 5583207)
|
||||
Azilo (revision 5751210)
|
||||
Aŭstrio (revision 5804014)
|
||||
Bahio (revision 5773065)
|
||||
Bamako (revision 5798202)
|
||||
Bataclan (revision 5795605)
|
||||
Bejruto (revision 5774306)
|
||||
Birmo (revision 5790386)
|
||||
Blonda (revision 5441229)
|
||||
Bohemian rhapsody (revision 5654078)
|
||||
Cayetano Redondo (revision 5591025)
|
||||
Ciro la 2-a (revision 5774667)
|
||||
DJ Abdel (revision 5628860)
|
||||
Daniela Mercury (revision 5764721)
|
||||
Decembro de 2015 (revision 5626904)
|
||||
Dilatkoeficiento (revision 5806460)
|
||||
Eksproprietigo (revision 5586845)
|
||||
Elektroniko (revision 5788966)
|
||||
Elle s'appelait Sarah (filmo) (revision 5475154)
|
||||
Esperanto (revision 5804190)
|
||||
Federaciero (revision 5696168)
|
||||
Fondaĵo Vikimedio (revision 5772681)
|
||||
Francio (revision 5759775)
|
||||
François Hollande (revision 5627721)
|
||||
|
||||
== End of Parsed pages ==
|
||||
|
||||
- Wikipedia parsing ended at: 2015-12-04 01:27:38.176708
|
||||
|
||||
56 characters appeared 342524 times.
|
||||
|
||||
First 35 characters:
|
||||
[ 0] Char a: 12.557952143499435 %
|
||||
[ 1] Char o: 9.84719318938235 %
|
||||
[ 2] Char e: 9.10242785906973 %
|
||||
[ 3] Char i: 8.362333734278474 %
|
||||
[ 4] Char n: 7.6359612757062285 %
|
||||
[ 5] Char r: 6.630192336887342 %
|
||||
[ 6] Char t: 5.70821314710794 %
|
||||
[ 7] Char l: 5.610409781504361 %
|
||||
[ 8] Char s: 5.004320865107262 %
|
||||
[ 9] Char k: 3.8855671427403626 %
|
||||
[10] Char d: 3.7194473963868226 %
|
||||
[11] Char j: 3.28531723324497 %
|
||||
[12] Char u: 2.8465158645817517 %
|
||||
[13] Char m: 2.787833845219605 %
|
||||
[14] Char p: 2.6582078920017285 %
|
||||
[15] Char g: 1.6825098387266293 %
|
||||
[16] Char v: 1.4048650605505015 %
|
||||
[17] Char c: 1.3823848839789328 %
|
||||
[18] Char b: 1.1406499982482978 %
|
||||
[19] Char f: 1.077296773364786 %
|
||||
[20] Char z: 0.7342551178895493 %
|
||||
[21] Char h: 0.6735294461118053 %
|
||||
[22] Char ĝ: 0.53572888323154 %
|
||||
[23] Char ŭ: 0.4268314045147202 %
|
||||
[24] Char ĉ: 0.33545094650301877 %
|
||||
[25] Char y: 0.17079095187490512 %
|
||||
[26] Char ŝ: 0.15327393116978666 %
|
||||
[27] Char w: 0.1442234704721421 %
|
||||
[28] Char ĵ: 0.1039343228503696 %
|
||||
[29] Char á: 0.0814541462788009 %
|
||||
[30] Char ó: 0.05430276418586727 %
|
||||
[31] Char é: 0.053718863495696656 %
|
||||
[32] Char q: 0.04350060141771087 %
|
||||
[33] Char x: 0.040873048311943105 %
|
||||
[34] Char ĥ: 0.03824549520617533 %
|
||||
|
||||
The first 35 characters have an accumulated ratio of 0.9991971365510156.
|
||||
|
||||
989 sequences found.
|
||||
|
||||
First 512 (typical positive ratio): 0.9942980632768038
|
||||
Next 512 (512-1024): 0.0015327393116978665
|
||||
Rest: -5.0306980803327406e-17
|
||||
|
||||
- Processing end: 2015-12-04 01:27:38.307198
|
||||
75
script/charsets/iso-8859-3.py
Normal file
75
script/charsets/iso-8859-3.py
Normal file
@ -0,0 +1,75 @@
|
||||
#!/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# ##### BEGIN LICENSE BLOCK #####
|
||||
# Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
#
|
||||
# The contents of this file are subject to the Mozilla Public License Version
|
||||
# 1.1 (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
# http://www.mozilla.org/MPL/
|
||||
#
|
||||
# Software distributed under the License is distributed on an "AS IS" basis,
|
||||
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
# for the specific language governing rights and limitations under the
|
||||
# License.
|
||||
#
|
||||
# The Original Code is Mozilla Universal charset detector code.
|
||||
#
|
||||
# The Initial Developer of the Original Code is
|
||||
# Netscape Communications Corporation.
|
||||
# Portions created by the Initial Developer are Copyright (C) 2001
|
||||
# the Initial Developer. All Rights Reserved.
|
||||
#
|
||||
# Contributor(s):
|
||||
# Jehan <jehan@girinstud.io>
|
||||
#
|
||||
# Alternatively, the contents of this file may be used under the terms of
|
||||
# either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
# in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
# of those above. If you wish to allow use of your version of this file only
|
||||
# under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
# use your version of this file under the terms of the MPL, indicate your
|
||||
# decision by deleting the provisions above and replace them with the notice
|
||||
# and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
# the provisions above, a recipient may use your version of this file under
|
||||
# the terms of any one of the MPL, the GPL or the LGPL.
|
||||
#
|
||||
# ##### END LICENSE BLOCK #####
|
||||
|
||||
from codepoints import *
|
||||
|
||||
# ISO-8859-3 is the full 8-bit range, IANA-defined, superset of ISO/CEI 8859-3.
|
||||
# It is basically the same as ISO/CEI 8859-3, but with control characters.
|
||||
name = 'ISO-8859-3'
|
||||
aliases = ['ISO_8859-3:1988', 'ISO_8859-3', 'iso-ir-109',
|
||||
'csISOLatin3', 'latin3', 'l3']
|
||||
|
||||
language = \
|
||||
{
|
||||
# Languages with complete coverage.
|
||||
'complete': [ 'eo', 'tr', 'mt' ],
|
||||
'incomplete': []
|
||||
}
|
||||
|
||||
# X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF #
|
||||
charmap = \
|
||||
[
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, # 0X
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 1X
|
||||
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # 2X
|
||||
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, # 3X
|
||||
SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 4X
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,SYM, # 5X
|
||||
SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 6X
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,CTR, # 7X
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 8X
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 9X
|
||||
SYM,LET,SYM,SYM,SYM,ILL,LET,SYM,SYM,LET,LET,LET,LET,SYM,ILL,LET, # AX
|
||||
SYM,LET,SYM,SYM,SYM,SYM,LET,SYM,SYM,LET,LET,LET,LET,SYM,ILL,LET, # BX
|
||||
LET,LET,LET,ILL,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # CX
|
||||
ILL,LET,LET,LET,LET,LET,LET,SYM,LET,LET,LET,LET,LET,LET,LET,LET, # DX
|
||||
LET,LET,LET,ILL,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # EX
|
||||
ILL,LET,LET,LET,LET,LET,LET,SYM,LET,LET,LET,LET,LET,LET,LET,SYM, # FX
|
||||
]
|
||||
76
script/langs/eo.py
Normal file
76
script/langs/eo.py
Normal file
@ -0,0 +1,76 @@
|
||||
#!/bin/python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# ##### BEGIN LICENSE BLOCK #####
|
||||
# Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
#
|
||||
# The contents of this file are subject to the Mozilla Public License Version
|
||||
# 1.1 (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
# http://www.mozilla.org/MPL/
|
||||
#
|
||||
# Software distributed under the License is distributed on an "AS IS" basis,
|
||||
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
# for the specific language governing rights and limitations under the
|
||||
# License.
|
||||
#
|
||||
# The Original Code is Mozilla Universal charset detector code.
|
||||
#
|
||||
# The Initial Developer of the Original Code is
|
||||
# Netscape Communications Corporation.
|
||||
# Portions created by the Initial Developer are Copyright (C) 2001
|
||||
# the Initial Developer. All Rights Reserved.
|
||||
#
|
||||
# Contributor(s):
|
||||
# Jehan <jehan@girinstud.io>
|
||||
#
|
||||
# Alternatively, the contents of this file may be used under the terms of
|
||||
# either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
# in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
# of those above. If you wish to allow use of your version of this file only
|
||||
# under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
# use your version of this file under the terms of the MPL, indicate your
|
||||
# decision by deleting the provisions above and replace them with the notice
|
||||
# and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
# the provisions above, a recipient may use your version of this file under
|
||||
# the terms of any one of the MPL, the GPL or the LGPL.
|
||||
#
|
||||
# ##### END LICENSE BLOCK #####
|
||||
|
||||
import re
|
||||
|
||||
## Mandatory Properties ##
|
||||
|
||||
# The human name for the language, in English.
|
||||
name = 'Esperanto'
|
||||
# Use 2-letter ISO 639-1 if possible, 3-letter ISO code otherwise,
|
||||
# or use another catalog as a last resort.
|
||||
code = 'eo'
|
||||
# Esperanto actually does use ASCII, but not q, w, x, or y.
|
||||
# So I just use the alphabet variable below instead.
|
||||
use_ascii = False
|
||||
# The charsets we want to support and create data for.
|
||||
charsets = ['ISO-8859-3']
|
||||
|
||||
## Optional Properties ##
|
||||
|
||||
# Alphabet characters.
|
||||
alphabet = 'abcĉdefgĝhĥijĵklmnoprsŝtuŭvz'
|
||||
# The start page. Though optional, it is advised to choose one yourself.
|
||||
start_pages = ['Vikipedio:Ĉefpaĝo']
|
||||
# give possibility to select another code for the Wikipedia URL.
|
||||
wikipedia_code = code
|
||||
# 'a' and 'A' will be considered the same character, and so on.
|
||||
# This uses Python algorithm to determine upper/lower-case of a given
|
||||
# character.
|
||||
case_mapping = True
|
||||
|
||||
# A function to clean content returned by the `wikipedia` python lib,
|
||||
# in case some unwanted data has been overlooked.
|
||||
def clean_wikipedia_content(content):
|
||||
# Get rid of title syntax: "=== Articles connexes ==="
|
||||
cleaned = re.sub(r'(=+) *([^=]+) *\1',
|
||||
r'\2',
|
||||
content)
|
||||
return cleaned
|
||||
@ -10,6 +10,7 @@ set(
|
||||
JpCntx.cpp
|
||||
LangModels/LangBulgarianModel.cpp
|
||||
LangModels/LangCyrillicModel.cpp
|
||||
LangModels/LangEsperantoModel.cpp
|
||||
LangModels/LangFrenchModel.cpp
|
||||
LangModels/LangGermanModel.cpp
|
||||
LangModels/LangGreekModel.cpp
|
||||
|
||||
141
src/LangModels/LangEsperantoModel.cpp
Normal file
141
src/LangModels/LangEsperantoModel.cpp
Normal file
@ -0,0 +1,141 @@
|
||||
/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
|
||||
/* ***** BEGIN LICENSE BLOCK *****
|
||||
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
*
|
||||
* The contents of this file are subject to the Mozilla Public License Version
|
||||
* 1.1 (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
* http://www.mozilla.org/MPL/
|
||||
*
|
||||
* Software distributed under the License is distributed on an "AS IS" basis,
|
||||
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
* for the specific language governing rights and limitations under the
|
||||
* License.
|
||||
*
|
||||
* The Original Code is Mozilla Communicator client code.
|
||||
*
|
||||
* The Initial Developer of the Original Code is
|
||||
* Netscape Communications Corporation.
|
||||
* Portions created by the Initial Developer are Copyright (C) 1998
|
||||
* the Initial Developer. All Rights Reserved.
|
||||
*
|
||||
* Contributor(s):
|
||||
*
|
||||
* Alternatively, the contents of this file may be used under the terms of
|
||||
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
* in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
* of those above. If you wish to allow use of your version of this file only
|
||||
* under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
* use your version of this file under the terms of the MPL, indicate your
|
||||
* decision by deleting the provisions above and replace them with the notice
|
||||
* and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
* the provisions above, a recipient may use your version of this file under
|
||||
* the terms of any one of the MPL, the GPL or the LGPL.
|
||||
*
|
||||
* ***** END LICENSE BLOCK ***** */
|
||||
|
||||
#include "../nsSBCharSetProber.h"
|
||||
|
||||
/********* Language model for: Esperanto *********/
|
||||
|
||||
/**
|
||||
* Generated by BuildLangModel.py
|
||||
* On: 2015-12-04 01:27:38.177516
|
||||
**/
|
||||
|
||||
/* Character Mapping Table:
|
||||
* ILL: illegal character.
|
||||
* CTR: control character specific to the charset.
|
||||
* RET: carriage/return.
|
||||
* SYM: symbol (punctuation) that does not belong to word.
|
||||
* NUM: 0 - 9.
|
||||
*
|
||||
* Other characters are ordered by probabilities
|
||||
* (0 is the most common character in the language).
|
||||
*
|
||||
* Orders are generic to a language. So the codepoint with order X in
|
||||
* CHARSET1 maps to the same character as the codepoint with the same
|
||||
* order X in CHARSET2 for the same language.
|
||||
* As such, it is possible to get missing order. For instance the
|
||||
* ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1
|
||||
* even though they are both used for French. Same for the euro sign.
|
||||
*/
|
||||
static const unsigned char Iso_8859_3_CharToOrderMap[] =
|
||||
{
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
|
||||
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
|
||||
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
|
||||
SYM, 0, 18, 17, 10, 2, 19, 15, 21, 3, 11, 9, 7, 13, 4, 1, /* 4X */
|
||||
14, 32, 5, 8, 6, 12, 16, 27, 33, 25, 20,SYM,SYM,SYM,SYM,SYM, /* 5X */
|
||||
SYM, 0, 18, 17, 10, 2, 19, 15, 21, 3, 11, 9, 7, 13, 4, 1, /* 6X */
|
||||
14, 32, 5, 8, 6, 12, 16, 27, 33, 25, 20,SYM,SYM,SYM,SYM,CTR, /* 7X */
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */
|
||||
SYM, 56,SYM,SYM,SYM,ILL, 34,SYM,SYM, 57, 53, 58, 28,SYM,ILL, 40, /* AX */
|
||||
SYM, 59,SYM,SYM,SYM,SYM, 34,SYM,SYM, 60, 53, 61, 28,SYM,ILL, 40, /* BX */
|
||||
44, 29, 46,ILL, 43, 62, 24, 38, 41, 31, 48, 50, 54, 35, 49, 52, /* CX */
|
||||
ILL, 42, 63, 30, 47, 64, 36,SYM, 22, 51, 39, 55, 37, 23, 26, 45, /* DX */
|
||||
44, 29, 46,ILL, 43, 65, 24, 38, 41, 31, 48, 50, 54, 35, 49, 52, /* EX */
|
||||
ILL, 42, 66, 30, 47, 67, 36,SYM, 22, 51, 39, 55, 37, 23, 26,SYM, /* FX */
|
||||
};
|
||||
/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
|
||||
|
||||
|
||||
/* Model Table:
|
||||
* Total sequences: 989
|
||||
* First 512 sequences: 0.9942980632768038
|
||||
* Next 512 sequences (512-1024): 0.0057019367231962385
|
||||
* Rest: -5.0306980803327406e-17
|
||||
* Negative sequences: TODO
|
||||
*/
|
||||
static const PRUint8 EsperantoLangModel[] =
|
||||
{
|
||||
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,0,0,2,3,3,
|
||||
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,2,3,2,0,0,0,2,2,2,
|
||||
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,0,0,0,2,3,3,
|
||||
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,2,3,2,2,2,3,0,2,2,2,
|
||||
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,2,2,3,2,3,2,0,2,
|
||||
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,3,0,3,3,3,2,2,2,
|
||||
3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,2,3,2,3,3,3,3,0,0,2,3,2,2,2,3,3,2,0,2,0,
|
||||
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,2,3,2,2,2,3,2,2,0,3,3,3,2,0,2,
|
||||
3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,2,3,3,3,3,3,3,0,0,0,3,0,2,0,3,2,3,2,2,0,
|
||||
3,3,3,3,3,3,3,3,3,2,3,2,3,3,3,2,3,3,2,3,3,3,0,0,0,3,2,0,2,3,2,2,0,0,0,
|
||||
3,3,3,3,3,3,2,3,3,2,3,2,3,3,3,3,3,2,2,2,3,3,0,0,2,3,0,3,2,2,2,2,0,0,0,
|
||||
3,3,3,3,3,3,3,3,3,2,3,2,3,3,2,2,0,2,2,2,2,2,2,0,0,0,0,0,0,3,3,2,0,2,0,
|
||||
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,0,3,2,3,2,0,0,0,2,0,2,2,
|
||||
3,3,3,3,3,3,3,3,3,3,2,2,3,3,3,2,2,3,3,3,2,0,0,0,2,3,2,2,0,3,2,2,0,0,0,
|
||||
3,3,3,3,2,3,3,3,3,2,2,2,3,2,3,2,0,2,2,2,2,3,0,0,0,2,2,0,0,3,2,2,0,0,0,
|
||||
3,3,3,3,3,3,2,3,3,2,3,0,3,3,2,2,3,2,2,2,2,3,0,2,2,3,2,2,2,2,2,3,0,2,0,
|
||||
3,3,3,3,2,3,2,2,2,2,2,3,3,2,2,2,0,0,2,0,2,2,0,0,2,2,0,0,0,3,2,2,0,0,0,
|
||||
3,3,3,3,0,3,3,3,3,3,2,0,3,2,2,2,0,3,2,2,3,3,0,0,0,3,0,0,0,2,2,2,2,2,2,
|
||||
3,3,3,3,3,3,3,3,3,2,3,2,3,2,2,2,2,2,3,2,0,2,0,0,0,3,2,0,0,3,3,3,0,0,0,
|
||||
3,3,3,3,0,3,3,3,2,2,2,2,3,3,2,3,2,0,2,3,0,0,0,0,0,2,0,0,0,0,0,2,0,3,0,
|
||||
3,3,3,3,3,2,2,3,3,3,2,2,3,2,2,2,2,3,3,2,2,0,0,0,0,3,2,2,0,2,2,2,2,0,0,
|
||||
3,3,3,3,3,3,3,3,2,2,2,0,3,3,2,0,2,0,2,2,0,2,0,0,0,2,0,2,0,2,2,2,0,2,0,
|
||||
3,3,3,3,0,0,2,3,0,0,2,2,3,0,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
3,2,3,3,2,3,3,3,3,3,3,2,2,2,2,3,2,0,2,2,3,2,0,0,2,0,3,0,0,0,0,0,0,0,0,
|
||||
3,3,3,3,0,0,2,2,0,2,3,2,3,3,0,0,2,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
3,3,3,3,3,3,2,3,3,2,3,2,2,2,2,2,2,2,2,2,0,0,0,0,0,0,0,2,0,2,0,2,0,0,0,
|
||||
3,3,3,3,2,2,3,2,0,2,0,2,3,2,2,0,3,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,
|
||||
3,3,3,3,2,2,2,2,3,2,0,0,2,0,0,0,0,0,0,2,0,2,0,0,0,2,0,3,0,0,2,0,0,0,0,
|
||||
3,3,2,2,2,2,0,2,0,2,0,0,3,0,0,2,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,2,0,3,3,3,3,3,2,3,0,0,2,2,2,2,3,2,2,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,3,3,2,2,2,2,2,2,0,0,2,2,2,0,2,2,3,0,0,0,0,0,0,3,0,0,0,0,0,0,0,
|
||||
2,2,2,0,3,3,3,3,3,2,2,0,2,2,2,2,2,2,2,2,2,0,0,0,0,0,0,0,0,0,0,0,2,0,0,
|
||||
2,0,0,2,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
2,2,2,3,0,0,2,2,0,0,0,0,2,2,2,2,0,0,0,2,0,2,0,0,0,0,0,2,0,0,0,0,0,0,0,
|
||||
3,3,3,2,2,0,2,0,0,0,2,0,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
};
|
||||
|
||||
|
||||
const SequenceModel Iso_8859_3EsperantoModel =
|
||||
{
|
||||
Iso_8859_3_CharToOrderMap,
|
||||
EsperantoLangModel,
|
||||
35,
|
||||
(float)0.9942980632768038,
|
||||
PR_FALSE,
|
||||
"ISO-8859-3"
|
||||
};
|
||||
@ -88,6 +88,8 @@ nsSBCSGroupProber::nsSBCSGroupProber()
|
||||
mProbers[19] = new nsSingleByteCharSetProber(&Iso_8859_1GermanModel);
|
||||
mProbers[20] = new nsSingleByteCharSetProber(&Windows_1252GermanModel);
|
||||
|
||||
mProbers[21] = new nsSingleByteCharSetProber(&Iso_8859_3EsperantoModel);
|
||||
|
||||
Reset();
|
||||
}
|
||||
|
||||
|
||||
@ -40,7 +40,7 @@
|
||||
#define nsSBCSGroupProber_h__
|
||||
|
||||
|
||||
#define NUM_OF_SBCS_PROBERS 21
|
||||
#define NUM_OF_SBCS_PROBERS 22
|
||||
|
||||
class nsCharSetProber;
|
||||
class nsSBCSGroupProber: public nsCharSetProber {
|
||||
|
||||
@ -129,19 +129,28 @@ extern const SequenceModel Latin5CyrillicModel;
|
||||
extern const SequenceModel MacCyrillicModel;
|
||||
extern const SequenceModel Ibm866CyrillicModel;
|
||||
extern const SequenceModel Ibm855CyrillicModel;
|
||||
|
||||
extern const SequenceModel Latin7GreekModel;
|
||||
extern const SequenceModel Win1253GreekModel;
|
||||
|
||||
extern const SequenceModel Latin5BulgarianModel;
|
||||
extern const SequenceModel Win1251BulgarianModel;
|
||||
|
||||
extern const SequenceModel Latin2HungarianModel;
|
||||
extern const SequenceModel Win1250HungarianModel;
|
||||
|
||||
extern const SequenceModel Win1255Model;
|
||||
|
||||
extern const SequenceModel TIS620ThaiModel;
|
||||
|
||||
extern const SequenceModel Iso_8859_15FrenchModel;
|
||||
extern const SequenceModel Iso_8859_1FrenchModel;
|
||||
extern const SequenceModel Windows_1252FrenchModel;
|
||||
|
||||
extern const SequenceModel Iso_8859_1GermanModel;
|
||||
extern const SequenceModel Windows_1252GermanModel;
|
||||
|
||||
extern const SequenceModel Iso_8859_3EsperantoModel;
|
||||
|
||||
#endif /* nsSingleByteCharSetProber_h__ */
|
||||
|
||||
|
||||
7
test/eo/iso-8859-3.txt
Normal file
7
test/eo/iso-8859-3.txt
Normal file
@ -0,0 +1,7 @@
|
||||
Esperanto (origine Lingvo Internacia) estas la plej disvastigita internacia
|
||||
planlingvo.[3] La nomo venas de la kaþnomo "Dr-o Esperanto", sub kiu la juda
|
||||
kuracisto Ludoviko Lazaro Zamenhofo en la jaro 1887 publikigis la bazon de la
|
||||
lingvo. La unua versio, la rusa, ricevis la cenzuran permeson disvastiøi en la
|
||||
26-a de julio; æi tiun daton oni konsideras la naskiøtago de Esperanto[4][5]. Li
|
||||
intencis krei facile lerneblan neýtralan lingvon, taýgan por uzo en la
|
||||
internacia komunikado, tamen ne anstataýigi aliajn, naciajn lingvojn.
|
||||
Loading…
x
Reference in New Issue
Block a user