mirror of
https://gitlab.freedesktop.org/uchardet/uchardet.git
synced 2025-12-06 16:56:40 +08:00
script, src, test: adding Catalan support.
For UTF-8, ISO-8859-1 and WINDOWS-1252 support. The test for UTF-8 and ISO-8859-1 is taken from 'Marmota' page on Wikipedia in Catalan. The test for WINDOWS-1252 is taken from the 'Unió_Europea' page. ISO-8859-1 and WINDOWS-1252 being very similar, regarding most letters (in particular the ones used in Catalan), I differentiated the test with a text containing the '€' symbol, which is on an unused spot in ISO-8859-1.
This commit is contained in:
parent
cec8817d79
commit
d40e5868d5
@ -25,6 +25,10 @@ uchardet started as a C language binding of the original C++ implementation of t
|
|||||||
* UTF-8
|
* UTF-8
|
||||||
* ISO-8859-5
|
* ISO-8859-5
|
||||||
* WINDOWS-1251
|
* WINDOWS-1251
|
||||||
|
* Catalan
|
||||||
|
* UTF-8
|
||||||
|
* ISO-8859-1
|
||||||
|
* WINDOWS-1252
|
||||||
* Chinese
|
* Chinese
|
||||||
* UTF-8
|
* UTF-8
|
||||||
* ISO-2022-CN
|
* ISO-2022-CN
|
||||||
|
|||||||
238
script/BuildLangModelLogs/LangCatalanModel.log
Normal file
238
script/BuildLangModelLogs/LangCatalanModel.log
Normal file
@ -0,0 +1,238 @@
|
|||||||
|
= Logs of language model for Catalan (ca) =
|
||||||
|
|
||||||
|
- Generated by BuildLangModel.py
|
||||||
|
- Started: 2022-12-20 01:31:40.290803
|
||||||
|
- Maximum depth: 4
|
||||||
|
- Max number of pages: 200
|
||||||
|
|
||||||
|
== Parsed pages ==
|
||||||
|
|
||||||
|
Parlament_Europeu (revision 31056370)
|
||||||
|
Genji Monogatari (revision 31007904)
|
||||||
|
Bundestag (revision 30742728)
|
||||||
|
Kana (revision 29176811)
|
||||||
|
Jun'ichirō Tanizaki (revision 30750244)
|
||||||
|
Representació proporcional amb llista de partit (revision 22086795)
|
||||||
|
Agències de la Unió Europea (revision 30276199)
|
||||||
|
Poder executiu (revision 30290834)
|
||||||
|
Edicions Atalanta (revision 26048077)
|
||||||
|
Animació (revision 30865051)
|
||||||
|
Pressupost de la Unió Europea (revision 30231577)
|
||||||
|
Jorge Luis Borges (revision 30783720)
|
||||||
|
Universitat de Pittsburgh (revision 25411555)
|
||||||
|
Satiricó (revision 31019009)
|
||||||
|
Dramatis personae (revision 30858787)
|
||||||
|
Corpus lingüístic (revision 28600087)
|
||||||
|
Genji Monogatari Emaki (revision 30520718)
|
||||||
|
Era Keichō (revision 27881416)
|
||||||
|
Període Heian (revision 30351338)
|
||||||
|
Uji (revision 26298733)
|
||||||
|
Clan Minamoto (revision 29218047)
|
||||||
|
Ventafocs (revision 30167478)
|
||||||
|
わ (revision 28487155)
|
||||||
|
Japó (revision 30980338)
|
||||||
|
Agència Europea dels Sistemes Globals de Navegació per Satèl·lit (revision 28777516)
|
||||||
|
Període Shōwa (revision 30351346)
|
||||||
|
ム (revision 25190709)
|
||||||
|
Premi Balzan (revision 30321993)
|
||||||
|
Germans Grimm (revision 30104486)
|
||||||
|
Europol (revision 25369380)
|
||||||
|
Unió Europea (revision 30730061)
|
||||||
|
Kyoto (revision 30706119)
|
||||||
|
Incendi del Reichstag (revision 30894126)
|
||||||
|
Processament de llenguatge natural (revision 29016655)
|
||||||
|
794 (revision 29283769)
|
||||||
|
CANTIC (revision 30488826)
|
||||||
|
Casa de la Història Europea (revision 30703943)
|
||||||
|
VP:VER (revision 30232565)
|
||||||
|
Katakana (revision 29937701)
|
||||||
|
Shogunat Kamakura (revision 28808156)
|
||||||
|
Eleccions (revision 30449311)
|
||||||
|
Noam Chomsky (revision 30552025)
|
||||||
|
Eleccions federals alemanyes de 1994 (revision 28337358)
|
||||||
|
Conceptes d'unitat europea abans del 1945 (revision 30927921)
|
||||||
|
Era Heian (revision 30351338)
|
||||||
|
Gemeinsame Normdatei (revision 30883432)
|
||||||
|
La Bella Dorment (pel·lícula de 1959) (revision 30982067)
|
||||||
|
Població (revision 30352350)
|
||||||
|
Obra literària (revision 31011396)
|
||||||
|
も (revision 25190714)
|
||||||
|
Istituto Centrale per il Catalogo Unico (revision 28786509)
|
||||||
|
Política (revision 31014511)
|
||||||
|
ハ (revision 31071577)
|
||||||
|
Vot (revision 27865452)
|
||||||
|
Clan Taira (revision 26323649)
|
||||||
|
Permís de conducció europeu (revision 27672810)
|
||||||
|
Mandala (revision 30940608)
|
||||||
|
Campània antiga (revision 29855854)
|
||||||
|
ゐ (revision 28487156)
|
||||||
|
Consell de la Unió Europea (revision 30308594)
|
||||||
|
24 de juliol (revision 31063555)
|
||||||
|
Kyōto (revision 30706119)
|
||||||
|
Alfons X de Castella (revision 30535714)
|
||||||
|
VIAF (revision 28927187)
|
||||||
|
1975 (revision 31057077)
|
||||||
|
モ (revision 25190714)
|
||||||
|
Sistema Galileo (revision 30880731)
|
||||||
|
Densitat de població (revision 30174278)
|
||||||
|
Autodesk Maya (revision 30989692)
|
||||||
|
Nàpols (revision 31028649)
|
||||||
|
Memòria de traducció (revision 30341759)
|
||||||
|
Ryukyu (revision 29922259)
|
||||||
|
Agència Europea per a la Seguretat i la Salut en el Treball (revision 29049313)
|
||||||
|
ISNI (revision 30824306)
|
||||||
|
PDF (revision 29442049)
|
||||||
|
Eleccions federals alemanyes de 1972 (revision 30271501)
|
||||||
|
Sistema presidencialista (revision 30596011)
|
||||||
|
Primer ministre (revision 27174693)
|
||||||
|
Coeducació (revision 31048027)
|
||||||
|
Ko Tazawa (revision 30932179)
|
||||||
|
Poliomielitis (revision 30976061)
|
||||||
|
18 de setembre (revision 31063494)
|
||||||
|
Campanya electoral (revision 27935270)
|
||||||
|
Kōbō Abe (revision 30016508)
|
||||||
|
Rodopis (revision 28014188)
|
||||||
|
Política Agrària Comunitària (revision 30353551)
|
||||||
|
21 d'octubre (revision 30980460)
|
||||||
|
1984 (revision 31063521)
|
||||||
|
South Park (revision 31024165)
|
||||||
|
Hiragana (revision 29920075)
|
||||||
|
Associació de Votants de Schleswig Meridional (revision 30753058)
|
||||||
|
ひ (revision 31071564)
|
||||||
|
Lingüística (revision 31037031)
|
||||||
|
Blauet comú (revision 28729161)
|
||||||
|
Autodeterminació (revision 29349294)
|
||||||
|
Xina (revision 31007838)
|
||||||
|
Control d'autoritats (revision 29854505)
|
||||||
|
Guillermo de Torre (revision 30765552)
|
||||||
|
Unesco (revision 30129516)
|
||||||
|
Romanització Hepburn (revision 29144432)
|
||||||
|
Tanka (revision 30478859)
|
||||||
|
Clientelisme (revision 30811663)
|
||||||
|
Corpus Textual Informatitzat de la Llengua Catalana (revision 29876775)
|
||||||
|
Secessió (revision 29980781)
|
||||||
|
Fada protectora (revision 29175001)
|
||||||
|
を (revision 28487157)
|
||||||
|
Ōtsu (revision 30010938)
|
||||||
|
Gran Enciclopèdia Catalana (revision 30724375)
|
||||||
|
LCCN (revision 30638965)
|
||||||
|
Universitat privada (revision 28518823)
|
||||||
|
Robert Louis Stevenson (revision 30728093)
|
||||||
|
Kioto (revision 30706119)
|
||||||
|
7 de setembre (revision 30503878)
|
||||||
|
Aardman Animations (revision 30216975)
|
||||||
|
Llibertinatge (revision 29597307)
|
||||||
|
Bibliothèque nationale de France (revision 30715383)
|
||||||
|
Alemanya Occidental (revision 30239917)
|
||||||
|
National Library of Australia (revision 30977078)
|
||||||
|
Diccionari Descriptiu de la Llengua Catalana (revision 27017217)
|
||||||
|
1969 (revision 31060188)
|
||||||
|
Separació de poders (revision 30362225)
|
||||||
|
Isaac Titsingh (revision 29748956)
|
||||||
|
Adolf Hitler (revision 30951478)
|
||||||
|
Període Kamakura (revision 28808156)
|
||||||
|
Societas Europaea (revision 28857120)
|
||||||
|
Invasions japoneses a Corea (revision 30978745)
|
||||||
|
Agència de la Unió Europea (revision 30276199)
|
||||||
|
Sistema polític (revision 30713673)
|
||||||
|
1606 (revision 26237152)
|
||||||
|
Universitat Rovira i Virgili (revision 30865280)
|
||||||
|
IVA (revision 30328630)
|
||||||
|
Patricis (revision 30923152)
|
||||||
|
Els barrufets (revision 31008031)
|
||||||
|
Lapislàtzuli Editorial (revision 30176117)
|
||||||
|
Internet (revision 30894405)
|
||||||
|
BIBSYS (revision 30255267)
|
||||||
|
Agència Europea de Seguretat Marítima (revision 28888118)
|
||||||
|
National Diet Library (revision 30669422)
|
||||||
|
Grup Enciclopèdia Catalana (revision 31077222)
|
||||||
|
Competència comunicativa (revision 30307632)
|
||||||
|
Castell Fushimi (revision 30610308)
|
||||||
|
Walter Gropius (revision 30790098)
|
||||||
|
Biblioteca Nacional de España (revision 31071591)
|
||||||
|
Diccionari Normatiu Valencià (revision 29882403)
|
||||||
|
Oscar Wilde (revision 31078983)
|
||||||
|
Hampshire (revision 30823098)
|
||||||
|
Clan Fujiwara (revision 30894950)
|
||||||
|
Speedy Gonzales (revision 30151280)
|
||||||
|
Tlön, Uqbar, Orbis Tertius (revision 29688246)
|
||||||
|
Japó ocupat (revision 28083159)
|
||||||
|
Garbancito de la Mancha (revision 30219073)
|
||||||
|
SUDOC (revision 29231585)
|
||||||
|
Gerardo Diego (revision 29912471)
|
||||||
|
Universitat (revision 29907980)
|
||||||
|
Foliscopi (revision 29903436)
|
||||||
|
1980 (revision 31063457)
|
||||||
|
Infart de miocardi (revision 30894255)
|
||||||
|
Encyclopædia Britannica (revision 28347959)
|
||||||
|
Petroni (revision 29790499)
|
||||||
|
Horari de màxima audiència (revision 27872454)
|
||||||
|
Sutra (revision 23458427)
|
||||||
|
Medicina (revision 31002196)
|
||||||
|
ホ (revision 25190705)
|
||||||
|
Luci Appuleu (revision 30336717)
|
||||||
|
Novel·la (revision 30386814)
|
||||||
|
Kimba, el lleó blanc (revision 30273901)
|
||||||
|
UTC+09:00 (revision 25182859)
|
||||||
|
Arquitectura neogòtica (revision 30347122)
|
||||||
|
Segle I (revision 30953541)
|
||||||
|
Emperador del Japó (revision 27799841)
|
||||||
|
Biblioteca Nacional de la República Txeca (revision 29847950)
|
||||||
|
Gran Diccionari de la Llengua Catalana (revision 29063719)
|
||||||
|
Període Reiwa (revision 29227861)
|
||||||
|
|
||||||
|
== End of Parsed pages ==
|
||||||
|
|
||||||
|
- Wikipedia parsing ended at: 2022-12-20 01:34:38.734771
|
||||||
|
|
||||||
|
57 characters appeared 1339831 times.
|
||||||
|
|
||||||
|
Most Frequent characters:
|
||||||
|
[ 0] Char e: 12.524042211293812 %
|
||||||
|
[ 1] Char a: 11.715955221218199 %
|
||||||
|
[ 2] Char i: 7.815090112111155 %
|
||||||
|
[ 3] Char s: 7.809940208877089 %
|
||||||
|
[ 4] Char r: 6.866686917976969 %
|
||||||
|
[ 5] Char n: 6.706069646097157 %
|
||||||
|
[ 6] Char l: 6.58105387918327 %
|
||||||
|
[ 7] Char t: 6.268850325152949 %
|
||||||
|
[ 8] Char o: 5.046308079153267 %
|
||||||
|
[ 9] Char c: 4.242027539294135 %
|
||||||
|
[10] Char d: 4.013192708632656 %
|
||||||
|
[11] Char u: 3.5825413802188484 %
|
||||||
|
[12] Char m: 3.048966623402504 %
|
||||||
|
[13] Char p: 2.778783294310999 %
|
||||||
|
[14] Char g: 1.4824257686230575 %
|
||||||
|
[15] Char v: 1.3498717375549603 %
|
||||||
|
[16] Char b: 1.2941184373253045 %
|
||||||
|
[17] Char f: 0.975943980994618 %
|
||||||
|
[18] Char q: 0.7455417884792933 %
|
||||||
|
[19] Char h: 0.6949383914837021 %
|
||||||
|
[20] Char ó: 0.5910446914573555 %
|
||||||
|
[21] Char x: 0.5195431364104875 %
|
||||||
|
[22] Char é: 0.4443097674258918 %
|
||||||
|
[23] Char à: 0.3875115592936721 %
|
||||||
|
[24] Char j: 0.36474749427353154 %
|
||||||
|
[25] Char y: 0.3636279500922131 %
|
||||||
|
[26] Char è: 0.3583287743006394 %
|
||||||
|
[27] Char í: 0.3250409939761059 %
|
||||||
|
[28] Char k: 0.2481656268589098 %
|
||||||
|
[29] Char ò: 0.21577348187943107 %
|
||||||
|
[30] Char z: 0.17778361599336034 %
|
||||||
|
[31] Char w: 0.11673113997213082 %
|
||||||
|
[32] Char ç: 0.11016314744172959 %
|
||||||
|
[33] Char ú: 0.08792153637287091 %
|
||||||
|
[34] Char ü: 0.06709801460034885 %
|
||||||
|
[35] Char ï: 0.05448448349082832 %
|
||||||
|
|
||||||
|
The first 36 characters have an accumulated ratio of 0.9997462366522347.
|
||||||
|
The first 5 characters have an accumulated ratio of 0.4673171467147723.
|
||||||
|
All characters whose order is over 21 have an accumulated ratio of 0.03321687585971664.
|
||||||
|
|
||||||
|
1083 sequences found.
|
||||||
|
|
||||||
|
First 517 (typical positive ratio): 0.9950067888087288
|
||||||
|
Next 195 (712-517): 0.003994192320077694
|
||||||
|
Rest: 0.0009990188711934689
|
||||||
|
|
||||||
|
- Processing end: 2022-12-20 01:34:38.859159
|
||||||
79
script/langs/ca.py
Normal file
79
script/langs/ca.py
Normal file
@ -0,0 +1,79 @@
|
|||||||
|
#!/bin/python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
# ##### BEGIN LICENSE BLOCK #####
|
||||||
|
# Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||||
|
#
|
||||||
|
# The contents of this file are subject to the Mozilla Public License Version
|
||||||
|
# 1.1 (the "License"); you may not use this file except in compliance with
|
||||||
|
# the License. You may obtain a copy of the License at
|
||||||
|
# http://www.mozilla.org/MPL/
|
||||||
|
#
|
||||||
|
# Software distributed under the License is distributed on an "AS IS" basis,
|
||||||
|
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||||
|
# for the specific language governing rights and limitations under the
|
||||||
|
# License.
|
||||||
|
#
|
||||||
|
# The Original Code is Mozilla Universal charset detector code.
|
||||||
|
#
|
||||||
|
# The Initial Developer of the Original Code is
|
||||||
|
# Netscape Communications Corporation.
|
||||||
|
# Portions created by the Initial Developer are Copyright (C) 2001
|
||||||
|
# the Initial Developer. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Contributor(s):
|
||||||
|
# Jehan <jehan@girinstud.io>
|
||||||
|
#
|
||||||
|
# Alternatively, the contents of this file may be used under the terms of
|
||||||
|
# either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||||
|
# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||||
|
# in which case the provisions of the GPL or the LGPL are applicable instead
|
||||||
|
# of those above. If you wish to allow use of your version of this file only
|
||||||
|
# under the terms of either the GPL or the LGPL, and not to allow others to
|
||||||
|
# use your version of this file under the terms of the MPL, indicate your
|
||||||
|
# decision by deleting the provisions above and replace them with the notice
|
||||||
|
# and other provisions required by the GPL or the LGPL. If you do not delete
|
||||||
|
# the provisions above, a recipient may use your version of this file under
|
||||||
|
# the terms of any one of the MPL, the GPL or the LGPL.
|
||||||
|
#
|
||||||
|
# ##### END LICENSE BLOCK #####
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
## Mandatory Properties ##
|
||||||
|
|
||||||
|
# The human name for the language, in English.
|
||||||
|
name = 'Catalan'
|
||||||
|
# Use 2-letter ISO 639-1 if possible, 3-letter ISO code otherwise,
|
||||||
|
# or use another catalog as a last resort.
|
||||||
|
code = 'ca'
|
||||||
|
# ASCII characters are also used in French.
|
||||||
|
use_ascii = True
|
||||||
|
# The charsets we want to support and create data for.
|
||||||
|
charsets = ['ISO-8859-1', 'WINDOWS-1252']
|
||||||
|
|
||||||
|
## Optional Properties ##
|
||||||
|
|
||||||
|
# Alphabet characters.
|
||||||
|
# If use_ascii=True, there is no need to add any ASCII characters.
|
||||||
|
# If case_mapping=True, there is no need to add several cases of a same
|
||||||
|
# character (provided Python algorithms know the right cases).
|
||||||
|
alphabet = ['à', 'è', 'é', 'í', 'ï', 'ó', 'ò', 'ú', 'ü', 'ç']
|
||||||
|
# The start page. Though optional, it is advised to choose one yourself.
|
||||||
|
start_pages = ['Parlament_Europeu', 'Genji Monogatari']
|
||||||
|
# give possibility to select another code for the Wikipedia URL.
|
||||||
|
wikipedia_code = code
|
||||||
|
# 'a' and 'A' will be considered the same character, and so on.
|
||||||
|
# This uses Python algorithm to determine upper/lower-case of a given
|
||||||
|
# character.
|
||||||
|
case_mapping = True
|
||||||
|
|
||||||
|
# A function to clean content returned by the `wikipedia` python lib,
|
||||||
|
# in case some unwanted data has been overlooked.
|
||||||
|
# Note that we are already cleaning away the '=' from the title syntax
|
||||||
|
# of Wikipedia, as well as double spaces. But sometimes, Wikipedia in
|
||||||
|
# some language may return weird syntax or UI text which should be
|
||||||
|
# discarded. If you encounter one of these cases, use this function.
|
||||||
|
def clean_wikipedia_content(content):
|
||||||
|
# Do your garbage text cleaning here.
|
||||||
|
return content
|
||||||
@ -1,6 +1,7 @@
|
|||||||
ar
|
ar
|
||||||
be
|
be
|
||||||
bg
|
bg
|
||||||
|
ca
|
||||||
cs
|
cs
|
||||||
da
|
da
|
||||||
de
|
de
|
||||||
|
|||||||
@ -10,6 +10,7 @@ set(
|
|||||||
LangModels/LangArabicModel.cpp
|
LangModels/LangArabicModel.cpp
|
||||||
LangModels/LangBelarusianModel.cpp
|
LangModels/LangBelarusianModel.cpp
|
||||||
LangModels/LangBulgarianModel.cpp
|
LangModels/LangBulgarianModel.cpp
|
||||||
|
LangModels/LangCatalanModel.cpp
|
||||||
LangModels/LangCroatianModel.cpp
|
LangModels/LangCroatianModel.cpp
|
||||||
LangModels/LangCzechModel.cpp
|
LangModels/LangCzechModel.cpp
|
||||||
LangModels/LangEnglishModel.cpp
|
LangModels/LangEnglishModel.cpp
|
||||||
|
|||||||
207
src/LangModels/LangCatalanModel.cpp
Normal file
207
src/LangModels/LangCatalanModel.cpp
Normal file
@ -0,0 +1,207 @@
|
|||||||
|
/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
|
||||||
|
/* ***** BEGIN LICENSE BLOCK *****
|
||||||
|
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||||
|
*
|
||||||
|
* The contents of this file are subject to the Mozilla Public License Version
|
||||||
|
* 1.1 (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
* http://www.mozilla.org/MPL/
|
||||||
|
*
|
||||||
|
* Software distributed under the License is distributed on an "AS IS" basis,
|
||||||
|
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||||
|
* for the specific language governing rights and limitations under the
|
||||||
|
* License.
|
||||||
|
*
|
||||||
|
* The Original Code is Mozilla Communicator client code.
|
||||||
|
*
|
||||||
|
* The Initial Developer of the Original Code is
|
||||||
|
* Netscape Communications Corporation.
|
||||||
|
* Portions created by the Initial Developer are Copyright (C) 1998
|
||||||
|
* the Initial Developer. All Rights Reserved.
|
||||||
|
*
|
||||||
|
* Contributor(s):
|
||||||
|
*
|
||||||
|
* Alternatively, the contents of this file may be used under the terms of
|
||||||
|
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||||
|
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||||
|
* in which case the provisions of the GPL or the LGPL are applicable instead
|
||||||
|
* of those above. If you wish to allow use of your version of this file only
|
||||||
|
* under the terms of either the GPL or the LGPL, and not to allow others to
|
||||||
|
* use your version of this file under the terms of the MPL, indicate your
|
||||||
|
* decision by deleting the provisions above and replace them with the notice
|
||||||
|
* and other provisions required by the GPL or the LGPL. If you do not delete
|
||||||
|
* the provisions above, a recipient may use your version of this file under
|
||||||
|
* the terms of any one of the MPL, the GPL or the LGPL.
|
||||||
|
*
|
||||||
|
* ***** END LICENSE BLOCK ***** */
|
||||||
|
|
||||||
|
#include "../nsSBCharSetProber.h"
|
||||||
|
#include "../nsSBCharSetProber-generated.h"
|
||||||
|
#include "../nsLanguageDetector.h"
|
||||||
|
|
||||||
|
#include "../nsLanguageDetector-generated.h"
|
||||||
|
|
||||||
|
/********* Language model for: Catalan *********/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Generated by BuildLangModel.py
|
||||||
|
* On: 2022-12-20 01:34:38.735681
|
||||||
|
**/
|
||||||
|
|
||||||
|
/* Character Mapping Table:
|
||||||
|
* ILL: illegal character.
|
||||||
|
* CTR: control character specific to the charset.
|
||||||
|
* RET: carriage/return.
|
||||||
|
* SYM: symbol (punctuation) that does not belong to word.
|
||||||
|
* NUM: 0 - 9.
|
||||||
|
*
|
||||||
|
* Other characters are ordered by probabilities
|
||||||
|
* (0 is the most common character in the language).
|
||||||
|
*
|
||||||
|
* Orders are generic to a language. So the codepoint with order X in
|
||||||
|
* CHARSET1 maps to the same character as the codepoint with the same
|
||||||
|
* order X in CHARSET2 for the same language.
|
||||||
|
* As such, it is possible to get missing order. For instance the
|
||||||
|
* ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1
|
||||||
|
* even though they are both used for French. Same for the euro sign.
|
||||||
|
*/
|
||||||
|
static const unsigned char Iso_8859_1_CharToOrderMap[] =
|
||||||
|
{
|
||||||
|
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
|
||||||
|
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
|
||||||
|
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
|
||||||
|
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
|
||||||
|
SYM, 1, 16, 9, 10, 0, 17, 14, 19, 2, 24, 28, 6, 12, 5, 8, /* 4X */
|
||||||
|
13, 18, 4, 3, 7, 11, 15, 31, 21, 25, 30,SYM,SYM,SYM,SYM,SYM, /* 5X */
|
||||||
|
SYM, 1, 16, 9, 10, 0, 17, 14, 19, 2, 24, 28, 6, 12, 5, 8, /* 6X */
|
||||||
|
13, 18, 4, 3, 7, 11, 15, 31, 21, 25, 30,SYM,SYM,SYM,SYM,CTR, /* 7X */
|
||||||
|
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */
|
||||||
|
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */
|
||||||
|
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */
|
||||||
|
SYM,SYM,SYM,SYM,SYM, 54,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */
|
||||||
|
23, 36, 45, 41, 39, 57, 40, 32, 26, 22, 49, 42, 50, 27, 58, 35, /* CX */
|
||||||
|
53, 38, 29, 20, 43, 52, 37,SYM, 56, 51, 33, 55, 34, 59, 60, 44, /* DX */
|
||||||
|
23, 36, 45, 41, 39, 61, 40, 32, 26, 22, 49, 42, 50, 27, 62, 35, /* EX */
|
||||||
|
53, 38, 29, 20, 43, 52, 37,SYM, 56, 51, 33, 55, 34, 63, 64, 65, /* FX */
|
||||||
|
};
|
||||||
|
/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
|
||||||
|
|
||||||
|
static const unsigned char Windows_1252_CharToOrderMap[] =
|
||||||
|
{
|
||||||
|
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
|
||||||
|
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
|
||||||
|
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
|
||||||
|
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
|
||||||
|
SYM, 1, 16, 9, 10, 0, 17, 14, 19, 2, 24, 28, 6, 12, 5, 8, /* 4X */
|
||||||
|
13, 18, 4, 3, 7, 11, 15, 31, 21, 25, 30,SYM,SYM,SYM,SYM,SYM, /* 5X */
|
||||||
|
SYM, 1, 16, 9, 10, 0, 17, 14, 19, 2, 24, 28, 6, 12, 5, 8, /* 6X */
|
||||||
|
13, 18, 4, 3, 7, 11, 15, 31, 21, 25, 30,SYM,SYM,SYM,SYM,CTR, /* 7X */
|
||||||
|
SYM,ILL,SYM, 66,SYM,SYM,SYM,SYM,SYM,SYM, 48,SYM, 47,ILL, 46,ILL, /* 8X */
|
||||||
|
ILL,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 48,SYM, 47,ILL, 46, 67, /* 9X */
|
||||||
|
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */
|
||||||
|
SYM,SYM,SYM,SYM,SYM, 54,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */
|
||||||
|
23, 36, 45, 41, 39, 68, 40, 32, 26, 22, 49, 42, 50, 27, 69, 35, /* CX */
|
||||||
|
53, 38, 29, 20, 43, 52, 37,SYM, 56, 51, 33, 55, 34, 70, 71, 44, /* DX */
|
||||||
|
23, 36, 45, 41, 39, 72, 40, 32, 26, 22, 49, 42, 50, 27, 73, 35, /* EX */
|
||||||
|
53, 38, 29, 20, 43, 52, 37,SYM, 56, 51, 33, 55, 34, 74, 75, 76, /* FX */
|
||||||
|
};
|
||||||
|
/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
|
||||||
|
|
||||||
|
static const int Unicode_Char_size = 72;
|
||||||
|
static const unsigned int Unicode_CharOrder[] =
|
||||||
|
{
|
||||||
|
65, 1, 66, 16, 67, 9, 68, 10, 69, 0, 70, 17, 71, 14, 72, 19,
|
||||||
|
73, 2, 74, 24, 75, 28, 76, 6, 77, 12, 78, 5, 79, 8, 80, 13,
|
||||||
|
81, 18, 82, 4, 83, 3, 84, 7, 85, 11, 86, 15, 87, 31, 88, 21,
|
||||||
|
89, 25, 90, 30, 97, 1, 98, 16, 99, 9, 100, 10, 101, 0,102, 17,
|
||||||
|
103, 14, 104, 19, 105, 2, 106, 24, 107, 28, 108, 6, 109, 12,110, 5,
|
||||||
|
111, 8, 112, 13, 113, 18, 114, 4, 115, 3, 116, 7, 117, 11,118, 15,
|
||||||
|
119, 31, 120, 21, 121, 25, 122, 30, 192, 23, 199, 32, 200, 26,201, 22,
|
||||||
|
205, 27, 207, 35, 210, 29, 211, 20, 218, 33, 220, 34, 224, 23,231, 32,
|
||||||
|
232, 26, 233, 22, 237, 27, 239, 35, 242, 29, 243, 20, 250, 33,252, 34,
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
/* Model Table:
|
||||||
|
* Total considered sequences: 1083 / 1296
|
||||||
|
* - Positive sequences: first 517 (0.9950067888087288)
|
||||||
|
* - Probable sequences: next 195 (712-517) (0.003994192320077694)
|
||||||
|
* - Neutral sequences: last 584 (0.0009990188711934689)
|
||||||
|
* - Negative sequences: 213 (off-ratio)
|
||||||
|
* Negative sequences: TODO
|
||||||
|
*/
|
||||||
|
static const PRUint8 CatalanLangModel[] =
|
||||||
|
{
|
||||||
|
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,3,1,3,3,3,0,1,3,2,3,3,2,0,2,3,
|
||||||
|
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,1,0,3,3,2,3,3,1,3,3,3,1,1,3,
|
||||||
|
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,0,3,3,3,3,2,0,0,1,
|
||||||
|
3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,1,2,3,0,3,3,3,3,2,1,3,0,2,1,0,
|
||||||
|
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,2,3,3,3,3,3,2,2,3,3,2,1,
|
||||||
|
3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,2,3,3,0,0,
|
||||||
|
3,3,3,3,2,2,3,3,3,3,3,3,3,3,3,3,3,3,1,3,3,2,3,3,1,3,3,3,3,3,3,1,2,2,1,0,
|
||||||
|
3,3,3,3,3,3,3,3,3,3,1,3,3,2,3,2,2,2,0,3,2,3,3,3,3,3,3,3,1,3,3,2,0,2,1,0,
|
||||||
|
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,1,2,3,3,2,0,3,1,2,3,1,0,0,2,
|
||||||
|
3,3,3,3,3,3,3,3,3,3,2,3,2,2,2,1,0,1,2,3,2,1,3,3,1,3,3,3,3,3,1,1,0,2,0,0,
|
||||||
|
3,3,3,3,3,2,3,2,3,2,3,3,3,2,3,2,1,3,3,2,2,0,3,3,2,3,3,3,1,2,2,3,0,3,0,0,
|
||||||
|
3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,2,3,3,2,2,1,3,3,2,3,2,3,3,3,0,2,1,0,0,0,3,
|
||||||
|
3,3,3,3,2,3,2,3,3,3,1,3,3,3,1,1,3,3,0,1,3,1,3,3,2,2,3,3,0,3,1,1,0,3,1,0,
|
||||||
|
3,3,3,3,3,2,3,3,3,3,3,3,1,3,1,2,2,2,0,3,3,0,2,3,0,2,3,3,1,3,0,1,2,3,0,0,
|
||||||
|
3,3,3,3,3,3,3,2,3,1,2,3,3,2,3,1,2,1,0,3,2,1,1,2,2,3,3,2,1,2,1,2,0,2,3,0,
|
||||||
|
3,3,3,1,2,1,1,1,3,0,1,3,0,1,0,1,1,1,0,0,0,1,3,3,0,1,2,3,1,1,0,1,0,0,0,0,
|
||||||
|
3,3,3,3,3,3,3,3,3,2,3,3,2,2,2,2,3,0,0,1,2,1,3,3,3,2,3,2,1,2,1,0,0,2,1,0,
|
||||||
|
3,3,3,2,3,1,3,3,3,1,1,3,1,1,2,1,0,3,0,0,2,0,2,3,0,1,2,3,1,3,0,0,0,1,2,0,
|
||||||
|
0,1,1,1,1,0,0,1,0,0,0,3,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,
|
||||||
|
3,3,3,2,3,3,3,3,3,1,1,3,2,1,0,0,1,1,1,1,1,0,1,2,1,2,1,1,2,1,1,2,0,1,1,0,
|
||||||
|
1,0,0,3,2,3,1,1,0,1,1,0,1,2,1,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,
|
||||||
|
3,3,3,1,1,0,1,3,3,3,1,2,1,3,0,3,1,2,1,2,1,3,1,2,0,0,3,3,1,3,0,1,0,1,0,0,
|
||||||
|
2,1,2,3,2,3,2,2,1,1,1,2,2,1,1,1,1,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,
|
||||||
|
0,0,2,3,3,3,3,3,0,3,2,3,3,3,3,2,2,3,2,1,0,3,0,0,0,0,0,0,1,0,0,0,0,0,0,1,
|
||||||
|
3,3,3,1,1,1,0,0,3,1,1,3,1,1,1,1,0,1,0,0,1,0,1,3,0,0,1,0,1,0,1,1,0,1,1,0,
|
||||||
|
3,3,3,3,2,2,3,2,3,2,2,3,2,2,1,1,3,0,0,1,1,0,1,2,0,1,0,0,1,0,0,1,0,1,0,0,
|
||||||
|
0,0,3,3,3,3,3,3,0,3,3,1,3,3,3,2,1,2,1,0,0,3,0,0,1,0,0,0,1,0,0,0,0,0,0,0,
|
||||||
|
2,3,0,3,2,3,3,3,3,3,3,0,3,2,3,2,2,3,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,
|
||||||
|
3,3,3,3,2,2,2,2,3,1,1,3,2,2,1,1,1,2,0,2,1,1,0,1,1,3,0,1,2,0,0,2,0,1,0,0,
|
||||||
|
0,0,0,3,3,3,3,3,1,3,3,0,3,3,3,1,2,2,2,0,0,2,0,0,0,0,0,0,1,0,0,0,0,0,0,0,
|
||||||
|
3,3,3,1,1,1,2,1,3,1,0,3,1,1,1,0,1,0,1,2,1,1,2,2,0,1,1,0,1,1,2,1,0,0,1,0,
|
||||||
|
3,3,3,2,2,2,1,1,3,0,1,1,1,1,0,0,1,0,0,2,0,0,1,0,0,1,0,0,1,0,0,2,0,0,1,0,
|
||||||
|
1,3,0,0,0,0,0,0,3,0,0,2,1,0,0,0,0,0,0,0,2,0,0,3,0,0,0,0,0,1,0,0,0,0,0,0,
|
||||||
|
0,0,1,3,2,3,3,2,0,1,1,0,3,1,0,1,3,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,
|
||||||
|
3,0,3,0,2,2,2,1,0,1,1,0,1,0,0,0,1,0,0,2,0,0,1,0,0,0,3,3,0,0,0,0,0,0,0,0,
|
||||||
|
2,2,0,3,1,3,2,3,0,2,3,0,1,0,0,0,1,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
const SequenceModel Iso_8859_1CatalanModel =
|
||||||
|
{
|
||||||
|
Iso_8859_1_CharToOrderMap,
|
||||||
|
CatalanLangModel,
|
||||||
|
36,
|
||||||
|
(float)0.9990009811288065,
|
||||||
|
PR_TRUE,
|
||||||
|
"ISO-8859-1",
|
||||||
|
"ca"
|
||||||
|
};
|
||||||
|
|
||||||
|
const SequenceModel Windows_1252CatalanModel =
|
||||||
|
{
|
||||||
|
Windows_1252_CharToOrderMap,
|
||||||
|
CatalanLangModel,
|
||||||
|
36,
|
||||||
|
(float)0.9990009811288065,
|
||||||
|
PR_TRUE,
|
||||||
|
"WINDOWS-1252",
|
||||||
|
"ca"
|
||||||
|
};
|
||||||
|
|
||||||
|
const LanguageModel CatalanModel =
|
||||||
|
{
|
||||||
|
"ca",
|
||||||
|
Unicode_CharOrder,
|
||||||
|
72,
|
||||||
|
CatalanLangModel,
|
||||||
|
36,
|
||||||
|
5,
|
||||||
|
(float)0.4673171467147723,
|
||||||
|
21,
|
||||||
|
(float)0.03321687585971664,
|
||||||
|
};
|
||||||
@ -38,11 +38,12 @@
|
|||||||
#ifndef nsLanguageDetector_h_generated_h__
|
#ifndef nsLanguageDetector_h_generated_h__
|
||||||
#define nsLanguageDetector_h_generated_h__
|
#define nsLanguageDetector_h_generated_h__
|
||||||
|
|
||||||
#define NUM_OF_LANGUAGE_MODELS 36
|
#define NUM_OF_LANGUAGE_MODELS 37
|
||||||
|
|
||||||
extern const LanguageModel ArabicModel;
|
extern const LanguageModel ArabicModel;
|
||||||
extern const LanguageModel BelarusianModel;
|
extern const LanguageModel BelarusianModel;
|
||||||
extern const LanguageModel BulgarianModel;
|
extern const LanguageModel BulgarianModel;
|
||||||
|
extern const LanguageModel CatalanModel;
|
||||||
extern const LanguageModel CzechModel;
|
extern const LanguageModel CzechModel;
|
||||||
extern const LanguageModel DanishModel;
|
extern const LanguageModel DanishModel;
|
||||||
extern const LanguageModel GermanModel;
|
extern const LanguageModel GermanModel;
|
||||||
|
|||||||
@ -95,6 +95,7 @@ nsMBCSGroupProber::nsMBCSGroupProber(PRUint32 aLanguageFilter)
|
|||||||
langDetectors[i][j++] = new nsLanguageDetector(&ArabicModel);
|
langDetectors[i][j++] = new nsLanguageDetector(&ArabicModel);
|
||||||
langDetectors[i][j++] = new nsLanguageDetector(&BelarusianModel);
|
langDetectors[i][j++] = new nsLanguageDetector(&BelarusianModel);
|
||||||
langDetectors[i][j++] = new nsLanguageDetector(&BulgarianModel);
|
langDetectors[i][j++] = new nsLanguageDetector(&BulgarianModel);
|
||||||
|
langDetectors[i][j++] = new nsLanguageDetector(&CatalanModel);
|
||||||
langDetectors[i][j++] = new nsLanguageDetector(&CroatianModel);
|
langDetectors[i][j++] = new nsLanguageDetector(&CroatianModel);
|
||||||
langDetectors[i][j++] = new nsLanguageDetector(&CzechModel);
|
langDetectors[i][j++] = new nsLanguageDetector(&CzechModel);
|
||||||
langDetectors[i][j++] = new nsLanguageDetector(&DanishModel);
|
langDetectors[i][j++] = new nsLanguageDetector(&DanishModel);
|
||||||
|
|||||||
@ -237,6 +237,9 @@ nsSBCSGroupProber::nsSBCSGroupProber()
|
|||||||
mProbers[n++] = new nsSingleByteCharSetProber(&Ibm855MacedonianModel);
|
mProbers[n++] = new nsSingleByteCharSetProber(&Ibm855MacedonianModel);
|
||||||
mProbers[n++] = new nsSingleByteCharSetProber(&Iso_8859_5MacedonianModel);
|
mProbers[n++] = new nsSingleByteCharSetProber(&Iso_8859_5MacedonianModel);
|
||||||
|
|
||||||
|
mProbers[n++] = new nsSingleByteCharSetProber(&Iso_8859_1CatalanModel);
|
||||||
|
mProbers[n++] = new nsSingleByteCharSetProber(&Windows_1252CatalanModel);
|
||||||
|
|
||||||
assert (n_sbcs_probers == n);
|
assert (n_sbcs_probers == n);
|
||||||
|
|
||||||
Reset();
|
Reset();
|
||||||
|
|||||||
@ -38,7 +38,7 @@
|
|||||||
#ifndef nsSingleByteCharSetProber_generated_h__
|
#ifndef nsSingleByteCharSetProber_generated_h__
|
||||||
#define nsSingleByteCharSetProber_generated_h__
|
#define nsSingleByteCharSetProber_generated_h__
|
||||||
|
|
||||||
#define NUM_OF_SEQUENCE_MODELS 116
|
#define NUM_OF_SEQUENCE_MODELS 118
|
||||||
|
|
||||||
extern const SequenceModel Iso_8859_6ArabicModel;
|
extern const SequenceModel Iso_8859_6ArabicModel;
|
||||||
extern const SequenceModel Windows_1256ArabicModel;
|
extern const SequenceModel Windows_1256ArabicModel;
|
||||||
@ -49,6 +49,9 @@ extern const SequenceModel Iso_8859_5BelarusianModel;
|
|||||||
extern const SequenceModel Windows_1251BulgarianModel;
|
extern const SequenceModel Windows_1251BulgarianModel;
|
||||||
extern const SequenceModel Iso_8859_5BulgarianModel;
|
extern const SequenceModel Iso_8859_5BulgarianModel;
|
||||||
|
|
||||||
|
extern const SequenceModel Iso_8859_1CatalanModel;
|
||||||
|
extern const SequenceModel Windows_1252CatalanModel;
|
||||||
|
|
||||||
extern const SequenceModel Iso_8859_2CzechModel;
|
extern const SequenceModel Iso_8859_2CzechModel;
|
||||||
extern const SequenceModel Windows_1250CzechModel;
|
extern const SequenceModel Windows_1250CzechModel;
|
||||||
extern const SequenceModel Ibm852CzechModel;
|
extern const SequenceModel Ibm852CzechModel;
|
||||||
|
|||||||
1
test/ca/iso-8859-1.txt
Normal file
1
test/ca/iso-8859-1.txt
Normal file
@ -0,0 +1 @@
|
|||||||
|
Les marmotes (Marmota) són un gènere de mamífers de la família dels esciúrids.[1] Viuen a l'alta muntanya a l'hemisferi nord. Són rosegadors de mida mitjana, una mica més grans que els gats domèstics, de potes curtes i cos ample que els proporcionen un aspecte força rabassut.
|
||||||
1
test/ca/utf-8.txt
Normal file
1
test/ca/utf-8.txt
Normal file
@ -0,0 +1 @@
|
|||||||
|
Les marmotes (Marmota) són un gènere de mamífers de la família dels esciúrids.[1] Viuen a l'alta muntanya a l'hemisferi nord. Són rosegadors de mida mitjana, una mica més grans que els gats domèstics, de potes curtes i cos ample que els proporcionen un aspecte força rabassut.
|
||||||
1
test/ca/windows-1252.txt
Normal file
1
test/ca/windows-1252.txt
Normal file
@ -0,0 +1 @@
|
|||||||
|
Les especials relacions econòmiques es fonamenten en la llibertat de trànsit de mercaderies, treballadors i capitals, així com en l'establiment d'una moneda única, l'euro (€) per tots els estats membres (la denominada Eurozona).
|
||||||
Loading…
x
Reference in New Issue
Block a user