diff --git a/README.md b/README.md index 3740b14..e464cf8 100644 --- a/README.md +++ b/README.md @@ -57,6 +57,10 @@ Techniques used by universalchardet are described at http://www.mozilla.org/proj * MAC-CYRILLIC * IBM866 * IBM855 + * Spanish + * ISO-8859-1 + * ISO-8859-15 + * WINDOWS-1252 * Thai * TIS-620 * ISO-8859-11 diff --git a/script/BuildLangModelLogs/LangSpanishModel.log b/script/BuildLangModelLogs/LangSpanishModel.log new file mode 100644 index 0000000..1fd5d83 --- /dev/null +++ b/script/BuildLangModelLogs/LangSpanishModel.log @@ -0,0 +1,109 @@ += Logs of language model for Spanish (es) = + +- Generated by BuildLangModel.py +- Started: 2015-12-12 18:37:37.085123 +- Maximum depth: 2 +- Max number of pages: 50 + +== Parsed pages == + +Wikipedia:Portada (revision 84894710) +11 de diciembre (revision 87735970) +12 de diciembre (revision 87742023) +13 de diciembre (revision 87697780) +1474 (revision 66715698) +1915 (revision 86935345) +2000 (revision 87686385) +2015 (revision 87743360) +Actuación (revision 87459085) +Akiyuki Nosaka (revision 87726149) +Alberto Podestá (revision 87729965) +Alejandro Magno (revision 87717064) +Argentina (revision 87742018) +Arnold Peralta (revision 87733100) +Atentados del 11 de diciembre de 2007 (revision 87720544) +Cantante (revision 86761085) +Canto (revision 87664585) +Carlo Furno (revision 87726011) +Ciencia ficción (revision 87662615) +Copa Mundial de Clubes de la FIFA 2015 (revision 87734956) +Corona de Castilla (revision 87209578) +Crisis migratoria en Europa (revision 87609406) +Dictadura de Primo de Rivera (revision 87371131) +Dionisio Miguel Recio (revision 87724426) +Disneyland (revision 87665192) +Dolph Schayes (revision 87730770) +Día Internacional de las Montañas (revision 87739490) +El discurso del rey (revision 87570241) +Elecciones regionales de Francia de 2015 (revision 87744011) +Estados Unidos (revision 87510736) +Fiction House (revision 87732511) +Filoxeno de Eretria (revision 83958621) +Frank Sinatra (revision 87742871) +Fundación Wikimedia (revision 87703852) +Geoffrey Marcy (revision 87706505) +Gheorghe Gruia (revision 87737327) +Grupo de Acción Republicana (revision 87739104) +Guerra contra el Estado Islámico (revision 87648946) +Here We Go Again (canción) (revision 87680365) +Isaac Asimov (revision 87591711) +Isabel I de Castilla (revision 87743713) +John "Hot Rod" Williams (revision 87730438) +José Subirà-Puig (revision 87740413) +Julio Terrazas Sandoval (revision 87736542) +Libertad Lamarque (revision 87508996) +Mosaico de Issos (revision 87731652) +Museo Arqueológico Nacional de Nápoles (revision 87302262) +Philip K. Dick (revision 87725371) +Planet Comics (revision 86698920) + +== End of Parsed pages == + +- Wikipedia parsing ended at: 2015-12-12 18:39:02.288858 + +52 characters appeared 991829 times. + +First 33 characters: +[ 0] Char e: 12.571925200815867 % +[ 1] Char a: 11.81988024145291 % +[ 2] Char o: 8.07941691561751 % +[ 3] Char n: 7.234513207417812 % +[ 4] Char s: 7.042242160695039 % +[ 5] Char i: 7.040528155559072 % +[ 6] Char r: 6.8208330266608455 % +[ 7] Char l: 5.722559029832763 % +[ 8] Char d: 5.275707808503281 % +[ 9] Char t: 4.668647518876742 % +[10] Char c: 4.466999855821921 % +[11] Char u: 3.673717949364255 % +[12] Char m: 2.710547886782903 % +[13] Char p: 2.4541528832086983 % +[14] Char b: 1.3867309788280036 % +[15] Char g: 1.2748165258325779 % +[16] Char f: 0.925058654263991 % +[17] Char y: 0.9045914164639268 % +[18] Char v: 0.8877538365988492 % +[19] Char ó: 0.8641610600214351 % +[20] Char h: 0.7369213846338432 % +[21] Char q: 0.5913317719082624 % +[22] Char í: 0.5612862701130941 % +[23] Char j: 0.43283670874717317 % +[24] Char z: 0.38071078784750195 % +[25] Char á: 0.37587124393418625 % +[26] Char é: 0.29632124085905936 % +[27] Char k: 0.2001353055819098 % +[28] Char x: 0.18743150280945606 % +[29] Char ñ: 0.17462687620547493 % +[30] Char ú: 0.12865120902897575 % +[31] Char w: 0.0972949974239511 % +[32] Char ü: 0.004436248587206061 % + +The first 33 characters have an accumulated ratio of 0.9999263986029848. + +897 sequences found. + +First 512 (typical positive ratio): 0.9970385677528184 +Next 512 (512-1024): 1.0082383152741046e-06 +Rest: 4.597017211338539e-17 + +- Processing end: 2015-12-12 18:39:02.460105 diff --git a/script/langs/es.py b/script/langs/es.py new file mode 100644 index 0000000..af4ac89 --- /dev/null +++ b/script/langs/es.py @@ -0,0 +1,77 @@ +#!/bin/python3 +# -*- coding: utf-8 -*- + +# ##### BEGIN LICENSE BLOCK ##### +# Version: MPL 1.1/GPL 2.0/LGPL 2.1 +# +# The contents of this file are subject to the Mozilla Public License Version +# 1.1 (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# http://www.mozilla.org/MPL/ +# +# Software distributed under the License is distributed on an "AS IS" basis, +# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License +# for the specific language governing rights and limitations under the +# License. +# +# The Original Code is Mozilla Universal charset detector code. +# +# The Initial Developer of the Original Code is +# Netscape Communications Corporation. +# Portions created by the Initial Developer are Copyright (C) 2001 +# the Initial Developer. All Rights Reserved. +# +# Contributor(s): +# Jehan +# +# Alternatively, the contents of this file may be used under the terms of +# either the GNU General Public License Version 2 or later (the "GPL"), or +# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), +# in which case the provisions of the GPL or the LGPL are applicable instead +# of those above. If you wish to allow use of your version of this file only +# under the terms of either the GPL or the LGPL, and not to allow others to +# use your version of this file under the terms of the MPL, indicate your +# decision by deleting the provisions above and replace them with the notice +# and other provisions required by the GPL or the LGPL. If you do not delete +# the provisions above, a recipient may use your version of this file under +# the terms of any one of the MPL, the GPL or the LGPL. +# +# ##### END LICENSE BLOCK ##### + +import re + +## Mandatory Properties ## + +# The human name for the language, in English. +name = 'Spanish' +# Use 2-letter ISO 639-1 if possible, 3-letter ISO code otherwise, +# or use another catalog as a last resort. +code = 'es' +# ASCII characters are also used in French. +use_ascii = True +# The charsets we want to support and create data for. +charsets = ['ISO-8859-15', 'ISO-8859-1', 'WINDOWS-1252'] + +## Optional Properties ## + +# Alphabet characters. +# If use_ascii=True, there is no need to add any ASCII characters. +# If case_mapping=True, there is no need to add several cases of a same +# character (provided Python algorithms know the right cases). +alphabet = 'ñáéíóúü' +# The start page. Though optional, it is advised to choose one yourself. +start_pages = ['Wikipedia:Portada'] +# give possibility to select another code for the Wikipedia URL. +wikipedia_code = code +# 'a' and 'A' will be considered the same character, and so on. +# This uses Python algorithm to determine upper/lower-case of a given +# character. +case_mapping = True + +# A function to clean content returned by the `wikipedia` python lib, +# in case some unwanted data has been overlooked. +def clean_wikipedia_content(content): + cleaned = re.sub(r'(=+) *([^=]+) *Editar \1', + r'\2', + content) + return cleaned diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 217edfc..babe8ae 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -16,6 +16,7 @@ set( LangModels/LangGreekModel.cpp LangModels/LangHungarianModel.cpp LangModels/LangHebrewModel.cpp + LangModels/LangSpanishModel.cpp LangModels/LangThaiModel.cpp LangModels/LangTurkishModel.cpp nsHebrewProber.cpp diff --git a/src/LangModels/LangSpanishModel.cpp b/src/LangModels/LangSpanishModel.cpp new file mode 100644 index 0000000..362bc5e --- /dev/null +++ b/src/LangModels/LangSpanishModel.cpp @@ -0,0 +1,201 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is Mozilla Communicator client code. + * + * The Initial Developer of the Original Code is + * Netscape Communications Corporation. + * Portions created by the Initial Developer are Copyright (C) 1998 + * the Initial Developer. All Rights Reserved. + * + * Contributor(s): + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ + +#include "../nsSBCharSetProber.h" + +/********* Language model for: Spanish *********/ + +/** + * Generated by BuildLangModel.py + * On: 2015-12-12 18:39:02.290370 + **/ + +/* Character Mapping Table: + * ILL: illegal character. + * CTR: control character specific to the charset. + * RET: carriage/return. + * SYM: symbol (punctuation) that does not belong to word. + * NUM: 0 - 9. + * + * Other characters are ordered by probabilities + * (0 is the most common character in the language). + * + * Orders are generic to a language. So the codepoint with order X in + * CHARSET1 maps to the same character as the codepoint with the same + * order X in CHARSET2 for the same language. + * As such, it is possible to get missing order. For instance the + * ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1 + * even though they are both used for French. Same for the euro sign. + */ +static const unsigned char Iso_8859_1_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 1, 14, 10, 8, 0, 16, 15, 20, 5, 23, 27, 7, 12, 3, 2, /* 4X */ + 13, 21, 6, 4, 9, 11, 18, 31, 28, 17, 24,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 1, 14, 10, 8, 0, 16, 15, 20, 5, 23, 27, 7, 12, 3, 2, /* 6X */ + 13, 21, 6, 4, 9, 11, 18, 31, 28, 17, 24,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ + SYM,SYM,SYM,SYM,SYM, 52,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */ + 33, 25, 39, 46, 37, 45, 47, 35, 36, 26, 48, 40, 53, 22, 41, 43, /* CX */ + 49, 29, 38, 19, 50, 54, 34,SYM, 44, 51, 30, 55, 32, 42, 56, 57, /* DX */ + 33, 25, 39, 46, 37, 45, 47, 35, 36, 26, 48, 40, 58, 22, 41, 43, /* EX */ + 49, 29, 38, 19, 50, 59, 34,SYM, 44, 51, 30, 60, 32, 42, 61, 62, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Iso_8859_15_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 1, 14, 10, 8, 0, 16, 15, 20, 5, 23, 27, 7, 12, 3, 2, /* 4X */ + 13, 21, 6, 4, 9, 11, 18, 31, 28, 17, 24,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 1, 14, 10, 8, 0, 16, 15, 20, 5, 23, 27, 7, 12, 3, 2, /* 6X */ + 13, 21, 6, 4, 9, 11, 18, 31, 28, 17, 24,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM,SYM,SYM,SYM,SYM,SYM, 63,SYM, 64,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ + SYM,SYM,SYM,SYM, 65, 66,SYM,SYM, 67,SYM,SYM,SYM, 68, 69, 70,SYM, /* BX */ + 33, 25, 39, 46, 37, 45, 47, 35, 36, 26, 48, 40, 71, 22, 41, 43, /* CX */ + 49, 29, 38, 19, 50, 72, 34,SYM, 44, 51, 30, 73, 32, 42, 74, 75, /* DX */ + 33, 25, 39, 46, 37, 45, 47, 35, 36, 26, 48, 40, 76, 22, 41, 43, /* EX */ + 49, 29, 38, 19, 50, 77, 34,SYM, 44, 51, 30, 78, 32, 42, 79, 80, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Windows_1252_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 1, 14, 10, 8, 0, 16, 15, 20, 5, 23, 27, 7, 12, 3, 2, /* 4X */ + 13, 21, 6, 4, 9, 11, 18, 31, 28, 17, 24,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 1, 14, 10, 8, 0, 16, 15, 20, 5, 23, 27, 7, 12, 3, 2, /* 6X */ + 13, 21, 6, 4, 9, 11, 18, 31, 28, 17, 24,SYM,SYM,SYM,SYM,CTR, /* 7X */ + SYM,ILL,SYM, 81,SYM,SYM,SYM,SYM,SYM,SYM, 82,SYM, 83,ILL, 84,ILL, /* 8X */ + ILL,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 85,SYM, 86,ILL, 87, 88, /* 9X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ + SYM,SYM,SYM,SYM,SYM, 89,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */ + 33, 25, 39, 46, 37, 45, 47, 35, 36, 26, 48, 40, 90, 22, 41, 43, /* CX */ + 49, 29, 38, 19, 50, 91, 34,SYM, 44, 51, 30, 92, 32, 42, 93, 94, /* DX */ + 33, 25, 39, 46, 37, 45, 47, 35, 36, 26, 48, 40, 95, 22, 41, 43, /* EX */ + 49, 29, 38, 19, 50, 96, 34,SYM, 44, 51, 30, 97, 32, 42, 98, 99, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + + +/* Model Table: + * Total sequences: 897 + * First 512 sequences: 0.9970385677528184 + * Next 512 sequences (512-1024): 0.0029614322471815486 + * Rest: 4.597017211338539e-17 + * Negative sequences: TODO + */ +static const PRUint8 SpanishLangModel[] = +{ + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,0,3,3,3,2,3,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,2,3,3,3,3,3,3,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,2,3,3,2,2,3,3,2,2,3,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,0,3,2,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,0,2,3,3,3,0,0,2,2,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,2,3,3,3,3,2,0,3,2,2, + 3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,2,2,3,3,2,2,0,2,2,0, + 3,3,3,2,3,3,3,3,2,2,2,3,3,2,2,3,2,3,3,3,3,2,3,2,2,3,3,2,0,0,2,2,2, + 3,3,3,3,3,3,3,3,2,3,3,3,2,2,3,2,2,3,2,3,3,0,3,2,2,3,3,0,0,0,2,2,2, + 3,3,3,3,3,3,3,3,2,3,3,3,2,2,2,2,2,3,0,3,3,2,3,0,2,3,3,3,0,0,2,0,0, + 3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,2,3,0,2,0, + 3,3,3,3,3,3,2,2,2,2,2,3,3,3,3,2,2,3,0,3,2,0,3,2,0,3,3,2,2,0,3,2,2, + 3,3,3,2,3,3,3,3,2,3,3,3,2,3,3,0,2,2,2,3,3,0,3,2,0,3,3,2,0,0,3,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,2,2,3,2,2,3,2,3,2,2,3,3,0,3,2,2,0,0,2,2,0, + 3,3,3,3,3,3,3,3,3,3,0,3,3,0,2,2,2,2,2,3,3,0,3,2,2,2,3,2,0,0,3,2,3, + 3,3,3,2,2,3,3,3,2,3,2,3,2,2,2,2,3,2,0,3,0,0,3,2,0,2,2,2,0,0,3,2,0, + 3,3,3,3,3,3,3,3,2,2,2,3,2,2,2,2,2,2,0,3,2,0,0,2,2,2,2,2,0,0,2,2,0, + 3,3,3,2,2,3,2,2,2,0,2,3,0,2,0,2,2,2,2,3,0,0,3,0,0,2,3,2,0,0,0,0,0, + 0,0,0,3,3,0,3,3,3,3,3,0,3,3,2,3,2,0,3,0,0,0,0,0,0,0,0,0,2,0,0,0,0, + 3,3,3,3,2,3,3,3,3,3,2,3,3,0,2,0,2,3,2,2,2,0,3,2,2,2,3,0,2,0,2,2,2, + 2,3,2,0,2,2,0,2,2,2,0,3,0,0,0,0,0,0,0,0,0,2,2,0,0,0,0,0,0,0,0,0,0, + 3,3,3,3,3,0,2,2,3,3,3,2,3,2,3,3,3,0,2,0,0,2,0,2,2,0,0,0,0,0,0,0,0, + 3,3,3,2,0,3,2,2,2,2,0,3,2,2,0,0,0,0,0,3,0,0,2,2,0,2,3,0,0,0,2,0,2, + 3,3,3,2,0,3,2,0,2,2,2,3,2,2,2,3,0,2,0,3,2,3,2,0,3,3,2,2,0,0,2,0,0, + 2,0,0,3,3,2,3,3,2,3,3,2,3,3,2,3,3,2,2,0,2,2,0,2,2,0,0,0,2,2,0,0,0, + 2,3,2,3,3,2,3,3,3,3,3,2,2,3,2,3,2,2,2,0,0,0,0,2,0,0,0,0,3,0,0,0,0, + 3,3,3,2,3,3,3,3,2,2,2,3,3,0,2,2,2,3,2,0,2,0,2,0,0,0,0,2,0,0,2,2,0, + 3,3,3,2,2,3,2,2,2,3,3,3,2,3,2,0,2,2,3,2,2,2,0,2,0,2,2,2,3,0,0,2,0, + 3,3,3,0,0,2,0,0,0,0,0,2,0,0,0,0,0,0,0,2,0,0,3,0,0,2,0,0,0,0,0,0,0, + 2,3,2,3,3,0,2,3,2,3,2,0,3,2,3,0,2,0,0,0,0,0,0,2,2,0,0,0,0,2,0,0,0, + 3,3,3,3,2,3,2,2,2,2,2,2,0,0,2,0,2,2,0,0,2,0,0,2,0,2,0,2,0,0,0,2,0, + 3,0,0,2,2,2,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,2,0,0,0,0,0,0, +}; + + +const SequenceModel Iso_8859_1SpanishModel = +{ + Iso_8859_1_CharToOrderMap, + SpanishLangModel, + 33, + (float)0.9970385677528184, + PR_TRUE, + "ISO-8859-1" +}; + +const SequenceModel Iso_8859_15SpanishModel = +{ + Iso_8859_15_CharToOrderMap, + SpanishLangModel, + 33, + (float)0.9970385677528184, + PR_TRUE, + "ISO-8859-15" +}; + +const SequenceModel Windows_1252SpanishModel = +{ + Windows_1252_CharToOrderMap, + SpanishLangModel, + 33, + (float)0.9970385677528184, + PR_TRUE, + "WINDOWS-1252" +}; \ No newline at end of file diff --git a/src/nsSBCSGroupProber.cpp b/src/nsSBCSGroupProber.cpp index 07109f1..e150a50 100644 --- a/src/nsSBCSGroupProber.cpp +++ b/src/nsSBCSGroupProber.cpp @@ -86,16 +86,20 @@ nsSBCSGroupProber::nsSBCSGroupProber() mProbers[16] = new nsSingleByteCharSetProber(&Iso_8859_15FrenchModel); mProbers[17] = new nsSingleByteCharSetProber(&Windows_1252FrenchModel); - mProbers[18] = new nsSingleByteCharSetProber(&Iso_8859_2HungarianModel); - mProbers[19] = new nsSingleByteCharSetProber(&Windows_1250HungarianModel); + mProbers[18] = new nsSingleByteCharSetProber(&Iso_8859_1SpanishModel); + mProbers[19] = new nsSingleByteCharSetProber(&Iso_8859_15SpanishModel); + mProbers[20] = new nsSingleByteCharSetProber(&Windows_1252SpanishModel); - mProbers[20] = new nsSingleByteCharSetProber(&Iso_8859_1GermanModel); - mProbers[21] = new nsSingleByteCharSetProber(&Windows_1252GermanModel); + mProbers[21] = new nsSingleByteCharSetProber(&Iso_8859_2HungarianModel); + mProbers[22] = new nsSingleByteCharSetProber(&Windows_1250HungarianModel); - mProbers[22] = new nsSingleByteCharSetProber(&Iso_8859_3EsperantoModel); + mProbers[23] = new nsSingleByteCharSetProber(&Iso_8859_1GermanModel); + mProbers[24] = new nsSingleByteCharSetProber(&Windows_1252GermanModel); - mProbers[23] = new nsSingleByteCharSetProber(&Iso_8859_3TurkishModel); - mProbers[24] = new nsSingleByteCharSetProber(&Iso_8859_9TurkishModel); + mProbers[25] = new nsSingleByteCharSetProber(&Iso_8859_3EsperantoModel); + + mProbers[26] = new nsSingleByteCharSetProber(&Iso_8859_3TurkishModel); + mProbers[27] = new nsSingleByteCharSetProber(&Iso_8859_9TurkishModel); Reset(); } diff --git a/src/nsSBCSGroupProber.h b/src/nsSBCSGroupProber.h index 88beeb3..b959483 100644 --- a/src/nsSBCSGroupProber.h +++ b/src/nsSBCSGroupProber.h @@ -40,7 +40,7 @@ #define nsSBCSGroupProber_h__ -#define NUM_OF_SBCS_PROBERS 25 +#define NUM_OF_SBCS_PROBERS 28 class nsCharSetProber; class nsSBCSGroupProber: public nsCharSetProber { diff --git a/src/nsSBCharSetProber.h b/src/nsSBCharSetProber.h index 408954f..4c815cf 100644 --- a/src/nsSBCharSetProber.h +++ b/src/nsSBCharSetProber.h @@ -148,6 +148,10 @@ extern const SequenceModel Iso_8859_15FrenchModel; extern const SequenceModel Iso_8859_1FrenchModel; extern const SequenceModel Windows_1252FrenchModel; +extern const SequenceModel Iso_8859_15SpanishModel; +extern const SequenceModel Iso_8859_1SpanishModel; +extern const SequenceModel Windows_1252SpanishModel; + extern const SequenceModel Iso_8859_1GermanModel; extern const SequenceModel Windows_1252GermanModel;