diff --git a/script/BuildLangModelLogs/LangPortugueseModel.log b/script/BuildLangModelLogs/LangPortugueseModel.log new file mode 100644 index 0000000..dce6f36 --- /dev/null +++ b/script/BuildLangModelLogs/LangPortugueseModel.log @@ -0,0 +1,166 @@ += Logs of language model for Portuguese (pt) = + +- Generated by BuildLangModel.py +- Started: 2016-09-20 23:44:39.722451 +- Maximum depth: 5 +- Max number of pages: 100 + +== Parsed pages == + +Papagaio-das-mascarenhas (revision 46763149) +Albinismo (revision 46498446) +Alfred Newton (revision 43617011) +Alphonse Milne-Edwards (revision 39740747) +Animalia (revision 46727732) +Asa (revision 46338820) +August von Pelzeln (revision 34726241) +Aves (revision 46728980) +Bico (revision 45311553) +Carl Wilhelm Hahn (revision 45025566) +Carlos Lineu (revision 46625396) +Carolus Linnaeus (revision 46625396) +Cauda (revision 43275401) +Charles Lucien Bonaparte (revision 45529712) +Chordata (revision 46640101) +Cladograma (revision 46700307) +Classe (biologia) (revision 46701409) +Classificação científica (revision 46306288) +Coleção Leverian (revision 45026647) +Comores (revision 46181501) +Coracopsinae (revision 36946101) +Coracopsis nigra (revision 44338845) +Coracopsis vasa (revision 42905822) +Cylindraspis indica (revision 42905410) +Cúlmen (revision 45311553) +Digital object identifier (revision 42172651) +Eclectus roratus (revision 44380798) +Edward Newton (revision 39261469) +Endemismo (revision 45260961) +Epíteto específico (revision 35101647) +Espécie (revision 45685675) +Esquilo-vermelho (revision 43489595) +Estado de conservação (revision 46662839) +Extinção (revision 46526607) +Família (biologia) (revision 46636004) +Filo (revision 46704246) +França (revision 46740839) +François-Nicolas Martinet (revision 43679514) +François Levaillant (revision 40142351) +Fredrik Hasselqvist (revision 44381122) +Fregilupus varius (revision 46555765) +Fumigação (revision 42458244) +George Robert Gray (revision 39047844) +Georges-Louis Leclerc, conde de Buffon (revision 45622418) +Género (biologia) (revision 45296588) +Hermann Schlegel (revision 43137605) +Herpetologista (revision 46207704) +Histoire Naturelle (revision 44293456) +Holótipo (revision 44029660) +Ilha da Reunião (revision 45458206) +Ilha vulcânica (revision 37924535) +Ilhas Mascarenhas (revision 45858660) +Ilhas Molucas (revision 45476933) +International Standard Book Number (revision 46326494) +Jacques Barraband (revision 45007769) +Jean Feuilley (revision 43140791) +Johann Georg Wagler (revision 34585234) +John Gerrard Keulemans (revision 39664498) +Julian Hume (revision 41876605) +Leiolopisma (revision 43997173) +Lionel Walter Rothschild (revision 46022922) +Lista Vermelha da IUCN (revision 46569884) +Lista Vermelha da União Internacional para a Conservação da Natureza e dos Recursos Naturais (revision 46569884) +Lista Vermelha de Espécies Ameaçadas da IUCN (revision 46569884) +Lista de aves extintas (revision 45507420) +Londres (revision 46310311) +Língua inglesa (revision 46609785) +Madagascar (revision 46617630) +Mascarenotus grucheti (revision 43145662) +Mathurin Jacques Brisson (revision 36018826) +Maurício (revision 46723599) +Maximiliano I José da Baviera (revision 46372080) +Melanina (revision 46762903) +Museu Nacional de História Natural (França) (revision 43731807) +Naturhistorisches Museum (revision 46694247) +Nesoenas duboisi (revision 43995805) +Nome científico (revision 46671641) +Nomenclatura binomial (revision 46671641) +Nycticorax duboisi (revision 43816214) +Nível do mar (revision 46414695) +Ordem (biologia) (revision 46360024) +Otto Finsch (revision 42362273) +Papagaio (revision 46738207) +Papagaio-cinzento (revision 46673943) +Papagaio-cinzento-de-maurício (revision 46664408) +Pedro Mascarenhas (c. 1484-1555) (revision 45541977) +Periquito-de-maurício (revision 43010883) +Periquito-de-reunião (revision 43048764) +Peter Mundy (revision 43563846) +Piton des Neiges (revision 45632497) +Pleistoceno (revision 45916874) +Plumagem (revision 34951058) +Ponto quente (revision 45375495) +Porphyrio coerulescens (revision 43672493) +Praslin (revision 40728143) +Psitacídeos (revision 46598835) +Psittaciformes (revision 46598835) +Psittacula (revision 42856453) +Psittaculinae (revision 46760737) +Psittaculini (revision 43015966) +Psittrichasiidae (revision 44385977) + +== End of Parsed pages == + +- Wikipedia parsing ended at: 2016-09-20 23:47:27.346826 + +51 characters appeared 558324 times. + +First 38 characters: +[ 0] Char a: 11.864795351802895 % +[ 1] Char e: 11.44604208309154 % +[ 2] Char o: 9.868284365350585 % +[ 3] Char s: 8.346587286235232 % +[ 4] Char i: 7.118089138206489 % +[ 5] Char r: 6.394136737808154 % +[ 6] Char n: 5.568272186042513 % +[ 7] Char d: 5.243192125002687 % +[ 8] Char t: 4.80061756256224 % +[ 9] Char m: 4.498105042949971 % +[10] Char c: 3.9747530107965985 % +[11] Char u: 3.7229279056605127 % +[12] Char l: 3.207814817202914 % +[13] Char p: 2.77562848811801 % +[14] Char g: 1.3850380782484721 % +[15] Char v: 1.3210967108703908 % +[16] Char f: 1.122466524813549 % +[17] Char b: 0.9702251739133549 % +[18] Char h: 0.9130898904578704 % +[19] Char é: 0.7026386112723079 % +[20] Char ã: 0.7022803963290133 % +[21] Char q: 0.5903382265494588 % +[22] Char ç: 0.5856814322866293 % +[23] Char í: 0.41391736697688086 % +[24] Char x: 0.3913498255493226 % +[25] Char á: 0.34567742027926435 % +[26] Char z: 0.3170202248156984 % +[27] Char ó: 0.22925756370852768 % +[28] Char j: 0.20454073262120204 % +[29] Char ê: 0.20239144296143458 % +[30] Char õ: 0.16155493942585308 % +[31] Char y: 0.15080849112701586 % +[32] Char w: 0.09241945537000021 % +[33] Char ú: 0.08794176857881804 % +[34] Char k: 0.08364318925928313 % +[35] Char â: 0.07898639499645367 % +[36] Char à: 0.06859816164091102 % +[37] Char ô: 0.031164700066627977 % + +The first 38 characters have an accumulated ratio of 0.9998137282294869. + +891 sequences found. + +First 512 (typical positive ratio): 0.9953179582313172 +Next 512 (512-1024): 1.7910747164728723e-06 +Rest: 2.42861286636753e-17 + +- Processing end: 2016-09-20 23:47:27.489355 diff --git a/script/langs/pt.py b/script/langs/pt.py new file mode 100644 index 0000000..e8bde16 --- /dev/null +++ b/script/langs/pt.py @@ -0,0 +1,80 @@ +#!/bin/python3 +# -*- coding: utf-8 -*- + +# ##### BEGIN LICENSE BLOCK ##### +# Version: MPL 1.1/GPL 2.0/LGPL 2.1 +# +# The contents of this file are subject to the Mozilla Public License Version +# 1.1 (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# http://www.mozilla.org/MPL/ +# +# Software distributed under the License is distributed on an "AS IS" basis, +# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License +# for the specific language governing rights and limitations under the +# License. +# +# The Original Code is Mozilla Universal charset detector code. +# +# The Initial Developer of the Original Code is +# Netscape Communications Corporation. +# Portions created by the Initial Developer are Copyright (C) 2001 +# the Initial Developer. All Rights Reserved. +# +# Contributor(s): +# Jehan +# +# Alternatively, the contents of this file may be used under the terms of +# either the GNU General Public License Version 2 or later (the "GPL"), or +# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), +# in which case the provisions of the GPL or the LGPL are applicable instead +# of those above. If you wish to allow use of your version of this file only +# under the terms of either the GPL or the LGPL, and not to allow others to +# use your version of this file under the terms of the MPL, indicate your +# decision by deleting the provisions above and replace them with the notice +# and other provisions required by the GPL or the LGPL. If you do not delete +# the provisions above, a recipient may use your version of this file under +# the terms of any one of the MPL, the GPL or the LGPL. +# +# ##### END LICENSE BLOCK ##### + +import re + +## Mandatory Properties ## + +# The human name for the language, in English. +name = 'Portuguese' +# Use 2-letter ISO 639-1 if possible, 3-letter ISO code otherwise, +# or use another catalog as a last resort. +code = 'pt' +# ASCII characters are also used in French. +use_ascii = True +# The charsets we want to support and create data for. +charsets = ['ISO-8859-15', 'ISO-8859-1', 'WINDOWS-1252', 'ISO-8859-9'] + +## Optional Properties ## + +# Alphabet characters. +# If use_ascii=True, there is no need to add any ASCII characters. +# If case_mapping=True, there is no need to add several cases of a same +# character (provided Python algorithms know the right cases). +alphabet = 'áâãàçéêíóôõú' +# The starred page which was rewarded on the main page when I created +# the data. +start_pages = ['Papagaio-das-mascarenhas'] +# give possibility to select another code for the Wikipedia URL. +wikipedia_code = code +# 'a' and 'A' will be considered the same character, and so on. +# This uses Python algorithm to determine upper/lower-case of a given +# character. +case_mapping = True + +# A function to clean content returned by the `wikipedia` python lib, +# in case some unwanted data has been overlooked. +# Note that we are already cleaning away the '=' from the title syntax +# of Wikipedia, as well as double spaces. But sometimes, Wikipedia in +# some language may return weird syntax or UI text which should be +# discarded. If you encounter one of these cases, use this function. +def clean_wikipedia_content(content): + # Do your garbage text cleaning here. + return content diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 9d9e8bc..5209ab5 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -19,6 +19,7 @@ set( LangModels/LangHebrewModel.cpp LangModels/LangLithuanianModel.cpp LangModels/LangLatvianModel.cpp + LangModels/LangPortugueseModel.cpp LangModels/LangSpanishModel.cpp LangModels/LangThaiModel.cpp LangModels/LangTurkishModel.cpp diff --git a/src/LangModels/LangPortugueseModel.cpp b/src/LangModels/LangPortugueseModel.cpp new file mode 100644 index 0000000..8886597 --- /dev/null +++ b/src/LangModels/LangPortugueseModel.cpp @@ -0,0 +1,237 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is Mozilla Communicator client code. + * + * The Initial Developer of the Original Code is + * Netscape Communications Corporation. + * Portions created by the Initial Developer are Copyright (C) 1998 + * the Initial Developer. All Rights Reserved. + * + * Contributor(s): + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ + +#include "../nsSBCharSetProber.h" + +/********* Language model for: Portuguese *********/ + +/** + * Generated by BuildLangModel.py + * On: 2016-09-20 23:47:27.348423 + **/ + +/* Character Mapping Table: + * ILL: illegal character. + * CTR: control character specific to the charset. + * RET: carriage/return. + * SYM: symbol (punctuation) that does not belong to word. + * NUM: 0 - 9. + * + * Other characters are ordered by probabilities + * (0 is the most common character in the language). + * + * Orders are generic to a language. So the codepoint with order X in + * CHARSET1 maps to the same character as the codepoint with the same + * order X in CHARSET2 for the same language. + * As such, it is possible to get missing order. For instance the + * ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1 + * even though they are both used for French. Same for the euro sign. + */ +static const unsigned char Iso_8859_1_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 0, 17, 10, 7, 1, 16, 14, 18, 4, 28, 34, 12, 9, 6, 2, /* 4X */ + 13, 21, 5, 3, 8, 11, 15, 32, 24, 31, 26,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 0, 17, 10, 7, 1, 16, 14, 18, 4, 28, 34, 12, 9, 6, 2, /* 6X */ + 13, 21, 5, 3, 8, 11, 15, 32, 24, 31, 26,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ + SYM,SYM,SYM,SYM,SYM, 51,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */ + 36, 25, 35, 20, 41, 42, 43, 22, 39, 19, 29, 44, 52, 23, 45, 47, /* CX */ + 48, 53, 46, 27, 37, 30, 38,SYM, 54, 55, 33, 56, 40, 57, 58, 49, /* DX */ + 36, 25, 35, 20, 41, 42, 43, 22, 39, 19, 29, 44, 59, 23, 45, 47, /* EX */ + 48, 60, 46, 27, 37, 30, 38,SYM, 61, 62, 33, 63, 40, 64, 65, 50, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Iso_8859_9_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 0, 17, 10, 7, 1, 16, 14, 18, 4, 28, 34, 12, 9, 6, 2, /* 4X */ + 13, 21, 5, 3, 8, 11, 15, 32, 24, 31, 26,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 0, 17, 10, 7, 1, 16, 14, 18, 4, 28, 34, 12, 9, 6, 2, /* 6X */ + 13, 21, 5, 3, 8, 11, 15, 32, 24, 31, 26,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ + SYM,SYM,SYM,SYM,SYM, 66,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */ + 36, 25, 35, 20, 41, 42, 43, 22, 39, 19, 29, 44, 67, 23, 45, 47, /* CX */ + 68, 69, 46, 27, 37, 30, 38,SYM, 70, 71, 33, 72, 40, 73, 74, 49, /* DX */ + 36, 25, 35, 20, 41, 42, 43, 22, 39, 19, 29, 44, 75, 23, 45, 47, /* EX */ + 76, 77, 46, 27, 37, 30, 38,SYM, 78, 79, 33, 80, 40, 81, 82, 50, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Iso_8859_15_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 0, 17, 10, 7, 1, 16, 14, 18, 4, 28, 34, 12, 9, 6, 2, /* 4X */ + 13, 21, 5, 3, 8, 11, 15, 32, 24, 31, 26,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 0, 17, 10, 7, 1, 16, 14, 18, 4, 28, 34, 12, 9, 6, 2, /* 6X */ + 13, 21, 5, 3, 8, 11, 15, 32, 24, 31, 26,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM,SYM,SYM,SYM,SYM,SYM, 83,SYM, 84,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ + SYM,SYM,SYM,SYM, 85, 86,SYM,SYM, 87,SYM,SYM,SYM, 88, 89, 50,SYM, /* BX */ + 36, 25, 35, 20, 41, 42, 43, 22, 39, 19, 29, 44, 90, 23, 45, 47, /* CX */ + 48, 91, 46, 27, 37, 30, 38,SYM, 92, 93, 33, 94, 40, 95, 96, 49, /* DX */ + 36, 25, 35, 20, 41, 42, 43, 22, 39, 19, 29, 44, 97, 23, 45, 47, /* EX */ + 48, 98, 46, 27, 37, 30, 38,SYM, 99,100, 33,101, 40,102,103, 50, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Windows_1252_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 0, 17, 10, 7, 1, 16, 14, 18, 4, 28, 34, 12, 9, 6, 2, /* 4X */ + 13, 21, 5, 3, 8, 11, 15, 32, 24, 31, 26,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 0, 17, 10, 7, 1, 16, 14, 18, 4, 28, 34, 12, 9, 6, 2, /* 6X */ + 13, 21, 5, 3, 8, 11, 15, 32, 24, 31, 26,SYM,SYM,SYM,SYM,CTR, /* 7X */ + SYM,ILL,SYM,104,SYM,SYM,SYM,SYM,SYM,SYM,105,SYM,106,ILL,107,ILL, /* 8X */ + ILL,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,108,SYM,109,ILL,110, 50, /* 9X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ + SYM,SYM,SYM,SYM,SYM,111,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */ + 36, 25, 35, 20, 41, 42, 43, 22, 39, 19, 29, 44,112, 23, 45, 47, /* CX */ + 48,113, 46, 27, 37, 30, 38,SYM,114,115, 33,116, 40,117,118, 49, /* DX */ + 36, 25, 35, 20, 41, 42, 43, 22, 39, 19, 29, 44,119, 23, 45, 47, /* EX */ + 48,120, 46, 27, 37, 30, 38,SYM,121,122, 33,123, 40,124,125, 50, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + + +/* Model Table: + * Total sequences: 891 + * First 512 sequences: 0.9953179582313172 + * Next 512 sequences (512-1024): 0.0046820417686827855 + * Rest: 2.42861286636753e-17 + * Negative sequences: TODO + */ +static const PRUint8 PortugueseLangModel[] = +{ + 2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,0,3,3,3,3,0,3,2,3,0,0,3,2,2,3,0,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,0,2,3,3,2,3,2,3,2,3,0,2,3,3,2,2,2,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,3,2,3,2,3,2,3,0,2,3,3,0,3,0,0,0, + 3,3,3,3,3,2,3,3,3,3,3,3,3,3,2,2,3,3,3,3,3,3,0,3,0,3,2,3,0,2,3,3,2,2,3,0,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,3,3,3,0,3,3,3,3,2,3,3,2,2,2,3,2,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,2,3,2,3,3,3,2,2,3,3,0,3, + 3,3,3,3,3,2,3,3,3,2,3,3,2,2,3,3,3,2,3,3,3,3,3,3,2,3,3,3,3,2,0,3,2,3,3,2,0,3, + 3,3,3,3,3,3,2,3,2,3,2,3,3,2,3,2,2,2,2,3,3,2,0,3,0,3,0,3,2,3,2,3,3,3,0,2,0,2, + 3,3,3,3,3,3,3,0,3,3,3,3,3,2,2,2,2,2,3,3,3,0,0,3,0,3,2,3,0,3,2,3,2,2,2,3,0,3, + 3,3,3,3,3,2,3,2,2,3,2,3,2,3,2,0,2,3,0,3,3,2,0,3,0,3,2,3,0,2,2,3,2,3,0,3,0,3, + 3,3,3,2,3,3,3,2,3,3,3,3,3,2,2,0,2,2,3,3,2,2,3,3,0,3,2,3,0,3,2,3,0,2,3,3,0,2, + 3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,2,3,2,2,3,3,3,2,3,0,3,3,0,2,2,0,2,0,0,0, + 3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,2,3,0,3,2,3,0,3,0,3,2,2,2,3,0,3, + 3,3,3,3,3,3,2,2,3,0,2,3,3,3,0,0,0,2,3,3,2,2,3,3,0,3,2,3,0,2,2,2,0,3,0,2,0,2, + 3,3,3,3,3,3,3,2,2,3,2,3,3,2,2,2,0,2,3,3,2,0,0,2,0,3,0,2,0,3,2,3,2,2,0,2,0,0, + 3,3,3,0,3,3,0,2,0,0,0,3,0,0,0,2,0,0,0,3,2,0,0,3,0,3,0,2,0,3,2,0,0,0,0,2,0,2, + 3,3,3,2,3,3,0,2,2,2,2,3,3,2,2,0,3,2,0,3,0,0,0,3,0,2,0,3,0,3,0,2,0,2,0,0,0,2, + 3,3,3,3,3,3,3,3,3,2,2,3,3,2,2,2,3,2,2,3,2,0,0,2,0,2,2,2,3,2,0,2,2,2,0,0,0,0, + 3,3,3,3,3,3,3,2,3,2,0,3,3,0,0,0,2,2,2,2,3,0,0,2,0,3,0,2,0,0,3,3,2,0,2,0,0,0, + 2,2,2,3,2,3,3,3,3,3,3,2,3,3,2,2,2,0,0,0,0,0,0,0,2,0,2,0,0,0,0,0,0,0,0,0,0,0, + 0,2,3,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 2,0,0,2,0,0,0,0,0,0,0,3,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,0,3,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,3,0,0,2,0,0,0,0, + 3,0,3,3,0,3,3,3,3,3,3,0,3,3,3,3,3,3,0,0,0,2,2,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0, + 3,3,3,2,3,0,0,0,3,0,3,3,2,3,0,3,2,0,2,2,2,0,0,2,3,2,0,2,2,0,2,0,0,0,0,0,0,2, + 0,0,0,3,0,3,2,2,3,0,3,2,3,3,3,3,3,3,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,3,3,0,3,2,2,0,0,2,2,3,0,0,0,0,0,2,2,2,2,0,0,0,0,2,2,2,0,2,2,0,2,0,0,2,0,0, + 0,0,0,3,2,3,3,3,3,3,3,0,3,3,3,2,3,2,0,0,0,2,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,3,3,2,2,0,2,0,0,0,2,3,0,2,0,0,0,0,0,0,2,0,0,0,0,3,0,0,0,0,0,0,0,2,2,0,0,0, + 0,0,0,3,0,0,3,0,2,3,0,2,2,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,3,3,3,2,3,2,2,3,2,3,2,3,2,2,0,2,2,2,0,0,0,0,0,3,0,2,0,2,0,0,0,2,0,2,0,0,0, + 3,3,3,2,3,2,2,2,3,2,2,2,2,0,0,2,0,2,3,0,0,0,0,0,0,0,2,0,0,0,0,2,2,0,2,0,0,0, + 0,0,0,3,0,2,3,3,2,3,2,0,3,2,0,2,0,3,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0, + 3,3,3,2,3,2,2,0,0,3,2,2,2,0,2,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,2,2,0,2,0,0,0, + 0,0,0,0,0,0,3,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,2,2,0,0,3,2,2,3,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +}; + + +const SequenceModel Iso_8859_1PortugueseModel = +{ + Iso_8859_1_CharToOrderMap, + PortugueseLangModel, + 38, + (float)0.9953179582313172, + PR_TRUE, + "ISO-8859-1" +}; + +const SequenceModel Iso_8859_9PortugueseModel = +{ + Iso_8859_9_CharToOrderMap, + PortugueseLangModel, + 38, + (float)0.9953179582313172, + PR_TRUE, + "ISO-8859-9" +}; + +const SequenceModel Iso_8859_15PortugueseModel = +{ + Iso_8859_15_CharToOrderMap, + PortugueseLangModel, + 38, + (float)0.9953179582313172, + PR_TRUE, + "ISO-8859-15" +}; + +const SequenceModel Windows_1252PortugueseModel = +{ + Windows_1252_CharToOrderMap, + PortugueseLangModel, + 38, + (float)0.9953179582313172, + PR_TRUE, + "WINDOWS-1252" +}; \ No newline at end of file diff --git a/src/nsSBCSGroupProber.cpp b/src/nsSBCSGroupProber.cpp index 0a13c1a..57dbafe 100644 --- a/src/nsSBCSGroupProber.cpp +++ b/src/nsSBCSGroupProber.cpp @@ -115,6 +115,11 @@ nsSBCSGroupProber::nsSBCSGroupProber() mProbers[36] = new nsSingleByteCharSetProber(&Iso_8859_13LatvianModel); + mProbers[37] = new nsSingleByteCharSetProber(&Iso_8859_1PortugueseModel); + mProbers[38] = new nsSingleByteCharSetProber(&Iso_8859_9PortugueseModel); + mProbers[39] = new nsSingleByteCharSetProber(&Iso_8859_15PortugueseModel); + mProbers[40] = new nsSingleByteCharSetProber(&Windows_1252PortugueseModel); + Reset(); } diff --git a/src/nsSBCSGroupProber.h b/src/nsSBCSGroupProber.h index 95081c3..9cf7ea4 100644 --- a/src/nsSBCSGroupProber.h +++ b/src/nsSBCSGroupProber.h @@ -40,7 +40,7 @@ #define nsSBCSGroupProber_h__ -#define NUM_OF_SBCS_PROBERS 37 +#define NUM_OF_SBCS_PROBERS 41 class nsCharSetProber; class nsSBCSGroupProber: public nsCharSetProber { diff --git a/src/nsSBCharSetProber.h b/src/nsSBCharSetProber.h index 4369a17..6000838 100644 --- a/src/nsSBCharSetProber.h +++ b/src/nsSBCharSetProber.h @@ -176,5 +176,10 @@ extern const SequenceModel Iso_8859_13LithuanianModel; extern const SequenceModel Iso_8859_13LatvianModel; +extern const SequenceModel Iso_8859_1PortugueseModel; +extern const SequenceModel Iso_8859_9PortugueseModel; +extern const SequenceModel Iso_8859_15PortugueseModel; +extern const SequenceModel Windows_1252PortugueseModel; + #endif /* nsSingleByteCharSetProber_h__ */ diff --git a/test/pt/iso-8859-1.txt b/test/pt/iso-8859-1.txt new file mode 100644 index 0000000..ec6bdda --- /dev/null +++ b/test/pt/iso-8859-1.txt @@ -0,0 +1,6 @@ +Albertossauro (Albertosaurus sp., que significa "lagarto de Alberta" no Canadá), +foi um género de dinossauro carnívoro e bípede presente no fim do período +Cretáceo. Media cerca de 8 a 9 metros de comprimento, 3 metros de altura e +pesava menos de 2 toneladas. O Albertossauro viveu na América do Norte e foi +descoberto no ano de 1884 por Joseph Burr Tyrrell em Alberta, no Canadá, local +ao qual deve seu nome. diff --git a/test/pt/utf-8.txt b/test/pt/utf-8.txt new file mode 100644 index 0000000..1729291 --- /dev/null +++ b/test/pt/utf-8.txt @@ -0,0 +1,6 @@ +Albertossauro (Albertosaurus sp., que significa "lagarto de Alberta" no Canadá), +foi um género de dinossauro carnívoro e bípede presente no fim do período +Cretáceo. Media cerca de 8 a 9 metros de comprimento, 3 metros de altura e +pesava menos de 2 toneladas. O Albertossauro viveu na América do Norte e foi +descoberto no ano de 1884 por Joseph Burr Tyrrell em Alberta, no Canadá, local +ao qual deve seu nome.