mirror of
https://gitlab.freedesktop.org/uchardet/uchardet.git
synced 2025-12-06 16:56:40 +08:00
LangModels: add support for Portuguese / ISO-8859-1.
I actually added also couples with ISO-8859-9, ISO-8859-15 and Windows-1252. Nevertheless there are no differences on the main characters related to Portuguese so differences will hardly be made and detection will usually return ISO-8859-1 only.
This commit is contained in:
parent
e98d257ec4
commit
e138839f07
166
script/BuildLangModelLogs/LangPortugueseModel.log
Normal file
166
script/BuildLangModelLogs/LangPortugueseModel.log
Normal file
@ -0,0 +1,166 @@
|
|||||||
|
= Logs of language model for Portuguese (pt) =
|
||||||
|
|
||||||
|
- Generated by BuildLangModel.py
|
||||||
|
- Started: 2016-09-20 23:44:39.722451
|
||||||
|
- Maximum depth: 5
|
||||||
|
- Max number of pages: 100
|
||||||
|
|
||||||
|
== Parsed pages ==
|
||||||
|
|
||||||
|
Papagaio-das-mascarenhas (revision 46763149)
|
||||||
|
Albinismo (revision 46498446)
|
||||||
|
Alfred Newton (revision 43617011)
|
||||||
|
Alphonse Milne-Edwards (revision 39740747)
|
||||||
|
Animalia (revision 46727732)
|
||||||
|
Asa (revision 46338820)
|
||||||
|
August von Pelzeln (revision 34726241)
|
||||||
|
Aves (revision 46728980)
|
||||||
|
Bico (revision 45311553)
|
||||||
|
Carl Wilhelm Hahn (revision 45025566)
|
||||||
|
Carlos Lineu (revision 46625396)
|
||||||
|
Carolus Linnaeus (revision 46625396)
|
||||||
|
Cauda (revision 43275401)
|
||||||
|
Charles Lucien Bonaparte (revision 45529712)
|
||||||
|
Chordata (revision 46640101)
|
||||||
|
Cladograma (revision 46700307)
|
||||||
|
Classe (biologia) (revision 46701409)
|
||||||
|
Classificação científica (revision 46306288)
|
||||||
|
Coleção Leverian (revision 45026647)
|
||||||
|
Comores (revision 46181501)
|
||||||
|
Coracopsinae (revision 36946101)
|
||||||
|
Coracopsis nigra (revision 44338845)
|
||||||
|
Coracopsis vasa (revision 42905822)
|
||||||
|
Cylindraspis indica (revision 42905410)
|
||||||
|
Cúlmen (revision 45311553)
|
||||||
|
Digital object identifier (revision 42172651)
|
||||||
|
Eclectus roratus (revision 44380798)
|
||||||
|
Edward Newton (revision 39261469)
|
||||||
|
Endemismo (revision 45260961)
|
||||||
|
Epíteto específico (revision 35101647)
|
||||||
|
Espécie (revision 45685675)
|
||||||
|
Esquilo-vermelho (revision 43489595)
|
||||||
|
Estado de conservação (revision 46662839)
|
||||||
|
Extinção (revision 46526607)
|
||||||
|
Família (biologia) (revision 46636004)
|
||||||
|
Filo (revision 46704246)
|
||||||
|
França (revision 46740839)
|
||||||
|
François-Nicolas Martinet (revision 43679514)
|
||||||
|
François Levaillant (revision 40142351)
|
||||||
|
Fredrik Hasselqvist (revision 44381122)
|
||||||
|
Fregilupus varius (revision 46555765)
|
||||||
|
Fumigação (revision 42458244)
|
||||||
|
George Robert Gray (revision 39047844)
|
||||||
|
Georges-Louis Leclerc, conde de Buffon (revision 45622418)
|
||||||
|
Género (biologia) (revision 45296588)
|
||||||
|
Hermann Schlegel (revision 43137605)
|
||||||
|
Herpetologista (revision 46207704)
|
||||||
|
Histoire Naturelle (revision 44293456)
|
||||||
|
Holótipo (revision 44029660)
|
||||||
|
Ilha da Reunião (revision 45458206)
|
||||||
|
Ilha vulcânica (revision 37924535)
|
||||||
|
Ilhas Mascarenhas (revision 45858660)
|
||||||
|
Ilhas Molucas (revision 45476933)
|
||||||
|
International Standard Book Number (revision 46326494)
|
||||||
|
Jacques Barraband (revision 45007769)
|
||||||
|
Jean Feuilley (revision 43140791)
|
||||||
|
Johann Georg Wagler (revision 34585234)
|
||||||
|
John Gerrard Keulemans (revision 39664498)
|
||||||
|
Julian Hume (revision 41876605)
|
||||||
|
Leiolopisma (revision 43997173)
|
||||||
|
Lionel Walter Rothschild (revision 46022922)
|
||||||
|
Lista Vermelha da IUCN (revision 46569884)
|
||||||
|
Lista Vermelha da União Internacional para a Conservação da Natureza e dos Recursos Naturais (revision 46569884)
|
||||||
|
Lista Vermelha de Espécies Ameaçadas da IUCN (revision 46569884)
|
||||||
|
Lista de aves extintas (revision 45507420)
|
||||||
|
Londres (revision 46310311)
|
||||||
|
Língua inglesa (revision 46609785)
|
||||||
|
Madagascar (revision 46617630)
|
||||||
|
Mascarenotus grucheti (revision 43145662)
|
||||||
|
Mathurin Jacques Brisson (revision 36018826)
|
||||||
|
Maurício (revision 46723599)
|
||||||
|
Maximiliano I José da Baviera (revision 46372080)
|
||||||
|
Melanina (revision 46762903)
|
||||||
|
Museu Nacional de História Natural (França) (revision 43731807)
|
||||||
|
Naturhistorisches Museum (revision 46694247)
|
||||||
|
Nesoenas duboisi (revision 43995805)
|
||||||
|
Nome científico (revision 46671641)
|
||||||
|
Nomenclatura binomial (revision 46671641)
|
||||||
|
Nycticorax duboisi (revision 43816214)
|
||||||
|
Nível do mar (revision 46414695)
|
||||||
|
Ordem (biologia) (revision 46360024)
|
||||||
|
Otto Finsch (revision 42362273)
|
||||||
|
Papagaio (revision 46738207)
|
||||||
|
Papagaio-cinzento (revision 46673943)
|
||||||
|
Papagaio-cinzento-de-maurício (revision 46664408)
|
||||||
|
Pedro Mascarenhas (c. 1484-1555) (revision 45541977)
|
||||||
|
Periquito-de-maurício (revision 43010883)
|
||||||
|
Periquito-de-reunião (revision 43048764)
|
||||||
|
Peter Mundy (revision 43563846)
|
||||||
|
Piton des Neiges (revision 45632497)
|
||||||
|
Pleistoceno (revision 45916874)
|
||||||
|
Plumagem (revision 34951058)
|
||||||
|
Ponto quente (revision 45375495)
|
||||||
|
Porphyrio coerulescens (revision 43672493)
|
||||||
|
Praslin (revision 40728143)
|
||||||
|
Psitacídeos (revision 46598835)
|
||||||
|
Psittaciformes (revision 46598835)
|
||||||
|
Psittacula (revision 42856453)
|
||||||
|
Psittaculinae (revision 46760737)
|
||||||
|
Psittaculini (revision 43015966)
|
||||||
|
Psittrichasiidae (revision 44385977)
|
||||||
|
|
||||||
|
== End of Parsed pages ==
|
||||||
|
|
||||||
|
- Wikipedia parsing ended at: 2016-09-20 23:47:27.346826
|
||||||
|
|
||||||
|
51 characters appeared 558324 times.
|
||||||
|
|
||||||
|
First 38 characters:
|
||||||
|
[ 0] Char a: 11.864795351802895 %
|
||||||
|
[ 1] Char e: 11.44604208309154 %
|
||||||
|
[ 2] Char o: 9.868284365350585 %
|
||||||
|
[ 3] Char s: 8.346587286235232 %
|
||||||
|
[ 4] Char i: 7.118089138206489 %
|
||||||
|
[ 5] Char r: 6.394136737808154 %
|
||||||
|
[ 6] Char n: 5.568272186042513 %
|
||||||
|
[ 7] Char d: 5.243192125002687 %
|
||||||
|
[ 8] Char t: 4.80061756256224 %
|
||||||
|
[ 9] Char m: 4.498105042949971 %
|
||||||
|
[10] Char c: 3.9747530107965985 %
|
||||||
|
[11] Char u: 3.7229279056605127 %
|
||||||
|
[12] Char l: 3.207814817202914 %
|
||||||
|
[13] Char p: 2.77562848811801 %
|
||||||
|
[14] Char g: 1.3850380782484721 %
|
||||||
|
[15] Char v: 1.3210967108703908 %
|
||||||
|
[16] Char f: 1.122466524813549 %
|
||||||
|
[17] Char b: 0.9702251739133549 %
|
||||||
|
[18] Char h: 0.9130898904578704 %
|
||||||
|
[19] Char é: 0.7026386112723079 %
|
||||||
|
[20] Char ã: 0.7022803963290133 %
|
||||||
|
[21] Char q: 0.5903382265494588 %
|
||||||
|
[22] Char ç: 0.5856814322866293 %
|
||||||
|
[23] Char í: 0.41391736697688086 %
|
||||||
|
[24] Char x: 0.3913498255493226 %
|
||||||
|
[25] Char á: 0.34567742027926435 %
|
||||||
|
[26] Char z: 0.3170202248156984 %
|
||||||
|
[27] Char ó: 0.22925756370852768 %
|
||||||
|
[28] Char j: 0.20454073262120204 %
|
||||||
|
[29] Char ê: 0.20239144296143458 %
|
||||||
|
[30] Char õ: 0.16155493942585308 %
|
||||||
|
[31] Char y: 0.15080849112701586 %
|
||||||
|
[32] Char w: 0.09241945537000021 %
|
||||||
|
[33] Char ú: 0.08794176857881804 %
|
||||||
|
[34] Char k: 0.08364318925928313 %
|
||||||
|
[35] Char â: 0.07898639499645367 %
|
||||||
|
[36] Char à: 0.06859816164091102 %
|
||||||
|
[37] Char ô: 0.031164700066627977 %
|
||||||
|
|
||||||
|
The first 38 characters have an accumulated ratio of 0.9998137282294869.
|
||||||
|
|
||||||
|
891 sequences found.
|
||||||
|
|
||||||
|
First 512 (typical positive ratio): 0.9953179582313172
|
||||||
|
Next 512 (512-1024): 1.7910747164728723e-06
|
||||||
|
Rest: 2.42861286636753e-17
|
||||||
|
|
||||||
|
- Processing end: 2016-09-20 23:47:27.489355
|
||||||
80
script/langs/pt.py
Normal file
80
script/langs/pt.py
Normal file
@ -0,0 +1,80 @@
|
|||||||
|
#!/bin/python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
# ##### BEGIN LICENSE BLOCK #####
|
||||||
|
# Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||||
|
#
|
||||||
|
# The contents of this file are subject to the Mozilla Public License Version
|
||||||
|
# 1.1 (the "License"); you may not use this file except in compliance with
|
||||||
|
# the License. You may obtain a copy of the License at
|
||||||
|
# http://www.mozilla.org/MPL/
|
||||||
|
#
|
||||||
|
# Software distributed under the License is distributed on an "AS IS" basis,
|
||||||
|
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||||
|
# for the specific language governing rights and limitations under the
|
||||||
|
# License.
|
||||||
|
#
|
||||||
|
# The Original Code is Mozilla Universal charset detector code.
|
||||||
|
#
|
||||||
|
# The Initial Developer of the Original Code is
|
||||||
|
# Netscape Communications Corporation.
|
||||||
|
# Portions created by the Initial Developer are Copyright (C) 2001
|
||||||
|
# the Initial Developer. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Contributor(s):
|
||||||
|
# Jehan <jehan@girinstud.io>
|
||||||
|
#
|
||||||
|
# Alternatively, the contents of this file may be used under the terms of
|
||||||
|
# either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||||
|
# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||||
|
# in which case the provisions of the GPL or the LGPL are applicable instead
|
||||||
|
# of those above. If you wish to allow use of your version of this file only
|
||||||
|
# under the terms of either the GPL or the LGPL, and not to allow others to
|
||||||
|
# use your version of this file under the terms of the MPL, indicate your
|
||||||
|
# decision by deleting the provisions above and replace them with the notice
|
||||||
|
# and other provisions required by the GPL or the LGPL. If you do not delete
|
||||||
|
# the provisions above, a recipient may use your version of this file under
|
||||||
|
# the terms of any one of the MPL, the GPL or the LGPL.
|
||||||
|
#
|
||||||
|
# ##### END LICENSE BLOCK #####
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
## Mandatory Properties ##
|
||||||
|
|
||||||
|
# The human name for the language, in English.
|
||||||
|
name = 'Portuguese'
|
||||||
|
# Use 2-letter ISO 639-1 if possible, 3-letter ISO code otherwise,
|
||||||
|
# or use another catalog as a last resort.
|
||||||
|
code = 'pt'
|
||||||
|
# ASCII characters are also used in French.
|
||||||
|
use_ascii = True
|
||||||
|
# The charsets we want to support and create data for.
|
||||||
|
charsets = ['ISO-8859-15', 'ISO-8859-1', 'WINDOWS-1252', 'ISO-8859-9']
|
||||||
|
|
||||||
|
## Optional Properties ##
|
||||||
|
|
||||||
|
# Alphabet characters.
|
||||||
|
# If use_ascii=True, there is no need to add any ASCII characters.
|
||||||
|
# If case_mapping=True, there is no need to add several cases of a same
|
||||||
|
# character (provided Python algorithms know the right cases).
|
||||||
|
alphabet = 'áâãàçéêíóôõú'
|
||||||
|
# The starred page which was rewarded on the main page when I created
|
||||||
|
# the data.
|
||||||
|
start_pages = ['Papagaio-das-mascarenhas']
|
||||||
|
# give possibility to select another code for the Wikipedia URL.
|
||||||
|
wikipedia_code = code
|
||||||
|
# 'a' and 'A' will be considered the same character, and so on.
|
||||||
|
# This uses Python algorithm to determine upper/lower-case of a given
|
||||||
|
# character.
|
||||||
|
case_mapping = True
|
||||||
|
|
||||||
|
# A function to clean content returned by the `wikipedia` python lib,
|
||||||
|
# in case some unwanted data has been overlooked.
|
||||||
|
# Note that we are already cleaning away the '=' from the title syntax
|
||||||
|
# of Wikipedia, as well as double spaces. But sometimes, Wikipedia in
|
||||||
|
# some language may return weird syntax or UI text which should be
|
||||||
|
# discarded. If you encounter one of these cases, use this function.
|
||||||
|
def clean_wikipedia_content(content):
|
||||||
|
# Do your garbage text cleaning here.
|
||||||
|
return content
|
||||||
@ -19,6 +19,7 @@ set(
|
|||||||
LangModels/LangHebrewModel.cpp
|
LangModels/LangHebrewModel.cpp
|
||||||
LangModels/LangLithuanianModel.cpp
|
LangModels/LangLithuanianModel.cpp
|
||||||
LangModels/LangLatvianModel.cpp
|
LangModels/LangLatvianModel.cpp
|
||||||
|
LangModels/LangPortugueseModel.cpp
|
||||||
LangModels/LangSpanishModel.cpp
|
LangModels/LangSpanishModel.cpp
|
||||||
LangModels/LangThaiModel.cpp
|
LangModels/LangThaiModel.cpp
|
||||||
LangModels/LangTurkishModel.cpp
|
LangModels/LangTurkishModel.cpp
|
||||||
|
|||||||
237
src/LangModels/LangPortugueseModel.cpp
Normal file
237
src/LangModels/LangPortugueseModel.cpp
Normal file
@ -0,0 +1,237 @@
|
|||||||
|
/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
|
||||||
|
/* ***** BEGIN LICENSE BLOCK *****
|
||||||
|
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||||
|
*
|
||||||
|
* The contents of this file are subject to the Mozilla Public License Version
|
||||||
|
* 1.1 (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
* http://www.mozilla.org/MPL/
|
||||||
|
*
|
||||||
|
* Software distributed under the License is distributed on an "AS IS" basis,
|
||||||
|
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||||
|
* for the specific language governing rights and limitations under the
|
||||||
|
* License.
|
||||||
|
*
|
||||||
|
* The Original Code is Mozilla Communicator client code.
|
||||||
|
*
|
||||||
|
* The Initial Developer of the Original Code is
|
||||||
|
* Netscape Communications Corporation.
|
||||||
|
* Portions created by the Initial Developer are Copyright (C) 1998
|
||||||
|
* the Initial Developer. All Rights Reserved.
|
||||||
|
*
|
||||||
|
* Contributor(s):
|
||||||
|
*
|
||||||
|
* Alternatively, the contents of this file may be used under the terms of
|
||||||
|
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||||
|
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||||
|
* in which case the provisions of the GPL or the LGPL are applicable instead
|
||||||
|
* of those above. If you wish to allow use of your version of this file only
|
||||||
|
* under the terms of either the GPL or the LGPL, and not to allow others to
|
||||||
|
* use your version of this file under the terms of the MPL, indicate your
|
||||||
|
* decision by deleting the provisions above and replace them with the notice
|
||||||
|
* and other provisions required by the GPL or the LGPL. If you do not delete
|
||||||
|
* the provisions above, a recipient may use your version of this file under
|
||||||
|
* the terms of any one of the MPL, the GPL or the LGPL.
|
||||||
|
*
|
||||||
|
* ***** END LICENSE BLOCK ***** */
|
||||||
|
|
||||||
|
#include "../nsSBCharSetProber.h"
|
||||||
|
|
||||||
|
/********* Language model for: Portuguese *********/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Generated by BuildLangModel.py
|
||||||
|
* On: 2016-09-20 23:47:27.348423
|
||||||
|
**/
|
||||||
|
|
||||||
|
/* Character Mapping Table:
|
||||||
|
* ILL: illegal character.
|
||||||
|
* CTR: control character specific to the charset.
|
||||||
|
* RET: carriage/return.
|
||||||
|
* SYM: symbol (punctuation) that does not belong to word.
|
||||||
|
* NUM: 0 - 9.
|
||||||
|
*
|
||||||
|
* Other characters are ordered by probabilities
|
||||||
|
* (0 is the most common character in the language).
|
||||||
|
*
|
||||||
|
* Orders are generic to a language. So the codepoint with order X in
|
||||||
|
* CHARSET1 maps to the same character as the codepoint with the same
|
||||||
|
* order X in CHARSET2 for the same language.
|
||||||
|
* As such, it is possible to get missing order. For instance the
|
||||||
|
* ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1
|
||||||
|
* even though they are both used for French. Same for the euro sign.
|
||||||
|
*/
|
||||||
|
static const unsigned char Iso_8859_1_CharToOrderMap[] =
|
||||||
|
{
|
||||||
|
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
|
||||||
|
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
|
||||||
|
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
|
||||||
|
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
|
||||||
|
SYM, 0, 17, 10, 7, 1, 16, 14, 18, 4, 28, 34, 12, 9, 6, 2, /* 4X */
|
||||||
|
13, 21, 5, 3, 8, 11, 15, 32, 24, 31, 26,SYM,SYM,SYM,SYM,SYM, /* 5X */
|
||||||
|
SYM, 0, 17, 10, 7, 1, 16, 14, 18, 4, 28, 34, 12, 9, 6, 2, /* 6X */
|
||||||
|
13, 21, 5, 3, 8, 11, 15, 32, 24, 31, 26,SYM,SYM,SYM,SYM,CTR, /* 7X */
|
||||||
|
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */
|
||||||
|
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */
|
||||||
|
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */
|
||||||
|
SYM,SYM,SYM,SYM,SYM, 51,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */
|
||||||
|
36, 25, 35, 20, 41, 42, 43, 22, 39, 19, 29, 44, 52, 23, 45, 47, /* CX */
|
||||||
|
48, 53, 46, 27, 37, 30, 38,SYM, 54, 55, 33, 56, 40, 57, 58, 49, /* DX */
|
||||||
|
36, 25, 35, 20, 41, 42, 43, 22, 39, 19, 29, 44, 59, 23, 45, 47, /* EX */
|
||||||
|
48, 60, 46, 27, 37, 30, 38,SYM, 61, 62, 33, 63, 40, 64, 65, 50, /* FX */
|
||||||
|
};
|
||||||
|
/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
|
||||||
|
|
||||||
|
static const unsigned char Iso_8859_9_CharToOrderMap[] =
|
||||||
|
{
|
||||||
|
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
|
||||||
|
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
|
||||||
|
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
|
||||||
|
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
|
||||||
|
SYM, 0, 17, 10, 7, 1, 16, 14, 18, 4, 28, 34, 12, 9, 6, 2, /* 4X */
|
||||||
|
13, 21, 5, 3, 8, 11, 15, 32, 24, 31, 26,SYM,SYM,SYM,SYM,SYM, /* 5X */
|
||||||
|
SYM, 0, 17, 10, 7, 1, 16, 14, 18, 4, 28, 34, 12, 9, 6, 2, /* 6X */
|
||||||
|
13, 21, 5, 3, 8, 11, 15, 32, 24, 31, 26,SYM,SYM,SYM,SYM,CTR, /* 7X */
|
||||||
|
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */
|
||||||
|
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */
|
||||||
|
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */
|
||||||
|
SYM,SYM,SYM,SYM,SYM, 66,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */
|
||||||
|
36, 25, 35, 20, 41, 42, 43, 22, 39, 19, 29, 44, 67, 23, 45, 47, /* CX */
|
||||||
|
68, 69, 46, 27, 37, 30, 38,SYM, 70, 71, 33, 72, 40, 73, 74, 49, /* DX */
|
||||||
|
36, 25, 35, 20, 41, 42, 43, 22, 39, 19, 29, 44, 75, 23, 45, 47, /* EX */
|
||||||
|
76, 77, 46, 27, 37, 30, 38,SYM, 78, 79, 33, 80, 40, 81, 82, 50, /* FX */
|
||||||
|
};
|
||||||
|
/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
|
||||||
|
|
||||||
|
static const unsigned char Iso_8859_15_CharToOrderMap[] =
|
||||||
|
{
|
||||||
|
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
|
||||||
|
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
|
||||||
|
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
|
||||||
|
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
|
||||||
|
SYM, 0, 17, 10, 7, 1, 16, 14, 18, 4, 28, 34, 12, 9, 6, 2, /* 4X */
|
||||||
|
13, 21, 5, 3, 8, 11, 15, 32, 24, 31, 26,SYM,SYM,SYM,SYM,SYM, /* 5X */
|
||||||
|
SYM, 0, 17, 10, 7, 1, 16, 14, 18, 4, 28, 34, 12, 9, 6, 2, /* 6X */
|
||||||
|
13, 21, 5, 3, 8, 11, 15, 32, 24, 31, 26,SYM,SYM,SYM,SYM,CTR, /* 7X */
|
||||||
|
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */
|
||||||
|
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */
|
||||||
|
SYM,SYM,SYM,SYM,SYM,SYM, 83,SYM, 84,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */
|
||||||
|
SYM,SYM,SYM,SYM, 85, 86,SYM,SYM, 87,SYM,SYM,SYM, 88, 89, 50,SYM, /* BX */
|
||||||
|
36, 25, 35, 20, 41, 42, 43, 22, 39, 19, 29, 44, 90, 23, 45, 47, /* CX */
|
||||||
|
48, 91, 46, 27, 37, 30, 38,SYM, 92, 93, 33, 94, 40, 95, 96, 49, /* DX */
|
||||||
|
36, 25, 35, 20, 41, 42, 43, 22, 39, 19, 29, 44, 97, 23, 45, 47, /* EX */
|
||||||
|
48, 98, 46, 27, 37, 30, 38,SYM, 99,100, 33,101, 40,102,103, 50, /* FX */
|
||||||
|
};
|
||||||
|
/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
|
||||||
|
|
||||||
|
static const unsigned char Windows_1252_CharToOrderMap[] =
|
||||||
|
{
|
||||||
|
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
|
||||||
|
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
|
||||||
|
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
|
||||||
|
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
|
||||||
|
SYM, 0, 17, 10, 7, 1, 16, 14, 18, 4, 28, 34, 12, 9, 6, 2, /* 4X */
|
||||||
|
13, 21, 5, 3, 8, 11, 15, 32, 24, 31, 26,SYM,SYM,SYM,SYM,SYM, /* 5X */
|
||||||
|
SYM, 0, 17, 10, 7, 1, 16, 14, 18, 4, 28, 34, 12, 9, 6, 2, /* 6X */
|
||||||
|
13, 21, 5, 3, 8, 11, 15, 32, 24, 31, 26,SYM,SYM,SYM,SYM,CTR, /* 7X */
|
||||||
|
SYM,ILL,SYM,104,SYM,SYM,SYM,SYM,SYM,SYM,105,SYM,106,ILL,107,ILL, /* 8X */
|
||||||
|
ILL,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,108,SYM,109,ILL,110, 50, /* 9X */
|
||||||
|
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */
|
||||||
|
SYM,SYM,SYM,SYM,SYM,111,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */
|
||||||
|
36, 25, 35, 20, 41, 42, 43, 22, 39, 19, 29, 44,112, 23, 45, 47, /* CX */
|
||||||
|
48,113, 46, 27, 37, 30, 38,SYM,114,115, 33,116, 40,117,118, 49, /* DX */
|
||||||
|
36, 25, 35, 20, 41, 42, 43, 22, 39, 19, 29, 44,119, 23, 45, 47, /* EX */
|
||||||
|
48,120, 46, 27, 37, 30, 38,SYM,121,122, 33,123, 40,124,125, 50, /* FX */
|
||||||
|
};
|
||||||
|
/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
|
||||||
|
|
||||||
|
|
||||||
|
/* Model Table:
|
||||||
|
* Total sequences: 891
|
||||||
|
* First 512 sequences: 0.9953179582313172
|
||||||
|
* Next 512 sequences (512-1024): 0.0046820417686827855
|
||||||
|
* Rest: 2.42861286636753e-17
|
||||||
|
* Negative sequences: TODO
|
||||||
|
*/
|
||||||
|
static const PRUint8 PortugueseLangModel[] =
|
||||||
|
{
|
||||||
|
2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,0,3,3,3,3,0,3,2,3,0,0,3,2,2,3,0,0,0,
|
||||||
|
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,0,2,3,3,2,3,2,3,2,3,0,2,3,3,2,2,2,0,0,
|
||||||
|
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,3,2,3,2,3,2,3,0,2,3,3,0,3,0,0,0,
|
||||||
|
3,3,3,3,3,2,3,3,3,3,3,3,3,3,2,2,3,3,3,3,3,3,0,3,0,3,2,3,0,2,3,3,2,2,3,0,0,0,
|
||||||
|
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,3,3,3,0,3,3,3,3,2,3,3,2,2,2,3,2,0,0,
|
||||||
|
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,2,3,2,3,3,3,2,2,3,3,0,3,
|
||||||
|
3,3,3,3,3,2,3,3,3,2,3,3,2,2,3,3,3,2,3,3,3,3,3,3,2,3,3,3,3,2,0,3,2,3,3,2,0,3,
|
||||||
|
3,3,3,3,3,3,2,3,2,3,2,3,3,2,3,2,2,2,2,3,3,2,0,3,0,3,0,3,2,3,2,3,3,3,0,2,0,2,
|
||||||
|
3,3,3,3,3,3,3,0,3,3,3,3,3,2,2,2,2,2,3,3,3,0,0,3,0,3,2,3,0,3,2,3,2,2,2,3,0,3,
|
||||||
|
3,3,3,3,3,2,3,2,2,3,2,3,2,3,2,0,2,3,0,3,3,2,0,3,0,3,2,3,0,2,2,3,2,3,0,3,0,3,
|
||||||
|
3,3,3,2,3,3,3,2,3,3,3,3,3,2,2,0,2,2,3,3,2,2,3,3,0,3,2,3,0,3,2,3,0,2,3,3,0,2,
|
||||||
|
3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,2,3,2,2,3,3,3,2,3,0,3,3,0,2,2,0,2,0,0,0,
|
||||||
|
3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,2,3,0,3,2,3,0,3,0,3,2,2,2,3,0,3,
|
||||||
|
3,3,3,3,3,3,2,2,3,0,2,3,3,3,0,0,0,2,3,3,2,2,3,3,0,3,2,3,0,2,2,2,0,3,0,2,0,2,
|
||||||
|
3,3,3,3,3,3,3,2,2,3,2,3,3,2,2,2,0,2,3,3,2,0,0,2,0,3,0,2,0,3,2,3,2,2,0,2,0,0,
|
||||||
|
3,3,3,0,3,3,0,2,0,0,0,3,0,0,0,2,0,0,0,3,2,0,0,3,0,3,0,2,0,3,2,0,0,0,0,2,0,2,
|
||||||
|
3,3,3,2,3,3,0,2,2,2,2,3,3,2,2,0,3,2,0,3,0,0,0,3,0,2,0,3,0,3,0,2,0,2,0,0,0,2,
|
||||||
|
3,3,3,3,3,3,3,3,3,2,2,3,3,2,2,2,3,2,2,3,2,0,0,2,0,2,2,2,3,2,0,2,2,2,0,0,0,0,
|
||||||
|
3,3,3,3,3,3,3,2,3,2,0,3,3,0,0,0,2,2,2,2,3,0,0,2,0,3,0,2,0,0,3,3,2,0,2,0,0,0,
|
||||||
|
2,2,2,3,2,3,3,3,3,3,3,2,3,3,2,2,2,0,0,0,0,0,0,0,2,0,2,0,0,0,0,0,0,0,0,0,0,0,
|
||||||
|
0,2,3,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||||
|
2,0,0,2,0,0,0,0,0,0,0,3,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||||
|
3,0,3,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,3,0,0,2,0,0,0,0,
|
||||||
|
3,0,3,3,0,3,3,3,3,3,3,0,3,3,3,3,3,3,0,0,0,2,2,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,
|
||||||
|
3,3,3,2,3,0,0,0,3,0,3,3,2,3,0,3,2,0,2,2,2,0,0,2,3,2,0,2,2,0,2,0,0,0,0,0,0,2,
|
||||||
|
0,0,0,3,0,3,2,2,3,0,3,2,3,3,3,3,3,3,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||||
|
3,3,3,0,3,2,2,0,0,2,2,3,0,0,0,0,0,2,2,2,2,0,0,0,0,2,2,2,0,2,2,0,2,0,0,2,0,0,
|
||||||
|
0,0,0,3,2,3,3,3,3,3,3,0,3,3,3,2,3,2,0,0,0,2,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||||
|
3,3,3,2,2,0,2,0,0,0,2,3,0,2,0,0,0,0,0,0,2,0,0,0,0,3,0,0,0,0,0,0,0,2,2,0,0,0,
|
||||||
|
0,0,0,3,0,0,3,0,2,3,0,2,2,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||||
|
0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||||
|
3,3,3,3,2,3,2,2,3,2,3,2,3,2,2,0,2,2,2,0,0,0,0,0,3,0,2,0,2,0,0,0,2,0,2,0,0,0,
|
||||||
|
3,3,3,2,3,2,2,2,3,2,2,2,2,0,0,2,0,2,3,0,0,0,0,0,0,0,2,0,0,0,0,2,2,0,2,0,0,0,
|
||||||
|
0,0,0,3,0,2,3,3,2,3,2,0,3,2,0,2,0,3,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,
|
||||||
|
3,3,3,2,3,2,2,0,0,3,2,2,2,0,2,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,2,2,0,2,0,0,0,
|
||||||
|
0,0,0,0,0,0,3,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||||
|
0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||||
|
0,0,2,2,0,0,3,2,2,3,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
const SequenceModel Iso_8859_1PortugueseModel =
|
||||||
|
{
|
||||||
|
Iso_8859_1_CharToOrderMap,
|
||||||
|
PortugueseLangModel,
|
||||||
|
38,
|
||||||
|
(float)0.9953179582313172,
|
||||||
|
PR_TRUE,
|
||||||
|
"ISO-8859-1"
|
||||||
|
};
|
||||||
|
|
||||||
|
const SequenceModel Iso_8859_9PortugueseModel =
|
||||||
|
{
|
||||||
|
Iso_8859_9_CharToOrderMap,
|
||||||
|
PortugueseLangModel,
|
||||||
|
38,
|
||||||
|
(float)0.9953179582313172,
|
||||||
|
PR_TRUE,
|
||||||
|
"ISO-8859-9"
|
||||||
|
};
|
||||||
|
|
||||||
|
const SequenceModel Iso_8859_15PortugueseModel =
|
||||||
|
{
|
||||||
|
Iso_8859_15_CharToOrderMap,
|
||||||
|
PortugueseLangModel,
|
||||||
|
38,
|
||||||
|
(float)0.9953179582313172,
|
||||||
|
PR_TRUE,
|
||||||
|
"ISO-8859-15"
|
||||||
|
};
|
||||||
|
|
||||||
|
const SequenceModel Windows_1252PortugueseModel =
|
||||||
|
{
|
||||||
|
Windows_1252_CharToOrderMap,
|
||||||
|
PortugueseLangModel,
|
||||||
|
38,
|
||||||
|
(float)0.9953179582313172,
|
||||||
|
PR_TRUE,
|
||||||
|
"WINDOWS-1252"
|
||||||
|
};
|
||||||
@ -115,6 +115,11 @@ nsSBCSGroupProber::nsSBCSGroupProber()
|
|||||||
|
|
||||||
mProbers[36] = new nsSingleByteCharSetProber(&Iso_8859_13LatvianModel);
|
mProbers[36] = new nsSingleByteCharSetProber(&Iso_8859_13LatvianModel);
|
||||||
|
|
||||||
|
mProbers[37] = new nsSingleByteCharSetProber(&Iso_8859_1PortugueseModel);
|
||||||
|
mProbers[38] = new nsSingleByteCharSetProber(&Iso_8859_9PortugueseModel);
|
||||||
|
mProbers[39] = new nsSingleByteCharSetProber(&Iso_8859_15PortugueseModel);
|
||||||
|
mProbers[40] = new nsSingleByteCharSetProber(&Windows_1252PortugueseModel);
|
||||||
|
|
||||||
Reset();
|
Reset();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -40,7 +40,7 @@
|
|||||||
#define nsSBCSGroupProber_h__
|
#define nsSBCSGroupProber_h__
|
||||||
|
|
||||||
|
|
||||||
#define NUM_OF_SBCS_PROBERS 37
|
#define NUM_OF_SBCS_PROBERS 41
|
||||||
|
|
||||||
class nsCharSetProber;
|
class nsCharSetProber;
|
||||||
class nsSBCSGroupProber: public nsCharSetProber {
|
class nsSBCSGroupProber: public nsCharSetProber {
|
||||||
|
|||||||
@ -176,5 +176,10 @@ extern const SequenceModel Iso_8859_13LithuanianModel;
|
|||||||
|
|
||||||
extern const SequenceModel Iso_8859_13LatvianModel;
|
extern const SequenceModel Iso_8859_13LatvianModel;
|
||||||
|
|
||||||
|
extern const SequenceModel Iso_8859_1PortugueseModel;
|
||||||
|
extern const SequenceModel Iso_8859_9PortugueseModel;
|
||||||
|
extern const SequenceModel Iso_8859_15PortugueseModel;
|
||||||
|
extern const SequenceModel Windows_1252PortugueseModel;
|
||||||
|
|
||||||
#endif /* nsSingleByteCharSetProber_h__ */
|
#endif /* nsSingleByteCharSetProber_h__ */
|
||||||
|
|
||||||
|
|||||||
6
test/pt/iso-8859-1.txt
Normal file
6
test/pt/iso-8859-1.txt
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
Albertossauro (Albertosaurus sp., que significa "lagarto de Alberta" no Canadá),
|
||||||
|
foi um género de dinossauro carnívoro e bípede presente no fim do período
|
||||||
|
Cretáceo. Media cerca de 8 a 9 metros de comprimento, 3 metros de altura e
|
||||||
|
pesava menos de 2 toneladas. O Albertossauro viveu na América do Norte e foi
|
||||||
|
descoberto no ano de 1884 por Joseph Burr Tyrrell em Alberta, no Canadá, local
|
||||||
|
ao qual deve seu nome.
|
||||||
6
test/pt/utf-8.txt
Normal file
6
test/pt/utf-8.txt
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
Albertossauro (Albertosaurus sp., que significa "lagarto de Alberta" no Canadá),
|
||||||
|
foi um género de dinossauro carnívoro e bípede presente no fim do período
|
||||||
|
Cretáceo. Media cerca de 8 a 9 metros de comprimento, 3 metros de altura e
|
||||||
|
pesava menos de 2 toneladas. O Albertossauro viveu na América do Norte e foi
|
||||||
|
descoberto no ano de 1884 por Joseph Burr Tyrrell em Alberta, no Canadá, local
|
||||||
|
ao qual deve seu nome.
|
||||||
Loading…
x
Reference in New Issue
Block a user