From 0270b1e856d3bf8815d938359663f52556d55488 Mon Sep 17 00:00:00 2001 From: Jehan Date: Thu, 3 Dec 2015 21:22:30 +0100 Subject: [PATCH] Adding French Windows-1252 support. --- script/BuildLangModelLogs/LangFrenchModel.log | 256 +++++++----------- script/charsets/windows-1252.py | 76 ++++++ script/langs/fr.py | 2 +- src/LangModels/LangFrenchModel.cpp | 169 +++++++----- src/nsSBCSGroupProber.cpp | 5 +- src/nsSBCSGroupProber.h | 2 +- src/nsSBCharSetProber.h | 1 + 7 files changed, 274 insertions(+), 237 deletions(-) create mode 100644 script/charsets/windows-1252.py diff --git a/script/BuildLangModelLogs/LangFrenchModel.log b/script/BuildLangModelLogs/LangFrenchModel.log index e75ee7f..ff6d14a 100644 --- a/script/BuildLangModelLogs/LangFrenchModel.log +++ b/script/BuildLangModelLogs/LangFrenchModel.log @@ -1,188 +1,116 @@ = Logs of language model for French (fr) = - Generated by BuildLangModel.py -- Started: 2015-11-30 18:53:23.881008 +- Started: 2015-12-03 21:07:37.508739 - Maximum depth: 2 -- Max number of pages: 10 +- Max number of pages: 50 == Parsed pages == Wikipédia:Accueil_principal (revision 115957655) Bœuf (animal) (revision 115500130) -1672 (revision 120907902) -1727 (revision 120908296) +1500 av. J.-C. (revision 110583603) +1898 dans les chemins de fer (revision 106801806) +1913 dans les chemins de fer (revision 112852042) +1974 dans les chemins de fer (revision 90170756) +1er décembre (revision 121012781) +2009 dans les chemins de fer (revision 107042206) +2011 dans les chemins de fer (revision 109560866) 24 novembre (revision 120782024) 26 novembre (revision 120833172) -27 novembre (revision 120860032) -28 novembre (revision 120900893) -30 novembre (revision 120934923) -Amsterdam (revision 120834895) -Amérique (revision 120916912) -An mil (revision 120416538) -Ancien Régime (revision 120708739) -Anjou (revision 120590957) -António Costa (revision 120928729) -Armée de l'air turque (revision 120764207) -Artémise II (revision 120920820) -Attentat du 24 novembre 2015 à Tunis (revision 120924574) -Barbro Hiort af Ornäs (revision 120933311) -Bataille d’Attu (revision 120942542) -Bretagne (revision 120828180) -Candé (revision 120928722) -Canton de Candé (revision 120383860) -Chef-lieu (revision 119340707) -Chouannerie (revision 119799524) -Commune (France) (revision 120627882) -Conférence de Paris de 2015 sur le climat (revision 120944002) +29 novembre (revision 120918160) +2 décembre (revision 121025437) +30 novembre (revision 120947714) +3 décembre (revision 121030621) +Amphibien (revision 120332329) +Angleterre (revision 120784240) +Anne-Josèphe Théroigne de Méricourt (revision 121009789) +Années 1930 (revision 120558236) +Antonio Troyo Calderón (revision 121028881) +António Costa (revision 120993829) +Attentat du 24 novembre 2015 à Tunis (revision 121015161) +Balard (métro de Paris) (revision 118979088) +Bois de Vincennes (revision 120822909) +Buse à tête blanche (revision 121009499) +Californie (revision 120922479) +Charenton-le-Pont (revision 120210025) +Charenton - Écoles (métro de Paris) (revision 108644873) +Chronique médiévale (revision 100253272) +Concorde (métro de Paris) (revision 120856751) +Conférence de Paris de 2015 sur le climat (revision 121029398) Crise de la dette publique grecque (revision 120905208) Crise entre la Colombie et le Venezuela de 2015 (revision 120857143) -Crise migratoire en Europe (revision 120906358) -Crise russo-turque de 2015 (revision 120936864) -Deuxième guerre civile libyenne (revision 120673125) -Déesse mère (revision 120904195) -Départements français (revision 120873309) -Effet Shapiro (revision 120893782) -Eldar Riazanov (revision 120924339) -Fatima Mernissi (revision 120942794) +Crise migratoire en Europe (revision 121002308) +Crise russo-turque de 2015 (revision 121030214) +Créteil (revision 120684618) +Créteil - Préfecture (métro de Paris) (revision 113486387) +Deuxième guerre civile libyenne (revision 121027704) +Devise (monnaie) (revision 121015771) +Droits de tirage spéciaux (revision 121009135) +Décembre 2015 (revision 121010045) +Département français (revision 120993190) +Eldar Riazanov (revision 120996396) +Enfants verts de Woolpit (revision 121002303) +Ernst Larsen (revision 121026772) +Fatima Mernissi (revision 120992271) +Fejervarya cancrivora (revision 120353807) +Fonds monétaire international (revision 120754406) Français (revision 120883858) -Gerry Byrne (football) (revision 120943526) -Guerre civile sud-soudanaise (revision 120672963) -Guerre civile syrienne (revision 120868598) -Guerre d'Afghanistan (depuis 2015) (revision 120675052) -Guerre du Donbass (revision 120862085) -Guerre du Yémen (depuis 2001) (revision 118472483) -Insurrection djihadiste au Nigeria (revision 120550223) -Irwin Shapiro (revision 116730530) -Ismaïl ben Chérif (revision 120930731) -Ivan Hlevnjak (revision 120917619) -Jean Corti (revision 120935599) -Jean Joubert (revision 120924134) -Karashima Noboru (revision 120892854) -Latin (revision 120360207) -Luc Bondy (revision 120941142) -Maine-et-Loire (revision 120890165) -Marches de Bretagne (revision 115772332) -Mark Behr (revision 120943649) -Maroc (revision 120937137) -Maurice Strong (revision 120927161) -Mausole (revision 120904648) -Moyen Âge (revision 120943615) -Novembre 2015 (revision 120866496) -Olene S. Walker (revision 120927070) -Paternité (revision 119371049) -Pays de la Loire (revision 120719853) -Philippe II Auguste (revision 120910593) -Philippe Washer (revision 120939362) -Premier ministre de Portugal (revision 120888501) -Relativité générale (revision 120814809) -Régions françaises (revision 120692851) -Seconde Guerre mondiale (revision 120884001) -Seconde guerre civile irakienne (revision 120893282) -Shigeru Mizuki (revision 120931351) -Soukhoï Su-24 (revision 120892538) -Spuistraat (revision 119667601) -Syrie (revision 120692724) -Tahir Elçi (revision 120942499) -Tunis (revision 120628797) -Vague de violence israélo-palestinienne de l'automne 2015 (revision 120927782) -Wiki (revision 120671138) -Wikimedia Foundation (revision 120519147) -Wikipédia en français (revision 120692561) -XVIIIe siècle (revision 119843235) -XVIIe siècle (revision 120773755) -Église de Jésus-Christ des saints des derniers jours (revision 120924507) -Agriculture (revision 120943777) -Anesthésie (revision 120319446) -Animal de trait (revision 120819989) -Bien-être animal (revision 120205455) -Bière (revision 119961318) -Bos taurus (revision 119683704) -Bête de somme (revision 117842569) -Bœuf Gras (revision 119942055) -Bœuf de Kobe (revision 120829709) -Castration (revision 119751330) -Chapon (revision 114928344) -Charrette (revision 120909407) -Charrue (revision 120819690) -Colonisation (revision 120146837) -Edme Gaulle (revision 118241504) -Europe de l'Ouest (revision 120854797) -Géant-Bœuf du Carnaval de Paris (revision 118480900) -Hongre (revision 120607208) -Hypoxie (revision 118470557) -Japon (revision 120742182) -Labour (revision 120144019) -Marché des Blancs-Manteaux (revision 106807185) -Monde musulman (revision 120793714) -Mâle (biologie) (revision 111721849) -Mésopotamie (revision 120642895) -Promenade du Bœuf Gras au Carnaval de Paris (revision 120874240) -Rue des Hospitalières-Saint-Gervais (revision 107834996) -Takayama (revision 118810594) -Taureau (revision 120459397) -Testicule (revision 120432335) -Testostérone (revision 119909685) -Traction animale (revision 120819989) -Traction bovine (revision 111651361) -Traîneau (revision 120604907) -Viande (revision 120600247) -Viande bovine (revision 119480442) -Wagyu (revision 120910460) -XXe siècle (revision 120793535) -Élevage bovin (revision 120877235) +Freyja (revision 121028677) +Fusillade du 2 décembre 2015 en Californie (revision 121030353) == End of Parsed pages == -- Wikipedia parsing ended at: 2015-11-30 19:05:38.631196 +- Wikipedia parsing ended at: 2015-12-03 21:10:27.682316 -58 characters appeared 2625348 times. +56 characters appeared 728239 times. First 38 characters: -[ 0] Char e: 14.297990209297968 % -[ 1] Char s: 8.062245462315854 % -[ 2] Char a: 8.006862328346566 % -[ 3] Char n: 7.458401705221555 % -[ 4] Char i: 7.3982572976992 % -[ 5] Char r: 6.902246864034788 % -[ 6] Char t: 6.851777364372266 % -[ 7] Char l: 5.928699738091865 % -[ 8] Char o: 5.30996271732357 % -[ 9] Char u: 5.181065519694913 % -[10] Char d: 4.153773137884959 % -[11] Char c: 3.1908912647009084 % -[12] Char m: 2.8650297027289335 % -[13] Char p: 2.801228637117822 % -[14] Char é: 2.4742624596815355 % -[15] Char v: 1.2647847066369868 % -[16] Char g: 1.2577761119668707 % -[17] Char f: 1.1079293107047143 % -[18] Char b: 1.030415777260767 % -[19] Char h: 0.9089842565633204 % -[20] Char q: 0.7969610124067362 % -[21] Char x: 0.43415196766295366 % -[22] Char è: 0.398613821862854 % -[23] Char à: 0.38916745513356704 % -[24] Char y: 0.3763310616344957 % -[25] Char j: 0.31298707828447886 % -[26] Char k: 0.20576319786938724 % -[27] Char z: 0.11880329769615304 % -[28] Char ê: 0.11221369509870692 % -[29] Char ç: 0.07610419647223911 % -[30] Char w: 0.06574366522076312 % -[31] Char ô: 0.04845071967602009 % -[32] Char â: 0.0448321517756884 % -[33] Char œ: 0.03778546691714774 % -[34] Char î: 0.03725220427920413 % -[35] Char ï: 0.02704403378142631 % -[36] Char û: 0.02285411305472646 % -[37] Char ù: 0.02034016061870655 % +[ 0] Char e: 14.339660468609894 % +[ 1] Char s: 7.954806045817375 % +[ 2] Char a: 7.864176458552756 % +[ 3] Char n: 7.572102015959047 % +[ 4] Char i: 7.34154583866011 % +[ 5] Char r: 7.020222756540091 % +[ 6] Char t: 6.833608197308851 % +[ 7] Char l: 5.9446143367768 % +[ 8] Char o: 5.386418469760614 % +[ 9] Char u: 5.024861343597363 % +[10] Char d: 4.169235649285468 % +[11] Char c: 3.4240132703686568 % +[12] Char p: 2.8882001650557028 % +[13] Char m: 2.803063280049544 % +[14] Char é: 2.498355622261373 % +[15] Char g: 1.277739862874688 % +[16] Char v: 1.1729665672945284 % +[17] Char f: 1.1614318925517584 % +[18] Char b: 0.9925312981040565 % +[19] Char h: 0.8580974103282026 % +[20] Char q: 0.7740590657737364 % +[21] Char x: 0.43570860665248634 % +[22] Char y: 0.41044217626356183 % +[23] Char è: 0.4100302235941771 % +[24] Char à: 0.363479571953713 % +[25] Char j: 0.29591933417463223 % +[26] Char k: 0.1359443808969308 % +[27] Char ç: 0.11685724054877589 % +[28] Char ê: 0.11218844362908331 % +[29] Char z: 0.10738232915292918 % +[30] Char w: 0.08239053387692777 % +[31] Char ô: 0.04792382720507965 % +[32] Char â: 0.03364280133307884 % +[33] Char î: 0.029385957082770905 % +[34] Char û: 0.024854477719539875 % +[35] Char œ: 0.021146903695078125 % +[36] Char ï: 0.017851282340001016 % +[37] Char ù: 0.015242248767231636 % -The first 38 characters have an accumulated ratio of 0.9997798387109063. +The first 38 characters have an accumulated ratio of 0.999621003544166. -1149 sequences found. +914 sequences found. -First 512 (typical positive ratio): 0.997044499777764 -Next 512 (512-1024): 3.8090188424544096e-07 -Rest: 5.974086801089403e-05 +First 512 (typical positive ratio): 0.997057879992383 +Next 512 (512-1024): 1.3731755646154627e-06 +Rest: 3.8163916471489756e-17 -- Processing end: 2015-11-30 19:05:38.842420 +- Processing end: 2015-12-03 21:10:27.987730 diff --git a/script/charsets/windows-1252.py b/script/charsets/windows-1252.py new file mode 100644 index 0000000..f308087 --- /dev/null +++ b/script/charsets/windows-1252.py @@ -0,0 +1,76 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +# ##### BEGIN LICENSE BLOCK ##### +# Version: MPL 1.1/GPL 2.0/LGPL 2.1 +# +# The contents of this file are subject to the Mozilla Public License Version +# 1.1 (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# http://www.mozilla.org/MPL/ +# +# Software distributed under the License is distributed on an "AS IS" basis, +# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License +# for the specific language governing rights and limitations under the +# License. +# +# The Original Code is Mozilla Universal charset detector code. +# +# The Initial Developer of the Original Code is +# Netscape Communications Corporation. +# Portions created by the Initial Developer are Copyright (C) 2001 +# the Initial Developer. All Rights Reserved. +# +# Contributor(s): +# Jehan +# +# Alternatively, the contents of this file may be used under the terms of +# either the GNU General Public License Version 2 or later (the "GPL"), or +# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), +# in which case the provisions of the GPL or the LGPL are applicable instead +# of those above. If you wish to allow use of your version of this file only +# under the terms of either the GPL or the LGPL, and not to allow others to +# use your version of this file under the terms of the MPL, indicate your +# decision by deleting the provisions above and replace them with the notice +# and other provisions required by the GPL or the LGPL. If you do not delete +# the provisions above, a recipient may use your version of this file under +# the terms of any one of the MPL, the GPL or the LGPL. +# +# ##### END LICENSE BLOCK ##### + +from codepoints import * + +name = 'WINDOWS-1252' +aliases = ['CP-1252', 'cswindows1252'] + +language = \ +{ + # Languages with complete coverage. + # Basically a mix of ISO-8859-1 and ISO-8859-15. + 'complete': [ 'af', 'sq', 'eu', 'br', 'co', 'da', 'en', 'fo', 'gl', 'de', + 'is', 'id', 'it', 'ku', 'leon1250', 'lb', 'ms', 'gv', 'no', + 'oc', 'pt', 'rm', 'gd', 'es', 'sw', 'sv', 'wa', 'ca', 'et', + 'fi', 'fr', 'ga', 'la' ], + 'incomplete': [] +} + +# X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF # +charmap = \ +[ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, # 0X + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 1X + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # 2X + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, # 3X + SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 4X + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,SYM, # 5X + SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 6X + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,CTR, # 7X + SYM,ILL,SYM,LET,SYM,SYM,SYM,SYM,SYM,SYM,LET,SYM,LET,ILL,LET,ILL, # 8X + ILL,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,LET,SYM,LET,ILL,LET,LET, # 9X + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # AX + SYM,SYM,SYM,SYM,SYM,LET,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # BX + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # CX + LET,LET,LET,LET,LET,LET,LET,SYM,LET,LET,LET,LET,LET,LET,LET,LET, # DX + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # EX + LET,LET,LET,LET,LET,LET,LET,SYM,LET,LET,LET,LET,LET,LET,LET,LET, # FX +] diff --git a/script/langs/fr.py b/script/langs/fr.py index 4e843a6..4bbc59d 100644 --- a/script/langs/fr.py +++ b/script/langs/fr.py @@ -50,7 +50,7 @@ code = 'fr' # ASCII characters are also used in French. use_ascii = True # The charsets we want to support and create data for. -charsets = ['ISO-8859-15', 'ISO-8859-1'] +charsets = ['ISO-8859-15', 'ISO-8859-1', 'WINDOWS-1252'] ## Optional Properties ## diff --git a/src/LangModels/LangFrenchModel.cpp b/src/LangModels/LangFrenchModel.cpp index d0e5f41..4c05498 100644 --- a/src/LangModels/LangFrenchModel.cpp +++ b/src/LangModels/LangFrenchModel.cpp @@ -41,7 +41,7 @@ /** * Generated by BuildLangModel.py - * On: 2015-11-30 19:05:38.632969 + * On: 2015-12-03 21:10:27.685575 **/ /* Character Mapping Table: @@ -61,24 +61,24 @@ * ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1 * even though they are both used for French. Same for the euro sign. */ -static const unsigned char Iso_8859_15_CharToOrderMap[] = +static const unsigned char Windows_1252_CharToOrderMap[] = { CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ - SYM, 2, 18, 11, 10, 0, 17, 16, 19, 4, 25, 26, 7, 12, 3, 8, /* 4X */ - 13, 20, 5, 1, 6, 9, 15, 30, 21, 24, 27,SYM,SYM,SYM,SYM,SYM, /* 5X */ - SYM, 2, 18, 11, 10, 0, 17, 16, 19, 4, 25, 26, 7, 12, 3, 8, /* 6X */ - 13, 20, 5, 1, 6, 9, 15, 30, 21, 24, 27,SYM,SYM,SYM,SYM,CTR, /* 7X */ - CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ - CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ - SYM,SYM,SYM,SYM,SYM,SYM, 48,SYM, 48,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ - SYM,SYM,SYM,SYM, 50, 58,SYM,SYM, 50,SYM,SYM,SYM, 33, 33, 57,SYM, /* BX */ - 23, 39, 32, 46, 49, 56, 47, 29, 22, 14, 28, 38, 54, 40, 34, 35, /* CX */ - 59, 45, 60, 41, 31, 53, 43,SYM, 52, 37, 44, 36, 42, 55, 51, 61, /* DX */ - 23, 39, 32, 46, 49, 56, 47, 29, 22, 14, 28, 38, 54, 40, 34, 35, /* EX */ - 62, 45, 63, 41, 31, 53, 43,SYM, 52, 37, 44, 36, 42, 55, 51, 57, /* FX */ + SYM, 2, 18, 11, 10, 0, 17, 15, 19, 4, 25, 26, 7, 13, 3, 8, /* 4X */ + 12, 20, 5, 1, 6, 9, 16, 30, 21, 22, 29,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 2, 18, 11, 10, 0, 17, 15, 19, 4, 25, 26, 7, 13, 3, 8, /* 6X */ + 12, 20, 5, 1, 6, 9, 16, 30, 21, 22, 29,SYM,SYM,SYM,SYM,CTR, /* 7X */ + SYM,ILL,SYM, 56,SYM,SYM,SYM,SYM,SYM,SYM, 51,SYM, 35,ILL, 57,ILL, /* 8X */ + ILL,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 51,SYM, 35,ILL, 58, 59, /* 9X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ + SYM,SYM,SYM,SYM,SYM, 60,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */ + 24, 38, 32, 46, 49, 61, 47, 27, 23, 14, 28, 41, 62, 39, 33, 36, /* CX */ + 48, 45, 54, 40, 31, 55, 42,SYM, 52, 37, 43, 34, 44, 53, 50, 63, /* DX */ + 24, 38, 32, 46, 49, 64, 47, 27, 23, 14, 28, 41, 65, 39, 33, 36, /* EX */ + 48, 45, 54, 40, 31, 55, 42,SYM, 52, 37, 43, 34, 44, 53, 50, 66, /* FX */ }; /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ @@ -88,80 +88,101 @@ static const unsigned char Iso_8859_1_CharToOrderMap[] = CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ - SYM, 2, 18, 11, 10, 0, 17, 16, 19, 4, 25, 26, 7, 12, 3, 8, /* 4X */ - 13, 20, 5, 1, 6, 9, 15, 30, 21, 24, 27,SYM,SYM,SYM,SYM,SYM, /* 5X */ - SYM, 2, 18, 11, 10, 0, 17, 16, 19, 4, 25, 26, 7, 12, 3, 8, /* 6X */ - 13, 20, 5, 1, 6, 9, 15, 30, 21, 24, 27,SYM,SYM,SYM,SYM,CTR, /* 7X */ + SYM, 2, 18, 11, 10, 0, 17, 15, 19, 4, 25, 26, 7, 13, 3, 8, /* 4X */ + 12, 20, 5, 1, 6, 9, 16, 30, 21, 22, 29,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 2, 18, 11, 10, 0, 17, 15, 19, 4, 25, 26, 7, 13, 3, 8, /* 6X */ + 12, 20, 5, 1, 6, 9, 16, 30, 21, 22, 29,SYM,SYM,SYM,SYM,CTR, /* 7X */ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ - SYM,SYM,SYM,SYM,SYM, 64,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */ - 23, 39, 32, 46, 49, 56, 47, 29, 22, 14, 28, 38, 54, 40, 34, 35, /* CX */ - 65, 45, 66, 41, 31, 53, 43,SYM, 52, 37, 44, 36, 42, 55, 51, 67, /* DX */ - 23, 39, 32, 46, 49, 56, 47, 29, 22, 14, 28, 38, 54, 40, 34, 35, /* EX */ - 68, 45, 69, 41, 31, 53, 43,SYM, 52, 37, 44, 36, 42, 55, 51, 57, /* FX */ + SYM,SYM,SYM,SYM,SYM, 67,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */ + 24, 38, 32, 46, 49, 68, 47, 27, 23, 14, 28, 41, 69, 39, 33, 36, /* CX */ + 48, 45, 54, 40, 31, 55, 42,SYM, 52, 37, 43, 34, 44, 53, 50, 70, /* DX */ + 24, 38, 32, 46, 49, 71, 47, 27, 23, 14, 28, 41, 72, 39, 33, 36, /* EX */ + 48, 45, 54, 40, 31, 55, 42,SYM, 52, 37, 43, 34, 44, 53, 50, 73, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Iso_8859_15_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 2, 18, 11, 10, 0, 17, 15, 19, 4, 25, 26, 7, 13, 3, 8, /* 4X */ + 12, 20, 5, 1, 6, 9, 16, 30, 21, 22, 29,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 2, 18, 11, 10, 0, 17, 15, 19, 4, 25, 26, 7, 13, 3, 8, /* 6X */ + 12, 20, 5, 1, 6, 9, 16, 30, 21, 22, 29,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM,SYM,SYM,SYM,SYM,SYM, 51,SYM, 51,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ + SYM,SYM,SYM,SYM, 74, 75,SYM,SYM, 76,SYM,SYM,SYM, 35, 35, 77,SYM, /* BX */ + 24, 38, 32, 46, 49, 78, 47, 27, 23, 14, 28, 41, 79, 39, 33, 36, /* CX */ + 48, 45, 54, 40, 31, 55, 42,SYM, 52, 37, 43, 34, 44, 53, 50, 80, /* DX */ + 24, 38, 32, 46, 49, 81, 47, 27, 23, 14, 28, 41, 82, 39, 33, 36, /* EX */ + 48, 45, 54, 40, 31, 55, 42,SYM, 52, 37, 43, 34, 44, 53, 50, 83, /* FX */ }; /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ /* Model Table: - * Total sequences: 1149 - * First 512 sequences: 0.997044499777764 - * Next 512 sequences (512-1024): 0.002895759354225113 - * Rest: 5.974086801089403e-05 + * Total sequences: 914 + * First 512 sequences: 0.997057879992383 + * Next 512 sequences (512-1024): 0.002942120007616917 + * Rest: 3.8163916471489756e-17 * Negative sequences: TODO */ static const PRUint8 FrenchLangModel[] = { - 3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,3,3,3,3,0,0,3,3,3,3,0,3,3,2,1,0,0,2,2,0, - 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,3,3,3,3,1,3,0,3,2,3,2,0,0,2,2,2,2,1,0,2,0, - 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,0,3,3,3,3,0,3,3,2,2,0,3,3,0,0, - 3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,2,3,0,3,3,3,3,2,3,2,2,2,2,2,0,2,0, - 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,3,2,1,2,2,2,2,0,2,0,0, - 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,0,3,2,3,3,3,3,2,3,3,0,1,1,2,2, - 3,3,3,2,3,3,3,3,3,3,2,3,3,2,3,2,2,2,3,3,1,0,3,0,3,2,2,3,3,0,2,3,2,1,0,0,2,0, - 3,3,3,2,3,2,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,2,3,3,3,2,3,2,2,2,2,2,2,1,1,0,2,1, - 3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,0,3,3,3,3,0,0,3,1,1,1,2,3,3,3, - 3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,3,3,2,2,0,0,0,2,2,0,0, - 3,3,3,2,3,3,2,3,3,3,3,2,3,2,3,2,3,2,2,3,1,0,3,0,3,3,2,2,0,0,2,2,2,0,2,0,2,0, - 3,3,3,2,3,3,3,3,3,3,2,3,2,2,3,2,2,2,2,3,3,1,3,1,3,0,3,2,2,0,2,3,2,2,0,0,0,0, - 3,3,3,3,3,2,3,2,3,3,2,2,3,3,3,2,2,2,3,2,0,1,3,0,3,1,2,2,3,0,2,2,3,2,2,0,0,0, - 3,3,3,2,3,3,3,3,3,3,2,2,2,3,3,2,2,2,2,3,0,2,3,0,2,2,2,2,3,2,2,3,2,0,2,0,0,0, - 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,0,0,3,2,3,0,2,0,0,3,0,0,0,0,0, - 3,2,3,2,3,3,0,2,3,3,2,0,2,0,3,2,2,2,2,0,0,0,3,0,2,1,2,0,3,0,0,2,0,2,2,0,2,0, - 3,3,3,3,3,3,3,3,3,3,2,2,3,2,3,2,3,2,2,3,2,2,3,0,3,0,2,2,2,0,2,0,2,2,2,0,0,0, - 3,3,3,2,3,3,2,3,3,3,2,2,2,2,3,0,2,3,1,0,0,0,3,0,2,0,2,0,3,0,1,0,2,2,0,0,2,0, - 3,3,3,3,3,3,3,3,3,3,3,2,2,2,3,2,2,2,3,2,2,0,3,0,3,3,2,2,2,0,2,0,3,3,2,0,2,0, - 3,3,3,3,3,3,3,3,3,3,2,2,3,2,3,2,2,2,2,2,2,0,3,0,3,2,2,2,2,0,2,3,3,2,0,0,0,0, - 2,2,3,0,2,1,2,2,2,3,1,2,1,2,0,1,1,2,2,0,2,0,0,0,1,0,0,0,0,0,0,0,2,0,0,0,2,0, - 3,0,3,0,3,0,3,2,2,3,0,3,2,3,3,3,0,2,0,2,2,3,2,0,3,0,0,0,0,0,2,0,0,0,0,0,0,0, - 0,3,0,3,0,3,3,3,0,0,3,3,3,0,0,3,3,2,3,0,3,0,0,0,1,0,2,2,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 3,3,3,3,2,3,3,3,3,2,3,3,3,3,3,2,3,2,2,2,1,2,2,0,2,2,2,3,0,2,2,2,2,0,0,2,0,0, - 3,2,3,2,3,2,2,2,3,3,2,2,2,2,3,1,1,2,2,2,0,0,0,3,2,2,2,1,0,0,0,1,2,0,0,0,0,0, - 3,3,3,2,3,3,2,2,3,3,1,2,3,2,2,2,2,2,2,3,0,0,0,0,3,2,3,1,0,0,2,0,1,1,0,0,0,0, - 3,2,3,2,3,2,2,2,3,3,1,2,2,0,3,2,2,2,2,3,2,0,2,0,2,0,2,2,0,0,2,0,2,0,0,0,0,0, - 0,0,0,2,0,0,3,2,0,0,0,3,3,2,0,2,0,0,0,0,3,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0, - 0,0,3,0,2,0,0,0,3,3,0,0,0,0,2,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 3,2,3,2,3,2,2,2,3,2,2,2,2,2,2,0,0,2,1,2,2,0,0,0,2,0,2,1,0,0,2,0,2,0,0,0,0,0, - 0,2,0,3,0,0,3,3,0,0,0,0,3,2,0,0,1,0,0,2,0,0,0,0,0,2,1,1,0,0,0,0,0,0,0,0,0,0, - 0,2,2,2,0,2,3,3,2,0,2,3,2,2,0,0,3,0,2,2,2,0,0,0,2,2,1,2,0,0,0,0,0,0,0,0,0,0, - 0,2,0,2,2,1,2,1,0,3,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,2,0,3,0,0,3,3,0,0,0,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 2,3,2,2,0,2,2,3,1,0,3,2,2,0,0,2,2,2,2,0,2,0,0,0,1,2,2,0,0,0,2,0,0,0,0,0,0,0, - 0,1,0,2,0,2,3,2,0,0,1,2,2,2,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,3,3,3,3,3,0,0,3,3,3,0,3,3,0,0,0,2,0,2,0, + 3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,2,2,3,3,3,3,0,3,3,0,0,3,0,0,2,3,0,0,0,2,2,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,0,3,3,3,0,3,3,2,2,3,0,0,3,0, + 3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,2,3,3,0,3,3,3,2,3,2,0,2,2,0,0,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,0,2,3,0,2,3,2,0,0,0,0,0,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,0,2,3,3,3,2,3,3,3,0,2,0,0,0, + 3,3,3,2,3,3,3,3,3,3,2,3,3,3,3,2,2,2,3,3,2,2,3,3,2,0,2,0,3,3,2,3,2,0,0,0,0,0, + 3,3,3,2,3,2,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,0,3,3,3,2,3,0,0,2,2,2,2,0,2,0,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,0,3,3,0,0,3,3,0,0,2,3,0,3,3, + 3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,2,3,3,2,0,3,3,2,3,3,2,0,0,0,0,0,2,0, + 3,3,3,2,3,3,3,2,3,3,3,2,2,3,3,3,2,2,2,3,0,0,3,3,0,3,0,0,2,2,3,2,2,2,3,0,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,2,2,3,2,2,2,2,3,3,0,3,3,0,0,3,0,2,2,2,3,2,0,0,2,0,0, + 3,3,3,2,3,3,3,3,3,3,2,2,3,2,3,0,0,2,2,3,0,0,3,3,0,0,2,2,3,2,2,3,2,0,0,0,0,0, + 3,3,3,3,3,2,3,2,3,3,2,3,3,3,3,2,0,2,3,2,0,0,3,3,0,2,2,0,3,0,2,2,3,0,2,2,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,0,0,0,3,2,2,0,3,0,0,2,0,0,0,0,0, + 3,3,3,3,3,3,3,3,3,3,2,2,2,3,3,3,2,2,3,3,0,2,3,3,0,0,0,0,2,0,2,0,2,0,0,0,0,0, + 3,2,3,2,3,3,0,2,3,3,0,0,0,2,3,0,2,2,0,0,0,0,2,3,0,0,2,0,3,0,0,0,0,0,0,2,0,0, + 3,3,3,2,3,3,3,3,3,3,2,2,2,3,3,2,0,3,0,0,0,0,0,3,0,2,0,0,3,0,0,0,0,0,2,2,0,0, + 3,3,3,3,3,3,3,3,3,3,3,2,2,2,3,2,2,0,3,2,0,0,3,2,0,3,0,0,0,0,0,0,3,2,0,2,0,0, + 3,3,3,3,3,3,3,3,3,3,0,2,0,3,3,0,0,2,2,0,0,0,3,3,0,2,2,0,2,2,2,3,3,0,0,2,0,0, + 0,0,2,0,0,0,0,2,2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,0,3,0,3,0,3,2,3,2,2,3,3,2,3,0,3,2,2,2,2,3,2,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0, + 3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,2,2,2,2,0,2,2,2,0,3,2,0,0,2,2,0,0,0,0,0,0,0, + 0,3,0,3,0,3,3,3,0,0,3,3,2,3,0,3,3,2,3,0,3,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,2,3,2,3,2,2,2,3,3,2,2,2,2,3,0,0,0,0,0,0,0,0,0,3,2,0,0,0,0,0,0,2,0,0,0,0,0, + 3,3,3,2,3,3,2,3,3,3,0,0,2,3,2,2,2,2,2,3,0,0,3,0,0,0,2,0,0,0,2,0,0,0,0,0,0,0, + 0,0,3,0,0,0,0,0,3,3,0,0,0,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,2,0,0,3,2,0,0,0,3,0,3,0,0,2,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,0,3,2,3,2,0,2,3,3,0,2,0,2,2,2,0,0,2,2,2,0,3,0,0,0,2,0,0,3,2,0,0,0,0,0,0,0, + 3,2,3,2,3,2,2,2,3,2,0,2,0,0,2,0,0,2,2,2,0,0,2,0,0,0,2,0,0,0,2,0,0,0,0,0,0,0, + 0,2,0,3,0,0,3,3,0,0,0,0,2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,2,0,2,2,0,3,3,0,0,0,3,2,2,0,3,0,0,2,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,3,0,0,3,3,0,0,0,2,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,2,3,2,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,2,0,0,2,0,2,2,0,3,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 2,2,2,2,0,2,2,3,0,0,2,2,0,2,0,2,0,2,2,0,2,0,0,0,0,0,2,0,0,0,2,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, }; -const SequenceModel Iso_8859_15FrenchModel = +const SequenceModel Windows_1252FrenchModel = { - Iso_8859_15_CharToOrderMap, + Windows_1252_CharToOrderMap, FrenchLangModel, 38, - (float)0.997044499777764, + (float)0.997057879992383, PR_TRUE, - "ISO-8859-15" + "WINDOWS-1252" }; const SequenceModel Iso_8859_1FrenchModel = @@ -169,7 +190,17 @@ const SequenceModel Iso_8859_1FrenchModel = Iso_8859_1_CharToOrderMap, FrenchLangModel, 38, - (float)0.997044499777764, + (float)0.997057879992383, PR_TRUE, "ISO-8859-1" +}; + +const SequenceModel Iso_8859_15FrenchModel = +{ + Iso_8859_15_CharToOrderMap, + FrenchLangModel, + 38, + (float)0.997057879992383, + PR_TRUE, + "ISO-8859-15" }; \ No newline at end of file diff --git a/src/nsSBCSGroupProber.cpp b/src/nsSBCSGroupProber.cpp index 32cfda9..0f3cbb2 100644 --- a/src/nsSBCSGroupProber.cpp +++ b/src/nsSBCSGroupProber.cpp @@ -80,9 +80,10 @@ nsSBCSGroupProber::nsSBCSGroupProber() mProbers[14] = new nsSingleByteCharSetProber(&Iso_8859_1FrenchModel); mProbers[15] = new nsSingleByteCharSetProber(&Iso_8859_15FrenchModel); + mProbers[16] = new nsSingleByteCharSetProber(&Windows_1252FrenchModel); - mProbers[16] = new nsSingleByteCharSetProber(&Latin2HungarianModel); - mProbers[17] = new nsSingleByteCharSetProber(&Win1250HungarianModel); + mProbers[17] = new nsSingleByteCharSetProber(&Latin2HungarianModel); + mProbers[18] = new nsSingleByteCharSetProber(&Win1250HungarianModel); Reset(); } diff --git a/src/nsSBCSGroupProber.h b/src/nsSBCSGroupProber.h index deb6c03..8dd25e0 100644 --- a/src/nsSBCSGroupProber.h +++ b/src/nsSBCSGroupProber.h @@ -40,7 +40,7 @@ #define nsSBCSGroupProber_h__ -#define NUM_OF_SBCS_PROBERS 18 +#define NUM_OF_SBCS_PROBERS 19 class nsCharSetProber; class nsSBCSGroupProber: public nsCharSetProber { diff --git a/src/nsSBCharSetProber.h b/src/nsSBCharSetProber.h index 20e392e..3af8c60 100644 --- a/src/nsSBCharSetProber.h +++ b/src/nsSBCharSetProber.h @@ -138,6 +138,7 @@ extern const SequenceModel Win1255Model; extern const SequenceModel TIS620ThaiModel; extern const SequenceModel Iso_8859_15FrenchModel; extern const SequenceModel Iso_8859_1FrenchModel; +extern const SequenceModel Windows_1252FrenchModel; #endif /* nsSingleByteCharSetProber_h__ */