diff --git a/script/BuildLangModelLogs/LangPolishModel.log b/script/BuildLangModelLogs/LangPolishModel.log new file mode 100644 index 0000000..f90f2de --- /dev/null +++ b/script/BuildLangModelLogs/LangPolishModel.log @@ -0,0 +1,154 @@ += Logs of language model for Polish (pl) = + +- Generated by BuildLangModel.py +- Started: 2016-09-21 17:06:43.735784 +- Maximum depth: 5 +- Max number of pages: 100 + +== Parsed pages == + +Krasnyj Krym (revision 46884814) +1913 (revision 46708474) +1915 (revision 46743905) +1917 (revision 46559521) +1925 (revision 46809935) +1928 (revision 46875978) +1929 (revision 46760445) +1935 (revision 46487358) +1936 (revision 46874348) +1939 (revision 46789269) +1941 (revision 46856112) +1942 (revision 46851808) +1943 (revision 46768330) +1944 (revision 46866229) +1949 (revision 46882598) +1953 (revision 46437607) +1957 (revision 46591716) +1959 (revision 46255886) +AdmiraÅ‚ Butakow (revision 45993412) +AdmiraÅ‚ Spiridow (revision 45993412) +Aparat torpedowy (revision 46633263) +Askold (revision 45787848) +Avro 504 (revision 44668646) +AÅ‚maz (1903) (revision 46472283) +Batumi (revision 46594611) +Bomba głębinowa (revision 46011227) +Brest (revision 45771242) +Burta (revision 45569092) +Cagliari (revision 46235605) +Cesariewicz (revision 40031486) +Czerwona Ukraina (revision 45993524) +Daty nowego i starego porzÄ…dku (revision 45622575) +Drednot (revision 45789788) +DziaÅ‚o przeciwlotnicze (revision 45160162) +Flota BaÅ‚tycka Marynarki Wojennej Rosji (revision 45700667) +Gromoboj (revision 44328986) +Hulk (okrÄ™t) (revision 46020688) +II wojna Å›wiatowa (revision 46871591) +I wojna Å›wiatowa (revision 46869119) +Imperator NikoÅ‚aj I (okrÄ™t lotniczy) (revision 45520638) +Imperium Rosyjskie (revision 46604959) +Impierator NikoÅ‚aj I (1916) (revision 46534166) +JÄ™zyk rosyjski (revision 46433952) +Kanonierka (revision 41091952) +Kanonierki typu Ardagan (revision 46534166) +Kanonierki typu Bobr (revision 45788694) +Kanonierki typu Chiwiniec (revision 46534166) +Kanonierki typu Groziaszczij (revision 46534166) +Kanonierki typu Mandżur (revision 46534166) +Karabin maszynowy DSzK (revision 45587452) +Karabin maszynowy Vickers 12,7 mm (revision 44572918) +KocioÅ‚ parowy (revision 46716473) +Konstrukcyjna linia wodna (revision 37082620) +Kontrtorpedowce typu Biesstrasznyj (revision 46534166) +Kontrtorpedowce typu Brawyj (revision 46534166) +Kontrtorpedowce typu Grozowoj (revision 46534166) +Kontrtorpedowce typu Prytkij (revision 46534166) +KoÅ„ mechaniczny (revision 44722357) +Krab (1915) (revision 42791389) +Kronsztad (revision 46425497) +Krążownik lekki (revision 40661490) +Krążownik liniowy (revision 40601776) +Krążownik pancernopokÅ‚adowy (revision 40055901) +Krążownik pancerny (revision 40324458) +Krążowniki lekkie typu SwietÅ‚ana (revision 45993412) +Krążowniki liniowe typu Borodino (revision 45990866) +Krążowniki typu AdmiraÅ‚ Nachimow (revision 45993521) +Krążowniki typu Bajan (revision 45991279) +Krążowniki typu Diana (revision 45991349) +Krążowniki typu Izumrud (revision 45991349) +Lend-Lease Act (revision 46877263) +Marynarka Wojenna ZwiÄ…zku Socjalistycznych Republik Radzieckich (revision 45795993) +Maszyna sterowa (revision 28497888) +Mecidiye (1903) (revision 43956539) +Mila morska (revision 45754209) +Mina morska (revision 45781427) +Morze Czarne (revision 46729213) +Nadbudówka (revision 45292731) +Neapol (revision 46823083) +Niszczyciel (revision 45799132) +Niszczyciele rakietowe projektu 61 (revision 46498775) +Niszczyciele typu Finn (revision 46620140) +Niszczyciele typu Lejtienant Szestakow (revision 46620140) +Niszczyciele typu Ochotnik (revision 46620140) +Niszczyciele typu Ukraina (revision 46620140) +Noworosyjsk (revision 44721836) +Odessa (revision 45629804) +Oerlikon 20 mm (revision 45493862) +Okres miÄ™dzywojenny (revision 46668249) +OkrÄ™t-baza wodnosamolotów (revision 45115462) + +== End of Parsed pages == + +- Wikipedia parsing ended at: 2016-09-21 17:21:04.404471 + +78 characters appeared 1159291 times. + +First 37 characters: +[ 0] Char a: 9.685575062689178 % +[ 1] Char i: 8.815819324052374 % +[ 2] Char o: 7.920185699707839 % +[ 3] Char e: 6.871613770830621 % +[ 4] Char r: 5.8672067668945935 % +[ 5] Char n: 5.763608964444647 % +[ 6] Char s: 4.736688199942896 % +[ 7] Char k: 4.722196583946568 % +[ 8] Char z: 4.519227700378939 % +[ 9] Char w: 4.279512219106333 % +[10] Char t: 4.0191806888865695 % +[11] Char c: 3.6891513864939864 % +[12] Char y: 3.565282573572986 % +[13] Char p: 3.0190004062828053 % +[14] Char d: 2.851052928039638 % +[15] Char l: 2.7930002044352973 % +[16] Char m: 2.7530620008263673 % +[17] Char u: 2.348504387595522 % +[18] Char j: 1.881236031332944 % +[19] Char Å‚: 1.6885320424293815 % +[20] Char b: 1.394559260789569 % +[21] Char g: 1.3928340684090534 % +[22] Char h: 1.163901039514669 % +[23] Char Ä™: 0.8066136975099435 % +[24] Char ó: 0.5971753425153823 % +[25] Char Ä…: 0.563275312238256 % +[26] Char f: 0.5245447432956868 % +[27] Char ż: 0.4545019326467643 % +[28] Char Å›: 0.39567287247119143 % +[29] Char Å„: 0.3857530162832283 % +[30] Char ć: 0.1397405828217419 % +[31] Char v: 0.12455888987320698 % +[32] Char ź: 0.10204512930748191 % +[33] Char x: 0.05468859846233603 % +[34] Char é: 0.020961087423261287 % +[35] Char á: 0.01707940456710179 % +[36] Char q: 0.011386269711401192 % + +The first 37 characters have an accumulated ratio of 0.9993892818972973. + +1321 sequences found. + +First 512 (typical positive ratio): 0.9894531815946438 +Next 512 (512-1024): 1.7251923805153322e-06 +Rest: 0.0003530230403650733 + +- Processing end: 2016-09-21 17:21:04.878014 diff --git a/script/charsets/iso-8859-16.py b/script/charsets/iso-8859-16.py new file mode 100644 index 0000000..2e6defd --- /dev/null +++ b/script/charsets/iso-8859-16.py @@ -0,0 +1,83 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +# ##### BEGIN LICENSE BLOCK ##### +# Version: MPL 1.1/GPL 2.0/LGPL 2.1 +# +# The contents of this file are subject to the Mozilla Public License Version +# 1.1 (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# http://www.mozilla.org/MPL/ +# +# Software distributed under the License is distributed on an "AS IS" basis, +# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License +# for the specific language governing rights and limitations under the +# License. +# +# The Original Code is Mozilla Universal charset detector code. +# +# The Initial Developer of the Original Code is +# Netscape Communications Corporation. +# Portions created by the Initial Developer are Copyright (C) 2001 +# the Initial Developer. All Rights Reserved. +# +# Contributor(s): +# Jehan +# +# Alternatively, the contents of this file may be used under the terms of +# either the GNU General Public License Version 2 or later (the "GPL"), or +# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), +# in which case the provisions of the GPL or the LGPL are applicable instead +# of those above. If you wish to allow use of your version of this file only +# under the terms of either the GPL or the LGPL, and not to allow others to +# use your version of this file under the terms of the MPL, indicate your +# decision by deleting the provisions above and replace them with the notice +# and other provisions required by the GPL or the LGPL. If you do not delete +# the provisions above, a recipient may use your version of this file under +# the terms of any one of the MPL, the GPL or the LGPL. +# +# ##### END LICENSE BLOCK ##### + +from codepoints import * + +# ISO-8859-1 is the full 8-bit range, IANA-defined, superset of ISO/CEI 8859-1. +# It is basically the same as ISO/CEI 8859-1, but with control characters. +# As far as I can see, `iconv` has no support for the ISO/CEI 8859-1 subset, +# so there is no need for us to support it anyway. + +name = 'ISO-8859-16' +aliases = ['ISO_8859-16:2001', 'ISO_8859-16', 'iso-ir-226', + 'csISO885916', 'latin10', 'l10'] + +language = \ +{ + # Languages with complete coverage. + # Some languages actually have several alphabets and only one of them is + # compatible with ISO-8859-1 (ex: Kurdish). + # Some don't have a ISO language code (like Leonese, for which I used + # a Glottolog code). + 'complete': [ 'sq', 'hr', 'hu', 'pl', 'ro', 'sr', 'sl', + 'fr', 'de', 'it', 'ga' ], + 'incomplete': [] +} + +# X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF # +charmap = \ +[ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, # 0X + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 1X + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # 2X + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, # 3X + SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 4X + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,SYM, # 5X + SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 6X + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,CTR, # 7X + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 8X + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 9X + SYM,LET,LET,LET,SYM,SYM,LET,SYM,LET,SYM,LET,SYM,LET,SYM,LET,LET, # AX + SYM,SYM,LET,LET,LET,SYM,SYM,SYM,LET,LET,LET,SYM,LET,LET,LET,LET, # BX + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # CX + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # DX + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # EX + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # FX +] diff --git a/script/langs/pl.py b/script/langs/pl.py new file mode 100644 index 0000000..54485a1 --- /dev/null +++ b/script/langs/pl.py @@ -0,0 +1,81 @@ +#!/bin/python3 +# -*- coding: utf-8 -*- + +# ##### BEGIN LICENSE BLOCK ##### +# Version: MPL 1.1/GPL 2.0/LGPL 2.1 +# +# The contents of this file are subject to the Mozilla Public License Version +# 1.1 (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# http://www.mozilla.org/MPL/ +# +# Software distributed under the License is distributed on an "AS IS" basis, +# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License +# for the specific language governing rights and limitations under the +# License. +# +# The Original Code is Mozilla Universal charset detector code. +# +# The Initial Developer of the Original Code is +# Netscape Communications Corporation. +# Portions created by the Initial Developer are Copyright (C) 2001 +# the Initial Developer. All Rights Reserved. +# +# Contributor(s): +# Jehan +# +# Alternatively, the contents of this file may be used under the terms of +# either the GNU General Public License Version 2 or later (the "GPL"), or +# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), +# in which case the provisions of the GPL or the LGPL are applicable instead +# of those above. If you wish to allow use of your version of this file only +# under the terms of either the GPL or the LGPL, and not to allow others to +# use your version of this file under the terms of the MPL, indicate your +# decision by deleting the provisions above and replace them with the notice +# and other provisions required by the GPL or the LGPL. If you do not delete +# the provisions above, a recipient may use your version of this file under +# the terms of any one of the MPL, the GPL or the LGPL. +# +# ##### END LICENSE BLOCK ##### + +import re + +## Mandatory Properties ## + +# The human name for the language, in English. +name = 'Polish' +# Use 2-letter ISO 639-1 if possible, 3-letter ISO code otherwise, +# or use another catalog as a last resort. +code = 'pl' +# ASCII characters are also used in French. +use_ascii = True +# The charsets we want to support and create data for. +charsets = ['ISO-8859-2', 'ISO-8859-13', 'ISO-8859-16', + 'Windows-1250', 'IBM852', 'MAC-CENTRALEUROPE'] + +## Optional Properties ## + +# Alphabet characters. +# If use_ascii=True, there is no need to add any ASCII characters. +# If case_mapping=True, there is no need to add several cases of a same +# character (provided Python algorithms know the right cases). +alphabet = 'ąćęłńóśźż' +# The starred page which was rewarded on the main page when I created +# the data. +start_pages = ['Krasnyj Krym'] +# give possibility to select another code for the Wikipedia URL. +wikipedia_code = code +# 'a' and 'A' will be considered the same character, and so on. +# This uses Python algorithm to determine upper/lower-case of a given +# character. +case_mapping = True + +# A function to clean content returned by the `wikipedia` python lib, +# in case some unwanted data has been overlooked. +# Note that we are already cleaning away the '=' from the title syntax +# of Wikipedia, as well as double spaces. But sometimes, Wikipedia in +# some language may return weird syntax or UI text which should be +# discarded. If you encounter one of these cases, use this function. +def clean_wikipedia_content(content): + # Do your garbage text cleaning here. + return content diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 6244812..6903b7d 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -20,6 +20,7 @@ set( LangModels/LangLithuanianModel.cpp LangModels/LangLatvianModel.cpp LangModels/LangMalteseModel.cpp + LangModels/LangPolishModel.cpp LangModels/LangPortugueseModel.cpp LangModels/LangRussianModel.cpp LangModels/LangSlovakModel.cpp diff --git a/src/LangModels/LangPolishModel.cpp b/src/LangModels/LangPolishModel.cpp new file mode 100644 index 0000000..e7379d1 --- /dev/null +++ b/src/LangModels/LangPolishModel.cpp @@ -0,0 +1,298 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is Mozilla Communicator client code. + * + * The Initial Developer of the Original Code is + * Netscape Communications Corporation. + * Portions created by the Initial Developer are Copyright (C) 1998 + * the Initial Developer. All Rights Reserved. + * + * Contributor(s): + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ + +#include "../nsSBCharSetProber.h" + +/********* Language model for: Polish *********/ + +/** + * Generated by BuildLangModel.py + * On: 2016-09-21 17:21:04.405363 + **/ + +/* Character Mapping Table: + * ILL: illegal character. + * CTR: control character specific to the charset. + * RET: carriage/return. + * SYM: symbol (punctuation) that does not belong to word. + * NUM: 0 - 9. + * + * Other characters are ordered by probabilities + * (0 is the most common character in the language). + * + * Orders are generic to a language. So the codepoint with order X in + * CHARSET1 maps to the same character as the codepoint with the same + * order X in CHARSET2 for the same language. + * As such, it is possible to get missing order. For instance the + * ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1 + * even though they are both used for French. Same for the euro sign. + */ +static const unsigned char Ibm852_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 0, 20, 11, 14, 3, 26, 21, 22, 1, 18, 7, 15, 16, 5, 2, /* 4X */ + 13, 36, 4, 6, 10, 17, 31, 9, 33, 12, 8,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 0, 20, 11, 14, 3, 26, 21, 22, 1, 18, 7, 15, 16, 5, 2, /* 6X */ + 13, 36, 4, 6, 10, 17, 31, 9, 33, 12, 8,SYM,SYM,SYM,SYM,CTR, /* 7X */ + 47, 39, 34, 54, 40, 78, 30, 47, 19, 58, 49, 49, 77, 32, 40, 30, /* 8X */ + 34, 79, 80, 55, 38, 74, 74, 28, 28, 38, 39, 76, 76, 19,SYM, 44, /* 9X */ + 35, 37, 24, 51, 25, 25, 45, 45, 23, 23,SYM, 32, 44, 56,SYM,SYM, /* AX */ + SYM,SYM,SYM,SYM,SYM, 35, 54, 46, 56,SYM,SYM,SYM,SYM, 27, 27,SYM, /* BX */ + SYM,SYM,SYM,SYM,SYM,SYM, 53, 53,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* CX */ + 70, 70, 69, 58, 69, 81, 37, 77, 46,SYM,SYM,SYM,SYM, 65, 82,SYM, /* DX */ + 24, 57, 55, 29, 29, 83, 41, 41, 84, 51, 85, 86, 60, 60, 65,SYM, /* EX */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 87, 50, 50,SYM,SYM, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Iso_8859_16_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 0, 20, 11, 14, 3, 26, 21, 22, 1, 18, 7, 15, 16, 5, 2, /* 4X */ + 13, 36, 4, 6, 10, 17, 31, 9, 33, 12, 8,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 0, 20, 11, 14, 3, 26, 21, 22, 1, 18, 7, 15, 16, 5, 2, /* 6X */ + 13, 36, 4, 6, 10, 17, 31, 9, 33, 12, 8,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM, 25, 25, 19,SYM,SYM, 41,SYM, 41,SYM, 62,SYM, 32,SYM, 32, 27, /* AX */ + SYM,SYM, 44, 19, 45,SYM,SYM,SYM, 45, 44, 62,SYM, 75, 75, 88, 27, /* BX */ + 61, 35, 54, 53, 40, 30, 89, 47, 43, 34, 64, 58, 90, 37, 77, 91, /* CX */ + 70, 29, 66, 24, 55, 49, 38, 28, 92, 68, 51, 93, 39, 23, 72, 57, /* DX */ + 61, 35, 54, 53, 40, 30, 94, 47, 43, 34, 64, 58, 95, 37, 77, 96, /* EX */ + 70, 29, 66, 24, 55, 49, 38, 28, 97, 68, 51, 98, 39, 23, 72, 99, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Iso_8859_2_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 0, 20, 11, 14, 3, 26, 21, 22, 1, 18, 7, 15, 16, 5, 2, /* 4X */ + 13, 36, 4, 6, 10, 17, 31, 9, 33, 12, 8,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 0, 20, 11, 14, 3, 26, 21, 22, 1, 18, 7, 15, 16, 5, 2, /* 6X */ + 13, 36, 4, 6, 10, 17, 31, 9, 33, 12, 8,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM, 25,SYM, 19,SYM, 74, 28,SYM,SYM, 41, 56, 76, 32,SYM, 45, 27, /* AX */ + SYM, 25,SYM, 19,SYM, 74, 28,SYM,SYM, 41, 56, 76, 32,SYM, 45, 27, /* BX */ + 100, 35, 54, 53, 40,101, 30, 47, 44, 34, 23, 58, 46, 37, 77, 69, /* CX */ + 70, 29,102, 24, 55, 49, 38,SYM, 50,103, 51,104, 39, 60, 65, 57, /* DX */ + 105, 35, 54, 53, 40,106, 30, 47, 44, 34, 23, 58, 46, 37, 77, 69, /* EX */ + 70, 29,107, 24, 55, 49, 38,SYM, 50,108, 51,109, 39, 60, 65,SYM, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Mac_Centraleurope_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 0, 20, 11, 14, 3, 26, 21, 22, 1, 18, 7, 15, 16, 5, 2, /* 4X */ + 13, 36, 4, 6, 10, 17, 31, 9, 33, 12, 8,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 0, 20, 11, 14, 3, 26, 21, 22, 1, 18, 7, 15, 16, 5, 2, /* 6X */ + 13, 36, 4, 6, 10, 17, 31, 9, 33, 12, 8,SYM,SYM,SYM,SYM,CTR, /* 7X */ + 40, 63, 63, 34, 25, 38, 39, 35, 25, 44, 40, 44, 30, 30, 34, 32, /* 8X */ + 32, 69, 37, 69,110,111, 71, 24, 71, 55, 38, 67, 51, 46, 46, 39, /* 9X */ + SYM,SYM, 23,SYM,SYM,SYM,SYM, 57,SYM,SYM,SYM, 23,SYM,SYM,112,113, /* AX */ + 114, 73,SYM,SYM, 73,115,SYM,SYM, 19,116,117, 74, 74,118,119,120, /* BX */ + 121, 29,SYM,SYM, 29,122,SYM,SYM,SYM,SYM,SYM,123, 49, 67, 49, 42, /* CX */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 42,124,125, 50,SYM,SYM, 50,126, /* DX */ + 127, 41,SYM,SYM, 41, 28, 28, 35, 76, 76, 37, 45, 45, 59, 24, 55, /* EX */ + 59,128, 51,129,130,131,132,133, 60, 60,134, 27, 19, 27,135,SYM, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Iso_8859_13_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 0, 20, 11, 14, 3, 26, 21, 22, 1, 18, 7, 15, 16, 5, 2, /* 4X */ + 13, 36, 4, 6, 10, 17, 31, 9, 33, 12, 8,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 0, 20, 11, 14, 3, 26, 21, 22, 1, 18, 7, 15, 16, 5, 2, /* 6X */ + 13, 36, 4, 6, 10, 17, 31, 9, 33, 12, 8,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 48,SYM,136,SYM,SYM,SYM,SYM,137, /* AX */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 48,SYM,138,SYM,SYM,SYM,SYM,139, /* BX */ + 25,140, 63, 30, 40, 52, 23,141, 44, 34, 32, 71,142,143, 73,144, /* CX */ + 41, 29,145, 24, 42, 67, 38,SYM,146, 19, 28, 59, 39, 27, 45, 57, /* DX */ + 25,147, 63, 30, 40, 52, 23,148, 44, 34, 32, 71,149,150, 73,151, /* EX */ + 41, 29,152, 24, 42, 67, 38,SYM,153, 19, 28, 59, 39, 27, 45,SYM, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Windows_1250_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 0, 20, 11, 14, 3, 26, 21, 22, 1, 18, 7, 15, 16, 5, 2, /* 4X */ + 13, 36, 4, 6, 10, 17, 31, 9, 33, 12, 8,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 0, 20, 11, 14, 3, 26, 21, 22, 1, 18, 7, 15, 16, 5, 2, /* 6X */ + 13, 36, 4, 6, 10, 17, 31, 9, 33, 12, 8,SYM,SYM,SYM,SYM,CTR, /* 7X */ + SYM,ILL,SYM,ILL,SYM,SYM,SYM,SYM,ILL,SYM, 41,SYM, 28, 76, 45, 32, /* 8X */ + ILL,SYM,SYM,SYM,SYM,SYM,SYM,SYM,ILL,SYM, 41,SYM, 28, 76, 45, 32, /* 9X */ + SYM,SYM,SYM, 19,SYM, 25,SYM,SYM,SYM,SYM, 56,SYM,SYM,SYM,SYM, 27, /* AX */ + SYM,SYM,SYM, 19,SYM,SYM,SYM,SYM,SYM, 25, 56,SYM, 74,SYM, 74, 27, /* BX */ + 154, 35, 54, 53, 40,155, 30, 47, 44, 34, 23, 58, 46, 37, 77, 69, /* CX */ + 70, 29,156, 24, 55, 49, 38,SYM, 50,157, 51,158, 39, 60, 65, 57, /* DX */ + 159, 35, 54, 53, 40,160, 30, 47, 44, 34, 23, 58, 46, 37, 77, 69, /* EX */ + 70, 29,161, 24, 55, 49, 38,SYM, 50,162, 51,163, 39, 60, 65,SYM, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + + +/* Model Table: + * Total sequences: 1321 + * First 512 sequences: 0.9894531815946438 + * Next 512 sequences (512-1024): 0.010193795364991133 + * Rest: 0.0003530230403650733 + * Negative sequences: TODO + */ +static const PRUint8 PolishLangModel[] = +{ + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,0,0,3,3,3,3,3,3,3,2,0,0,2, + 3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,2,3,3,3,2,3,2,3,3,3,2,2,2,2,2, + 3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,0,0,0,3,3,3,3,2,3,2,2,0,0,1, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,0,2,2,3,3,3,3,2,3,2,2,0,2,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,3,3,2,3,3,3,2,2,2,0,2,2,0,1,2,2,2, + 3,3,3,3,3,3,3,3,3,2,3,3,3,2,3,2,2,3,2,1,2,3,2,3,3,3,3,2,0,0,0,2,0,2,2,2,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,2,3,2,3,3,2,0,0,0,0,2,0,0,2,2,2, + 3,3,3,3,3,2,3,3,1,3,3,3,2,2,2,3,3,3,2,3,2,2,2,3,3,3,2,3,2,0,0,2,0,1,2,2,0, + 3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,2,3,3,3,3,3,3,2,3,2,3,1,2,0,0,0,2,0,0,2,2,2, + 3,3,3,3,3,3,3,3,3,2,2,3,3,3,2,3,2,3,0,3,2,2,2,3,3,3,1,0,2,0,0,0,0,0,1,0,0, + 3,3,3,3,3,3,3,3,3,3,3,2,3,2,2,3,2,3,2,3,2,2,3,3,3,3,2,1,0,0,0,2,0,0,2,2,0, + 3,3,3,3,2,3,2,3,3,2,3,2,3,1,2,3,2,3,3,2,1,2,3,2,3,3,0,0,0,0,0,2,0,0,2,0,2, + 3,2,2,2,3,3,3,3,3,3,3,3,0,3,3,3,3,2,3,3,3,3,2,0,0,0,3,3,3,3,3,2,2,0,0,0,0, + 3,3,3,3,3,3,3,2,2,2,2,3,3,3,2,3,2,3,1,3,2,2,3,2,3,2,2,0,0,0,0,2,0,0,2,2,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,2,2,2,3,3,2,2,1,0,0,2,2,0,2,2,1, + 3,3,3,3,2,3,3,3,2,3,3,3,2,2,3,3,3,3,2,0,3,3,2,3,2,3,3,2,0,0,0,2,0,1,2,2,0, + 3,3,3,3,2,3,3,2,1,2,2,3,3,3,2,2,3,3,2,2,3,2,1,3,3,2,2,1,1,0,0,1,0,0,2,2,0, + 3,3,2,3,3,3,3,3,3,3,3,3,2,3,3,3,3,2,3,3,3,3,2,0,1,0,3,3,2,3,2,2,2,2,2,2,2, + 3,3,3,3,2,3,3,2,2,3,2,3,0,2,3,2,3,3,2,2,2,2,1,3,3,3,1,1,3,1,0,1,0,0,0,2,0, + 3,0,3,3,1,3,2,3,2,2,3,2,3,2,2,0,2,3,0,2,2,3,1,3,3,3,0,2,0,0,0,0,0,0,0,0,0, + 3,3,3,3,3,3,3,3,2,2,1,2,3,2,2,3,2,3,2,3,2,1,2,3,2,2,2,0,0,0,0,0,0,0,2,2,0, + 3,3,3,3,3,3,2,2,2,3,2,1,2,2,3,3,2,3,2,3,2,2,3,2,3,2,2,1,0,0,0,2,0,0,2,2,2, + 3,3,3,3,3,3,2,2,1,2,3,2,3,2,2,3,3,3,2,2,2,1,1,2,2,2,2,0,2,0,0,2,0,0,2,2,0, + 0,0,0,0,0,0,3,3,3,1,3,3,0,3,3,2,0,0,0,3,3,3,0,0,0,0,0,3,3,0,2,0,2,0,0,0,0, + 0,0,0,0,3,2,2,2,3,3,2,3,1,2,3,3,2,0,3,3,3,2,0,0,0,0,2,3,1,0,0,1,3,0,0,0,0, + 0,0,0,0,0,0,2,2,3,2,3,3,0,3,3,0,0,0,0,3,2,3,0,0,0,0,0,3,0,0,2,0,0,0,0,0,0, + 3,3,3,3,3,2,2,2,1,2,2,2,2,1,1,3,2,3,2,2,2,2,2,2,2,2,3,0,0,0,0,1,0,0,2,1,1, + 3,2,3,3,0,3,2,2,0,2,0,2,3,0,2,3,2,3,0,0,3,1,0,2,2,3,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,3,3,0,2,0,3,0,3,0,2,0,3,3,0,0,0,0,0,0,0,0,0,0,0,0,1,3,0,0,0,0,0,0, + 1,0,0,0,0,0,3,2,0,0,0,3,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 1,0,0,1,0,0,0,2,0,2,0,0,0,0,2,0,2,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,3,3,3,2,2,2,2,1,0,0,0,2,2,1,2,0,2,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,2,2,0, + 0,0,0,0,2,3,2,0,0,2,0,2,0,0,3,2,2,0,0,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0, + 2,3,2,2,0,1,0,1,0,1,1,0,1,2,1,2,1,2,1,0,2,0,2,0,0,0,2,0,0,0,0,2,0,2,0,0,0, + 2,1,2,2,2,2,2,0,1,0,2,2,1,1,2,2,2,1,1,0,1,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0, + 0,1,0,1,2,2,2,2,2,0,2,2,0,2,1,2,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0, + 1,2,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +}; + + +const SequenceModel Ibm852PolishModel = +{ + Ibm852_CharToOrderMap, + PolishLangModel, + 37, + (float)0.9894531815946438, + PR_TRUE, + "IBM852" +}; + +const SequenceModel Iso_8859_16PolishModel = +{ + Iso_8859_16_CharToOrderMap, + PolishLangModel, + 37, + (float)0.9894531815946438, + PR_TRUE, + "ISO-8859-16" +}; + +const SequenceModel Iso_8859_2PolishModel = +{ + Iso_8859_2_CharToOrderMap, + PolishLangModel, + 37, + (float)0.9894531815946438, + PR_TRUE, + "ISO-8859-2" +}; + +const SequenceModel Mac_CentraleuropePolishModel = +{ + Mac_Centraleurope_CharToOrderMap, + PolishLangModel, + 37, + (float)0.9894531815946438, + PR_TRUE, + "MAC-CENTRALEUROPE" +}; + +const SequenceModel Iso_8859_13PolishModel = +{ + Iso_8859_13_CharToOrderMap, + PolishLangModel, + 37, + (float)0.9894531815946438, + PR_TRUE, + "ISO-8859-13" +}; + +const SequenceModel Windows_1250PolishModel = +{ + Windows_1250_CharToOrderMap, + PolishLangModel, + 37, + (float)0.9894531815946438, + PR_TRUE, + "WINDOWS-1250" +}; \ No newline at end of file diff --git a/src/nsSBCSGroupProber.cpp b/src/nsSBCSGroupProber.cpp index b68134f..afb5948 100644 --- a/src/nsSBCSGroupProber.cpp +++ b/src/nsSBCSGroupProber.cpp @@ -136,6 +136,13 @@ nsSBCSGroupProber::nsSBCSGroupProber() mProbers[52] = new nsSingleByteCharSetProber(&Mac_CentraleuropeSlovakModel); mProbers[53] = new nsSingleByteCharSetProber(&Ibm852SlovakModel); + mProbers[54] = new nsSingleByteCharSetProber(&Windows_1250PolishModel); + mProbers[55] = new nsSingleByteCharSetProber(&Iso_8859_2PolishModel); + mProbers[56] = new nsSingleByteCharSetProber(&Iso_8859_13PolishModel); + mProbers[57] = new nsSingleByteCharSetProber(&Iso_8859_16PolishModel); + mProbers[58] = new nsSingleByteCharSetProber(&Mac_CentraleuropePolishModel); + mProbers[59] = new nsSingleByteCharSetProber(&Ibm852PolishModel); + Reset(); } diff --git a/src/nsSBCSGroupProber.h b/src/nsSBCSGroupProber.h index 1a1266f..75db30e 100644 --- a/src/nsSBCSGroupProber.h +++ b/src/nsSBCSGroupProber.h @@ -40,7 +40,7 @@ #define nsSBCSGroupProber_h__ -#define NUM_OF_SBCS_PROBERS 54 +#define NUM_OF_SBCS_PROBERS 60 class nsCharSetProber; class nsSBCSGroupProber: public nsCharSetProber { diff --git a/src/nsSBCharSetProber.h b/src/nsSBCharSetProber.h index 979b9a7..86e7c21 100644 --- a/src/nsSBCharSetProber.h +++ b/src/nsSBCharSetProber.h @@ -197,5 +197,12 @@ extern const SequenceModel Iso_8859_2SlovakModel; extern const SequenceModel Ibm852SlovakModel; extern const SequenceModel Mac_CentraleuropeSlovakModel; +extern const SequenceModel Windows_1250PolishModel; +extern const SequenceModel Iso_8859_2PolishModel; +extern const SequenceModel Iso_8859_13PolishModel; +extern const SequenceModel Iso_8859_16PolishModel; +extern const SequenceModel Ibm852PolishModel; +extern const SequenceModel Mac_CentraleuropePolishModel; + #endif /* nsSingleByteCharSetProber_h__ */ diff --git a/test/pl/ibm852.txt b/test/pl/ibm852.txt new file mode 100644 index 0000000..e420950 --- /dev/null +++ b/test/pl/ibm852.txt @@ -0,0 +1,3 @@ +Zofia (Sonka) Holszaäska herbu Hippocentaurus (ur. ok. 1405, zm. 21 wrze˜nia 1461 w Krakowie) +ksi©¾niczka litewska, kr¢lowa Polski, od 1422 roku czwarta i ostatnia ¾ona Wˆadysˆawa II +Jagieˆˆy. diff --git a/test/pl/iso-8859-13.txt b/test/pl/iso-8859-13.txt new file mode 100644 index 0000000..6bafbc1 --- /dev/null +++ b/test/pl/iso-8859-13.txt @@ -0,0 +1,3 @@ +Zofia (Sonka) Holszañska herbu Hippocentaurus (ur. ok. 1405, zm. 21 wrzeúnia 1461 w Krakowie) +ksiæýniczka litewska, królowa Polski, od 1422 roku czwarta i ostatnia ýona Wùadysùawa II +Jagieùùy. diff --git a/test/pl/iso-8859-16.txt b/test/pl/iso-8859-16.txt new file mode 100644 index 0000000..abe9607 --- /dev/null +++ b/test/pl/iso-8859-16.txt @@ -0,0 +1,3 @@ +Zofia (Sonka) Holszañska herbu Hippocentaurus (ur. ok. 1405, zm. 21 wrze÷nia 1461 w Krakowie) +ksiý¿niczka litewska, królowa Polski, od 1422 roku czwarta i ostatnia ¿ona W³adys³awa II +Jagie³³y. diff --git a/test/pl/iso-8859-2.txt b/test/pl/iso-8859-2.txt new file mode 100644 index 0000000..8ff7d6d --- /dev/null +++ b/test/pl/iso-8859-2.txt @@ -0,0 +1,3 @@ +Zofia (Sonka) Holszañska herbu Hippocentaurus (ur. ok. 1405, zm. 21 wrze¶nia 1461 w Krakowie) +ksiê¿niczka litewska, królowa Polski, od 1422 roku czwarta i ostatnia ¿ona W³adys³awa II +Jagie³³y. diff --git a/test/pl/mac-centraleurope.txt b/test/pl/mac-centraleurope.txt new file mode 100644 index 0000000..48c5901 --- /dev/null +++ b/test/pl/mac-centraleurope.txt @@ -0,0 +1,3 @@ +Zofia (Sonka) HolszaÄska herbu Hippocentaurus (ur. ok. 1405, zm. 21 wrzeænia 1461 w Krakowie) +ksi«ýniczka litewska, kr—lowa Polski, od 1422 roku czwarta i ostatnia ýona W¸adys¸awa II +Jagie¸¸y. diff --git a/test/pl/utf-8.txt b/test/pl/utf-8.txt new file mode 100644 index 0000000..8b7e938 --- /dev/null +++ b/test/pl/utf-8.txt @@ -0,0 +1,3 @@ +Zofia (Sonka) HolszaÅ„ska herbu Hippocentaurus (ur. ok. 1405, zm. 21 wrzeÅ›nia 1461 w Krakowie) +księżniczka litewska, królowa Polski, od 1422 roku czwarta i ostatnia żona WÅ‚adysÅ‚awa II +Jagiełły. diff --git a/test/pl/windows-1250.txt b/test/pl/windows-1250.txt new file mode 100644 index 0000000..c739798 --- /dev/null +++ b/test/pl/windows-1250.txt @@ -0,0 +1,3 @@ +Zofia (Sonka) Holszañska herbu Hippocentaurus (ur. ok. 1405, zm. 21 wrzeœnia 1461 w Krakowie) +ksiê¿niczka litewska, królowa Polski, od 1422 roku czwarta i ostatnia ¿ona W³adys³awa II +Jagie³³y.