diff --git a/script/BuildLangModelLogs/LangCzechModel.log b/script/BuildLangModelLogs/LangCzechModel.log new file mode 100644 index 0000000..7d9c950 --- /dev/null +++ b/script/BuildLangModelLogs/LangCzechModel.log @@ -0,0 +1,161 @@ += Logs of language model for Czech (cs) = + +- Generated by BuildLangModel.py +- Started: 2016-09-21 03:20:56.824516 +- Maximum depth: 5 +- Max number of pages: 100 + +== Parsed pages == + +Sociální fobie (revision 13567590) +Adaptace (revision 13991192) +Agorafobie (revision 13013445) +Alkoholismus (revision 13822064) +Alprazolam (revision 14082425) +Antidepresivum (revision 14113423) +Asertivita (revision 14111958) +Atenolol (revision 12051880) +Automatické negativní myšlenky (revision 13567590) +Benzodiazepin (revision 13947546) +Beta-blokátory (revision 13428762) +Blud (revision 13888988) +Bohatství (revision 13556478) +Bupropion (revision 13686045) +Citaloparam (revision 13567590) +Clonazepan (revision 13567590) +Crohnova nemoc (revision 13745254) +Deprese (psychologie) (revision 13695735) +Diagnostický a statický manuál mentálních poruch (revision 13567590) +Diagnostický a statistický manuál mentálních poruch (revision 13714660) +Diagnóza (medicína) (revision 13052239) +Dichotomické myšlení (revision 13567590) +Digital object identifier (revision 14138049) +Dopamin (revision 13714274) +Dystymie (revision 13567267) +Důkaz kruhem (revision 13190761) +Elektivní mutismus (revision 9940891) +Emoce (revision 14110033) +Escitalopram (revision 12954987) +Evoluce (revision 13951488) +Expozice (psychologie) (revision 14119474) +Extraverze a introverze (revision 13872996) +Fluoxetin (revision 12955006) +Fluvoxamin (revision 12955006) +Gen (revision 13907182) +Generalizovaná úzkostná porucha (revision 14006709) +Halucinaci (revision 12188143) +Hněv (revision 14057864) +Inteligence (revision 14009781) +International Standard Serial Number (revision 12869806) +Interpersonální psychoterapie (revision 13567590) +Iracionalita (revision 4765977) +Ján Praško Pavlov (revision 14086840) +Klinické testování (revision 13530979) +Kognitivní omyl (revision 13107294) +Kognitivní psychologie (revision 11629465) +Kognitivní restrukturalizace (revision 13567360) +Kognitivně behaviorální terapie (revision 13980494) +Komorbidita (revision 11351714) +Lymská borelióza (revision 14068446) +Malé sebevědomí (revision 13567590) +Medical Subject Headings (revision 12239331) +Meditace (revision 13180783) +Mentální černý filtr (revision 13567590) +Mezinárodní klasifikace nemocí (revision 12531067) +Michael Liebowitz (revision 13567590) +Moclobemid (revision 13567590) +Moritova terapie (revision 11960292) +Musturbace (revision 13567590) +Nervozita (revision 13847097) +Noradrenalin (revision 14054165) +Obsedantně kompulzivní porucha (revision 13950365) +Panická ataka (revision 13253537) +Panická porucha (revision 13253537) +Paranoia (revision 14027052) +Paroxetin (revision 12955006) +Pohlavnost (revision 13564689) +Porucha (revision 11039108) +Pravděpodobnost (revision 13596041) +Predestinace (revision 12467403) +Profese (revision 13975485) +Propanolol (revision 12972658) +Psychiatr (revision 12767960) +Psychické trauma (revision 11227535) +Psychoaktivní droga (revision 13939232) +Psychodynamická léčba (revision 13567590) +Psychofarmaka (revision 9928215) +Psycholog (revision 12358728) +Psychoterapie (revision 13874178) +Puberta (revision 12540014) +RIMA (revision 10234728) +Remise (revision 9896748) +Richard Heimberg (revision 13567590) +Rámování myšlenek (revision 13567590) +Schizofrenie (revision 13977456) +Sebevražda (revision 14053884) +Selektivní abstrakce (revision 13567590) +Selektivní inhibitor zpětného vychytávání serotoninu (revision 12955027) +Serotonin (revision 13975104) +Sertralin (revision 12955006) +Skupinová terapie (revision 11964235) +Sociální chování (revision 13507313) +Sociální dovednost (revision 12226347) + +== End of Parsed pages == + +- Wikipedia parsing ended at: 2016-09-21 03:28:11.731386 + +47 characters appeared 594800 times. + +First 41 characters: +[ 0] Char o: 8.323806321452588 % +[ 1] Char e: 8.040013449899126 % +[ 2] Char n: 6.895595158036315 % +[ 3] Char a: 6.263113651647613 % +[ 4] Char i: 5.650470746469401 % +[ 5] Char t: 5.40383322125084 % +[ 6] Char s: 4.588937457969065 % +[ 7] Char v: 3.8685272360457295 % +[ 8] Char p: 3.6914929388029587 % +[ 9] Char r: 3.6302958977807664 % +[10] Char l: 3.6017148621385338 % +[11] Char í: 3.5733019502353733 % +[12] Char k: 3.301950235373235 % +[13] Char u: 3.1782111634162744 % +[14] Char c: 3.1383658372562206 % +[15] Char d: 3.120208473436449 % +[16] Char m: 2.758406186953598 % +[17] Char h: 2.2747141896435776 % +[18] Char á: 2.156186953597848 % +[19] Char z: 2.0260591795561536 % +[20] Char y: 1.9894082044384667 % +[21] Char j: 1.8979488903833224 % +[22] Char b: 1.8189307330195021 % +[23] Char ě: 1.277236045729657 % +[24] Char é: 1.2291526563550772 % +[25] Char č: 0.9502353732347008 % +[26] Char ž: 0.9214862138533961 % +[27] Char ř: 0.8955951580363146 % +[28] Char ý: 0.7646267652992602 % +[29] Char š: 0.6605581708137189 % +[30] Char f: 0.6260928043039677 % +[31] Char ů: 0.5016812373907196 % +[32] Char g: 0.47041022192333554 % +[33] Char ú: 0.19502353732347008 % +[34] Char x: 0.13685272360457296 % +[35] Char ň: 0.05447209145931405 % +[36] Char w: 0.04488903833221251 % +[37] Char ó: 0.03429724277067922 % +[38] Char ť: 0.02269670477471419 % +[39] Char ď: 0.012104909213180902 % +[40] Char q: 0.007229320780094149 % + +The first 41 characters have an accumulated ratio of 0.9999613315400132. + +1025 sequences found. + +First 512 (typical positive ratio): 0.9786035192432675 +Next 512 (512-1024): 1.6812373907195695e-06 +Rest: 2.0246480655940202e-06 + +- Processing end: 2016-09-21 03:28:12.235582 diff --git a/script/charsets/ibm852.py b/script/charsets/ibm852.py new file mode 100644 index 0000000..28ab185 --- /dev/null +++ b/script/charsets/ibm852.py @@ -0,0 +1,72 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +# ##### BEGIN LICENSE BLOCK ##### +# Version: MPL 1.1/GPL 2.0/LGPL 2.1 +# +# The contents of this file are subject to the Mozilla Public License Version +# 1.1 (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# http://www.mozilla.org/MPL/ +# +# Software distributed under the License is distributed on an "AS IS" basis, +# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License +# for the specific language governing rights and limitations under the +# License. +# +# The Original Code is Mozilla Universal charset detector code. +# +# The Initial Developer of the Original Code is +# Netscape Communications Corporation. +# Portions created by the Initial Developer are Copyright (C) 2001 +# the Initial Developer. All Rights Reserved. +# +# Contributor(s): +# Jehan +# +# Alternatively, the contents of this file may be used under the terms of +# either the GNU General Public License Version 2 or later (the "GPL"), or +# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), +# in which case the provisions of the GPL or the LGPL are applicable instead +# of those above. If you wish to allow use of your version of this file only +# under the terms of either the GPL or the LGPL, and not to allow others to +# use your version of this file under the terms of the MPL, indicate your +# decision by deleting the provisions above and replace them with the notice +# and other provisions required by the GPL or the LGPL. If you do not delete +# the provisions above, a recipient may use your version of this file under +# the terms of any one of the MPL, the GPL or the LGPL. +# +# ##### END LICENSE BLOCK ##### + +from codepoints import * + +name = 'IBM852' +aliases = ['CP852'] + +language = \ +{ + 'complete': [ 'bs', 'hr', 'cs', 'de', 'hu', 'pl', 'sr', 'sk', 'sl', + 'hsb', 'dsb', 'tk' ], + 'incomplete': [ 'ro' ] +} + +# X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF # +charmap = \ +[ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, # 0X + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 1X + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # 2X + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, # 3X + SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 4X + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,SYM, # 5X + SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 6X + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,CTR, # 7X + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 8X + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,LET, # 9X + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,LET,LET,LET,SYM,SYM, # AX + SYM,SYM,SYM,SYM,SYM,LET,LET,LET,LET,SYM,SYM,SYM,SYM,LET,LET,SYM, # BX + SYM,SYM,SYM,SYM,SYM,SYM,LET,LET,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # CX + LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,LET,LET,SYM, # DX + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM, # EX + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,LET,LET,LET,SYM,SYM, # FX +] diff --git a/script/charsets/mac-centraleurope.py b/script/charsets/mac-centraleurope.py new file mode 100644 index 0000000..bf201e6 --- /dev/null +++ b/script/charsets/mac-centraleurope.py @@ -0,0 +1,72 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +# ##### BEGIN LICENSE BLOCK ##### +# Version: MPL 1.1/GPL 2.0/LGPL 2.1 +# +# The contents of this file are subject to the Mozilla Public License Version +# 1.1 (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# http://www.mozilla.org/MPL/ +# +# Software distributed under the License is distributed on an "AS IS" basis, +# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License +# for the specific language governing rights and limitations under the +# License. +# +# The Original Code is Mozilla Universal charset detector code. +# +# The Initial Developer of the Original Code is +# Netscape Communications Corporation. +# Portions created by the Initial Developer are Copyright (C) 2001 +# the Initial Developer. All Rights Reserved. +# +# Contributor(s): +# Jehan +# +# Alternatively, the contents of this file may be used under the terms of +# either the GNU General Public License Version 2 or later (the "GPL"), or +# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), +# in which case the provisions of the GPL or the LGPL are applicable instead +# of those above. If you wish to allow use of your version of this file only +# under the terms of either the GPL or the LGPL, and not to allow others to +# use your version of this file under the terms of the MPL, indicate your +# decision by deleting the provisions above and replace them with the notice +# and other provisions required by the GPL or the LGPL. If you do not delete +# the provisions above, a recipient may use your version of this file under +# the terms of any one of the MPL, the GPL or the LGPL. +# +# ##### END LICENSE BLOCK ##### + +from codepoints import * + +name = 'MAC-CENTRALEUROPE' +aliases = [] + +language = \ +{ + 'complete': [ 'bs', 'hr', 'cs', 'de', 'hu', 'pl', 'sr', 'sk', 'sl', + 'hsb', 'dsb', 'tk' ], + 'incomplete': [ 'ro' ] +} + +# X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF # +charmap = \ +[ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, # 0X + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 1X + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # 2X + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, # 3X + SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 4X + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,SYM, # 5X + SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 6X + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,CTR, # 7X + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 8X + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 9X + SYM,SYM,LET,SYM,SYM,SYM,SYM,LET,SYM,SYM,SYM,LET,SYM,SYM,LET,LET, # AX + LET,LET,SYM,SYM,LET,LET,SYM,SYM,LET,LET,LET,LET,LET,LET,LET,LET, # BX + LET,LET,SYM,SYM,LET,LET,SYM,SYM,SYM,SYM,SYM,LET,LET,LET,LET,LET, # CX + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,LET,LET,LET,LET,SYM,SYM,LET,LET, # DX + LET,LET,SYM,SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # EX + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM, # FX +] diff --git a/script/langs/cs.py b/script/langs/cs.py new file mode 100644 index 0000000..009ceec --- /dev/null +++ b/script/langs/cs.py @@ -0,0 +1,80 @@ +#!/bin/python3 +# -*- coding: utf-8 -*- + +# ##### BEGIN LICENSE BLOCK ##### +# Version: MPL 1.1/GPL 2.0/LGPL 2.1 +# +# The contents of this file are subject to the Mozilla Public License Version +# 1.1 (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# http://www.mozilla.org/MPL/ +# +# Software distributed under the License is distributed on an "AS IS" basis, +# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License +# for the specific language governing rights and limitations under the +# License. +# +# The Original Code is Mozilla Universal charset detector code. +# +# The Initial Developer of the Original Code is +# Netscape Communications Corporation. +# Portions created by the Initial Developer are Copyright (C) 2001 +# the Initial Developer. All Rights Reserved. +# +# Contributor(s): +# Jehan +# +# Alternatively, the contents of this file may be used under the terms of +# either the GNU General Public License Version 2 or later (the "GPL"), or +# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), +# in which case the provisions of the GPL or the LGPL are applicable instead +# of those above. If you wish to allow use of your version of this file only +# under the terms of either the GPL or the LGPL, and not to allow others to +# use your version of this file under the terms of the MPL, indicate your +# decision by deleting the provisions above and replace them with the notice +# and other provisions required by the GPL or the LGPL. If you do not delete +# the provisions above, a recipient may use your version of this file under +# the terms of any one of the MPL, the GPL or the LGPL. +# +# ##### END LICENSE BLOCK ##### + +import re + +## Mandatory Properties ## + +# The human name for the language, in English. +name = 'Czech' +# Use 2-letter ISO 639-1 if possible, 3-letter ISO code otherwise, +# or use another catalog as a last resort. +code = 'cs' +# ASCII characters are also used in French. +use_ascii = True +# The charsets we want to support and create data for. +charsets = ['ISO-8859-2', 'Windows-1250', 'IBM852', 'MAC-CENTRALEUROPE'] + +## Optional Properties ## + +# Alphabet characters. +# If use_ascii=True, there is no need to add any ASCII characters. +# If case_mapping=True, there is no need to add several cases of a same +# character (provided Python algorithms know the right cases). +alphabet = 'áčďéěíňóřšťúůýž' +# The starred page which was rewarded on the main page when I created +# the data. +start_pages = ['Sociální fobie'] +# give possibility to select another code for the Wikipedia URL. +wikipedia_code = code +# 'a' and 'A' will be considered the same character, and so on. +# This uses Python algorithm to determine upper/lower-case of a given +# character. +case_mapping = True + +# A function to clean content returned by the `wikipedia` python lib, +# in case some unwanted data has been overlooked. +# Note that we are already cleaning away the '=' from the title syntax +# of Wikipedia, as well as double spaces. But sometimes, Wikipedia in +# some language may return weird syntax or UI text which should be +# discarded. If you encounter one of these cases, use this function. +def clean_wikipedia_content(content): + # Do your garbage text cleaning here. + return content diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index b5c4620..9493d17 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -9,7 +9,7 @@ set( JpCntx.cpp LangModels/LangArabicModel.cpp LangModels/LangBulgarianModel.cpp - LangModels/LangRussianModel.cpp + LangModels/LangCzechModel.cpp LangModels/LangEsperantoModel.cpp LangModels/LangFrenchModel.cpp LangModels/LangDanishModel.cpp @@ -21,6 +21,7 @@ set( LangModels/LangLatvianModel.cpp LangModels/LangMalteseModel.cpp LangModels/LangPortugueseModel.cpp + LangModels/LangRussianModel.cpp LangModels/LangSpanishModel.cpp LangModels/LangThaiModel.cpp LangModels/LangTurkishModel.cpp diff --git a/src/LangModels/LangCzechModel.cpp b/src/LangModels/LangCzechModel.cpp new file mode 100644 index 0000000..4e74ed2 --- /dev/null +++ b/src/LangModels/LangCzechModel.cpp @@ -0,0 +1,281 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is Mozilla Communicator client code. + * + * The Initial Developer of the Original Code is + * Netscape Communications Corporation. + * Portions created by the Initial Developer are Copyright (C) 1998 + * the Initial Developer. All Rights Reserved. + * + * Contributor(s): + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ + +#include "../nsSBCharSetProber.h" + +/********* Language model for: Czech *********/ + +/** + * Generated by BuildLangModel.py + * On: 2016-09-21 03:28:11.733089 + **/ + +/* Character Mapping Table: + * ILL: illegal character. + * CTR: control character specific to the charset. + * RET: carriage/return. + * SYM: symbol (punctuation) that does not belong to word. + * NUM: 0 - 9. + * + * Other characters are ordered by probabilities + * (0 is the most common character in the language). + * + * Orders are generic to a language. So the codepoint with order X in + * CHARSET1 maps to the same character as the codepoint with the same + * order X in CHARSET2 for the same language. + * As such, it is possible to get missing order. For instance the + * ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1 + * even though they are both used for French. Same for the euro sign. + */ +static const unsigned char Windows_1250_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 3, 22, 14, 15, 1, 30, 32, 17, 4, 21, 12, 10, 16, 2, 0, /* 4X */ + 8, 40, 9, 6, 5, 13, 7, 36, 34, 20, 19,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 3, 22, 14, 15, 1, 30, 32, 17, 4, 21, 12, 10, 16, 2, 0, /* 6X */ + 8, 40, 9, 6, 5, 13, 7, 36, 34, 20, 19,SYM,SYM,SYM,SYM,CTR, /* 7X */ + SYM,ILL,SYM,ILL,SYM,SYM,SYM,SYM,ILL,SYM, 29,SYM, 46, 38, 26, 47, /* 8X */ + ILL,SYM,SYM,SYM,SYM,SYM,SYM,SYM,ILL,SYM, 29,SYM, 46, 38, 26, 48, /* 9X */ + SYM,SYM,SYM, 49,SYM, 50,SYM,SYM,SYM,SYM, 51,SYM,SYM,SYM,SYM, 52, /* AX */ + SYM,SYM,SYM, 53,SYM,SYM,SYM,SYM,SYM, 54, 55,SYM, 45,SYM, 45, 56, /* BX */ + 57, 18, 58, 59, 42, 60, 61, 62, 25, 24, 63, 64, 23, 11, 65, 39, /* CX */ + 66, 67, 35, 37, 68, 69, 41,SYM, 27, 31, 33, 70, 43, 28, 71, 72, /* DX */ + 73, 18, 74, 75, 42, 76, 77, 78, 25, 24, 79, 80, 23, 11, 81, 39, /* EX */ + 82, 83, 35, 37, 84, 85, 41,SYM, 27, 31, 33, 86, 43, 28, 87,SYM, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Mac_Centraleurope_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 3, 22, 14, 15, 1, 30, 32, 17, 4, 21, 12, 10, 16, 2, 0, /* 4X */ + 8, 40, 9, 6, 5, 13, 7, 36, 34, 20, 19,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 3, 22, 14, 15, 1, 30, 32, 17, 4, 21, 12, 10, 16, 2, 0, /* 6X */ + 8, 40, 9, 6, 5, 13, 7, 36, 34, 20, 19,SYM,SYM,SYM,SYM,CTR, /* 7X */ + 42, 88, 89, 24, 90, 41, 43, 18, 91, 25, 42, 25, 92, 93, 24, 94, /* 8X */ + 95, 39, 11, 39, 44, 44, 96, 37, 97, 98, 41, 99, 33, 23, 23, 43, /* 9X */ + SYM,SYM,100,SYM,SYM,SYM,SYM,101,SYM,SYM,SYM,102,SYM,SYM,103,104, /* AX */ + 105,106,SYM,SYM,107,108,SYM,SYM,109,110,111, 45, 45,112,113,114, /* BX */ + 115,116,SYM,SYM,117, 35,SYM,SYM,SYM,SYM,SYM, 35,118,119,120,121, /* CX */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,122,123,124, 27,SYM,SYM, 27,125, /* DX */ + 126, 29,SYM,SYM, 29, 46, 46, 18, 38, 38, 11, 26, 26,127, 37,128, /* EX */ + 129, 31, 33, 31,130,131,132,133, 28, 28,134,135,136,137,138,SYM, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Ibm852_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 3, 22, 14, 15, 1, 30, 32, 17, 4, 21, 12, 10, 16, 2, 0, /* 4X */ + 8, 40, 9, 6, 5, 13, 7, 36, 34, 20, 19,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 3, 22, 14, 15, 1, 30, 32, 17, 4, 21, 12, 10, 16, 2, 0, /* 6X */ + 8, 40, 9, 6, 5, 13, 7, 36, 34, 20, 19,SYM,SYM,SYM,SYM,CTR, /* 7X */ + 139, 43, 24,140, 42, 31,141,142,143,144,145,146,147,148, 42,149, /* 8X */ + 24,150,151,152, 41, 45, 45, 46, 46, 41, 43, 38, 38,153,SYM, 25, /* 9X */ + 18, 11, 37, 33,154,155, 26, 26,156,157,SYM,158, 25,159,SYM,SYM, /* AX */ + SYM,SYM,SYM,SYM,SYM, 18,160, 23,161,SYM,SYM,SYM,SYM,162,163,SYM, /* BX */ + SYM,SYM,SYM,SYM,SYM,SYM,164,165,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* CX */ + 166,167, 39,168, 39, 35, 11,169, 23,SYM,SYM,SYM,SYM,170, 31,SYM, /* DX */ + 37,171,172,173,174, 35, 29, 29,175, 33,176,177, 28, 28,178,SYM, /* EX */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,179, 27, 27,SYM,SYM, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Iso_8859_2_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 3, 22, 14, 15, 1, 30, 32, 17, 4, 21, 12, 10, 16, 2, 0, /* 4X */ + 8, 40, 9, 6, 5, 13, 7, 36, 34, 20, 19,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 3, 22, 14, 15, 1, 30, 32, 17, 4, 21, 12, 10, 16, 2, 0, /* 6X */ + 8, 40, 9, 6, 5, 13, 7, 36, 34, 20, 19,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM,180,SYM,181,SYM, 45, 46,SYM,SYM, 29,182, 38,183,SYM, 26,184, /* AX */ + SYM,185,SYM,186,SYM, 45, 46,SYM,SYM, 29,187, 38,188,SYM, 26,189, /* BX */ + 190, 18,191,192, 42,193,194,195, 25, 24,196,197, 23, 11,198, 39, /* CX */ + 199,200, 35, 37,201,202, 41,SYM, 27, 31, 33,203, 43, 28,204,205, /* DX */ + 206, 18,207,208, 42,209,210,211, 25, 24,212,213, 23, 11,214, 39, /* EX */ + 215,216, 35, 37,217,218, 41,SYM, 27, 31, 33,219, 43, 28,220,SYM, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + + +/* Model Table: + * Total sequences: 1025 + * First 512 sequences: 0.9786035192432675 + * Next 512 sequences (512-1024): 0.02139445610866691 + * Rest: 2.0246480655940202e-06 + * Negative sequences: TODO + */ +static const PRUint8 CzechLangModel[] = +{ + 2,2,3,2,3,3,3,3,3,3,3,2,3,3,3,3,3,3,2,3, + 2,3,3,0,0,3,3,3,0,2,3,0,3,0,3,2,2,0,2,0,0, + 3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,3,3,3,2,3, + 2,3,3,0,0,3,3,3,0,3,3,2,3,2,3,2,2,2,2,2,2, + 3,3,3,3,3,3,3,2,0,2,3,3,3,3,3,3,2,3,3,3, + 3,2,2,3,3,2,2,0,3,2,3,3,3,0,2,0,0,2,0,0,2, + 3,3,3,2,2,3,3,3,3,3,3,0,3,3,3,3,3,3,0,3, + 3,3,3,0,0,3,3,3,0,3,3,0,3,0,3,2,2,0,2,2,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,2,3,3, + 0,2,3,0,2,3,3,2,0,3,3,0,3,0,2,2,2,2,2,0,2, + 3,3,3,3,3,2,2,3,2,3,3,3,3,3,2,2,2,3,3,3, + 3,2,2,3,3,2,0,3,3,3,0,3,2,0,0,2,2,2,0,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,2,3,2, + 3,2,3,0,2,2,0,0,2,0,2,2,2,2,0,2,2,0,2,0,0, + 3,3,3,3,3,2,2,0,2,3,3,3,3,3,2,3,0,2,3,3, + 3,2,2,3,3,2,2,2,3,3,0,3,0,0,0,2,0,2,0,0,0, + 3,3,3,3,3,3,3,0,2,3,3,3,2,3,2,2,2,2,3,0, + 3,2,2,3,2,2,0,3,2,2,2,3,2,0,2,2,0,0,0,0,0, + 3,3,3,3,3,3,3,3,3,2,2,2,3,3,2,3,3,2,3,3, + 3,0,3,0,3,3,2,0,3,2,2,3,3,0,0,2,2,2,2,2,2, + 3,3,3,3,3,3,2,2,2,2,2,3,3,3,2,2,2,2,3,3, + 3,0,2,0,3,2,2,0,3,3,2,3,2,0,0,2,0,2,0,0,0, + 0,2,3,0,2,3,3,3,3,3,3,2,3,0,3,2,3,3,0,3, + 0,3,2,0,0,3,3,2,0,2,0,0,2,0,0,0,0,0,2,0,0, + 3,3,3,3,3,3,2,3,0,3,3,0,2,3,3,3,2,2,3,2, + 3,2,3,0,3,2,2,2,3,0,2,3,2,0,0,0,0,2,0,0,0, + 2,2,3,3,3,3,3,3,3,3,3,0,3,2,3,3,3,3,3,3, + 2,3,3,0,0,3,3,2,0,3,2,0,2,0,2,2,2,0,2,2,0, + 3,3,3,3,3,3,2,2,2,2,2,3,3,2,2,3,2,3,2,2, + 2,0,2,0,2,0,0,0,0,0,2,2,0,0,0,2,0,0,0,0,2, + 3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,2,3,2,3,2, + 3,0,2,3,3,2,2,2,2,2,2,3,2,0,2,2,2,0,0,0,0, + 3,3,3,3,3,2,2,0,3,3,3,3,2,3,2,2,2,2,3,2, + 3,2,3,3,3,2,3,2,2,2,2,3,2,0,0,2,0,2,0,0,0, + 3,3,3,3,3,3,2,3,2,3,3,2,2,3,2,2,2,0,3,0, + 3,2,2,0,2,2,2,2,3,0,2,2,0,0,0,0,2,2,2,0,0, + 0,0,3,0,0,3,3,3,2,3,3,0,3,0,3,3,3,3,0,3, + 0,2,2,0,0,2,3,2,0,3,2,0,0,0,0,2,0,0,0,2,0, + 3,3,3,3,3,3,2,3,3,2,3,3,3,3,2,3,3,3,3,2, + 3,2,2,0,2,2,0,2,2,2,2,2,0,2,0,2,0,2,0,0,0, + 2,2,3,2,2,3,3,3,3,2,3,2,3,2,3,2,3,3,0,3, + 0,2,3,0,0,2,3,2,0,3,2,0,2,2,2,0,0,0,2,0,0, + 2,3,3,3,3,2,3,2,2,2,2,3,2,2,2,2,3,2,2,2, + 0,2,2,0,0,2,0,0,0,3,2,2,0,2,0,2,0,2,0,2,0, + 3,3,3,3,3,3,3,3,0,3,3,3,2,3,2,2,2,2,3,2, + 3,3,2,3,2,2,0,2,3,2,0,2,0,0,0,2,0,2,0,0,0, + 0,0,3,2,0,3,3,2,3,3,3,0,3,0,3,3,2,3,0,2, + 0,3,0,0,0,2,3,3,0,3,0,0,0,0,0,2,0,0,2,2,0, + 2,0,3,0,0,3,2,2,2,2,2,0,3,0,0,2,3,3,0,3, + 0,0,0,0,0,3,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,3,3,3,3,3,2,0,0,2,3,3,3,3,2,0,0,0,3,0, + 0,0,3,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0, + 3,3,3,2,3,2,2,0,0,2,2,3,2,2,2,3,2,0,3,0, + 0,0,2,0,0,0,0,0,0,3,0,2,0,0,0,2,0,0,0,2,0, + 2,3,2,3,3,0,2,0,0,0,0,3,2,2,0,0,0,0,2,2, + 0,0,2,0,0,2,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0, + 0,0,2,0,0,3,3,3,2,3,2,0,2,2,3,2,3,2,0,3, + 0,2,3,0,0,2,2,2,0,3,2,0,0,0,0,0,0,0,0,0,0, + 2,3,3,3,3,3,2,2,2,0,3,3,3,3,0,0,0,0,2,0, + 0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,2,0,0, + 3,3,2,3,3,2,2,0,2,3,3,2,0,3,0,2,2,0,2,2, + 3,0,0,0,2,0,0,0,2,0,2,2,0,2,0,0,0,2,0,0,0, + 0,0,2,2,0,0,3,3,0,2,2,0,2,0,2,2,3,2,0,3, + 0,2,2,0,0,2,3,2,0,0,0,0,0,0,0,2,0,0,0,0,0, + 3,3,3,3,3,2,2,2,2,3,3,0,0,2,2,2,2,2,2,0, + 2,0,0,0,2,0,0,0,0,0,2,2,2,0,0,0,0,0,0,0,0, + 0,0,2,0,0,2,3,2,2,2,2,0,2,0,2,2,2,2,0,3, + 0,2,2,0,0,3,2,2,0,0,2,0,0,0,0,0,0,0,0,0,0, + 2,2,2,2,3,3,0,0,3,2,2,2,2,2,2,2,2,0,2,0, + 2,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,2,0,0,0,0, + 2,0,0,2,0,0,2,0,0,0,0,0,2,3,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0, + 2,2,2,2,3,0,2,0,0,0,2,0,2,2,2,0,0,2,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0, + 0,0,2,0,0,0,2,0,0,0,2,0,0,0,0,2,2,0,0,3, + 0,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0, + 2,0,0,2,0,0,0,0,0,0,0,0,2,2,0,0,0,0,2,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 2,0,0,2,0,2,0,0,0,0,0,0,0,2,0,0,2,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +}; + + +const SequenceModel Windows_1250CzechModel = +{ + Windows_1250_CharToOrderMap, + CzechLangModel, + 41, + (float)0.9786035192432675, + PR_TRUE, + "WINDOWS-1250" +}; + +const SequenceModel Mac_CentraleuropeCzechModel = +{ + Mac_Centraleurope_CharToOrderMap, + CzechLangModel, + 41, + (float)0.9786035192432675, + PR_TRUE, + "MAC-CENTRALEUROPE" +}; + +const SequenceModel Ibm852CzechModel = +{ + Ibm852_CharToOrderMap, + CzechLangModel, + 41, + (float)0.9786035192432675, + PR_TRUE, + "IBM852" +}; + +const SequenceModel Iso_8859_2CzechModel = +{ + Iso_8859_2_CharToOrderMap, + CzechLangModel, + 41, + (float)0.9786035192432675, + PR_TRUE, + "ISO-8859-2" +}; \ No newline at end of file diff --git a/src/nsSBCSGroupProber.cpp b/src/nsSBCSGroupProber.cpp index ecfacab..ec3fe3c 100644 --- a/src/nsSBCSGroupProber.cpp +++ b/src/nsSBCSGroupProber.cpp @@ -126,6 +126,11 @@ nsSBCSGroupProber::nsSBCSGroupProber() mProbers[45] = new nsSingleByteCharSetProber(&Iso_8859_3MalteseModel); + mProbers[46] = new nsSingleByteCharSetProber(&Windows_1250CzechModel); + mProbers[47] = new nsSingleByteCharSetProber(&Iso_8859_2CzechModel); + mProbers[48] = new nsSingleByteCharSetProber(&Mac_CentraleuropeCzechModel); + mProbers[49] = new nsSingleByteCharSetProber(&Ibm852CzechModel); + Reset(); } diff --git a/src/nsSBCSGroupProber.h b/src/nsSBCSGroupProber.h index 5053a49..d61225f 100644 --- a/src/nsSBCSGroupProber.h +++ b/src/nsSBCSGroupProber.h @@ -40,7 +40,7 @@ #define nsSBCSGroupProber_h__ -#define NUM_OF_SBCS_PROBERS 46 +#define NUM_OF_SBCS_PROBERS 50 class nsCharSetProber; class nsSBCSGroupProber: public nsCharSetProber { diff --git a/src/nsSBCharSetProber.h b/src/nsSBCharSetProber.h index a48591d..c3e8432 100644 --- a/src/nsSBCharSetProber.h +++ b/src/nsSBCharSetProber.h @@ -187,5 +187,10 @@ extern const SequenceModel Windows_1252PortugueseModel; extern const SequenceModel Iso_8859_3MalteseModel; +extern const SequenceModel Windows_1250CzechModel; +extern const SequenceModel Iso_8859_2CzechModel; +extern const SequenceModel Ibm852CzechModel; +extern const SequenceModel Mac_CentraleuropeCzechModel; + #endif /* nsSingleByteCharSetProber_h__ */