diff --git a/README.md b/README.md index f83a717..c20b575 100644 --- a/README.md +++ b/README.md @@ -26,6 +26,13 @@ Techniques used by universalchardet are described at http://www.mozilla.org/proj * EUC-TW * GB18030 * HZ-GB-2312 + * Croatian: + * ISO-8859-2 + * ISO-8859-13 + * ISO-8859-16 + * Windows-1250 + * IBM852 + * MAC-CENTRALEUROPE * Czech * Windows-1250 * ISO-8859-2 diff --git a/script/BuildLangModelLogs/LangCroatianModel.log b/script/BuildLangModelLogs/LangCroatianModel.log new file mode 100644 index 0000000..a79f123 --- /dev/null +++ b/script/BuildLangModelLogs/LangCroatianModel.log @@ -0,0 +1,157 @@ += Logs of language model for Croatian (hr) = + +- Generated by BuildLangModel.py +- Started: 2016-09-25 23:41:35.999066 +- Maximum depth: 5 +- Max number of pages: 100 + +== Parsed pages == + +Fizika čvrstog stanja (revision 4776646) +Agregatno stanje (revision 4663090) +Alnico (revision 3915185) +Aluminij (revision 4772363) +Amorfna tvar (revision 4659679) +Antimon (revision 4420072) +Antoine Henri Becquerel (revision 4634966) +Apsolutna nula (revision 4706683) +Arsen (revision 4540773) +Arthur Holly Compton (revision 4736068) +Atom (revision 4778162) +Atomska jezgra (revision 4540956) +Bell Labs (revision 4769518) +Bor (element) (revision 4602837) +Brian Josephson (revision 4403761) +Cink (revision 4537854) +Coulombov zakon (revision 4710338) +Dijamant (revision 4625335) +Dimenzija (revision 4669110) +Dinastija Han (revision 4541686) +Dislokacija (revision 4668021) +EV (revision 4538157) +Eksponencijalna funkcija (revision 4160157) +Električna struja (revision 4280621) +Električna vodljivost (revision 4460160) +Električni izolator (revision 4649046) +Električni luk (revision 4646980) +Električni naboj (revision 4727496) +Električni otpor (revision 4593314) +Električni vodič (revision 4333008) +Električno polje (revision 4705679) +Elektrolit (revision 4486319) +Elektromagnetsko zračenje (revision 4537368) +Elektron (revision 4630705) +Elektronika (revision 4090016) +Elektronska konfiguracija (revision 4420620) +Elektronski mikroskop (revision 4413214) +Elektrotehnika (revision 4596912) +Energetika (revision 4586277) +Energija (revision 4719089) +Fermi-Diracova statistika (revision 3934172) +Feromagnetizam (revision 4760511) +Fizika (revision 4769955) +Fizika kondenzirane tvari (revision 4769955) +Fizikalna veličina (revision 4621676) +Fosfor (revision 4602427) +Fotodioda (revision 3939069) +Fotoelektrični učinak (revision 4704417) +Foton (revision 4537522) +Fotonaponski sustavi (revision 4418887) +Francuski jezik (revision 4771366) +Galij (revision 4537855) +Genitiv (revision 4625199) +Germanij (revision 4537856) +Helij (revision 4747001) +Henri (revision 3922500) +Indij (revision 4537867) +Integrirani krug (revision 4447159) +Ion (revision 4549144) +Ioniziranje (revision 4566703) +Izolator (revision 4649046) +John Bardeen (revision 4403736) +Kadmij (revision 3921860) +Kelvin (revision 4624351) +Keramika (revision 4599177) +Kinetička energija (revision 4719090) +Klasična mehanika (revision 4637127) +Kompas (revision 4702880) +Kondenzacija (revision 4477825) +Kondenzirana tvar (revision 4776646) +Konstrukcija (revision 4680450) +Kovalentna veza (revision 4641419) +Kristal (revision 4720329) +Kristalna rešetka (revision 4479184) +Kristalografija (revision 4105956) +Krutine (revision 4625162) +Kubični kristalni sustav (revision 4344344) +Kubični metar (revision 4616551) +Kvantna mehanika (revision 4541215) +Latinski jezik (revision 4760544) +Luminiscencija (revision 4708222) +Magnet (revision 4603344) +Magnetizam (revision 4760040) +Magnetska permeabilnost (revision 4675996) +Magnetska vodljivost (revision 4736934) +Magnetski moment (revision 4410235) +Magnetsko polje (revision 4678057) +Materijal (revision 4669230) +Mehanika (revision 4698699) +Metal (revision 4671710) +Metan (revision 4422418) +Metar (revision 4655527) +Mjerna veličina (revision 4621676) +Molekula (revision 4539232) +Molekule (revision 4539232) +Napon (revision 4585417) +Niskotemperaturna fizika (revision 4657522) +Njemački jezik (revision 4731246) +Optika (revision 4768098) + +== End of Parsed pages == + +- Wikipedia parsing ended at: 2016-09-25 23:50:27.589690 + +49 characters appeared 500582 times. + +First 31 characters: +[ 0] Char a: 10.808019465342342 % +[ 1] Char i: 10.18554402675286 % +[ 2] Char e: 9.571259054460608 % +[ 3] Char o: 8.468143081453189 % +[ 4] Char n: 6.952906816465634 % +[ 5] Char t: 5.369549843981606 % +[ 6] Char r: 5.331993559496746 % +[ 7] Char j: 5.102860270644969 % +[ 8] Char s: 4.717109284792501 % +[ 9] Char k: 4.013927788054705 % +[10] Char l: 3.854713113935379 % +[11] Char u: 3.786792173909569 % +[12] Char m: 3.730058212240951 % +[13] Char v: 3.0989927724129114 % +[14] Char p: 2.67308852495695 % +[15] Char d: 2.6135578186990345 % +[16] Char z: 1.8931963194841206 % +[17] Char g: 1.5665765049482403 % +[18] Char č: 1.161048539500022 % +[19] Char b: 1.1440683044935693 % +[20] Char c: 1.007627122029957 % +[21] Char h: 0.8006680224219008 % +[22] Char f: 0.5159993767254915 % +[23] Char š: 0.422907735395999 % +[24] Char ž: 0.3611795869607777 % +[25] Char ć: 0.34959307366225717 % +[26] Char đ: 0.2195444502598975 % +[27] Char y: 0.11306838839590717 % +[28] Char w: 0.07291512679241363 % +[29] Char x: 0.04534721584076135 % +[30] Char q: 0.02477116636235422 % + +The first 31 characters have an accumulated ratio of 0.9997702674087363. + +712 sequences found. + +First 512 (typical positive ratio): 0.9989731099787131 +Next 512 (512-1024): 1.9976747066414694e-06 +Rest: 3.7513395167998453e-17 + +- Processing end: 2016-09-25 23:50:27.987029 diff --git a/script/langs/hr.py b/script/langs/hr.py new file mode 100644 index 0000000..3b0dea5 --- /dev/null +++ b/script/langs/hr.py @@ -0,0 +1,59 @@ +#!/bin/python3 +# -*- coding: utf-8 -*- + +# ##### BEGIN LICENSE BLOCK ##### +# Version: MPL 1.1/GPL 2.0/LGPL 2.1 +# +# The contents of this file are subject to the Mozilla Public License Version +# 1.1 (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# http://www.mozilla.org/MPL/ +# +# Software distributed under the License is distributed on an "AS IS" basis, +# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License +# for the specific language governing rights and limitations under the +# License. +# +# The Original Code is Mozilla Universal charset detector code. +# +# The Initial Developer of the Original Code is +# Netscape Communications Corporation. +# Portions created by the Initial Developer are Copyright (C) 2001 +# the Initial Developer. All Rights Reserved. +# +# Contributor(s): +# Jehan +# +# Alternatively, the contents of this file may be used under the terms of +# either the GNU General Public License Version 2 or later (the "GPL"), or +# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), +# in which case the provisions of the GPL or the LGPL are applicable instead +# of those above. If you wish to allow use of your version of this file only +# under the terms of either the GPL or the LGPL, and not to allow others to +# use your version of this file under the terms of the MPL, indicate your +# decision by deleting the provisions above and replace them with the notice +# and other provisions required by the GPL or the LGPL. If you do not delete +# the provisions above, a recipient may use your version of this file under +# the terms of any one of the MPL, the GPL or the LGPL. +# +# ##### END LICENSE BLOCK ##### + +import re + +## Mandatory Properties ## + +name = 'Croatian' +code = 'hr' +use_ascii = True +charsets = ['ISO-8859-2', 'ISO-8859-13', 'ISO-8859-16', + 'Windows-1250', 'IBM852', 'MAC-CENTRALEUROPE'] + +## Optional Properties ## + +# Alphabet characters. +alphabet = 'čćđšž' +# The starred page which was rewarded on the main page when I created +# the data. +start_pages = ['Fizika čvrstog stanja'] +wikipedia_code = code +case_mapping = True diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index f859915..1fbed29 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -9,6 +9,7 @@ set( JpCntx.cpp LangModels/LangArabicModel.cpp LangModels/LangBulgarianModel.cpp + LangModels/LangCroatianModel.cpp LangModels/LangCzechModel.cpp LangModels/LangEsperantoModel.cpp LangModels/LangFinnishModel.cpp diff --git a/src/LangModels/LangCroatianModel.cpp b/src/LangModels/LangCroatianModel.cpp new file mode 100644 index 0000000..58f882e --- /dev/null +++ b/src/LangModels/LangCroatianModel.cpp @@ -0,0 +1,292 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is Mozilla Communicator client code. + * + * The Initial Developer of the Original Code is + * Netscape Communications Corporation. + * Portions created by the Initial Developer are Copyright (C) 1998 + * the Initial Developer. All Rights Reserved. + * + * Contributor(s): + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ + +#include "../nsSBCharSetProber.h" + +/********* Language model for: Croatian *********/ + +/** + * Generated by BuildLangModel.py + * On: 2016-09-25 23:50:27.590137 + **/ + +/* Character Mapping Table: + * ILL: illegal character. + * CTR: control character specific to the charset. + * RET: carriage/return. + * SYM: symbol (punctuation) that does not belong to word. + * NUM: 0 - 9. + * + * Other characters are ordered by probabilities + * (0 is the most common character in the language). + * + * Orders are generic to a language. So the codepoint with order X in + * CHARSET1 maps to the same character as the codepoint with the same + * order X in CHARSET2 for the same language. + * As such, it is possible to get missing order. For instance the + * ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1 + * even though they are both used for French. Same for the euro sign. + */ +static const unsigned char Windows_1250_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 0, 19, 20, 15, 2, 22, 17, 21, 1, 7, 9, 10, 12, 4, 3, /* 4X */ + 14, 30, 6, 8, 5, 11, 13, 28, 29, 27, 16,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 0, 19, 20, 15, 2, 22, 17, 21, 1, 7, 9, 10, 12, 4, 3, /* 6X */ + 14, 30, 6, 8, 5, 11, 13, 28, 29, 27, 16,SYM,SYM,SYM,SYM,CTR, /* 7X */ + SYM,ILL,SYM,ILL,SYM,SYM,SYM,SYM,ILL,SYM, 23,SYM, 49, 50, 24, 51, /* 8X */ + ILL,SYM,SYM,SYM,SYM,SYM,SYM,SYM,ILL,SYM, 23,SYM, 52, 53, 24, 54, /* 9X */ + SYM,SYM,SYM, 40,SYM, 55,SYM,SYM,SYM,SYM, 56,SYM,SYM,SYM,SYM, 57, /* AX */ + SYM,SYM,SYM, 40,SYM,SYM,SYM,SYM,SYM, 58, 59,SYM, 60,SYM, 61, 62, /* BX */ + 63, 41, 43, 64, 36, 65, 25, 39, 18, 31, 66, 47, 67, 68, 69, 70, /* CX */ + 26, 71, 72, 44, 73, 74, 32,SYM, 75, 76, 48, 77, 33, 78, 79, 80, /* DX */ + 81, 41, 43, 82, 36, 83, 25, 39, 18, 31, 84, 47, 85, 86, 87, 88, /* EX */ + 26, 89, 90, 44, 91, 92, 32,SYM, 93, 94, 48, 95, 33, 96, 97,SYM, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Iso_8859_2_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 0, 19, 20, 15, 2, 22, 17, 21, 1, 7, 9, 10, 12, 4, 3, /* 4X */ + 14, 30, 6, 8, 5, 11, 13, 28, 29, 27, 16,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 0, 19, 20, 15, 2, 22, 17, 21, 1, 7, 9, 10, 12, 4, 3, /* 6X */ + 14, 30, 6, 8, 5, 11, 13, 28, 29, 27, 16,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM, 98,SYM, 40,SYM, 99,100,SYM,SYM, 23,101,102,103,SYM, 24,104, /* AX */ + SYM,105,SYM, 40,SYM,106,107,SYM,SYM, 23,108,109,110,SYM, 24,111, /* BX */ + 112, 41, 43,113, 36,114, 25, 39, 18, 31,115, 47,116,117,118,119, /* CX */ + 26,120,121, 44,122,123, 32,SYM,124,125, 48,126, 33,127,128,129, /* DX */ + 130, 41, 43,131, 36,132, 25, 39, 18, 31,133, 47,134,135,136,137, /* EX */ + 26,138,139, 44,140,141, 32,SYM,142,143, 48,144, 33,145,146,SYM, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Iso_8859_16_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 0, 19, 20, 15, 2, 22, 17, 21, 1, 7, 9, 10, 12, 4, 3, /* 4X */ + 14, 30, 6, 8, 5, 11, 13, 28, 29, 27, 16,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 0, 19, 20, 15, 2, 22, 17, 21, 1, 7, 9, 10, 12, 4, 3, /* 6X */ + 14, 30, 6, 8, 5, 11, 13, 28, 29, 27, 16,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM,147,148, 40,SYM,SYM, 23,SYM, 23,SYM,149,SYM,150,SYM,151,152, /* AX */ + SYM,SYM, 18, 40, 24,SYM,SYM,SYM, 24, 18,153,SYM, 45, 45,154,155, /* BX */ + 46, 41, 43,156, 36, 25,157, 39, 35, 31, 42, 47,158,159,160,161, /* CX */ + 26,162,163, 44,164,165, 32,166,167,168, 48,169, 33,170,171,172, /* DX */ + 46, 41, 43,173, 36, 25,174, 39, 35, 31, 42, 47,175,176,177,178, /* EX */ + 26,179,180, 44,181,182, 32,183,184,185, 48,186, 33,187,188,189, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Mac_Centraleurope_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 0, 19, 20, 15, 2, 22, 17, 21, 1, 7, 9, 10, 12, 4, 3, /* 4X */ + 14, 30, 6, 8, 5, 11, 13, 28, 29, 27, 16,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 0, 19, 20, 15, 2, 22, 17, 21, 1, 7, 9, 10, 12, 4, 3, /* 6X */ + 14, 30, 6, 8, 5, 11, 13, 28, 29, 27, 16,SYM,SYM,SYM,SYM,CTR, /* 7X */ + 36,190,191, 31,192, 32, 33, 41,193, 18, 36, 18, 25, 25, 31,194, /* 8X */ + 195,196,197,198,199,200,201, 44,202,203, 32, 37, 48,204,205, 33, /* 9X */ + SYM,SYM,206,SYM,SYM,SYM,SYM,207,SYM,SYM,SYM,208,SYM,SYM,209,210, /* AX */ + 211,212,SYM,SYM,213,214,SYM,SYM, 40,215,216,217,218,219,220,221, /* BX */ + 222,223,SYM,SYM,224,225,SYM,SYM,SYM,SYM,SYM,226,227, 37,228, 38, /* CX */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 38,229,230,231,SYM,SYM,232,233, /* DX */ + 234, 23,SYM,SYM, 23,235,236, 41,237,238,239, 24, 24,240, 44,241, /* EX */ + 242,243, 48,244,245,246,247,248,249,249,249,249, 40,249,249,SYM, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Iso_8859_13_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 0, 19, 20, 15, 2, 22, 17, 21, 1, 7, 9, 10, 12, 4, 3, /* 4X */ + 14, 30, 6, 8, 5, 11, 13, 28, 29, 27, 16,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 0, 19, 20, 15, 2, 22, 17, 21, 1, 7, 9, 10, 12, 4, 3, /* 6X */ + 14, 30, 6, 8, 5, 11, 13, 28, 29, 27, 16,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 34,SYM,249,SYM,SYM,SYM,SYM,249, /* AX */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 34,SYM,249,SYM,SYM,SYM,SYM,249, /* BX */ + 249,249,249, 25, 36,249,249,249, 18, 31,249,249,249,249,249,249, /* CX */ + 23,249,249, 44, 38, 37, 32,SYM,249, 40,249,249, 33,249, 24,249, /* DX */ + 249,249,249, 25, 36,249,249,249, 18, 31,249,249,249,249,249,249, /* EX */ + 23,249,249, 44, 38, 37, 32,SYM,249, 40,249,249, 33,249, 24,SYM, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Ibm852_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 0, 19, 20, 15, 2, 22, 17, 21, 1, 7, 9, 10, 12, 4, 3, /* 4X */ + 14, 30, 6, 8, 5, 11, 13, 28, 29, 27, 16,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 0, 19, 20, 15, 2, 22, 17, 21, 1, 7, 9, 10, 12, 4, 3, /* 6X */ + 14, 30, 6, 8, 5, 11, 13, 28, 29, 27, 16,SYM,SYM,SYM,SYM,CTR, /* 7X */ + 39, 33, 31, 43, 36,249, 25, 39, 40, 47,249,249,249,249, 36, 25, /* 8X */ + 31,249,249,249, 32,249,249,249,249, 32, 33,249,249, 40,SYM, 18, /* 9X */ + 41,249, 44, 48,249,249, 24, 24,249,249,SYM,249, 18,249,SYM,SYM, /* AX */ + SYM,SYM,SYM,SYM,SYM, 41, 43,249,249,SYM,SYM,SYM,SYM,249,249,SYM, /* BX */ + SYM,SYM,SYM,SYM,SYM,SYM,249,249,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* CX */ + 26, 26,249, 47,249,249,249,249,249,SYM,SYM,SYM,SYM,249,249,SYM, /* DX */ + 44,249,249,249,249,249, 23, 23,249, 48,249,249,249,249,249,SYM, /* EX */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,249,249,249,SYM,SYM, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + + +/* Model Table: + * Total sequences: 712 + * First 512 sequences: 0.9989731099787131 + * Next 512 sequences (512-1024): 0.0010268900212868262 + * Rest: 3.7513395167998453e-17 + * Negative sequences: TODO + */ +static const PRUint8 CroatianLangModel[] = +{ + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,0,2,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,0,2,2,2,0,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,0,0,3,3,2,0,0,0,0,3,2,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,2,0,0, + 3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,2,3,0,3,3,2,0,2,3,0,0,0,0,0,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,2,3,3,3,3,0,0,0,0,3,2,0,2, + 3,3,3,3,3,3,3,3,3,0,3,3,3,3,2,2,0,3,3,0,3,2,0,3,0,2,0,2,3,0,0, + 3,3,3,3,3,3,0,3,3,3,3,3,3,3,2,3,3,3,0,3,3,3,3,2,2,0,0,3,0,2,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,2,2,0, + 3,3,3,3,3,2,3,3,3,3,3,3,3,2,3,3,0,3,2,3,3,2,3,0,0,0,0,2,3,0,0, + 3,3,3,3,3,0,3,3,3,3,3,3,2,0,2,3,0,0,2,0,3,0,0,3,0,0,0,2,0,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,0,2,2,2,2,3,3,0,2,0,3,0,2,0,0,0, + 3,3,3,3,3,3,3,3,3,0,3,3,3,3,0,3,3,3,0,3,2,2,3,0,3,0,0,2,3,2,2, + 3,3,3,3,3,0,3,3,2,0,3,3,3,3,0,3,0,3,0,3,0,3,0,0,0,0,0,2,2,0,0, + 3,3,3,3,3,2,3,2,2,0,3,3,3,3,2,3,3,2,0,0,0,3,2,0,0,0,0,3,2,0,0, + 3,3,3,3,3,0,2,3,0,3,3,3,0,3,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0, + 3,3,3,3,3,2,3,3,3,0,3,3,2,2,0,3,3,0,0,2,3,0,3,0,0,0,0,2,0,0,2, + 3,3,3,3,2,3,3,3,3,3,3,3,3,3,0,3,0,2,0,0,0,3,0,0,0,0,0,2,0,0,3, + 3,3,3,3,3,3,3,0,3,2,3,3,2,3,0,2,3,2,0,3,3,2,2,0,0,0,0,3,3,2,0, + 3,3,3,3,3,3,3,0,3,2,3,3,2,0,2,2,0,2,0,0,0,0,3,0,0,0,0,0,0,0,0, + 3,3,3,2,3,3,2,0,0,3,3,3,2,3,3,0,0,0,2,0,2,0,0,0,0,3,0,0,0,0,0, + 3,3,3,3,3,0,0,2,0,0,2,3,0,0,0,3,0,0,0,3,0,0,0,0,0,0,2,0,0,0,0, + 3,3,3,3,3,0,0,0,0,2,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0, + 3,3,3,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,2,3,3,3,2,2,2,3,0,3,3,0,0,0,2,2,2,0,0,2,0,0,0,0,0,0,0,0,0,0, + 3,3,3,3,2,3,2,2,2,0,0,3,0,0,0,0,0,0,0,2,2,3,2,0,0,0,0,2,2,0,0, + 2,3,2,0,0,0,2,0,0,0,0,2,0,2,3,0,0,2,0,0,0,0,2,0,0,0,0,0,3,0,0, + 0,3,2,0,0,0,2,0,0,0,0,3,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2, +}; + + +const SequenceModel Windows_1250CroatianModel = +{ + Windows_1250_CharToOrderMap, + CroatianLangModel, + 31, + (float)0.9989731099787131, + PR_TRUE, + "WINDOWS-1250" +}; + +const SequenceModel Iso_8859_2CroatianModel = +{ + Iso_8859_2_CharToOrderMap, + CroatianLangModel, + 31, + (float)0.9989731099787131, + PR_TRUE, + "ISO-8859-2" +}; + +const SequenceModel Iso_8859_16CroatianModel = +{ + Iso_8859_16_CharToOrderMap, + CroatianLangModel, + 31, + (float)0.9989731099787131, + PR_TRUE, + "ISO-8859-16" +}; + +const SequenceModel Mac_CentraleuropeCroatianModel = +{ + Mac_Centraleurope_CharToOrderMap, + CroatianLangModel, + 31, + (float)0.9989731099787131, + PR_TRUE, + "MAC-CENTRALEUROPE" +}; + +const SequenceModel Iso_8859_13CroatianModel = +{ + Iso_8859_13_CharToOrderMap, + CroatianLangModel, + 31, + (float)0.9989731099787131, + PR_TRUE, + "ISO-8859-13" +}; + +const SequenceModel Ibm852CroatianModel = +{ + Ibm852_CharToOrderMap, + CroatianLangModel, + 31, + (float)0.9989731099787131, + PR_TRUE, + "IBM852" +}; diff --git a/src/nsSBCSGroupProber.cpp b/src/nsSBCSGroupProber.cpp index 9d7cbde..95c98e2 100644 --- a/src/nsSBCSGroupProber.cpp +++ b/src/nsSBCSGroupProber.cpp @@ -156,6 +156,13 @@ nsSBCSGroupProber::nsSBCSGroupProber() mProbers[68] = new nsSingleByteCharSetProber(&Iso_8859_15ItalianModel); mProbers[69] = new nsSingleByteCharSetProber(&Windows_1252ItalianModel); + mProbers[70] = new nsSingleByteCharSetProber(&Windows_1250CroatianModel); + mProbers[71] = new nsSingleByteCharSetProber(&Iso_8859_2CroatianModel); + mProbers[72] = new nsSingleByteCharSetProber(&Iso_8859_13CroatianModel); + mProbers[73] = new nsSingleByteCharSetProber(&Iso_8859_16CroatianModel); + mProbers[74] = new nsSingleByteCharSetProber(&Mac_CentraleuropeCroatianModel); + mProbers[75] = new nsSingleByteCharSetProber(&Ibm852CroatianModel); + Reset(); } diff --git a/src/nsSBCSGroupProber.h b/src/nsSBCSGroupProber.h index 68b6bff..6617f9e 100644 --- a/src/nsSBCSGroupProber.h +++ b/src/nsSBCSGroupProber.h @@ -40,7 +40,7 @@ #define nsSBCSGroupProber_h__ -#define NUM_OF_SBCS_PROBERS 70 +#define NUM_OF_SBCS_PROBERS 76 class nsCharSetProber; class nsSBCSGroupProber: public nsCharSetProber { diff --git a/src/nsSBCharSetProber.h b/src/nsSBCharSetProber.h index 7472054..5092c8d 100644 --- a/src/nsSBCharSetProber.h +++ b/src/nsSBCharSetProber.h @@ -217,5 +217,12 @@ extern const SequenceModel Iso_8859_9ItalianModel; extern const SequenceModel Iso_8859_15ItalianModel; extern const SequenceModel Windows_1252ItalianModel; +extern const SequenceModel Windows_1250CroatianModel; +extern const SequenceModel Iso_8859_2CroatianModel; +extern const SequenceModel Iso_8859_13CroatianModel; +extern const SequenceModel Iso_8859_16CroatianModel; +extern const SequenceModel Ibm852CroatianModel; +extern const SequenceModel Mac_CentraleuropeCroatianModel; + #endif /* nsSingleByteCharSetProber_h__ */ diff --git a/test/hr/ibm852.txt b/test/hr/ibm852.txt new file mode 100644 index 0000000..98f5138 --- /dev/null +++ b/test/hr/ibm852.txt @@ -0,0 +1,4 @@ +Brekinja (lat. Sorbus torminalis) je bjelogorina vrsta drvea iz porodice +Rosaceae. +Prirodno je rasprostranjena u zapadnoj, srednjoj i junoj Europi, sjevernoj +Africi, Krimu, Maloj Aziji, Kavkazu i Transkavkaziji. diff --git a/test/hr/iso-8859-13.txt b/test/hr/iso-8859-13.txt new file mode 100644 index 0000000..124b1c0 --- /dev/null +++ b/test/hr/iso-8859-13.txt @@ -0,0 +1,4 @@ +Brekinja (lat. Sorbus torminalis) je bjelogorina vrsta drvea iz porodice +Rosaceae. +Prirodno je rasprostranjena u zapadnoj, srednjoj i junoj Europi, sjevernoj +Africi, Krimu, Maloj Aziji, Kavkazu i Transkavkaziji. diff --git a/test/hr/iso-8859-16.txt b/test/hr/iso-8859-16.txt new file mode 100644 index 0000000..a10a108 --- /dev/null +++ b/test/hr/iso-8859-16.txt @@ -0,0 +1,4 @@ +Brekinja (lat. Sorbus torminalis) je bjelogorina vrsta drvea iz porodice +Rosaceae. +Prirodno je rasprostranjena u zapadnoj, srednjoj i junoj Europi, sjevernoj +Africi, Krimu, Maloj Aziji, Kavkazu i Transkavkaziji. diff --git a/test/hr/iso-8859-2.txt b/test/hr/iso-8859-2.txt new file mode 100644 index 0000000..7c33320 --- /dev/null +++ b/test/hr/iso-8859-2.txt @@ -0,0 +1,4 @@ +Brekinja (lat. Sorbus torminalis) je bjelogorina vrsta drvea iz porodice +Rosaceae. +Prirodno je rasprostranjena u zapadnoj, srednjoj i junoj Europi, sjevernoj +Africi, Krimu, Maloj Aziji, Kavkazu i Transkavkaziji. diff --git a/test/hr/mac-centraleurope.txt b/test/hr/mac-centraleurope.txt new file mode 100644 index 0000000..27c41e0 --- /dev/null +++ b/test/hr/mac-centraleurope.txt @@ -0,0 +1,4 @@ +Brekinja (lat. Sorbus torminalis) je bjelogorina vrsta drvea iz porodice +Rosaceae. +Prirodno je rasprostranjena u zapadnoj, srednjoj i junoj Europi, sjevernoj +Africi, Krimu, Maloj Aziji, Kavkazu i Transkavkaziji. diff --git a/test/hr/utf-8.txt b/test/hr/utf-8.txt new file mode 100644 index 0000000..92cc1dd --- /dev/null +++ b/test/hr/utf-8.txt @@ -0,0 +1,4 @@ +Brekinja (lat. Sorbus torminalis) je bjelogorična vrsta drveća iz porodice +Rosaceae. +Prirodno je rasprostranjena u zapadnoj, srednjoj i južnoj Europi, sjevernoj +Africi, Krimu, Maloj Aziji, Kavkazu i Transkavkaziji. diff --git a/test/hr/windows-1250.txt b/test/hr/windows-1250.txt new file mode 100644 index 0000000..60d8c98 --- /dev/null +++ b/test/hr/windows-1250.txt @@ -0,0 +1,4 @@ +Brekinja (lat. Sorbus torminalis) je bjelogorina vrsta drvea iz porodice +Rosaceae. +Prirodno je rasprostranjena u zapadnoj, srednjoj i junoj Europi, sjevernoj +Africi, Krimu, Maloj Aziji, Kavkazu i Transkavkaziji.