diff --git a/script/BuildLangModelLogs/LangFinnishModel.log b/script/BuildLangModelLogs/LangFinnishModel.log new file mode 100644 index 0000000..e99e9aa --- /dev/null +++ b/script/BuildLangModelLogs/LangFinnishModel.log @@ -0,0 +1,156 @@ += Logs of language model for Finnish (fi) = + +- Generated by BuildLangModel.py +- Started: 2016-09-21 18:12:24.181917 +- Maximum depth: 5 +- Max number of pages: 100 + +== Parsed pages == + +Yhdistynyt kuningaskunta (revision 15843357) +1. toukokuuta (revision 15910178) +1700-luku (revision 15493702) +1707 (revision 15106709) +1800-luku (revision 15708929) +2014 (revision 15891601) +409 (revision 12809782) +5. marraskuuta (revision 15421719) +927 (revision 12785964) +Aasia (revision 15948161) +Abhasia (revision 15730328) +Adolf Hitler (revision 15951829) +Afrikka (revision 15934209) +Agatha Christie (revision 15760740) +Aikavyöhyke (revision 15800313) +Ajoneuvon kansallisuustunnus (revision 15897445) +Akrotiri ja Dhekelia (revision 14625383) +Alamaat (revision 15913741) +Alan Turing (revision 15904871) +Alankomaat (revision 15936643) +Albania (revision 15767604) +Alec Guinness (revision 15363805) +Alexander Fleming (revision 15023225) +Alfred Hitchcock (revision 15892843) +Alfred Tennyson (revision 15856114) +Allen Jones (revision 12871703) +Andorra (revision 15913862) +Andrew Lloyd Webber (revision 14978349) +Anglit (revision 15902350) +Anguilla (revision 15854041) +Anne Brontë (revision 14287992) +Anthony Eden (revision 14391831) +Antigua ja Barbuda (revision 15196967) +Arabian Lawrence (revision 15736417) +Argentiina (revision 15676474) +Armenia (revision 15634470) +Arthur Conan Doyle (revision 15402837) +Arts and Crafts (revision 15806930) +Aurinko (revision 15934252) +Australia (revision 15934255) +Avara luonto (revision 15815943) +Azerbaidžan (revision 15946891) +BBC (revision 15866026) +BKT (revision 15656549) +Bahama (revision 15516869) +Bangladesh (revision 15883994) +Bank of England (revision 14481173) +Barbados (revision 15839821) +Barbara Hepworth (revision 15106880) +Bath (revision 15869900) +Beatrix Potter (revision 15057380) +Belfast (revision 15715934) +Belgia (revision 15932391) +Belize (revision 15665086) +Ben Nevis (revision 15610196) +Bengalin kieli (revision 15551820) +Benjamin Britten (revision 15081615) +Bermuda (revision 15632621) +Bertrand Russell (revision 14631969) +Bhutan (revision 15377394) +Big Ben (revision 14897401) +Big Brother (revision 14641391) +Birmingham (revision 15855259) +Black Sabbath (revision 15839917) +Bosnia ja Hertsegovina (revision 15934266) +Botswana (revision 15524955) +Bristol (revision 15891889) +Bristolin kanaali (revision 15849713) +Bristolin kansainvälinen lentoasema (revision 14452870) +Britannia (provinssi) (revision 14557442) +Britannian avoin golfturnaus (revision 14293265) +Britannian kuninkaallinen perhe (revision 15522149) +Britannian talous (revision 15470242) +Britannian väestö (revision 15661241) +Brittein saaret (revision 15805422) +Brittiläinen Antarktiksen alue (revision 15836227) +Brittiläinen Intia (revision 15593126) +Brittiläinen Intian valtameren alue (revision 14272903) +Brittiläinen imperiumi (revision 15906600) +Brittiläinen kansainyhteisö (revision 15894379) +Brittiläinen keittiö (revision 13393533) +Brittiläinen kulttuuri (revision 15951407) +Brittiläiset Neitsytsaaret (revision 15910520) +Brittiläiset merentakaiset alueet (revision 15836213) +Brunei (revision 15580824) +Bruttokansantuote (revision 15656549) +Bulgaria (revision 15944101) +Burma (revision 15627218) +Cambridge (revision 14641664) +Cambridgen yliopisto (revision 15493340) +Canterburyn tarinoita (revision 15232140) +Cardiff (revision 15840398) +Caymansaaret (revision 15914575) +Channel 4 (revision 15882475) +Charles Babbage (revision 15203616) +Charles Chaplin (revision 15674652) +Charles Darwin (revision 15894085) +Charles Dickens (revision 15699592) +Charles Dickensin joulutarina (revision 15116247) + +== End of Parsed pages == + +- Wikipedia parsing ended at: 2016-09-21 18:15:05.189221 + +61 characters appeared 940364 times. + +First 30 characters: +[ 0] Char a: 12.508773198463574 % +[ 1] Char i: 10.969475649854738 % +[ 2] Char n: 8.815841525196626 % +[ 3] Char t: 8.80169806585535 % +[ 4] Char e: 7.8206949649284745 % +[ 5] Char s: 7.595782058862313 % +[ 6] Char l: 5.963541777439374 % +[ 7] Char o: 5.439808414613916 % +[ 8] Char u: 5.0102938861972595 % +[ 9] Char k: 4.589712068943515 % +[10] Char r: 3.1231523112326713 % +[11] Char ä: 3.041800834570443 % +[12] Char m: 3.0392486313810396 % +[13] Char v: 2.156292669647073 % +[14] Char h: 1.996141919512019 % +[15] Char j: 1.9248929138078446 % +[16] Char p: 1.6324529650220552 % +[17] Char y: 1.6323466232224966 % +[18] Char d: 1.1981530556252684 % +[19] Char b: 0.6835650875618378 % +[20] Char g: 0.5793501239945382 % +[21] Char c: 0.5056552569005194 % +[22] Char ö: 0.38931732818355447 % +[23] Char f: 0.215023118707224 % +[24] Char w: 0.2106631049253268 % +[25] Char z: 0.06593191572625068 % +[26] Char x: 0.024458613898447838 % +[27] Char Å¡: 0.010421496356729947 % +[28] Char ž: 0.007869293167326695 % +[29] Char q: 0.007762951367768225 % + +The first 30 characters have an accumulated ratio of 0.9996012182516557. + +919 sequences found. + +First 512 (typical positive ratio): 0.9985378147555799 +Next 512 (512-1024): 1.0634179955846884e-06 +Rest: 3.881443777498106e-17 + +- Processing end: 2016-09-21 18:15:05.307164 diff --git a/script/langs/fi.py b/script/langs/fi.py new file mode 100644 index 0000000..b9a5518 --- /dev/null +++ b/script/langs/fi.py @@ -0,0 +1,60 @@ +#!/bin/python3 +# -*- coding: utf-8 -*- + +# ##### BEGIN LICENSE BLOCK ##### +# Version: MPL 1.1/GPL 2.0/LGPL 2.1 +# +# The contents of this file are subject to the Mozilla Public License Version +# 1.1 (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# http://www.mozilla.org/MPL/ +# +# Software distributed under the License is distributed on an "AS IS" basis, +# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License +# for the specific language governing rights and limitations under the +# License. +# +# The Original Code is Mozilla Universal charset detector code. +# +# The Initial Developer of the Original Code is +# Netscape Communications Corporation. +# Portions created by the Initial Developer are Copyright (C) 2001 +# the Initial Developer. All Rights Reserved. +# +# Contributor(s): +# Jehan +# +# Alternatively, the contents of this file may be used under the terms of +# either the GNU General Public License Version 2 or later (the "GPL"), or +# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), +# in which case the provisions of the GPL or the LGPL are applicable instead +# of those above. If you wish to allow use of your version of this file only +# under the terms of either the GPL or the LGPL, and not to allow others to +# use your version of this file under the terms of the MPL, indicate your +# decision by deleting the provisions above and replace them with the notice +# and other provisions required by the GPL or the LGPL. If you do not delete +# the provisions above, a recipient may use your version of this file under +# the terms of any one of the MPL, the GPL or the LGPL. +# +# ##### END LICENSE BLOCK ##### + +import re + +## Mandatory Properties ## + +name = 'Finnish' +code = 'fi' +use_ascii = True +charsets = ['ISO-8859-1', 'ISO-8859-4', 'ISO-8859-9', + 'ISO-8859-13', 'ISO-8859-15', 'WINDOWS-1252'] + +## Optional Properties ## + +# Alphabet characters. +# 'Ã¥' (Swedish o), 'Å¡' and 'ž' are rare enough that I don't want to include them +# here. +alphabet = 'äö' +# Some random high quality page found on the Finnish home page. +start_pages = ['Yhdistynyt kuningaskunta'] +wikipedia_code = code +case_mapping = True diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 6903b7d..c01126c 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -11,6 +11,7 @@ set( LangModels/LangBulgarianModel.cpp LangModels/LangCzechModel.cpp LangModels/LangEsperantoModel.cpp + LangModels/LangFinnishModel.cpp LangModels/LangFrenchModel.cpp LangModels/LangDanishModel.cpp LangModels/LangGermanModel.cpp diff --git a/src/LangModels/LangFinnishModel.cpp b/src/LangModels/LangFinnishModel.cpp new file mode 100644 index 0000000..ee91e14 --- /dev/null +++ b/src/LangModels/LangFinnishModel.cpp @@ -0,0 +1,291 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is Mozilla Communicator client code. + * + * The Initial Developer of the Original Code is + * Netscape Communications Corporation. + * Portions created by the Initial Developer are Copyright (C) 1998 + * the Initial Developer. All Rights Reserved. + * + * Contributor(s): + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ + +#include "../nsSBCharSetProber.h" + +/********* Language model for: Finnish *********/ + +/** + * Generated by BuildLangModel.py + * On: 2016-09-21 18:15:05.189948 + **/ + +/* Character Mapping Table: + * ILL: illegal character. + * CTR: control character specific to the charset. + * RET: carriage/return. + * SYM: symbol (punctuation) that does not belong to word. + * NUM: 0 - 9. + * + * Other characters are ordered by probabilities + * (0 is the most common character in the language). + * + * Orders are generic to a language. So the codepoint with order X in + * CHARSET1 maps to the same character as the codepoint with the same + * order X in CHARSET2 for the same language. + * As such, it is possible to get missing order. For instance the + * ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1 + * even though they are both used for French. Same for the euro sign. + */ +static const unsigned char Iso_8859_15_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 0, 19, 21, 18, 4, 23, 20, 14, 1, 15, 9, 6, 12, 2, 7, /* 4X */ + 16, 29, 10, 5, 3, 8, 13, 24, 26, 17, 25,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 0, 19, 21, 18, 4, 23, 20, 14, 1, 15, 9, 6, 12, 2, 7, /* 6X */ + 16, 29, 10, 5, 3, 8, 13, 24, 26, 17, 25,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM,SYM,SYM,SYM,SYM,SYM, 27,SYM, 27,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ + SYM,SYM,SYM,SYM, 28, 61,SYM,SYM, 28,SYM,SYM,SYM, 62, 63, 64,SYM, /* BX */ + 49, 35, 65, 46, 11, 56, 39, 37, 40, 30, 51, 31, 66, 36, 67, 57, /* CX */ + 68, 58, 52, 33, 34, 59, 22,SYM, 69, 70, 38, 71, 32, 72, 73, 55, /* DX */ + 49, 35, 74, 46, 11, 56, 39, 37, 40, 30, 51, 31, 75, 36, 76, 57, /* EX */ + 77, 58, 52, 33, 34, 59, 22,SYM, 78, 79, 38, 80, 32, 81, 82, 83, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Windows_1252_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 0, 19, 21, 18, 4, 23, 20, 14, 1, 15, 9, 6, 12, 2, 7, /* 4X */ + 16, 29, 10, 5, 3, 8, 13, 24, 26, 17, 25,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 0, 19, 21, 18, 4, 23, 20, 14, 1, 15, 9, 6, 12, 2, 7, /* 6X */ + 16, 29, 10, 5, 3, 8, 13, 24, 26, 17, 25,SYM,SYM,SYM,SYM,CTR, /* 7X */ + SYM,ILL,SYM, 84,SYM,SYM,SYM,SYM,SYM,SYM, 27,SYM, 85,ILL, 28,ILL, /* 8X */ + ILL,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 27,SYM, 86,ILL, 28, 87, /* 9X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ + SYM,SYM,SYM,SYM,SYM, 88,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */ + 49, 35, 89, 46, 11, 56, 39, 37, 40, 30, 51, 31, 90, 36, 91, 57, /* CX */ + 92, 58, 52, 33, 34, 59, 22,SYM, 93, 94, 38, 95, 32, 96, 97, 55, /* DX */ + 49, 35, 98, 46, 11, 56, 39, 37, 40, 30, 51, 31, 99, 36,100, 57, /* EX */ + 101, 58, 52, 33, 34, 59, 22,SYM,102,103, 38,104, 32,105,106,107, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Iso_8859_4_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 0, 19, 21, 18, 4, 23, 20, 14, 1, 15, 9, 6, 12, 2, 7, /* 4X */ + 16, 29, 10, 5, 3, 8, 13, 24, 26, 17, 25,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 0, 19, 21, 18, 4, 23, 20, 14, 1, 15, 9, 6, 12, 2, 7, /* 6X */ + 16, 29, 10, 5, 3, 8, 13, 24, 26, 17, 25,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM,108,109, 47,SYM,110,111,SYM,SYM, 27,112,113,114,SYM, 28,SYM, /* AX */ + SYM,115,SYM, 47,SYM,116,117,SYM,SYM, 27,118,119,120, 45, 28, 45, /* BX */ + 53, 35,121, 46, 11, 56, 39,122, 43, 30,123, 31,124, 36,125,126, /* CX */ + 127, 54,128,129, 34, 59, 22,SYM,130,131, 38,132, 32,133,134, 55, /* DX */ + 53, 35,135, 46, 11, 56, 39,136, 43, 30,137, 31,138, 36,139,140, /* EX */ + 141, 54,142,143, 34, 59, 22,SYM,144,145, 38,146, 32,147,148,SYM, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Iso_8859_13_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 0, 19, 21, 18, 4, 23, 20, 14, 1, 15, 9, 6, 12, 2, 7, /* 4X */ + 16, 29, 10, 5, 3, 8, 13, 24, 26, 17, 25,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 0, 19, 21, 18, 4, 23, 20, 14, 1, 15, 9, 6, 12, 2, 7, /* 6X */ + 16, 29, 10, 5, 3, 8, 13, 24, 26, 17, 25,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,149,SYM, 47,SYM,SYM,SYM,SYM, 39, /* AX */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,150,SYM, 47,SYM,SYM,SYM,SYM, 39, /* BX */ + 151,152, 53, 41, 11, 56,153,154, 43, 30,155,156,157,158,159,160, /* CX */ + 27,161, 54, 33,162, 59, 22,SYM,163,164,165,166, 32, 60, 28, 55, /* DX */ + 167,168, 53, 41, 11, 56,169,170, 43, 30,171,172,173,174,175,176, /* EX */ + 27,177, 54, 33,178, 59, 22,SYM,179,180,181,182, 32, 60, 28,SYM, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Iso_8859_9_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 0, 19, 21, 18, 4, 23, 20, 14, 1, 15, 9, 6, 12, 2, 7, /* 4X */ + 16, 29, 10, 5, 3, 8, 13, 24, 26, 17, 25,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 0, 19, 21, 18, 4, 23, 20, 14, 1, 15, 9, 6, 12, 2, 7, /* 6X */ + 16, 29, 10, 5, 3, 8, 13, 24, 26, 17, 25,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ + SYM,SYM,SYM,SYM,SYM,183,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */ + 49, 35,184, 46, 11, 56, 39, 37, 40, 30, 51, 31,185, 36,186, 57, /* CX */ + 50, 58, 52, 33, 34, 59, 22,SYM,187,188, 38,189, 32, 48, 42, 55, /* DX */ + 49, 35,190, 46, 11, 56, 39, 37, 40, 30, 51, 31,191, 36,192, 57, /* EX */ + 50, 58, 52, 33, 34, 59, 22,SYM,193,194, 38,195, 32, 44, 42,196, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Iso_8859_1_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 0, 19, 21, 18, 4, 23, 20, 14, 1, 15, 9, 6, 12, 2, 7, /* 4X */ + 16, 29, 10, 5, 3, 8, 13, 24, 26, 17, 25,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 0, 19, 21, 18, 4, 23, 20, 14, 1, 15, 9, 6, 12, 2, 7, /* 6X */ + 16, 29, 10, 5, 3, 8, 13, 24, 26, 17, 25,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ + SYM,SYM,SYM,SYM,SYM,197,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */ + 49, 35,198, 46, 11, 56, 39, 37, 40, 30, 51, 31,199, 36,200, 57, /* CX */ + 201, 58, 52, 33, 34, 59, 22,SYM,202,203, 38,204, 32,205,206, 55, /* DX */ + 49, 35,207, 46, 11, 56, 39, 37, 40, 30, 51, 31,208, 36,209, 57, /* EX */ + 210, 58, 52, 33, 34, 59, 22,SYM,211,212, 38,213, 32,214,215,216, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + + +/* Model Table: + * Total sequences: 919 + * First 512 sequences: 0.9985378147555799 + * Next 512 sequences (512-1024): 0.0014621852444200612 + * Rest: 3.881443777498106e-17 + * Negative sequences: TODO + */ +static const PRUint8 FinnishLangModel[] = +{ + 3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,2,2,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,2,0,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,0,0,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,2,3,3,3,3,2,3,3,0,3,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,3,2,0,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,2,0,0,0,2, + 3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,2,2,0,0, + 3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,0,3,3,3,3,2,0,2, + 3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,0,3,3,2,3,2,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,3,0,2,3,2,3,2,2,0,2,0,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,2,2, + 3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,0,0,2,2,0,0,0,0,0,0,0, + 3,3,2,2,3,3,2,3,3,2,3,3,3,2,2,2,3,3,2,3,3,3,3,2,2,2,2,0,0,0, + 3,3,2,2,3,2,2,3,3,3,2,3,0,2,2,2,2,3,2,2,0,0,2,0,0,0,0,0,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,2,0,3,3,2,2,0,0,0,0,2, + 3,3,3,2,3,2,2,3,3,2,2,3,2,0,2,0,2,3,0,2,0,0,3,2,0,0,0,0,0,0, + 3,3,2,3,3,3,3,3,3,2,3,3,3,2,3,2,3,3,2,0,2,2,3,2,3,0,0,2,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,3,3,2,3,2,2,2,0,0, + 3,3,3,3,3,3,3,3,3,0,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,0,0,3,2, + 3,3,3,3,3,3,3,3,3,3,3,2,2,0,3,2,0,3,3,3,2,3,2,0,2,2,0,0,0,0, + 3,3,3,3,3,3,3,3,3,3,3,2,3,2,3,2,3,3,3,3,3,3,3,2,3,2,0,0,0,0, + 3,3,2,3,3,3,3,3,3,3,3,0,2,0,3,0,2,3,3,2,2,3,0,0,0,2,0,0,0,2, + 2,3,3,3,2,3,3,2,0,3,3,3,3,3,3,3,3,3,3,2,0,0,3,2,0,0,0,0,0,0, + 3,3,2,3,3,3,3,3,3,2,3,2,0,2,0,2,2,3,0,2,2,2,0,3,0,2,0,0,0,0, + 3,3,3,2,3,3,2,3,2,2,3,0,2,0,3,0,0,2,2,2,2,2,0,2,2,0,0,0,0,0, + 3,3,3,2,3,2,2,3,2,2,2,2,2,2,2,0,2,3,2,2,2,0,0,2,2,3,0,0,0,0, + 3,3,0,2,2,2,3,2,0,0,0,0,2,2,3,0,2,0,0,2,0,2,0,3,2,0,2,0,0,0, + 3,3,2,2,3,0,0,2,2,2,2,0,2,2,0,2,2,2,0,0,0,0,0,0,0,0,0,0,0,0, + 3,2,0,0,2,0,0,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 2,2,0,0,0,2,0,0,3,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +}; + + +const SequenceModel Iso_8859_15FinnishModel = +{ + Iso_8859_15_CharToOrderMap, + FinnishLangModel, + 30, + (float)0.9985378147555799, + PR_TRUE, + "ISO-8859-15" +}; + +const SequenceModel Windows_1252FinnishModel = +{ + Windows_1252_CharToOrderMap, + FinnishLangModel, + 30, + (float)0.9985378147555799, + PR_TRUE, + "WINDOWS-1252" +}; + +const SequenceModel Iso_8859_4FinnishModel = +{ + Iso_8859_4_CharToOrderMap, + FinnishLangModel, + 30, + (float)0.9985378147555799, + PR_TRUE, + "ISO-8859-4" +}; + +const SequenceModel Iso_8859_13FinnishModel = +{ + Iso_8859_13_CharToOrderMap, + FinnishLangModel, + 30, + (float)0.9985378147555799, + PR_TRUE, + "ISO-8859-13" +}; + +const SequenceModel Iso_8859_9FinnishModel = +{ + Iso_8859_9_CharToOrderMap, + FinnishLangModel, + 30, + (float)0.9985378147555799, + PR_TRUE, + "ISO-8859-9" +}; + +const SequenceModel Iso_8859_1FinnishModel = +{ + Iso_8859_1_CharToOrderMap, + FinnishLangModel, + 30, + (float)0.9985378147555799, + PR_TRUE, + "ISO-8859-1" +}; diff --git a/src/nsSBCSGroupProber.cpp b/src/nsSBCSGroupProber.cpp index afb5948..e26540c 100644 --- a/src/nsSBCSGroupProber.cpp +++ b/src/nsSBCSGroupProber.cpp @@ -143,6 +143,13 @@ nsSBCSGroupProber::nsSBCSGroupProber() mProbers[58] = new nsSingleByteCharSetProber(&Mac_CentraleuropePolishModel); mProbers[59] = new nsSingleByteCharSetProber(&Ibm852PolishModel); + mProbers[60] = new nsSingleByteCharSetProber(&Iso_8859_1FinnishModel); + mProbers[61] = new nsSingleByteCharSetProber(&Iso_8859_4FinnishModel); + mProbers[62] = new nsSingleByteCharSetProber(&Iso_8859_9FinnishModel); + mProbers[63] = new nsSingleByteCharSetProber(&Iso_8859_13FinnishModel); + mProbers[64] = new nsSingleByteCharSetProber(&Iso_8859_15FinnishModel); + mProbers[65] = new nsSingleByteCharSetProber(&Windows_1252FinnishModel); + Reset(); } diff --git a/src/nsSBCSGroupProber.h b/src/nsSBCSGroupProber.h index 75db30e..aeeac65 100644 --- a/src/nsSBCSGroupProber.h +++ b/src/nsSBCSGroupProber.h @@ -40,7 +40,7 @@ #define nsSBCSGroupProber_h__ -#define NUM_OF_SBCS_PROBERS 60 +#define NUM_OF_SBCS_PROBERS 66 class nsCharSetProber; class nsSBCSGroupProber: public nsCharSetProber { diff --git a/src/nsSBCharSetProber.h b/src/nsSBCharSetProber.h index 86e7c21..2905c05 100644 --- a/src/nsSBCharSetProber.h +++ b/src/nsSBCharSetProber.h @@ -204,5 +204,12 @@ extern const SequenceModel Iso_8859_16PolishModel; extern const SequenceModel Ibm852PolishModel; extern const SequenceModel Mac_CentraleuropePolishModel; +extern const SequenceModel Iso_8859_1FinnishModel; +extern const SequenceModel Iso_8859_4FinnishModel; +extern const SequenceModel Iso_8859_9FinnishModel; +extern const SequenceModel Iso_8859_13FinnishModel; +extern const SequenceModel Iso_8859_15FinnishModel; +extern const SequenceModel Windows_1252FinnishModel; + #endif /* nsSingleByteCharSetProber_h__ */ diff --git a/test/fi/iso-8859-1.txt b/test/fi/iso-8859-1.txt new file mode 100644 index 0000000..3d584ff --- /dev/null +++ b/test/fi/iso-8859-1.txt @@ -0,0 +1,8 @@ +Termi science fiction on amerikkalaisen tieteislehtien toimittajan Hugo +Gernsbackin keksimä. Suomessa termin tieteiskirjallisuus loi tohtori Eino +Kauppinen 1950-luvun alkupuolella. +Tieteiskirjallisuudelle on laadittu erilaisia määritelmiä. Tieteiskirjallisuuden +rajat eivät ole yksiselitteisen selkeät. Tieteiskirjallisuus lähenee monia +kirjallisuudenlajeja, erityisesti kauhu- ja fantasiakirjallisuutta. Näillä +kolmella lajilla onkin yhteiset juuret 1800-lukua edeltävässä ei-realistisessa +kirjallisuudessa. diff --git a/test/fi/utf-8.txt b/test/fi/utf-8.txt new file mode 100644 index 0000000..ca98172 --- /dev/null +++ b/test/fi/utf-8.txt @@ -0,0 +1,8 @@ +Termi science fiction on amerikkalaisen tieteislehtien toimittajan Hugo +Gernsbackin keksimä. Suomessa termin tieteiskirjallisuus loi tohtori Eino +Kauppinen 1950-luvun alkupuolella. +Tieteiskirjallisuudelle on laadittu erilaisia määritelmiä. Tieteiskirjallisuuden +rajat eivät ole yksiselitteisen selkeät. Tieteiskirjallisuus lähenee monia +kirjallisuudenlajeja, erityisesti kauhu- ja fantasiakirjallisuutta. Näillä +kolmella lajilla onkin yhteiset juuret 1800-lukua edeltävässä ei-realistisessa +kirjallisuudessa.