mirror of
https://gitlab.freedesktop.org/uchardet/uchardet.git
synced 2025-12-06 08:46:40 +08:00
LangModels: add Finnish support.
I built models for ISO-8859-1, ISO-8859-4, ISO-8859-9, ISO-8859-13, ISO-8859-15 and WINDOWS-1252, which all contain Finnish letters. Nevertheless most texts in these encoding end up the same (same codepoints for the Finnish glyphs) so I keep only tests for ISO-8859-1 and UTF-8. Models for other encoding may still be useful when processing texts with some symbols, etc.
This commit is contained in:
parent
ac4aa94b73
commit
6bbe7da1ac
156
script/BuildLangModelLogs/LangFinnishModel.log
Normal file
156
script/BuildLangModelLogs/LangFinnishModel.log
Normal file
@ -0,0 +1,156 @@
|
||||
= Logs of language model for Finnish (fi) =
|
||||
|
||||
- Generated by BuildLangModel.py
|
||||
- Started: 2016-09-21 18:12:24.181917
|
||||
- Maximum depth: 5
|
||||
- Max number of pages: 100
|
||||
|
||||
== Parsed pages ==
|
||||
|
||||
Yhdistynyt kuningaskunta (revision 15843357)
|
||||
1. toukokuuta (revision 15910178)
|
||||
1700-luku (revision 15493702)
|
||||
1707 (revision 15106709)
|
||||
1800-luku (revision 15708929)
|
||||
2014 (revision 15891601)
|
||||
409 (revision 12809782)
|
||||
5. marraskuuta (revision 15421719)
|
||||
927 (revision 12785964)
|
||||
Aasia (revision 15948161)
|
||||
Abhasia (revision 15730328)
|
||||
Adolf Hitler (revision 15951829)
|
||||
Afrikka (revision 15934209)
|
||||
Agatha Christie (revision 15760740)
|
||||
Aikavyöhyke (revision 15800313)
|
||||
Ajoneuvon kansallisuustunnus (revision 15897445)
|
||||
Akrotiri ja Dhekelia (revision 14625383)
|
||||
Alamaat (revision 15913741)
|
||||
Alan Turing (revision 15904871)
|
||||
Alankomaat (revision 15936643)
|
||||
Albania (revision 15767604)
|
||||
Alec Guinness (revision 15363805)
|
||||
Alexander Fleming (revision 15023225)
|
||||
Alfred Hitchcock (revision 15892843)
|
||||
Alfred Tennyson (revision 15856114)
|
||||
Allen Jones (revision 12871703)
|
||||
Andorra (revision 15913862)
|
||||
Andrew Lloyd Webber (revision 14978349)
|
||||
Anglit (revision 15902350)
|
||||
Anguilla (revision 15854041)
|
||||
Anne Brontë (revision 14287992)
|
||||
Anthony Eden (revision 14391831)
|
||||
Antigua ja Barbuda (revision 15196967)
|
||||
Arabian Lawrence (revision 15736417)
|
||||
Argentiina (revision 15676474)
|
||||
Armenia (revision 15634470)
|
||||
Arthur Conan Doyle (revision 15402837)
|
||||
Arts and Crafts (revision 15806930)
|
||||
Aurinko (revision 15934252)
|
||||
Australia (revision 15934255)
|
||||
Avara luonto (revision 15815943)
|
||||
Azerbaidžan (revision 15946891)
|
||||
BBC (revision 15866026)
|
||||
BKT (revision 15656549)
|
||||
Bahama (revision 15516869)
|
||||
Bangladesh (revision 15883994)
|
||||
Bank of England (revision 14481173)
|
||||
Barbados (revision 15839821)
|
||||
Barbara Hepworth (revision 15106880)
|
||||
Bath (revision 15869900)
|
||||
Beatrix Potter (revision 15057380)
|
||||
Belfast (revision 15715934)
|
||||
Belgia (revision 15932391)
|
||||
Belize (revision 15665086)
|
||||
Ben Nevis (revision 15610196)
|
||||
Bengalin kieli (revision 15551820)
|
||||
Benjamin Britten (revision 15081615)
|
||||
Bermuda (revision 15632621)
|
||||
Bertrand Russell (revision 14631969)
|
||||
Bhutan (revision 15377394)
|
||||
Big Ben (revision 14897401)
|
||||
Big Brother (revision 14641391)
|
||||
Birmingham (revision 15855259)
|
||||
Black Sabbath (revision 15839917)
|
||||
Bosnia ja Hertsegovina (revision 15934266)
|
||||
Botswana (revision 15524955)
|
||||
Bristol (revision 15891889)
|
||||
Bristolin kanaali (revision 15849713)
|
||||
Bristolin kansainvälinen lentoasema (revision 14452870)
|
||||
Britannia (provinssi) (revision 14557442)
|
||||
Britannian avoin golfturnaus (revision 14293265)
|
||||
Britannian kuninkaallinen perhe (revision 15522149)
|
||||
Britannian talous (revision 15470242)
|
||||
Britannian väestö (revision 15661241)
|
||||
Brittein saaret (revision 15805422)
|
||||
Brittiläinen Antarktiksen alue (revision 15836227)
|
||||
Brittiläinen Intia (revision 15593126)
|
||||
Brittiläinen Intian valtameren alue (revision 14272903)
|
||||
Brittiläinen imperiumi (revision 15906600)
|
||||
Brittiläinen kansainyhteisö (revision 15894379)
|
||||
Brittiläinen keittiö (revision 13393533)
|
||||
Brittiläinen kulttuuri (revision 15951407)
|
||||
Brittiläiset Neitsytsaaret (revision 15910520)
|
||||
Brittiläiset merentakaiset alueet (revision 15836213)
|
||||
Brunei (revision 15580824)
|
||||
Bruttokansantuote (revision 15656549)
|
||||
Bulgaria (revision 15944101)
|
||||
Burma (revision 15627218)
|
||||
Cambridge (revision 14641664)
|
||||
Cambridgen yliopisto (revision 15493340)
|
||||
Canterburyn tarinoita (revision 15232140)
|
||||
Cardiff (revision 15840398)
|
||||
Caymansaaret (revision 15914575)
|
||||
Channel 4 (revision 15882475)
|
||||
Charles Babbage (revision 15203616)
|
||||
Charles Chaplin (revision 15674652)
|
||||
Charles Darwin (revision 15894085)
|
||||
Charles Dickens (revision 15699592)
|
||||
Charles Dickensin joulutarina (revision 15116247)
|
||||
|
||||
== End of Parsed pages ==
|
||||
|
||||
- Wikipedia parsing ended at: 2016-09-21 18:15:05.189221
|
||||
|
||||
61 characters appeared 940364 times.
|
||||
|
||||
First 30 characters:
|
||||
[ 0] Char a: 12.508773198463574 %
|
||||
[ 1] Char i: 10.969475649854738 %
|
||||
[ 2] Char n: 8.815841525196626 %
|
||||
[ 3] Char t: 8.80169806585535 %
|
||||
[ 4] Char e: 7.8206949649284745 %
|
||||
[ 5] Char s: 7.595782058862313 %
|
||||
[ 6] Char l: 5.963541777439374 %
|
||||
[ 7] Char o: 5.439808414613916 %
|
||||
[ 8] Char u: 5.0102938861972595 %
|
||||
[ 9] Char k: 4.589712068943515 %
|
||||
[10] Char r: 3.1231523112326713 %
|
||||
[11] Char ä: 3.041800834570443 %
|
||||
[12] Char m: 3.0392486313810396 %
|
||||
[13] Char v: 2.156292669647073 %
|
||||
[14] Char h: 1.996141919512019 %
|
||||
[15] Char j: 1.9248929138078446 %
|
||||
[16] Char p: 1.6324529650220552 %
|
||||
[17] Char y: 1.6323466232224966 %
|
||||
[18] Char d: 1.1981530556252684 %
|
||||
[19] Char b: 0.6835650875618378 %
|
||||
[20] Char g: 0.5793501239945382 %
|
||||
[21] Char c: 0.5056552569005194 %
|
||||
[22] Char ö: 0.38931732818355447 %
|
||||
[23] Char f: 0.215023118707224 %
|
||||
[24] Char w: 0.2106631049253268 %
|
||||
[25] Char z: 0.06593191572625068 %
|
||||
[26] Char x: 0.024458613898447838 %
|
||||
[27] Char š: 0.010421496356729947 %
|
||||
[28] Char ž: 0.007869293167326695 %
|
||||
[29] Char q: 0.007762951367768225 %
|
||||
|
||||
The first 30 characters have an accumulated ratio of 0.9996012182516557.
|
||||
|
||||
919 sequences found.
|
||||
|
||||
First 512 (typical positive ratio): 0.9985378147555799
|
||||
Next 512 (512-1024): 1.0634179955846884e-06
|
||||
Rest: 3.881443777498106e-17
|
||||
|
||||
- Processing end: 2016-09-21 18:15:05.307164
|
||||
60
script/langs/fi.py
Normal file
60
script/langs/fi.py
Normal file
@ -0,0 +1,60 @@
|
||||
#!/bin/python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# ##### BEGIN LICENSE BLOCK #####
|
||||
# Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
#
|
||||
# The contents of this file are subject to the Mozilla Public License Version
|
||||
# 1.1 (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
# http://www.mozilla.org/MPL/
|
||||
#
|
||||
# Software distributed under the License is distributed on an "AS IS" basis,
|
||||
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
# for the specific language governing rights and limitations under the
|
||||
# License.
|
||||
#
|
||||
# The Original Code is Mozilla Universal charset detector code.
|
||||
#
|
||||
# The Initial Developer of the Original Code is
|
||||
# Netscape Communications Corporation.
|
||||
# Portions created by the Initial Developer are Copyright (C) 2001
|
||||
# the Initial Developer. All Rights Reserved.
|
||||
#
|
||||
# Contributor(s):
|
||||
# Jehan <jehan@girinstud.io>
|
||||
#
|
||||
# Alternatively, the contents of this file may be used under the terms of
|
||||
# either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
# in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
# of those above. If you wish to allow use of your version of this file only
|
||||
# under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
# use your version of this file under the terms of the MPL, indicate your
|
||||
# decision by deleting the provisions above and replace them with the notice
|
||||
# and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
# the provisions above, a recipient may use your version of this file under
|
||||
# the terms of any one of the MPL, the GPL or the LGPL.
|
||||
#
|
||||
# ##### END LICENSE BLOCK #####
|
||||
|
||||
import re
|
||||
|
||||
## Mandatory Properties ##
|
||||
|
||||
name = 'Finnish'
|
||||
code = 'fi'
|
||||
use_ascii = True
|
||||
charsets = ['ISO-8859-1', 'ISO-8859-4', 'ISO-8859-9',
|
||||
'ISO-8859-13', 'ISO-8859-15', 'WINDOWS-1252']
|
||||
|
||||
## Optional Properties ##
|
||||
|
||||
# Alphabet characters.
|
||||
# 'å' (Swedish o), 'š' and 'ž' are rare enough that I don't want to include them
|
||||
# here.
|
||||
alphabet = 'äö'
|
||||
# Some random high quality page found on the Finnish home page.
|
||||
start_pages = ['Yhdistynyt kuningaskunta']
|
||||
wikipedia_code = code
|
||||
case_mapping = True
|
||||
@ -11,6 +11,7 @@ set(
|
||||
LangModels/LangBulgarianModel.cpp
|
||||
LangModels/LangCzechModel.cpp
|
||||
LangModels/LangEsperantoModel.cpp
|
||||
LangModels/LangFinnishModel.cpp
|
||||
LangModels/LangFrenchModel.cpp
|
||||
LangModels/LangDanishModel.cpp
|
||||
LangModels/LangGermanModel.cpp
|
||||
|
||||
291
src/LangModels/LangFinnishModel.cpp
Normal file
291
src/LangModels/LangFinnishModel.cpp
Normal file
@ -0,0 +1,291 @@
|
||||
/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
|
||||
/* ***** BEGIN LICENSE BLOCK *****
|
||||
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
*
|
||||
* The contents of this file are subject to the Mozilla Public License Version
|
||||
* 1.1 (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
* http://www.mozilla.org/MPL/
|
||||
*
|
||||
* Software distributed under the License is distributed on an "AS IS" basis,
|
||||
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
* for the specific language governing rights and limitations under the
|
||||
* License.
|
||||
*
|
||||
* The Original Code is Mozilla Communicator client code.
|
||||
*
|
||||
* The Initial Developer of the Original Code is
|
||||
* Netscape Communications Corporation.
|
||||
* Portions created by the Initial Developer are Copyright (C) 1998
|
||||
* the Initial Developer. All Rights Reserved.
|
||||
*
|
||||
* Contributor(s):
|
||||
*
|
||||
* Alternatively, the contents of this file may be used under the terms of
|
||||
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
* in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
* of those above. If you wish to allow use of your version of this file only
|
||||
* under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
* use your version of this file under the terms of the MPL, indicate your
|
||||
* decision by deleting the provisions above and replace them with the notice
|
||||
* and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
* the provisions above, a recipient may use your version of this file under
|
||||
* the terms of any one of the MPL, the GPL or the LGPL.
|
||||
*
|
||||
* ***** END LICENSE BLOCK ***** */
|
||||
|
||||
#include "../nsSBCharSetProber.h"
|
||||
|
||||
/********* Language model for: Finnish *********/
|
||||
|
||||
/**
|
||||
* Generated by BuildLangModel.py
|
||||
* On: 2016-09-21 18:15:05.189948
|
||||
**/
|
||||
|
||||
/* Character Mapping Table:
|
||||
* ILL: illegal character.
|
||||
* CTR: control character specific to the charset.
|
||||
* RET: carriage/return.
|
||||
* SYM: symbol (punctuation) that does not belong to word.
|
||||
* NUM: 0 - 9.
|
||||
*
|
||||
* Other characters are ordered by probabilities
|
||||
* (0 is the most common character in the language).
|
||||
*
|
||||
* Orders are generic to a language. So the codepoint with order X in
|
||||
* CHARSET1 maps to the same character as the codepoint with the same
|
||||
* order X in CHARSET2 for the same language.
|
||||
* As such, it is possible to get missing order. For instance the
|
||||
* ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1
|
||||
* even though they are both used for French. Same for the euro sign.
|
||||
*/
|
||||
static const unsigned char Iso_8859_15_CharToOrderMap[] =
|
||||
{
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
|
||||
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
|
||||
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
|
||||
SYM, 0, 19, 21, 18, 4, 23, 20, 14, 1, 15, 9, 6, 12, 2, 7, /* 4X */
|
||||
16, 29, 10, 5, 3, 8, 13, 24, 26, 17, 25,SYM,SYM,SYM,SYM,SYM, /* 5X */
|
||||
SYM, 0, 19, 21, 18, 4, 23, 20, 14, 1, 15, 9, 6, 12, 2, 7, /* 6X */
|
||||
16, 29, 10, 5, 3, 8, 13, 24, 26, 17, 25,SYM,SYM,SYM,SYM,CTR, /* 7X */
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */
|
||||
SYM,SYM,SYM,SYM,SYM,SYM, 27,SYM, 27,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */
|
||||
SYM,SYM,SYM,SYM, 28, 61,SYM,SYM, 28,SYM,SYM,SYM, 62, 63, 64,SYM, /* BX */
|
||||
49, 35, 65, 46, 11, 56, 39, 37, 40, 30, 51, 31, 66, 36, 67, 57, /* CX */
|
||||
68, 58, 52, 33, 34, 59, 22,SYM, 69, 70, 38, 71, 32, 72, 73, 55, /* DX */
|
||||
49, 35, 74, 46, 11, 56, 39, 37, 40, 30, 51, 31, 75, 36, 76, 57, /* EX */
|
||||
77, 58, 52, 33, 34, 59, 22,SYM, 78, 79, 38, 80, 32, 81, 82, 83, /* FX */
|
||||
};
|
||||
/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
|
||||
|
||||
static const unsigned char Windows_1252_CharToOrderMap[] =
|
||||
{
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
|
||||
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
|
||||
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
|
||||
SYM, 0, 19, 21, 18, 4, 23, 20, 14, 1, 15, 9, 6, 12, 2, 7, /* 4X */
|
||||
16, 29, 10, 5, 3, 8, 13, 24, 26, 17, 25,SYM,SYM,SYM,SYM,SYM, /* 5X */
|
||||
SYM, 0, 19, 21, 18, 4, 23, 20, 14, 1, 15, 9, 6, 12, 2, 7, /* 6X */
|
||||
16, 29, 10, 5, 3, 8, 13, 24, 26, 17, 25,SYM,SYM,SYM,SYM,CTR, /* 7X */
|
||||
SYM,ILL,SYM, 84,SYM,SYM,SYM,SYM,SYM,SYM, 27,SYM, 85,ILL, 28,ILL, /* 8X */
|
||||
ILL,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 27,SYM, 86,ILL, 28, 87, /* 9X */
|
||||
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */
|
||||
SYM,SYM,SYM,SYM,SYM, 88,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */
|
||||
49, 35, 89, 46, 11, 56, 39, 37, 40, 30, 51, 31, 90, 36, 91, 57, /* CX */
|
||||
92, 58, 52, 33, 34, 59, 22,SYM, 93, 94, 38, 95, 32, 96, 97, 55, /* DX */
|
||||
49, 35, 98, 46, 11, 56, 39, 37, 40, 30, 51, 31, 99, 36,100, 57, /* EX */
|
||||
101, 58, 52, 33, 34, 59, 22,SYM,102,103, 38,104, 32,105,106,107, /* FX */
|
||||
};
|
||||
/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
|
||||
|
||||
static const unsigned char Iso_8859_4_CharToOrderMap[] =
|
||||
{
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
|
||||
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
|
||||
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
|
||||
SYM, 0, 19, 21, 18, 4, 23, 20, 14, 1, 15, 9, 6, 12, 2, 7, /* 4X */
|
||||
16, 29, 10, 5, 3, 8, 13, 24, 26, 17, 25,SYM,SYM,SYM,SYM,SYM, /* 5X */
|
||||
SYM, 0, 19, 21, 18, 4, 23, 20, 14, 1, 15, 9, 6, 12, 2, 7, /* 6X */
|
||||
16, 29, 10, 5, 3, 8, 13, 24, 26, 17, 25,SYM,SYM,SYM,SYM,CTR, /* 7X */
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */
|
||||
SYM,108,109, 47,SYM,110,111,SYM,SYM, 27,112,113,114,SYM, 28,SYM, /* AX */
|
||||
SYM,115,SYM, 47,SYM,116,117,SYM,SYM, 27,118,119,120, 45, 28, 45, /* BX */
|
||||
53, 35,121, 46, 11, 56, 39,122, 43, 30,123, 31,124, 36,125,126, /* CX */
|
||||
127, 54,128,129, 34, 59, 22,SYM,130,131, 38,132, 32,133,134, 55, /* DX */
|
||||
53, 35,135, 46, 11, 56, 39,136, 43, 30,137, 31,138, 36,139,140, /* EX */
|
||||
141, 54,142,143, 34, 59, 22,SYM,144,145, 38,146, 32,147,148,SYM, /* FX */
|
||||
};
|
||||
/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
|
||||
|
||||
static const unsigned char Iso_8859_13_CharToOrderMap[] =
|
||||
{
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
|
||||
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
|
||||
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
|
||||
SYM, 0, 19, 21, 18, 4, 23, 20, 14, 1, 15, 9, 6, 12, 2, 7, /* 4X */
|
||||
16, 29, 10, 5, 3, 8, 13, 24, 26, 17, 25,SYM,SYM,SYM,SYM,SYM, /* 5X */
|
||||
SYM, 0, 19, 21, 18, 4, 23, 20, 14, 1, 15, 9, 6, 12, 2, 7, /* 6X */
|
||||
16, 29, 10, 5, 3, 8, 13, 24, 26, 17, 25,SYM,SYM,SYM,SYM,CTR, /* 7X */
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */
|
||||
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,149,SYM, 47,SYM,SYM,SYM,SYM, 39, /* AX */
|
||||
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,150,SYM, 47,SYM,SYM,SYM,SYM, 39, /* BX */
|
||||
151,152, 53, 41, 11, 56,153,154, 43, 30,155,156,157,158,159,160, /* CX */
|
||||
27,161, 54, 33,162, 59, 22,SYM,163,164,165,166, 32, 60, 28, 55, /* DX */
|
||||
167,168, 53, 41, 11, 56,169,170, 43, 30,171,172,173,174,175,176, /* EX */
|
||||
27,177, 54, 33,178, 59, 22,SYM,179,180,181,182, 32, 60, 28,SYM, /* FX */
|
||||
};
|
||||
/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
|
||||
|
||||
static const unsigned char Iso_8859_9_CharToOrderMap[] =
|
||||
{
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
|
||||
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
|
||||
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
|
||||
SYM, 0, 19, 21, 18, 4, 23, 20, 14, 1, 15, 9, 6, 12, 2, 7, /* 4X */
|
||||
16, 29, 10, 5, 3, 8, 13, 24, 26, 17, 25,SYM,SYM,SYM,SYM,SYM, /* 5X */
|
||||
SYM, 0, 19, 21, 18, 4, 23, 20, 14, 1, 15, 9, 6, 12, 2, 7, /* 6X */
|
||||
16, 29, 10, 5, 3, 8, 13, 24, 26, 17, 25,SYM,SYM,SYM,SYM,CTR, /* 7X */
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */
|
||||
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */
|
||||
SYM,SYM,SYM,SYM,SYM,183,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */
|
||||
49, 35,184, 46, 11, 56, 39, 37, 40, 30, 51, 31,185, 36,186, 57, /* CX */
|
||||
50, 58, 52, 33, 34, 59, 22,SYM,187,188, 38,189, 32, 48, 42, 55, /* DX */
|
||||
49, 35,190, 46, 11, 56, 39, 37, 40, 30, 51, 31,191, 36,192, 57, /* EX */
|
||||
50, 58, 52, 33, 34, 59, 22,SYM,193,194, 38,195, 32, 44, 42,196, /* FX */
|
||||
};
|
||||
/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
|
||||
|
||||
static const unsigned char Iso_8859_1_CharToOrderMap[] =
|
||||
{
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
|
||||
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
|
||||
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
|
||||
SYM, 0, 19, 21, 18, 4, 23, 20, 14, 1, 15, 9, 6, 12, 2, 7, /* 4X */
|
||||
16, 29, 10, 5, 3, 8, 13, 24, 26, 17, 25,SYM,SYM,SYM,SYM,SYM, /* 5X */
|
||||
SYM, 0, 19, 21, 18, 4, 23, 20, 14, 1, 15, 9, 6, 12, 2, 7, /* 6X */
|
||||
16, 29, 10, 5, 3, 8, 13, 24, 26, 17, 25,SYM,SYM,SYM,SYM,CTR, /* 7X */
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */
|
||||
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */
|
||||
SYM,SYM,SYM,SYM,SYM,197,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */
|
||||
49, 35,198, 46, 11, 56, 39, 37, 40, 30, 51, 31,199, 36,200, 57, /* CX */
|
||||
201, 58, 52, 33, 34, 59, 22,SYM,202,203, 38,204, 32,205,206, 55, /* DX */
|
||||
49, 35,207, 46, 11, 56, 39, 37, 40, 30, 51, 31,208, 36,209, 57, /* EX */
|
||||
210, 58, 52, 33, 34, 59, 22,SYM,211,212, 38,213, 32,214,215,216, /* FX */
|
||||
};
|
||||
/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
|
||||
|
||||
|
||||
/* Model Table:
|
||||
* Total sequences: 919
|
||||
* First 512 sequences: 0.9985378147555799
|
||||
* Next 512 sequences (512-1024): 0.0014621852444200612
|
||||
* Rest: 3.881443777498106e-17
|
||||
* Negative sequences: TODO
|
||||
*/
|
||||
static const PRUint8 FinnishLangModel[] =
|
||||
{
|
||||
3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,2,2,2,
|
||||
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,2,0,2,
|
||||
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,0,0,2,
|
||||
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,2,3,3,3,3,2,3,3,0,3,0,0,
|
||||
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,3,2,0,2,
|
||||
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,2,0,0,0,2,
|
||||
3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,2,2,0,0,
|
||||
3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,0,3,3,3,3,2,0,2,
|
||||
3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,0,3,3,2,3,2,0,0,
|
||||
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,3,0,2,3,2,3,2,2,0,2,0,0,0,
|
||||
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,2,2,
|
||||
3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,0,0,2,2,0,0,0,0,0,0,0,
|
||||
3,3,2,2,3,3,2,3,3,2,3,3,3,2,2,2,3,3,2,3,3,3,3,2,2,2,2,0,0,0,
|
||||
3,3,2,2,3,2,2,3,3,3,2,3,0,2,2,2,2,3,2,2,0,0,2,0,0,0,0,0,0,0,
|
||||
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,2,0,3,3,2,2,0,0,0,0,2,
|
||||
3,3,3,2,3,2,2,3,3,2,2,3,2,0,2,0,2,3,0,2,0,0,3,2,0,0,0,0,0,0,
|
||||
3,3,2,3,3,3,3,3,3,2,3,3,3,2,3,2,3,3,2,0,2,2,3,2,3,0,0,2,0,0,
|
||||
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,3,3,2,3,2,2,2,0,0,
|
||||
3,3,3,3,3,3,3,3,3,0,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,0,0,3,2,
|
||||
3,3,3,3,3,3,3,3,3,3,3,2,2,0,3,2,0,3,3,3,2,3,2,0,2,2,0,0,0,0,
|
||||
3,3,3,3,3,3,3,3,3,3,3,2,3,2,3,2,3,3,3,3,3,3,3,2,3,2,0,0,0,0,
|
||||
3,3,2,3,3,3,3,3,3,3,3,0,2,0,3,0,2,3,3,2,2,3,0,0,0,2,0,0,0,2,
|
||||
2,3,3,3,2,3,3,2,0,3,3,3,3,3,3,3,3,3,3,2,0,0,3,2,0,0,0,0,0,0,
|
||||
3,3,2,3,3,3,3,3,3,2,3,2,0,2,0,2,2,3,0,2,2,2,0,3,0,2,0,0,0,0,
|
||||
3,3,3,2,3,3,2,3,2,2,3,0,2,0,3,0,0,2,2,2,2,2,0,2,2,0,0,0,0,0,
|
||||
3,3,3,2,3,2,2,3,2,2,2,2,2,2,2,0,2,3,2,2,2,0,0,2,2,3,0,0,0,0,
|
||||
3,3,0,2,2,2,3,2,0,0,0,0,2,2,3,0,2,0,0,2,0,2,0,3,2,0,2,0,0,0,
|
||||
3,3,2,2,3,0,0,2,2,2,2,0,2,2,0,2,2,2,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
3,2,0,0,2,0,0,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
2,2,0,0,0,2,0,0,3,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
};
|
||||
|
||||
|
||||
const SequenceModel Iso_8859_15FinnishModel =
|
||||
{
|
||||
Iso_8859_15_CharToOrderMap,
|
||||
FinnishLangModel,
|
||||
30,
|
||||
(float)0.9985378147555799,
|
||||
PR_TRUE,
|
||||
"ISO-8859-15"
|
||||
};
|
||||
|
||||
const SequenceModel Windows_1252FinnishModel =
|
||||
{
|
||||
Windows_1252_CharToOrderMap,
|
||||
FinnishLangModel,
|
||||
30,
|
||||
(float)0.9985378147555799,
|
||||
PR_TRUE,
|
||||
"WINDOWS-1252"
|
||||
};
|
||||
|
||||
const SequenceModel Iso_8859_4FinnishModel =
|
||||
{
|
||||
Iso_8859_4_CharToOrderMap,
|
||||
FinnishLangModel,
|
||||
30,
|
||||
(float)0.9985378147555799,
|
||||
PR_TRUE,
|
||||
"ISO-8859-4"
|
||||
};
|
||||
|
||||
const SequenceModel Iso_8859_13FinnishModel =
|
||||
{
|
||||
Iso_8859_13_CharToOrderMap,
|
||||
FinnishLangModel,
|
||||
30,
|
||||
(float)0.9985378147555799,
|
||||
PR_TRUE,
|
||||
"ISO-8859-13"
|
||||
};
|
||||
|
||||
const SequenceModel Iso_8859_9FinnishModel =
|
||||
{
|
||||
Iso_8859_9_CharToOrderMap,
|
||||
FinnishLangModel,
|
||||
30,
|
||||
(float)0.9985378147555799,
|
||||
PR_TRUE,
|
||||
"ISO-8859-9"
|
||||
};
|
||||
|
||||
const SequenceModel Iso_8859_1FinnishModel =
|
||||
{
|
||||
Iso_8859_1_CharToOrderMap,
|
||||
FinnishLangModel,
|
||||
30,
|
||||
(float)0.9985378147555799,
|
||||
PR_TRUE,
|
||||
"ISO-8859-1"
|
||||
};
|
||||
@ -143,6 +143,13 @@ nsSBCSGroupProber::nsSBCSGroupProber()
|
||||
mProbers[58] = new nsSingleByteCharSetProber(&Mac_CentraleuropePolishModel);
|
||||
mProbers[59] = new nsSingleByteCharSetProber(&Ibm852PolishModel);
|
||||
|
||||
mProbers[60] = new nsSingleByteCharSetProber(&Iso_8859_1FinnishModel);
|
||||
mProbers[61] = new nsSingleByteCharSetProber(&Iso_8859_4FinnishModel);
|
||||
mProbers[62] = new nsSingleByteCharSetProber(&Iso_8859_9FinnishModel);
|
||||
mProbers[63] = new nsSingleByteCharSetProber(&Iso_8859_13FinnishModel);
|
||||
mProbers[64] = new nsSingleByteCharSetProber(&Iso_8859_15FinnishModel);
|
||||
mProbers[65] = new nsSingleByteCharSetProber(&Windows_1252FinnishModel);
|
||||
|
||||
Reset();
|
||||
}
|
||||
|
||||
|
||||
@ -40,7 +40,7 @@
|
||||
#define nsSBCSGroupProber_h__
|
||||
|
||||
|
||||
#define NUM_OF_SBCS_PROBERS 60
|
||||
#define NUM_OF_SBCS_PROBERS 66
|
||||
|
||||
class nsCharSetProber;
|
||||
class nsSBCSGroupProber: public nsCharSetProber {
|
||||
|
||||
@ -204,5 +204,12 @@ extern const SequenceModel Iso_8859_16PolishModel;
|
||||
extern const SequenceModel Ibm852PolishModel;
|
||||
extern const SequenceModel Mac_CentraleuropePolishModel;
|
||||
|
||||
extern const SequenceModel Iso_8859_1FinnishModel;
|
||||
extern const SequenceModel Iso_8859_4FinnishModel;
|
||||
extern const SequenceModel Iso_8859_9FinnishModel;
|
||||
extern const SequenceModel Iso_8859_13FinnishModel;
|
||||
extern const SequenceModel Iso_8859_15FinnishModel;
|
||||
extern const SequenceModel Windows_1252FinnishModel;
|
||||
|
||||
#endif /* nsSingleByteCharSetProber_h__ */
|
||||
|
||||
|
||||
8
test/fi/iso-8859-1.txt
Normal file
8
test/fi/iso-8859-1.txt
Normal file
@ -0,0 +1,8 @@
|
||||
Termi science fiction on amerikkalaisen tieteislehtien toimittajan Hugo
|
||||
Gernsbackin keksimä. Suomessa termin tieteiskirjallisuus loi tohtori Eino
|
||||
Kauppinen 1950-luvun alkupuolella.
|
||||
Tieteiskirjallisuudelle on laadittu erilaisia määritelmiä. Tieteiskirjallisuuden
|
||||
rajat eivät ole yksiselitteisen selkeät. Tieteiskirjallisuus lähenee monia
|
||||
kirjallisuudenlajeja, erityisesti kauhu- ja fantasiakirjallisuutta. Näillä
|
||||
kolmella lajilla onkin yhteiset juuret 1800-lukua edeltävässä ei-realistisessa
|
||||
kirjallisuudessa.
|
||||
8
test/fi/utf-8.txt
Normal file
8
test/fi/utf-8.txt
Normal file
@ -0,0 +1,8 @@
|
||||
Termi science fiction on amerikkalaisen tieteislehtien toimittajan Hugo
|
||||
Gernsbackin keksimä. Suomessa termin tieteiskirjallisuus loi tohtori Eino
|
||||
Kauppinen 1950-luvun alkupuolella.
|
||||
Tieteiskirjallisuudelle on laadittu erilaisia määritelmiä. Tieteiskirjallisuuden
|
||||
rajat eivät ole yksiselitteisen selkeät. Tieteiskirjallisuus lähenee monia
|
||||
kirjallisuudenlajeja, erityisesti kauhu- ja fantasiakirjallisuutta. Näillä
|
||||
kolmella lajilla onkin yhteiset juuret 1800-lukua edeltävässä ei-realistisessa
|
||||
kirjallisuudessa.
|
||||
Loading…
x
Reference in New Issue
Block a user