mirror of
https://gitlab.freedesktop.org/uchardet/uchardet.git
synced 2025-12-07 17:26:41 +08:00
LangModels: add Finnish support.
I built models for ISO-8859-1, ISO-8859-4, ISO-8859-9, ISO-8859-13, ISO-8859-15 and WINDOWS-1252, which all contain Finnish letters. Nevertheless most texts in these encoding end up the same (same codepoints for the Finnish glyphs) so I keep only tests for ISO-8859-1 and UTF-8. Models for other encoding may still be useful when processing texts with some symbols, etc.
This commit is contained in:
parent
ac4aa94b73
commit
6bbe7da1ac
156
script/BuildLangModelLogs/LangFinnishModel.log
Normal file
156
script/BuildLangModelLogs/LangFinnishModel.log
Normal file
@ -0,0 +1,156 @@
|
|||||||
|
= Logs of language model for Finnish (fi) =
|
||||||
|
|
||||||
|
- Generated by BuildLangModel.py
|
||||||
|
- Started: 2016-09-21 18:12:24.181917
|
||||||
|
- Maximum depth: 5
|
||||||
|
- Max number of pages: 100
|
||||||
|
|
||||||
|
== Parsed pages ==
|
||||||
|
|
||||||
|
Yhdistynyt kuningaskunta (revision 15843357)
|
||||||
|
1. toukokuuta (revision 15910178)
|
||||||
|
1700-luku (revision 15493702)
|
||||||
|
1707 (revision 15106709)
|
||||||
|
1800-luku (revision 15708929)
|
||||||
|
2014 (revision 15891601)
|
||||||
|
409 (revision 12809782)
|
||||||
|
5. marraskuuta (revision 15421719)
|
||||||
|
927 (revision 12785964)
|
||||||
|
Aasia (revision 15948161)
|
||||||
|
Abhasia (revision 15730328)
|
||||||
|
Adolf Hitler (revision 15951829)
|
||||||
|
Afrikka (revision 15934209)
|
||||||
|
Agatha Christie (revision 15760740)
|
||||||
|
Aikavyöhyke (revision 15800313)
|
||||||
|
Ajoneuvon kansallisuustunnus (revision 15897445)
|
||||||
|
Akrotiri ja Dhekelia (revision 14625383)
|
||||||
|
Alamaat (revision 15913741)
|
||||||
|
Alan Turing (revision 15904871)
|
||||||
|
Alankomaat (revision 15936643)
|
||||||
|
Albania (revision 15767604)
|
||||||
|
Alec Guinness (revision 15363805)
|
||||||
|
Alexander Fleming (revision 15023225)
|
||||||
|
Alfred Hitchcock (revision 15892843)
|
||||||
|
Alfred Tennyson (revision 15856114)
|
||||||
|
Allen Jones (revision 12871703)
|
||||||
|
Andorra (revision 15913862)
|
||||||
|
Andrew Lloyd Webber (revision 14978349)
|
||||||
|
Anglit (revision 15902350)
|
||||||
|
Anguilla (revision 15854041)
|
||||||
|
Anne Brontë (revision 14287992)
|
||||||
|
Anthony Eden (revision 14391831)
|
||||||
|
Antigua ja Barbuda (revision 15196967)
|
||||||
|
Arabian Lawrence (revision 15736417)
|
||||||
|
Argentiina (revision 15676474)
|
||||||
|
Armenia (revision 15634470)
|
||||||
|
Arthur Conan Doyle (revision 15402837)
|
||||||
|
Arts and Crafts (revision 15806930)
|
||||||
|
Aurinko (revision 15934252)
|
||||||
|
Australia (revision 15934255)
|
||||||
|
Avara luonto (revision 15815943)
|
||||||
|
Azerbaidžan (revision 15946891)
|
||||||
|
BBC (revision 15866026)
|
||||||
|
BKT (revision 15656549)
|
||||||
|
Bahama (revision 15516869)
|
||||||
|
Bangladesh (revision 15883994)
|
||||||
|
Bank of England (revision 14481173)
|
||||||
|
Barbados (revision 15839821)
|
||||||
|
Barbara Hepworth (revision 15106880)
|
||||||
|
Bath (revision 15869900)
|
||||||
|
Beatrix Potter (revision 15057380)
|
||||||
|
Belfast (revision 15715934)
|
||||||
|
Belgia (revision 15932391)
|
||||||
|
Belize (revision 15665086)
|
||||||
|
Ben Nevis (revision 15610196)
|
||||||
|
Bengalin kieli (revision 15551820)
|
||||||
|
Benjamin Britten (revision 15081615)
|
||||||
|
Bermuda (revision 15632621)
|
||||||
|
Bertrand Russell (revision 14631969)
|
||||||
|
Bhutan (revision 15377394)
|
||||||
|
Big Ben (revision 14897401)
|
||||||
|
Big Brother (revision 14641391)
|
||||||
|
Birmingham (revision 15855259)
|
||||||
|
Black Sabbath (revision 15839917)
|
||||||
|
Bosnia ja Hertsegovina (revision 15934266)
|
||||||
|
Botswana (revision 15524955)
|
||||||
|
Bristol (revision 15891889)
|
||||||
|
Bristolin kanaali (revision 15849713)
|
||||||
|
Bristolin kansainvälinen lentoasema (revision 14452870)
|
||||||
|
Britannia (provinssi) (revision 14557442)
|
||||||
|
Britannian avoin golfturnaus (revision 14293265)
|
||||||
|
Britannian kuninkaallinen perhe (revision 15522149)
|
||||||
|
Britannian talous (revision 15470242)
|
||||||
|
Britannian väestö (revision 15661241)
|
||||||
|
Brittein saaret (revision 15805422)
|
||||||
|
Brittiläinen Antarktiksen alue (revision 15836227)
|
||||||
|
Brittiläinen Intia (revision 15593126)
|
||||||
|
Brittiläinen Intian valtameren alue (revision 14272903)
|
||||||
|
Brittiläinen imperiumi (revision 15906600)
|
||||||
|
Brittiläinen kansainyhteisö (revision 15894379)
|
||||||
|
Brittiläinen keittiö (revision 13393533)
|
||||||
|
Brittiläinen kulttuuri (revision 15951407)
|
||||||
|
Brittiläiset Neitsytsaaret (revision 15910520)
|
||||||
|
Brittiläiset merentakaiset alueet (revision 15836213)
|
||||||
|
Brunei (revision 15580824)
|
||||||
|
Bruttokansantuote (revision 15656549)
|
||||||
|
Bulgaria (revision 15944101)
|
||||||
|
Burma (revision 15627218)
|
||||||
|
Cambridge (revision 14641664)
|
||||||
|
Cambridgen yliopisto (revision 15493340)
|
||||||
|
Canterburyn tarinoita (revision 15232140)
|
||||||
|
Cardiff (revision 15840398)
|
||||||
|
Caymansaaret (revision 15914575)
|
||||||
|
Channel 4 (revision 15882475)
|
||||||
|
Charles Babbage (revision 15203616)
|
||||||
|
Charles Chaplin (revision 15674652)
|
||||||
|
Charles Darwin (revision 15894085)
|
||||||
|
Charles Dickens (revision 15699592)
|
||||||
|
Charles Dickensin joulutarina (revision 15116247)
|
||||||
|
|
||||||
|
== End of Parsed pages ==
|
||||||
|
|
||||||
|
- Wikipedia parsing ended at: 2016-09-21 18:15:05.189221
|
||||||
|
|
||||||
|
61 characters appeared 940364 times.
|
||||||
|
|
||||||
|
First 30 characters:
|
||||||
|
[ 0] Char a: 12.508773198463574 %
|
||||||
|
[ 1] Char i: 10.969475649854738 %
|
||||||
|
[ 2] Char n: 8.815841525196626 %
|
||||||
|
[ 3] Char t: 8.80169806585535 %
|
||||||
|
[ 4] Char e: 7.8206949649284745 %
|
||||||
|
[ 5] Char s: 7.595782058862313 %
|
||||||
|
[ 6] Char l: 5.963541777439374 %
|
||||||
|
[ 7] Char o: 5.439808414613916 %
|
||||||
|
[ 8] Char u: 5.0102938861972595 %
|
||||||
|
[ 9] Char k: 4.589712068943515 %
|
||||||
|
[10] Char r: 3.1231523112326713 %
|
||||||
|
[11] Char ä: 3.041800834570443 %
|
||||||
|
[12] Char m: 3.0392486313810396 %
|
||||||
|
[13] Char v: 2.156292669647073 %
|
||||||
|
[14] Char h: 1.996141919512019 %
|
||||||
|
[15] Char j: 1.9248929138078446 %
|
||||||
|
[16] Char p: 1.6324529650220552 %
|
||||||
|
[17] Char y: 1.6323466232224966 %
|
||||||
|
[18] Char d: 1.1981530556252684 %
|
||||||
|
[19] Char b: 0.6835650875618378 %
|
||||||
|
[20] Char g: 0.5793501239945382 %
|
||||||
|
[21] Char c: 0.5056552569005194 %
|
||||||
|
[22] Char ö: 0.38931732818355447 %
|
||||||
|
[23] Char f: 0.215023118707224 %
|
||||||
|
[24] Char w: 0.2106631049253268 %
|
||||||
|
[25] Char z: 0.06593191572625068 %
|
||||||
|
[26] Char x: 0.024458613898447838 %
|
||||||
|
[27] Char š: 0.010421496356729947 %
|
||||||
|
[28] Char ž: 0.007869293167326695 %
|
||||||
|
[29] Char q: 0.007762951367768225 %
|
||||||
|
|
||||||
|
The first 30 characters have an accumulated ratio of 0.9996012182516557.
|
||||||
|
|
||||||
|
919 sequences found.
|
||||||
|
|
||||||
|
First 512 (typical positive ratio): 0.9985378147555799
|
||||||
|
Next 512 (512-1024): 1.0634179955846884e-06
|
||||||
|
Rest: 3.881443777498106e-17
|
||||||
|
|
||||||
|
- Processing end: 2016-09-21 18:15:05.307164
|
||||||
60
script/langs/fi.py
Normal file
60
script/langs/fi.py
Normal file
@ -0,0 +1,60 @@
|
|||||||
|
#!/bin/python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
# ##### BEGIN LICENSE BLOCK #####
|
||||||
|
# Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||||
|
#
|
||||||
|
# The contents of this file are subject to the Mozilla Public License Version
|
||||||
|
# 1.1 (the "License"); you may not use this file except in compliance with
|
||||||
|
# the License. You may obtain a copy of the License at
|
||||||
|
# http://www.mozilla.org/MPL/
|
||||||
|
#
|
||||||
|
# Software distributed under the License is distributed on an "AS IS" basis,
|
||||||
|
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||||
|
# for the specific language governing rights and limitations under the
|
||||||
|
# License.
|
||||||
|
#
|
||||||
|
# The Original Code is Mozilla Universal charset detector code.
|
||||||
|
#
|
||||||
|
# The Initial Developer of the Original Code is
|
||||||
|
# Netscape Communications Corporation.
|
||||||
|
# Portions created by the Initial Developer are Copyright (C) 2001
|
||||||
|
# the Initial Developer. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Contributor(s):
|
||||||
|
# Jehan <jehan@girinstud.io>
|
||||||
|
#
|
||||||
|
# Alternatively, the contents of this file may be used under the terms of
|
||||||
|
# either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||||
|
# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||||
|
# in which case the provisions of the GPL or the LGPL are applicable instead
|
||||||
|
# of those above. If you wish to allow use of your version of this file only
|
||||||
|
# under the terms of either the GPL or the LGPL, and not to allow others to
|
||||||
|
# use your version of this file under the terms of the MPL, indicate your
|
||||||
|
# decision by deleting the provisions above and replace them with the notice
|
||||||
|
# and other provisions required by the GPL or the LGPL. If you do not delete
|
||||||
|
# the provisions above, a recipient may use your version of this file under
|
||||||
|
# the terms of any one of the MPL, the GPL or the LGPL.
|
||||||
|
#
|
||||||
|
# ##### END LICENSE BLOCK #####
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
## Mandatory Properties ##
|
||||||
|
|
||||||
|
name = 'Finnish'
|
||||||
|
code = 'fi'
|
||||||
|
use_ascii = True
|
||||||
|
charsets = ['ISO-8859-1', 'ISO-8859-4', 'ISO-8859-9',
|
||||||
|
'ISO-8859-13', 'ISO-8859-15', 'WINDOWS-1252']
|
||||||
|
|
||||||
|
## Optional Properties ##
|
||||||
|
|
||||||
|
# Alphabet characters.
|
||||||
|
# 'å' (Swedish o), 'š' and 'ž' are rare enough that I don't want to include them
|
||||||
|
# here.
|
||||||
|
alphabet = 'äö'
|
||||||
|
# Some random high quality page found on the Finnish home page.
|
||||||
|
start_pages = ['Yhdistynyt kuningaskunta']
|
||||||
|
wikipedia_code = code
|
||||||
|
case_mapping = True
|
||||||
@ -11,6 +11,7 @@ set(
|
|||||||
LangModels/LangBulgarianModel.cpp
|
LangModels/LangBulgarianModel.cpp
|
||||||
LangModels/LangCzechModel.cpp
|
LangModels/LangCzechModel.cpp
|
||||||
LangModels/LangEsperantoModel.cpp
|
LangModels/LangEsperantoModel.cpp
|
||||||
|
LangModels/LangFinnishModel.cpp
|
||||||
LangModels/LangFrenchModel.cpp
|
LangModels/LangFrenchModel.cpp
|
||||||
LangModels/LangDanishModel.cpp
|
LangModels/LangDanishModel.cpp
|
||||||
LangModels/LangGermanModel.cpp
|
LangModels/LangGermanModel.cpp
|
||||||
|
|||||||
291
src/LangModels/LangFinnishModel.cpp
Normal file
291
src/LangModels/LangFinnishModel.cpp
Normal file
@ -0,0 +1,291 @@
|
|||||||
|
/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
|
||||||
|
/* ***** BEGIN LICENSE BLOCK *****
|
||||||
|
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||||
|
*
|
||||||
|
* The contents of this file are subject to the Mozilla Public License Version
|
||||||
|
* 1.1 (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
* http://www.mozilla.org/MPL/
|
||||||
|
*
|
||||||
|
* Software distributed under the License is distributed on an "AS IS" basis,
|
||||||
|
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||||
|
* for the specific language governing rights and limitations under the
|
||||||
|
* License.
|
||||||
|
*
|
||||||
|
* The Original Code is Mozilla Communicator client code.
|
||||||
|
*
|
||||||
|
* The Initial Developer of the Original Code is
|
||||||
|
* Netscape Communications Corporation.
|
||||||
|
* Portions created by the Initial Developer are Copyright (C) 1998
|
||||||
|
* the Initial Developer. All Rights Reserved.
|
||||||
|
*
|
||||||
|
* Contributor(s):
|
||||||
|
*
|
||||||
|
* Alternatively, the contents of this file may be used under the terms of
|
||||||
|
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||||
|
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||||
|
* in which case the provisions of the GPL or the LGPL are applicable instead
|
||||||
|
* of those above. If you wish to allow use of your version of this file only
|
||||||
|
* under the terms of either the GPL or the LGPL, and not to allow others to
|
||||||
|
* use your version of this file under the terms of the MPL, indicate your
|
||||||
|
* decision by deleting the provisions above and replace them with the notice
|
||||||
|
* and other provisions required by the GPL or the LGPL. If you do not delete
|
||||||
|
* the provisions above, a recipient may use your version of this file under
|
||||||
|
* the terms of any one of the MPL, the GPL or the LGPL.
|
||||||
|
*
|
||||||
|
* ***** END LICENSE BLOCK ***** */
|
||||||
|
|
||||||
|
#include "../nsSBCharSetProber.h"
|
||||||
|
|
||||||
|
/********* Language model for: Finnish *********/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Generated by BuildLangModel.py
|
||||||
|
* On: 2016-09-21 18:15:05.189948
|
||||||
|
**/
|
||||||
|
|
||||||
|
/* Character Mapping Table:
|
||||||
|
* ILL: illegal character.
|
||||||
|
* CTR: control character specific to the charset.
|
||||||
|
* RET: carriage/return.
|
||||||
|
* SYM: symbol (punctuation) that does not belong to word.
|
||||||
|
* NUM: 0 - 9.
|
||||||
|
*
|
||||||
|
* Other characters are ordered by probabilities
|
||||||
|
* (0 is the most common character in the language).
|
||||||
|
*
|
||||||
|
* Orders are generic to a language. So the codepoint with order X in
|
||||||
|
* CHARSET1 maps to the same character as the codepoint with the same
|
||||||
|
* order X in CHARSET2 for the same language.
|
||||||
|
* As such, it is possible to get missing order. For instance the
|
||||||
|
* ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1
|
||||||
|
* even though they are both used for French. Same for the euro sign.
|
||||||
|
*/
|
||||||
|
static const unsigned char Iso_8859_15_CharToOrderMap[] =
|
||||||
|
{
|
||||||
|
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
|
||||||
|
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
|
||||||
|
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
|
||||||
|
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
|
||||||
|
SYM, 0, 19, 21, 18, 4, 23, 20, 14, 1, 15, 9, 6, 12, 2, 7, /* 4X */
|
||||||
|
16, 29, 10, 5, 3, 8, 13, 24, 26, 17, 25,SYM,SYM,SYM,SYM,SYM, /* 5X */
|
||||||
|
SYM, 0, 19, 21, 18, 4, 23, 20, 14, 1, 15, 9, 6, 12, 2, 7, /* 6X */
|
||||||
|
16, 29, 10, 5, 3, 8, 13, 24, 26, 17, 25,SYM,SYM,SYM,SYM,CTR, /* 7X */
|
||||||
|
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */
|
||||||
|
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */
|
||||||
|
SYM,SYM,SYM,SYM,SYM,SYM, 27,SYM, 27,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */
|
||||||
|
SYM,SYM,SYM,SYM, 28, 61,SYM,SYM, 28,SYM,SYM,SYM, 62, 63, 64,SYM, /* BX */
|
||||||
|
49, 35, 65, 46, 11, 56, 39, 37, 40, 30, 51, 31, 66, 36, 67, 57, /* CX */
|
||||||
|
68, 58, 52, 33, 34, 59, 22,SYM, 69, 70, 38, 71, 32, 72, 73, 55, /* DX */
|
||||||
|
49, 35, 74, 46, 11, 56, 39, 37, 40, 30, 51, 31, 75, 36, 76, 57, /* EX */
|
||||||
|
77, 58, 52, 33, 34, 59, 22,SYM, 78, 79, 38, 80, 32, 81, 82, 83, /* FX */
|
||||||
|
};
|
||||||
|
/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
|
||||||
|
|
||||||
|
static const unsigned char Windows_1252_CharToOrderMap[] =
|
||||||
|
{
|
||||||
|
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
|
||||||
|
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
|
||||||
|
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
|
||||||
|
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
|
||||||
|
SYM, 0, 19, 21, 18, 4, 23, 20, 14, 1, 15, 9, 6, 12, 2, 7, /* 4X */
|
||||||
|
16, 29, 10, 5, 3, 8, 13, 24, 26, 17, 25,SYM,SYM,SYM,SYM,SYM, /* 5X */
|
||||||
|
SYM, 0, 19, 21, 18, 4, 23, 20, 14, 1, 15, 9, 6, 12, 2, 7, /* 6X */
|
||||||
|
16, 29, 10, 5, 3, 8, 13, 24, 26, 17, 25,SYM,SYM,SYM,SYM,CTR, /* 7X */
|
||||||
|
SYM,ILL,SYM, 84,SYM,SYM,SYM,SYM,SYM,SYM, 27,SYM, 85,ILL, 28,ILL, /* 8X */
|
||||||
|
ILL,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 27,SYM, 86,ILL, 28, 87, /* 9X */
|
||||||
|
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */
|
||||||
|
SYM,SYM,SYM,SYM,SYM, 88,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */
|
||||||
|
49, 35, 89, 46, 11, 56, 39, 37, 40, 30, 51, 31, 90, 36, 91, 57, /* CX */
|
||||||
|
92, 58, 52, 33, 34, 59, 22,SYM, 93, 94, 38, 95, 32, 96, 97, 55, /* DX */
|
||||||
|
49, 35, 98, 46, 11, 56, 39, 37, 40, 30, 51, 31, 99, 36,100, 57, /* EX */
|
||||||
|
101, 58, 52, 33, 34, 59, 22,SYM,102,103, 38,104, 32,105,106,107, /* FX */
|
||||||
|
};
|
||||||
|
/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
|
||||||
|
|
||||||
|
static const unsigned char Iso_8859_4_CharToOrderMap[] =
|
||||||
|
{
|
||||||
|
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
|
||||||
|
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
|
||||||
|
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
|
||||||
|
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
|
||||||
|
SYM, 0, 19, 21, 18, 4, 23, 20, 14, 1, 15, 9, 6, 12, 2, 7, /* 4X */
|
||||||
|
16, 29, 10, 5, 3, 8, 13, 24, 26, 17, 25,SYM,SYM,SYM,SYM,SYM, /* 5X */
|
||||||
|
SYM, 0, 19, 21, 18, 4, 23, 20, 14, 1, 15, 9, 6, 12, 2, 7, /* 6X */
|
||||||
|
16, 29, 10, 5, 3, 8, 13, 24, 26, 17, 25,SYM,SYM,SYM,SYM,CTR, /* 7X */
|
||||||
|
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */
|
||||||
|
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */
|
||||||
|
SYM,108,109, 47,SYM,110,111,SYM,SYM, 27,112,113,114,SYM, 28,SYM, /* AX */
|
||||||
|
SYM,115,SYM, 47,SYM,116,117,SYM,SYM, 27,118,119,120, 45, 28, 45, /* BX */
|
||||||
|
53, 35,121, 46, 11, 56, 39,122, 43, 30,123, 31,124, 36,125,126, /* CX */
|
||||||
|
127, 54,128,129, 34, 59, 22,SYM,130,131, 38,132, 32,133,134, 55, /* DX */
|
||||||
|
53, 35,135, 46, 11, 56, 39,136, 43, 30,137, 31,138, 36,139,140, /* EX */
|
||||||
|
141, 54,142,143, 34, 59, 22,SYM,144,145, 38,146, 32,147,148,SYM, /* FX */
|
||||||
|
};
|
||||||
|
/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
|
||||||
|
|
||||||
|
static const unsigned char Iso_8859_13_CharToOrderMap[] =
|
||||||
|
{
|
||||||
|
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
|
||||||
|
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
|
||||||
|
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
|
||||||
|
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
|
||||||
|
SYM, 0, 19, 21, 18, 4, 23, 20, 14, 1, 15, 9, 6, 12, 2, 7, /* 4X */
|
||||||
|
16, 29, 10, 5, 3, 8, 13, 24, 26, 17, 25,SYM,SYM,SYM,SYM,SYM, /* 5X */
|
||||||
|
SYM, 0, 19, 21, 18, 4, 23, 20, 14, 1, 15, 9, 6, 12, 2, 7, /* 6X */
|
||||||
|
16, 29, 10, 5, 3, 8, 13, 24, 26, 17, 25,SYM,SYM,SYM,SYM,CTR, /* 7X */
|
||||||
|
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */
|
||||||
|
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */
|
||||||
|
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,149,SYM, 47,SYM,SYM,SYM,SYM, 39, /* AX */
|
||||||
|
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,150,SYM, 47,SYM,SYM,SYM,SYM, 39, /* BX */
|
||||||
|
151,152, 53, 41, 11, 56,153,154, 43, 30,155,156,157,158,159,160, /* CX */
|
||||||
|
27,161, 54, 33,162, 59, 22,SYM,163,164,165,166, 32, 60, 28, 55, /* DX */
|
||||||
|
167,168, 53, 41, 11, 56,169,170, 43, 30,171,172,173,174,175,176, /* EX */
|
||||||
|
27,177, 54, 33,178, 59, 22,SYM,179,180,181,182, 32, 60, 28,SYM, /* FX */
|
||||||
|
};
|
||||||
|
/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
|
||||||
|
|
||||||
|
static const unsigned char Iso_8859_9_CharToOrderMap[] =
|
||||||
|
{
|
||||||
|
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
|
||||||
|
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
|
||||||
|
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
|
||||||
|
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
|
||||||
|
SYM, 0, 19, 21, 18, 4, 23, 20, 14, 1, 15, 9, 6, 12, 2, 7, /* 4X */
|
||||||
|
16, 29, 10, 5, 3, 8, 13, 24, 26, 17, 25,SYM,SYM,SYM,SYM,SYM, /* 5X */
|
||||||
|
SYM, 0, 19, 21, 18, 4, 23, 20, 14, 1, 15, 9, 6, 12, 2, 7, /* 6X */
|
||||||
|
16, 29, 10, 5, 3, 8, 13, 24, 26, 17, 25,SYM,SYM,SYM,SYM,CTR, /* 7X */
|
||||||
|
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */
|
||||||
|
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */
|
||||||
|
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */
|
||||||
|
SYM,SYM,SYM,SYM,SYM,183,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */
|
||||||
|
49, 35,184, 46, 11, 56, 39, 37, 40, 30, 51, 31,185, 36,186, 57, /* CX */
|
||||||
|
50, 58, 52, 33, 34, 59, 22,SYM,187,188, 38,189, 32, 48, 42, 55, /* DX */
|
||||||
|
49, 35,190, 46, 11, 56, 39, 37, 40, 30, 51, 31,191, 36,192, 57, /* EX */
|
||||||
|
50, 58, 52, 33, 34, 59, 22,SYM,193,194, 38,195, 32, 44, 42,196, /* FX */
|
||||||
|
};
|
||||||
|
/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
|
||||||
|
|
||||||
|
static const unsigned char Iso_8859_1_CharToOrderMap[] =
|
||||||
|
{
|
||||||
|
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
|
||||||
|
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
|
||||||
|
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
|
||||||
|
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
|
||||||
|
SYM, 0, 19, 21, 18, 4, 23, 20, 14, 1, 15, 9, 6, 12, 2, 7, /* 4X */
|
||||||
|
16, 29, 10, 5, 3, 8, 13, 24, 26, 17, 25,SYM,SYM,SYM,SYM,SYM, /* 5X */
|
||||||
|
SYM, 0, 19, 21, 18, 4, 23, 20, 14, 1, 15, 9, 6, 12, 2, 7, /* 6X */
|
||||||
|
16, 29, 10, 5, 3, 8, 13, 24, 26, 17, 25,SYM,SYM,SYM,SYM,CTR, /* 7X */
|
||||||
|
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */
|
||||||
|
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */
|
||||||
|
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */
|
||||||
|
SYM,SYM,SYM,SYM,SYM,197,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */
|
||||||
|
49, 35,198, 46, 11, 56, 39, 37, 40, 30, 51, 31,199, 36,200, 57, /* CX */
|
||||||
|
201, 58, 52, 33, 34, 59, 22,SYM,202,203, 38,204, 32,205,206, 55, /* DX */
|
||||||
|
49, 35,207, 46, 11, 56, 39, 37, 40, 30, 51, 31,208, 36,209, 57, /* EX */
|
||||||
|
210, 58, 52, 33, 34, 59, 22,SYM,211,212, 38,213, 32,214,215,216, /* FX */
|
||||||
|
};
|
||||||
|
/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
|
||||||
|
|
||||||
|
|
||||||
|
/* Model Table:
|
||||||
|
* Total sequences: 919
|
||||||
|
* First 512 sequences: 0.9985378147555799
|
||||||
|
* Next 512 sequences (512-1024): 0.0014621852444200612
|
||||||
|
* Rest: 3.881443777498106e-17
|
||||||
|
* Negative sequences: TODO
|
||||||
|
*/
|
||||||
|
static const PRUint8 FinnishLangModel[] =
|
||||||
|
{
|
||||||
|
3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,2,2,2,
|
||||||
|
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,2,0,2,
|
||||||
|
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,0,0,2,
|
||||||
|
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,2,3,3,3,3,2,3,3,0,3,0,0,
|
||||||
|
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,3,2,0,2,
|
||||||
|
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,2,0,0,0,2,
|
||||||
|
3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,2,2,0,0,
|
||||||
|
3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,0,3,3,3,3,2,0,2,
|
||||||
|
3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,0,3,3,2,3,2,0,0,
|
||||||
|
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,3,0,2,3,2,3,2,2,0,2,0,0,0,
|
||||||
|
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,2,2,
|
||||||
|
3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,0,0,2,2,0,0,0,0,0,0,0,
|
||||||
|
3,3,2,2,3,3,2,3,3,2,3,3,3,2,2,2,3,3,2,3,3,3,3,2,2,2,2,0,0,0,
|
||||||
|
3,3,2,2,3,2,2,3,3,3,2,3,0,2,2,2,2,3,2,2,0,0,2,0,0,0,0,0,0,0,
|
||||||
|
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,2,0,3,3,2,2,0,0,0,0,2,
|
||||||
|
3,3,3,2,3,2,2,3,3,2,2,3,2,0,2,0,2,3,0,2,0,0,3,2,0,0,0,0,0,0,
|
||||||
|
3,3,2,3,3,3,3,3,3,2,3,3,3,2,3,2,3,3,2,0,2,2,3,2,3,0,0,2,0,0,
|
||||||
|
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,3,3,2,3,2,2,2,0,0,
|
||||||
|
3,3,3,3,3,3,3,3,3,0,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,0,0,3,2,
|
||||||
|
3,3,3,3,3,3,3,3,3,3,3,2,2,0,3,2,0,3,3,3,2,3,2,0,2,2,0,0,0,0,
|
||||||
|
3,3,3,3,3,3,3,3,3,3,3,2,3,2,3,2,3,3,3,3,3,3,3,2,3,2,0,0,0,0,
|
||||||
|
3,3,2,3,3,3,3,3,3,3,3,0,2,0,3,0,2,3,3,2,2,3,0,0,0,2,0,0,0,2,
|
||||||
|
2,3,3,3,2,3,3,2,0,3,3,3,3,3,3,3,3,3,3,2,0,0,3,2,0,0,0,0,0,0,
|
||||||
|
3,3,2,3,3,3,3,3,3,2,3,2,0,2,0,2,2,3,0,2,2,2,0,3,0,2,0,0,0,0,
|
||||||
|
3,3,3,2,3,3,2,3,2,2,3,0,2,0,3,0,0,2,2,2,2,2,0,2,2,0,0,0,0,0,
|
||||||
|
3,3,3,2,3,2,2,3,2,2,2,2,2,2,2,0,2,3,2,2,2,0,0,2,2,3,0,0,0,0,
|
||||||
|
3,3,0,2,2,2,3,2,0,0,0,0,2,2,3,0,2,0,0,2,0,2,0,3,2,0,2,0,0,0,
|
||||||
|
3,3,2,2,3,0,0,2,2,2,2,0,2,2,0,2,2,2,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||||
|
3,2,0,0,2,0,0,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||||
|
2,2,0,0,0,2,0,0,3,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
const SequenceModel Iso_8859_15FinnishModel =
|
||||||
|
{
|
||||||
|
Iso_8859_15_CharToOrderMap,
|
||||||
|
FinnishLangModel,
|
||||||
|
30,
|
||||||
|
(float)0.9985378147555799,
|
||||||
|
PR_TRUE,
|
||||||
|
"ISO-8859-15"
|
||||||
|
};
|
||||||
|
|
||||||
|
const SequenceModel Windows_1252FinnishModel =
|
||||||
|
{
|
||||||
|
Windows_1252_CharToOrderMap,
|
||||||
|
FinnishLangModel,
|
||||||
|
30,
|
||||||
|
(float)0.9985378147555799,
|
||||||
|
PR_TRUE,
|
||||||
|
"WINDOWS-1252"
|
||||||
|
};
|
||||||
|
|
||||||
|
const SequenceModel Iso_8859_4FinnishModel =
|
||||||
|
{
|
||||||
|
Iso_8859_4_CharToOrderMap,
|
||||||
|
FinnishLangModel,
|
||||||
|
30,
|
||||||
|
(float)0.9985378147555799,
|
||||||
|
PR_TRUE,
|
||||||
|
"ISO-8859-4"
|
||||||
|
};
|
||||||
|
|
||||||
|
const SequenceModel Iso_8859_13FinnishModel =
|
||||||
|
{
|
||||||
|
Iso_8859_13_CharToOrderMap,
|
||||||
|
FinnishLangModel,
|
||||||
|
30,
|
||||||
|
(float)0.9985378147555799,
|
||||||
|
PR_TRUE,
|
||||||
|
"ISO-8859-13"
|
||||||
|
};
|
||||||
|
|
||||||
|
const SequenceModel Iso_8859_9FinnishModel =
|
||||||
|
{
|
||||||
|
Iso_8859_9_CharToOrderMap,
|
||||||
|
FinnishLangModel,
|
||||||
|
30,
|
||||||
|
(float)0.9985378147555799,
|
||||||
|
PR_TRUE,
|
||||||
|
"ISO-8859-9"
|
||||||
|
};
|
||||||
|
|
||||||
|
const SequenceModel Iso_8859_1FinnishModel =
|
||||||
|
{
|
||||||
|
Iso_8859_1_CharToOrderMap,
|
||||||
|
FinnishLangModel,
|
||||||
|
30,
|
||||||
|
(float)0.9985378147555799,
|
||||||
|
PR_TRUE,
|
||||||
|
"ISO-8859-1"
|
||||||
|
};
|
||||||
@ -143,6 +143,13 @@ nsSBCSGroupProber::nsSBCSGroupProber()
|
|||||||
mProbers[58] = new nsSingleByteCharSetProber(&Mac_CentraleuropePolishModel);
|
mProbers[58] = new nsSingleByteCharSetProber(&Mac_CentraleuropePolishModel);
|
||||||
mProbers[59] = new nsSingleByteCharSetProber(&Ibm852PolishModel);
|
mProbers[59] = new nsSingleByteCharSetProber(&Ibm852PolishModel);
|
||||||
|
|
||||||
|
mProbers[60] = new nsSingleByteCharSetProber(&Iso_8859_1FinnishModel);
|
||||||
|
mProbers[61] = new nsSingleByteCharSetProber(&Iso_8859_4FinnishModel);
|
||||||
|
mProbers[62] = new nsSingleByteCharSetProber(&Iso_8859_9FinnishModel);
|
||||||
|
mProbers[63] = new nsSingleByteCharSetProber(&Iso_8859_13FinnishModel);
|
||||||
|
mProbers[64] = new nsSingleByteCharSetProber(&Iso_8859_15FinnishModel);
|
||||||
|
mProbers[65] = new nsSingleByteCharSetProber(&Windows_1252FinnishModel);
|
||||||
|
|
||||||
Reset();
|
Reset();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -40,7 +40,7 @@
|
|||||||
#define nsSBCSGroupProber_h__
|
#define nsSBCSGroupProber_h__
|
||||||
|
|
||||||
|
|
||||||
#define NUM_OF_SBCS_PROBERS 60
|
#define NUM_OF_SBCS_PROBERS 66
|
||||||
|
|
||||||
class nsCharSetProber;
|
class nsCharSetProber;
|
||||||
class nsSBCSGroupProber: public nsCharSetProber {
|
class nsSBCSGroupProber: public nsCharSetProber {
|
||||||
|
|||||||
@ -204,5 +204,12 @@ extern const SequenceModel Iso_8859_16PolishModel;
|
|||||||
extern const SequenceModel Ibm852PolishModel;
|
extern const SequenceModel Ibm852PolishModel;
|
||||||
extern const SequenceModel Mac_CentraleuropePolishModel;
|
extern const SequenceModel Mac_CentraleuropePolishModel;
|
||||||
|
|
||||||
|
extern const SequenceModel Iso_8859_1FinnishModel;
|
||||||
|
extern const SequenceModel Iso_8859_4FinnishModel;
|
||||||
|
extern const SequenceModel Iso_8859_9FinnishModel;
|
||||||
|
extern const SequenceModel Iso_8859_13FinnishModel;
|
||||||
|
extern const SequenceModel Iso_8859_15FinnishModel;
|
||||||
|
extern const SequenceModel Windows_1252FinnishModel;
|
||||||
|
|
||||||
#endif /* nsSingleByteCharSetProber_h__ */
|
#endif /* nsSingleByteCharSetProber_h__ */
|
||||||
|
|
||||||
|
|||||||
8
test/fi/iso-8859-1.txt
Normal file
8
test/fi/iso-8859-1.txt
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
Termi science fiction on amerikkalaisen tieteislehtien toimittajan Hugo
|
||||||
|
Gernsbackin keksimä. Suomessa termin tieteiskirjallisuus loi tohtori Eino
|
||||||
|
Kauppinen 1950-luvun alkupuolella.
|
||||||
|
Tieteiskirjallisuudelle on laadittu erilaisia määritelmiä. Tieteiskirjallisuuden
|
||||||
|
rajat eivät ole yksiselitteisen selkeät. Tieteiskirjallisuus lähenee monia
|
||||||
|
kirjallisuudenlajeja, erityisesti kauhu- ja fantasiakirjallisuutta. Näillä
|
||||||
|
kolmella lajilla onkin yhteiset juuret 1800-lukua edeltävässä ei-realistisessa
|
||||||
|
kirjallisuudessa.
|
||||||
8
test/fi/utf-8.txt
Normal file
8
test/fi/utf-8.txt
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
Termi science fiction on amerikkalaisen tieteislehtien toimittajan Hugo
|
||||||
|
Gernsbackin keksimä. Suomessa termin tieteiskirjallisuus loi tohtori Eino
|
||||||
|
Kauppinen 1950-luvun alkupuolella.
|
||||||
|
Tieteiskirjallisuudelle on laadittu erilaisia määritelmiä. Tieteiskirjallisuuden
|
||||||
|
rajat eivät ole yksiselitteisen selkeät. Tieteiskirjallisuus lähenee monia
|
||||||
|
kirjallisuudenlajeja, erityisesti kauhu- ja fantasiakirjallisuutta. Näillä
|
||||||
|
kolmella lajilla onkin yhteiset juuret 1800-lukua edeltävässä ei-realistisessa
|
||||||
|
kirjallisuudessa.
|
||||||
Loading…
x
Reference in New Issue
Block a user