From ea2f4dd40f85a97349c830f495262e1ac828249f Mon Sep 17 00:00:00 2001 From: Jehan Date: Tue, 20 Sep 2016 23:28:02 +0200 Subject: [PATCH] LangModels: new support for Latvian / ISO-8859-13. Test text extracted from: https://lv.wikipedia.org/wiki/Vinsents_van_Gogs --- .../BuildLangModelLogs/LangLatvianModel.log | 162 ++++++++++++++++++ script/langs/lv.py | 69 ++++++++ src/CMakeLists.txt | 1 + src/LangModels/LangLatvianModel.cpp | 145 ++++++++++++++++ src/nsSBCSGroupProber.cpp | 2 + src/nsSBCSGroupProber.h | 2 +- src/nsSBCharSetProber.h | 2 + test/lv/iso-8859-13.txt | 6 + test/lv/utf-8.txt | 6 + 9 files changed, 394 insertions(+), 1 deletion(-) create mode 100644 script/BuildLangModelLogs/LangLatvianModel.log create mode 100644 script/langs/lv.py create mode 100644 src/LangModels/LangLatvianModel.cpp create mode 100644 test/lv/iso-8859-13.txt create mode 100644 test/lv/utf-8.txt diff --git a/script/BuildLangModelLogs/LangLatvianModel.log b/script/BuildLangModelLogs/LangLatvianModel.log new file mode 100644 index 0000000..956f1a6 --- /dev/null +++ b/script/BuildLangModelLogs/LangLatvianModel.log @@ -0,0 +1,162 @@ += Logs of language model for Latvian (lv) = + +- Generated by BuildLangModel.py +- Started: 2016-09-20 23:16:39.184579 +- Maximum depth: 5 +- Max number of pages: 100 + +== Parsed pages == + +Zigfrīds Anna Meierovics (revision 2546984) +1. Saeima (revision 2511127) +1. Saeimas deputāti (revision 2303859) +1. Saeimas frakcijas (revision 2429725) +1. Saeimas vēlēšanas (revision 2464758) +1887. gads (revision 2583253) +1919. gada Parīzes miera konference (revision 2482078) +1920 (revision 2401222) +1921 (revision 2473337) +1922 (revision 2486819) +1923 (revision 2544643) +1924 (revision 2539361) +1925 (revision 2486795) +22. augusts (revision 2583254) +31. jūlijs (revision 2559648) +5. februāris (revision 2581966) +ASV (revision 2549746) +Agrārā reforma Latvijā (revision 2473423) +Agudas Izrael (Latvija) (revision 2311143) +Aigars Kalvītis (revision 2545858) +Alberts Kviesis (revision 2546934) +Aleksandrs Bočagovs (revision 2329526) +Aleksandrs Dauge (revision 2546805) +Aleksandrs Jaunbērzs (revision 2462254) +Aleksandrs Kerenskis (revision 2461214) +Aleksandrs Millerāns (revision 2309419) +Aleksandrs Neibergs (revision 2491897) +Alfrēds Birznieks (revision 2567317) +Alfrēds Jēkabs Bērziņš (revision 2564068) +Alfrēds Riekstiņš (politiķis) (revision 2586148) +Andrejs Bērziņš (revision 2564283) +Andrejs Kurcijs (revision 2564338) +Andrejs Petrevics (revision 2460269) +Andrejs Sīmanis (revision 2547079) +Andrejs Veckalns (revision 2564224) +Andrievs Niedra (revision 2546988) +Andris Bērziņš (politiķis, 1951) (revision 2218488) +Andris Šķēle (revision 2457423) +Angļu valoda (revision 2447598) +Ansis Buševics (revision 2578312) +Ansis Rudevics (revision 2414854) +Antante (revision 2581862) +Antons Dzenis (revision 2564295) +Antons Laizāns (revision 2467408) +Antons Rubins (1885) (revision 2465396) +Antons Velkme (revision 2564425) +Ants Pīps (revision 2564383) +Apollo (portāls) (revision 2371202) +Apolonija Laurinoviča (revision 2466232) +Aprīļa pučs (revision 2150686) +Apvienotā Karaliste (revision 2566258) +Aristīds Briāns (revision 2536819) +Arons Nuroks (revision 2337085) +Arturs Alberings (revision 2442531) +Arturs Ozols (inženieris) (revision 2491399) +Artūrs Balfūrs (revision 2309461) +Artūrs Vīgants (revision 2461471) +Artūrs Žers (revision 2564230) +Arveds Bergs (revision 2564118) +Arveds Švābe (revision 2586288) +Arvīds Kalniņš (revision 2545254) +Aspazija (revision 2574081) +Augusts Briedis (revision 2546879) +Augusts Kalniņš (revision 2436647) +Augusts Kirhenšteins (revision 2547109) +Austroungārija (revision 2524307) +Autoritatīvā vadība (revision 2385793) +Balfūra nota (revision 2538973) +Baltijas Antante (revision 2541901) +Baltijas pārkrievošana (revision 2570657) +Bermontiāde (revision 2499160) +Bernards Kublinskis (revision 2441386) +Bezpartijiskais nacionālais centrs (revision 2438819) +Beļģija (revision 2579008) +Brestļitovskas miera līgums (revision 2569020) +Brizules muiža (revision 2584564) +Bruno Kalniņš (revision 2566572) +Brīvības piemineklis (revision 2578595) +Bulduru konference (revision 2193449) +Ceire-Cion (revision 2311779) +Celmiņa 1. Ministru kabinets (revision 2112830) +Delfi (portāls) (revision 2544918) +Demokrātiskais Centrs (revision 2113060) +Demokrātu savienība (revision 2179593) +Diena (laikraksts) (revision 2548854) +Donats Bicāns (revision 2479349) +Dubulti (Jūrmala) (revision 2456811) +Durbe (revision 2381790) +Dāvids Komisārs (revision 2574685) +Džovanni Džoliti (revision 2538055) +Ebreju bloks (revision 2311643) +Ebreju nacionāldemokrātu partija (revision 2312288) +Eduards Grantskalns (revision 2565167) +Eduards Jaunzems (revision 2452579) +Eduards Laimiņš (revision 2449521) +Eduards Radziņš (revision 2564393) + +== End of Parsed pages == + +- Wikipedia parsing ended at: 2016-09-20 23:23:02.592168 + +48 characters appeared 354730 times. + +First 39 characters: +[ 0] Char a: 11.905674738533532 % +[ 1] Char i: 9.398133791898063 % +[ 2] Char s: 8.224565162236066 % +[ 3] Char e: 6.367378005807234 % +[ 4] Char r: 5.854311730048205 % +[ 5] Char t: 5.831477461731457 % +[ 6] Char u: 4.939813379189807 % +[ 7] Char n: 4.463958503650664 % +[ 8] Char ā: 3.950046514250275 % +[ 9] Char l: 3.8031742452005752 % +[10] Char o: 3.6298029487215633 % +[11] Char k: 3.5249344571927943 % +[12] Char m: 3.2740394102556873 % +[13] Char d: 3.17790995968765 % +[14] Char v: 3.0048205677557576 % +[15] Char p: 2.8272207030699406 % +[16] Char j: 2.8167902348264877 % +[17] Char b: 2.0280213119837622 % +[18] Char ī: 1.885659515687988 % +[19] Char g: 1.6147492459053363 % +[20] Char z: 1.5344064499760381 % +[21] Char ē: 1.4594198404420264 % +[22] Char c: 1.2231838299551772 % +[23] Char š: 0.8877174188819666 % +[24] Char ņ: 0.4659882163899304 % +[25] Char f: 0.42031967975643447 % +[26] Char ļ: 0.34702449750514475 % +[27] Char ū: 0.3016378654187692 % +[28] Char h: 0.20071603754968567 % +[29] Char ž: 0.1877484283821498 % +[30] Char ķ: 0.1420798917486539 % +[31] Char ģ: 0.12685704620415528 % +[32] Char č: 0.08287993685338144 % +[33] Char w: 0.03241902291883968 % +[34] Char y: 0.02734474107067347 % +[35] Char x: 0.015786654638739323 % +[36] Char ö: 0.005074281848166211 % +[37] Char é: 0.003946663659684831 % +[38] Char q: 0.0031009500183237955 % + +The first 39 characters have an accumulated ratio of 0.9999013334085078. + +956 sequences found. + +First 512 (typical positive ratio): 0.9904728616367904 +Next 512 (512-1024): 0.001877484283821498 +Rest: -4.683753385137379e-17 + +- Processing end: 2016-09-20 23:23:02.695068 diff --git a/script/langs/lv.py b/script/langs/lv.py new file mode 100644 index 0000000..85d1b39 --- /dev/null +++ b/script/langs/lv.py @@ -0,0 +1,69 @@ +#!/bin/python3 +# -*- coding: utf-8 -*- + +# ##### BEGIN LICENSE BLOCK ##### +# Version: MPL 1.1/GPL 2.0/LGPL 2.1 +# +# The contents of this file are subject to the Mozilla Public License Version +# 1.1 (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# http://www.mozilla.org/MPL/ +# +# Software distributed under the License is distributed on an "AS IS" basis, +# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License +# for the specific language governing rights and limitations under the +# License. +# +# The Original Code is Mozilla Universal charset detector code. +# +# The Initial Developer of the Original Code is +# Netscape Communications Corporation. +# Portions created by the Initial Developer are Copyright (C) 2001 +# the Initial Developer. All Rights Reserved. +# +# Contributor(s): +# Jehan +# +# Alternatively, the contents of this file may be used under the terms of +# either the GNU General Public License Version 2 or later (the "GPL"), or +# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), +# in which case the provisions of the GPL or the LGPL are applicable instead +# of those above. If you wish to allow use of your version of this file only +# under the terms of either the GPL or the LGPL, and not to allow others to +# use your version of this file under the terms of the MPL, indicate your +# decision by deleting the provisions above and replace them with the notice +# and other provisions required by the GPL or the LGPL. If you do not delete +# the provisions above, a recipient may use your version of this file under +# the terms of any one of the MPL, the GPL or the LGPL. +# +# ##### END LICENSE BLOCK ##### + +import re + +## Mandatory Properties ## + +# The human name for the language, in English. +name = 'Latvian' +# Use 2-letter ISO 639-1 if possible, 3-letter ISO code otherwise, +# or use another catalog as a last resort. +code = 'lv' +# ASCII characters are also used. +use_ascii = True +# The charsets we want to support and create data for. +charsets = ['ISO-8859-13'] + +## Optional Properties ## + +# Alphabet characters. +# If use_ascii=True, there is no need to add any ASCII characters. +# If case_mapping=True, there is no need to add several cases of a same +# character (provided Python algorithms know the right cases). +alphabet = 'āčēģīķļņšūž' +# The start page. Just taking a starred page. +start_pages = ['Zigfrīds Anna Meierovics'] +# give possibility to select another code for the Wikipedia URL. +wikipedia_code = code +# 'a' and 'A' will be considered the same character, and so on. +# This uses Python algorithm to determine upper/lower-case of a given +# character. +case_mapping = True diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 7d0fbcc..9d9e8bc 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -18,6 +18,7 @@ set( LangModels/LangHungarianModel.cpp LangModels/LangHebrewModel.cpp LangModels/LangLithuanianModel.cpp + LangModels/LangLatvianModel.cpp LangModels/LangSpanishModel.cpp LangModels/LangThaiModel.cpp LangModels/LangTurkishModel.cpp diff --git a/src/LangModels/LangLatvianModel.cpp b/src/LangModels/LangLatvianModel.cpp new file mode 100644 index 0000000..fd60884 --- /dev/null +++ b/src/LangModels/LangLatvianModel.cpp @@ -0,0 +1,145 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is Mozilla Communicator client code. + * + * The Initial Developer of the Original Code is + * Netscape Communications Corporation. + * Portions created by the Initial Developer are Copyright (C) 1998 + * the Initial Developer. All Rights Reserved. + * + * Contributor(s): + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ + +#include "../nsSBCharSetProber.h" + +/********* Language model for: Latvian *********/ + +/** + * Generated by BuildLangModel.py + * On: 2016-09-20 23:23:02.592930 + **/ + +/* Character Mapping Table: + * ILL: illegal character. + * CTR: control character specific to the charset. + * RET: carriage/return. + * SYM: symbol (punctuation) that does not belong to word. + * NUM: 0 - 9. + * + * Other characters are ordered by probabilities + * (0 is the most common character in the language). + * + * Orders are generic to a language. So the codepoint with order X in + * CHARSET1 maps to the same character as the codepoint with the same + * order X in CHARSET2 for the same language. + * As such, it is possible to get missing order. For instance the + * ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1 + * even though they are both used for French. Same for the euro sign. + */ +static const unsigned char Iso_8859_13_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 0, 17, 22, 13, 3, 25, 19, 28, 1, 16, 11, 9, 12, 7, 10, /* 4X */ + 15, 38, 4, 2, 5, 6, 14, 33, 35, 34, 20,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 0, 17, 22, 13, 3, 25, 19, 28, 1, 16, 11, 9, 12, 7, 10, /* 6X */ + 15, 38, 4, 2, 5, 6, 14, 33, 35, 34, 20,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 48,SYM, 49,SYM,SYM,SYM,SYM, 47, /* AX */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 50,SYM, 51,SYM,SYM,SYM,SYM, 47, /* BX */ + 52, 53, 8, 54, 40, 46, 55, 21, 32, 37, 56, 43, 31, 30, 18, 26, /* CX */ + 23, 57, 24, 44, 45, 58, 36,SYM, 59, 41, 60, 27, 39, 61, 29, 42, /* DX */ + 62, 63, 8, 64, 40, 46, 65, 21, 32, 37, 66, 43, 31, 30, 18, 26, /* EX */ + 23, 67, 24, 44, 45, 68, 36,SYM, 69, 41, 70, 27, 39, 71, 29,SYM, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + + +/* Model Table: + * Total sequences: 956 + * First 512 sequences: 0.9904728616367904 + * Next 512 sequences (512-1024): 0.009527138363209666 + * Rest: -4.683753385137379e-17 + * Negative sequences: TODO + */ +static const PRUint8 LatvianLangModel[] = +{ + 2,3,3,3,3,3,3,3,2,3,2,3,3,3,3,3,3,3,2,3,3,2,3,3,3,3,3,0,3,3,2,2,3,2,2,2,0,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,2,3,3,3,3,3,2,3,3,3,2,3,0,0,2,0,2,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,2,2,3,3,0,2,2,2,3,2,2,0,0,0,2,2,0,2,2,2, + 3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,2,3,3,2,3,3,3,3,3,0,3,3,2,3,2,2,2,2,0,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,2,2,3,3,2,3,2,2,2,2,0,2,2,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,2,2,3,3,0,0,2,0,2,2,0,0,0,0, + 3,3,3,2,3,3,2,3,3,3,2,3,3,3,3,3,3,3,2,3,3,2,3,3,3,2,3,0,2,2,2,2,2,0,2,0,0,2,0, + 3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,2,2,3,3,3,2,3,3,3,0,3,0,2,2,2,0,0,3,0,2,0,0,0,2, + 2,2,3,2,3,3,2,3,0,3,0,3,3,3,3,3,3,3,0,2,3,0,3,3,3,3,3,0,0,2,0,2,2,0,0,0,0,0,0, + 3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,2,3,3,2,0,2,2,0,2,2,0,2,0, + 3,2,3,2,3,3,3,3,2,3,2,3,3,3,3,3,3,3,0,3,3,2,3,3,3,3,3,0,2,3,2,3,2,2,2,2,0,0,0, + 3,3,3,3,3,3,3,3,3,3,3,2,3,2,3,2,2,2,2,2,2,2,3,3,2,0,3,2,2,0,0,0,0,0,2,0,2,0,0, + 3,3,3,3,0,3,3,3,3,2,3,3,2,2,2,3,3,3,3,2,0,3,2,2,0,2,0,3,0,0,0,2,0,0,2,2,0,2,0, + 3,3,3,3,3,2,3,3,3,2,3,2,3,2,3,2,2,2,3,2,3,3,2,2,2,0,0,2,0,3,0,0,0,2,2,0,0,2,0, + 3,3,3,3,2,2,3,2,3,2,3,2,2,2,2,3,3,2,3,2,2,3,2,0,2,0,0,2,0,0,0,0,0,0,2,0,0,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,2,3,3,3,2,3,2,3,2,2,2,2,2,0,0,2,0,0,0,0,0,0,0, + 3,3,3,3,2,0,3,3,3,2,3,2,2,2,2,2,0,0,2,2,0,3,2,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0, + 3,3,3,3,3,0,3,2,3,3,3,2,2,2,2,2,2,2,3,0,0,3,2,2,0,0,2,3,2,0,0,0,2,0,2,0,2,0,0, + 0,0,3,0,3,3,0,3,0,3,0,3,3,3,3,3,3,3,0,3,3,0,3,3,3,2,2,0,0,2,2,0,2,0,0,0,0,0,0, + 3,3,3,3,3,3,3,2,3,3,3,2,2,2,2,0,0,0,0,2,2,2,0,3,0,2,3,3,2,2,0,0,0,0,2,0,0,2,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,2,3,3,2,3,0,0,2,2,0,0,0,0,2,2,0,0,0,0, + 2,0,3,0,3,3,0,3,0,3,0,3,3,3,3,2,3,2,0,3,3,0,3,3,2,2,3,0,0,2,2,3,0,0,0,0,0,0,0, + 3,3,3,3,2,2,3,2,3,2,3,3,2,2,2,2,0,2,3,0,2,3,2,2,0,0,0,2,3,0,0,2,0,0,2,0,0,0,0, + 3,3,3,3,2,3,3,3,3,3,3,2,2,2,3,2,2,2,3,2,2,2,0,0,2,0,2,2,0,0,3,0,0,0,0,0,0,0,0, + 3,3,2,3,0,0,3,2,3,0,3,0,2,2,2,2,2,2,0,2,0,3,2,3,0,0,0,2,0,0,3,2,0,0,0,0,0,0,0, + 3,3,3,3,3,2,3,2,2,3,3,2,2,0,0,0,0,0,2,2,0,2,0,0,0,2,0,2,0,0,0,0,0,0,0,0,0,0,0, + 3,3,2,2,2,0,3,2,3,2,3,2,2,0,2,2,2,0,2,2,0,2,0,2,2,0,2,2,0,0,2,3,0,0,0,0,0,0,0, + 0,2,3,0,3,3,0,3,0,3,2,3,2,3,3,3,2,0,0,2,3,0,3,2,0,2,0,0,2,2,0,0,0,0,0,0,0,0,0, + 3,3,2,3,2,2,2,3,2,2,3,2,2,2,0,0,2,0,2,0,0,2,0,0,0,0,0,2,2,0,0,0,0,2,2,0,2,0,0, + 3,3,2,3,2,0,3,2,3,2,3,2,2,0,2,0,0,0,2,0,2,2,0,0,2,0,0,2,0,0,2,2,0,0,0,0,0,0,0, + 3,3,2,3,0,2,3,0,2,0,2,0,0,0,0,0,0,0,3,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 2,3,0,3,0,0,2,0,0,0,2,0,0,0,0,0,0,0,2,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,2,2,3,0,0,3,2,2,0,2,2,2,0,0,2,0,0,2,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0, + 2,2,2,2,0,2,0,0,0,2,2,0,2,0,0,2,2,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,3,0,0,0,0,0, + 2,0,2,2,2,0,0,2,0,2,2,0,2,2,0,0,0,0,0,2,0,0,2,0,0,2,0,0,0,0,0,0,0,2,0,0,0,0,0, + 2,2,0,0,0,0,2,0,0,0,0,0,0,0,2,2,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0, + 0,0,2,0,0,2,0,2,0,2,0,0,2,2,0,0,0,0,0,0,0,0,0,0,0,2,0,0,2,0,0,0,0,0,0,0,0,0,0, + 0,0,2,0,2,2,0,2,0,0,2,0,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 2,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +}; + + +const SequenceModel Iso_8859_13LatvianModel = +{ + Iso_8859_13_CharToOrderMap, + LatvianLangModel, + 39, + (float)0.9904728616367904, + PR_TRUE, + "ISO-8859-13" +}; diff --git a/src/nsSBCSGroupProber.cpp b/src/nsSBCSGroupProber.cpp index 871c46d..0a13c1a 100644 --- a/src/nsSBCSGroupProber.cpp +++ b/src/nsSBCSGroupProber.cpp @@ -113,6 +113,8 @@ nsSBCSGroupProber::nsSBCSGroupProber() mProbers[35] = new nsSingleByteCharSetProber(&Iso_8859_13LithuanianModel); + mProbers[36] = new nsSingleByteCharSetProber(&Iso_8859_13LatvianModel); + Reset(); } diff --git a/src/nsSBCSGroupProber.h b/src/nsSBCSGroupProber.h index 19be64a..95081c3 100644 --- a/src/nsSBCSGroupProber.h +++ b/src/nsSBCSGroupProber.h @@ -40,7 +40,7 @@ #define nsSBCSGroupProber_h__ -#define NUM_OF_SBCS_PROBERS 36 +#define NUM_OF_SBCS_PROBERS 37 class nsCharSetProber; class nsSBCSGroupProber: public nsCharSetProber { diff --git a/src/nsSBCharSetProber.h b/src/nsSBCharSetProber.h index 94bf697..4369a17 100644 --- a/src/nsSBCharSetProber.h +++ b/src/nsSBCharSetProber.h @@ -174,5 +174,7 @@ extern const SequenceModel Windows_1252DanishModel; extern const SequenceModel Iso_8859_13LithuanianModel; +extern const SequenceModel Iso_8859_13LatvianModel; + #endif /* nsSingleByteCharSetProber_h__ */ diff --git a/test/lv/iso-8859-13.txt b/test/lv/iso-8859-13.txt new file mode 100644 index 0000000..bd4691d --- /dev/null +++ b/test/lv/iso-8859-13.txt @@ -0,0 +1,6 @@ +Vinsents Villems van Gogs (nderlandieu: Vincent Willem van Gogh, dzimis 1853. +gada 30. mart, miris 1890. gada 29. jlij) bija nderlandieu gleznotjs, +postimpresionisma prstvis. Kopum van Gogs radja vairk nek 2000 darbu, to +skait 900 gleznu un 1100 zmjumu un skiu. Savus slavenkos darbus vi radja +pdjo divu dzves gadu laik. Tiek uzskatts, ka van Gogs btiski ir ietekmjis +20. gadsimta mkslu, tostarp ekspresionismu un fovismu. diff --git a/test/lv/utf-8.txt b/test/lv/utf-8.txt new file mode 100644 index 0000000..4a4d3c9 --- /dev/null +++ b/test/lv/utf-8.txt @@ -0,0 +1,6 @@ +Vinsents Villems van Gogs (nīderlandiešu: Vincent Willem van Gogh, dzimis 1853. +gada 30. martā, miris 1890. gada 29. jūlijā) bija nīderlandiešu gleznotājs, +postimpresionisma pārstāvis. Kopumā van Gogs radīja vairāk nekā 2000 darbu, to +skaitā 900 gleznu un 1100 zīmējumu un skiču. Savus slavenākos darbus viņš radīja +pēdējo divu dzīves gadu laikā. Tiek uzskatīts, ka van Gogs būtiski ir ietekmējis +20. gadsimta mākslu, tostarp ekspresionismu un fovismu.