From 7cb3dd9dddac4fd886498db9ec91a20f837da33f Mon Sep 17 00:00:00 2001 From: Jehan Date: Tue, 20 Sep 2016 23:09:24 +0200 Subject: [PATCH] LangModels: add support for Lithuanian / ISO-8859-13. Test text extracted from https://lt.wikipedia.org/wiki/Vincent_van_Gogh. --- .../LangLithuanianModel.log | 162 ++++++++++++++++++ script/charsets/iso-8859-13.py | 72 ++++++++ script/langs/lt.py | 70 ++++++++ src/CMakeLists.txt | 1 + src/LangModels/LangLithuanianModel.cpp | 144 ++++++++++++++++ src/nsSBCSGroupProber.cpp | 2 + src/nsSBCSGroupProber.h | 2 +- src/nsSBCharSetProber.h | 2 + test/lt/iso-8859-13.txt | 3 + test/lt/utf-8.txt | 3 + 10 files changed, 460 insertions(+), 1 deletion(-) create mode 100644 script/BuildLangModelLogs/LangLithuanianModel.log create mode 100644 script/charsets/iso-8859-13.py create mode 100644 script/langs/lt.py create mode 100644 src/LangModels/LangLithuanianModel.cpp create mode 100644 test/lt/iso-8859-13.txt create mode 100644 test/lt/utf-8.txt diff --git a/script/BuildLangModelLogs/LangLithuanianModel.log b/script/BuildLangModelLogs/LangLithuanianModel.log new file mode 100644 index 0000000..7e04157 --- /dev/null +++ b/script/BuildLangModelLogs/LangLithuanianModel.log @@ -0,0 +1,162 @@ += Logs of language model for Lithuanian (lt) = + +- Generated by BuildLangModel.py +- Started: 2016-09-20 22:53:23.311784 +- Maximum depth: 5 +- Max number of pages: 100 + +== Parsed pages == + +Karūna (laivas) (revision 5080379) +1650 (revision 4990868) +1654 (revision 4991037) +1664 (revision 4991048) +1665 (revision 4991050) +1668 (revision 4991052) +1669 (revision 4991053) +1672 (revision 4991056) +1676 (revision 4991060) +1718 (revision 4990914) +1909 (revision 4990667) +1928 (revision 4990262) +1932 (revision 4990613) +1956 (revision 4990635) +1980 (revision 4990655) +Baltijos jūra (revision 5052833) +Burinis laivas (revision 4657401) +Flagmanas (laivas) (revision 5005271) +Grimzlė (revision 4487052) +Kalmaras (Švedija) (revision 4978519) +Karo laivas (revision 4726931) +Karolis XI (revision 4944621) +Karolis XII (revision 4915230) +Kilis (revision 4325533) +Koordinačių sistema (revision 5033980) +Laivo vėliava (revision 4986001) +Liepos 1 d. (revision 4910200) +Nyderlandai (revision 5080140) +Rugpjūčio 10 (revision 4910281) +Varytuvas (revision 4620792) +Vaza (laivas) (revision 5079282) +XVIII a. (revision 4896219) +XVII a. (revision 4768242) +Švedija (revision 5057665) +Švedų kalba (revision 4687559) +1590 (revision 4990983) +1596 (revision 4990989) +1608 (revision 4991000) +1610 (revision 4991002) +1623 m. (revision 4991015) +1634 m. (revision 4991026) +1643 m. (revision 4990870) +1644 m. (revision 4990872) +1645 m. (revision 4990873) +1646 m. (revision 4990874) +1647 m. (revision 4913295) +1648 m. (revision 4990875) +1649 m. (revision 4990876) +1651 m. (revision 4991035) +1652 m. (revision 4991072) +1653 m. (revision 4991036) +1654 m. (revision 4991037) +1655 m. (revision 4991038) +1662 m. (revision 4991046) +1668 m. (revision 4991052) +1677 m. (revision 4991061) +1702 (revision 4990595) +1704 (revision 4990863) +1722 (revision 4990918) +1723 (revision 4990919) +1737 (revision 4990931) +2 tūkstantmetis (revision 4296407) +ATR (revision 5078529) +Abiejų Tautų Respublika (revision 5078529) +Adomas Freitagas (revision 4362991) +Anglų kalba (revision 4911240) +Armėnų kalendorius (revision 4817534) +Bahajų kalendorius (revision 4706296) +Bajorai (revision 5006456) +Berberų kalendorius (revision 4926904) +Birželio 21 (revision 4910142) +Bizantijos kalendorius (revision 4927623) +Budistų kalendorius (revision 4705734) +Dešimtmetis (revision 4296419) +Dominikonai (revision 4921895) +Dominikonų ordinas (revision 4921895) +Džohoro sultonatas (revision 4934526) +Džu Ihai (revision 4991072) +Džu Joulang (revision 4991072) +Emanuelis Vladislovas Tiškevičius Logoiskis (revision 4939239) +Filosofas (revision 5078172) +Gegužės 26 (revision 4910130) +Grafas (titulas) (revision 5008057) +Grigaliaus kalendorius (revision 5000317) +Hebrajų kalendorius (revision 4728592) +Imperatorius Go-Komijas (revision 4907057) +Inocentas X (revision 4905150) +Iraniečių kalendorius (revision 4964854) +Isaac Titsingh (revision 4990745) +Japonija (revision 5035249) +Japonijos imperatorius (revision 4720428) +Japonų kalendorius (revision 4956765) +John Churchill (revision 4903704) +Jonas Kazimieras Vaza (revision 5037754) +Jurgis Kasakauskis (revision 5047829) +Jurgis Kazimieras Ancuta (revision 5059404) +Jurgis Mikalojus Tiškevičius (revision 4939554) + +== End of Parsed pages == + +- Wikipedia parsing ended at: 2016-09-20 22:57:33.076907 + +53 characters appeared 353013 times. + +First 38 characters: +[ 0] Char i: 13.033797622183885 % +[ 1] Char a: 11.1684272250597 % +[ 2] Char s: 8.587502443252799 % +[ 3] Char o: 7.01957151719625 % +[ 4] Char e: 5.52642537243671 % +[ 5] Char r: 5.469770235090492 % +[ 6] Char n: 5.143153368289553 % +[ 7] Char t: 5.1063275290145125 % +[ 8] Char u: 4.270947528844546 % +[ 9] Char k: 3.9621770303076653 % +[10] Char l: 3.905521892961449 % +[11] Char m: 3.360216196004113 % +[12] Char d: 3.037565188817409 % +[13] Char v: 2.727378311846872 % +[14] Char j: 2.447501933356562 % +[15] Char p: 2.3293759719897 % +[16] Char g: 1.942987935288502 % +[17] Char ė: 1.56594799624943 % +[18] Char b: 1.5075932047828267 % +[19] Char y: 1.223750966678281 % +[20] Char ų: 1.1818261650420805 % +[21] Char š: 0.9631373348856841 % +[22] Char ž: 0.8172503562191761 % +[23] Char c: 0.5960120448821998 % +[24] Char č: 0.48015228900918666 % +[25] Char f: 0.42831283833739836 % +[26] Char h: 0.42519680578335645 % +[27] Char z: 0.40111837241121434 % +[28] Char ū: 0.3685416684371397 % +[29] Char ą: 0.352678229980199 % +[30] Char į: 0.29007430321262956 % +[31] Char ę: 0.1481531841603567 % +[32] Char x: 0.08753218719990481 % +[33] Char w: 0.05920461852679646 % +[34] Char ō: 0.018129643950789347 % +[35] Char ö: 0.00878154628866359 % +[36] Char é: 0.007648443541739256 % +[37] Char q: 0.0073651678550081725 % + +The first 38 characters have an accumulated ratio of 0.9997705466937479. + +976 sequences found. + +First 512 (typical positive ratio): 0.9930868640383149 +Next 512 (512-1024): 0.008172503562191761 +Rest: -2.688821387764051e-17 + +- Processing end: 2016-09-20 22:57:33.185223 diff --git a/script/charsets/iso-8859-13.py b/script/charsets/iso-8859-13.py new file mode 100644 index 0000000..6d19a0a --- /dev/null +++ b/script/charsets/iso-8859-13.py @@ -0,0 +1,72 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +# ##### BEGIN LICENSE BLOCK ##### +# Version: MPL 1.1/GPL 2.0/LGPL 2.1 +# +# The contents of this file are subject to the Mozilla Public License Version +# 1.1 (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# http://www.mozilla.org/MPL/ +# +# Software distributed under the License is distributed on an "AS IS" basis, +# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License +# for the specific language governing rights and limitations under the +# License. +# +# The Original Code is Mozilla Universal charset detector code. +# +# The Initial Developer of the Original Code is +# Netscape Communications Corporation. +# Portions created by the Initial Developer are Copyright (C) 2001 +# the Initial Developer. All Rights Reserved. +# +# Contributor(s): +# Jehan +# +# Alternatively, the contents of this file may be used under the terms of +# either the GNU General Public License Version 2 or later (the "GPL"), or +# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), +# in which case the provisions of the GPL or the LGPL are applicable instead +# of those above. If you wish to allow use of your version of this file only +# under the terms of either the GPL or the LGPL, and not to allow others to +# use your version of this file under the terms of the MPL, indicate your +# decision by deleting the provisions above and replace them with the notice +# and other provisions required by the GPL or the LGPL. If you do not delete +# the provisions above, a recipient may use your version of this file under +# the terms of any one of the MPL, the GPL or the LGPL. +# +# ##### END LICENSE BLOCK ##### + +from codepoints import * + +name = 'ISO-8859-13' +aliases = ['csISO885913'] + +language = \ +{ + # Designed to cover Baltic languages. + 'complete': [ 'lv', 'lt' ], + 'incomplete': [] +} + +# X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF # +charmap = \ +[ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, # 0X + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 1X + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # 2X + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, # 3X + SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 4X + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,SYM, # 5X + SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 6X + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,CTR, # 7X + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 8X + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 9X + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,LET,SYM,LET,SYM,SYM,SYM,SYM,LET, # AX + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,LET,SYM,LET,SYM,SYM,SYM,SYM,LET, # BX + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # CX + LET,LET,LET,LET,LET,LET,LET,SYM,LET,LET,LET,LET,LET,LET,LET,LET, # DX + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # EX + LET,LET,LET,LET,LET,LET,LET,SYM,LET,LET,LET,LET,LET,LET,LET,SYM, # FX +] diff --git a/script/langs/lt.py b/script/langs/lt.py new file mode 100644 index 0000000..2f030c8 --- /dev/null +++ b/script/langs/lt.py @@ -0,0 +1,70 @@ +#!/bin/python3 +# -*- coding: utf-8 -*- + +# ##### BEGIN LICENSE BLOCK ##### +# Version: MPL 1.1/GPL 2.0/LGPL 2.1 +# +# The contents of this file are subject to the Mozilla Public License Version +# 1.1 (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# http://www.mozilla.org/MPL/ +# +# Software distributed under the License is distributed on an "AS IS" basis, +# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License +# for the specific language governing rights and limitations under the +# License. +# +# The Original Code is Mozilla Universal charset detector code. +# +# The Initial Developer of the Original Code is +# Netscape Communications Corporation. +# Portions created by the Initial Developer are Copyright (C) 2001 +# the Initial Developer. All Rights Reserved. +# +# Contributor(s): +# Jehan +# +# Alternatively, the contents of this file may be used under the terms of +# either the GNU General Public License Version 2 or later (the "GPL"), or +# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), +# in which case the provisions of the GPL or the LGPL are applicable instead +# of those above. If you wish to allow use of your version of this file only +# under the terms of either the GPL or the LGPL, and not to allow others to +# use your version of this file under the terms of the MPL, indicate your +# decision by deleting the provisions above and replace them with the notice +# and other provisions required by the GPL or the LGPL. If you do not delete +# the provisions above, a recipient may use your version of this file under +# the terms of any one of the MPL, the GPL or the LGPL. +# +# ##### END LICENSE BLOCK ##### + +import re + +## Mandatory Properties ## + +# The human name for the language, in English. +name = 'Lithuanian' +# Use 2-letter ISO 639-1 if possible, 3-letter ISO code otherwise, +# or use another catalog as a last resort. +code = 'lt' +# ASCII characters are also used. +use_ascii = True +# The charsets we want to support and create data for. +charsets = ['ISO-8859-13'] + +## Optional Properties ## + +# Alphabet characters. +# If use_ascii=True, there is no need to add any ASCII characters. +# If case_mapping=True, there is no need to add several cases of a same +# character (provided Python algorithms know the right cases). +alphabet = 'ąčęėįšųūž' +# The start page. Just taking the page which was in front page the day +# I created the data. +start_pages = ['Karūna (laivas)'] +# give possibility to select another code for the Wikipedia URL. +wikipedia_code = code +# 'a' and 'A' will be considered the same character, and so on. +# This uses Python algorithm to determine upper/lower-case of a given +# character. +case_mapping = True diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 4b38fba..7d0fbcc 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -17,6 +17,7 @@ set( LangModels/LangGreekModel.cpp LangModels/LangHungarianModel.cpp LangModels/LangHebrewModel.cpp + LangModels/LangLithuanianModel.cpp LangModels/LangSpanishModel.cpp LangModels/LangThaiModel.cpp LangModels/LangTurkishModel.cpp diff --git a/src/LangModels/LangLithuanianModel.cpp b/src/LangModels/LangLithuanianModel.cpp new file mode 100644 index 0000000..d77d741 --- /dev/null +++ b/src/LangModels/LangLithuanianModel.cpp @@ -0,0 +1,144 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is Mozilla Communicator client code. + * + * The Initial Developer of the Original Code is + * Netscape Communications Corporation. + * Portions created by the Initial Developer are Copyright (C) 1998 + * the Initial Developer. All Rights Reserved. + * + * Contributor(s): + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ + +#include "../nsSBCharSetProber.h" + +/********* Language model for: Lithuanian *********/ + +/** + * Generated by BuildLangModel.py + * On: 2016-09-20 22:57:33.077635 + **/ + +/* Character Mapping Table: + * ILL: illegal character. + * CTR: control character specific to the charset. + * RET: carriage/return. + * SYM: symbol (punctuation) that does not belong to word. + * NUM: 0 - 9. + * + * Other characters are ordered by probabilities + * (0 is the most common character in the language). + * + * Orders are generic to a language. So the codepoint with order X in + * CHARSET1 maps to the same character as the codepoint with the same + * order X in CHARSET2 for the same language. + * As such, it is possible to get missing order. For instance the + * ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1 + * even though they are both used for French. Same for the euro sign. + */ +static const unsigned char Iso_8859_13_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 1, 18, 23, 12, 4, 25, 16, 26, 0, 14, 9, 10, 11, 6, 3, /* 4X */ + 15, 37, 5, 2, 7, 8, 13, 33, 32, 19, 27,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 1, 18, 23, 12, 4, 25, 16, 26, 0, 14, 9, 10, 11, 6, 3, /* 6X */ + 15, 37, 5, 2, 7, 8, 13, 33, 32, 19, 27,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 45,SYM, 53,SYM,SYM,SYM,SYM, 54, /* AX */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 45,SYM, 55,SYM,SYM,SYM,SYM, 56, /* BX */ + 29, 30, 39, 46, 38, 41, 31, 47, 24, 36, 57, 17, 58, 59, 44, 50, /* CX */ + 21, 48, 51, 42, 34, 60, 35,SYM, 20, 40, 52, 28, 43, 61, 22, 49, /* DX */ + 29, 30, 39, 46, 38, 41, 31, 47, 24, 36, 62, 17, 63, 64, 44, 50, /* EX */ + 21, 48, 51, 42, 34, 65, 35,SYM, 20, 40, 52, 28, 43, 66, 22,SYM, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + + +/* Model Table: + * Total sequences: 976 + * First 512 sequences: 0.9930868640383149 + * Next 512 sequences (512-1024): 0.0069131359616851065 + * Rest: -2.688821387764051e-17 + * Negative sequences: TODO + */ +static const PRUint8 LithuanianLangModel[] = +{ + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,2,3,3,3,3,3,3,3,3,3,3,0,2,3,2,2,2,0,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,0,3,3,3,3,3,3,3,0,0,0,0,2,2,0,0,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,2,0,3,3,2,3,2,3,3,2,3,0,2,2,2,2,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,0,3,3,3,2,3,3,3,0,0,0,0,2,3,0,0,0,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,0,3,3,3,3,3,2,3,0,0,2,0,2,3,0,0,0,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,2,2,2,3,3,3,3,2,2,2,2,2,2, + 3,3,3,3,3,3,3,3,3,3,2,2,3,3,2,2,3,3,3,3,3,2,3,3,3,3,3,3,2,3,3,3,2,2,2,0,2,0, + 3,3,3,3,3,3,3,3,3,3,3,3,0,3,2,2,2,3,3,3,3,2,0,2,0,2,3,2,3,3,3,3,0,2,2,2,2,0, + 3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,0,3,2,0,3,3,3,3,3,2,3,0,0,0,0,0,2,0,0,0,0, + 3,3,3,3,3,3,3,3,3,2,3,3,3,3,2,2,2,3,2,3,3,3,0,3,2,2,3,2,3,3,2,3,0,2,2,0,2,0, + 3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,2,3,3,2,3,3,3,3,0,2,0,2,2,0, + 3,3,3,3,3,2,2,3,3,2,2,3,2,2,2,3,2,3,3,3,3,2,3,2,0,2,0,2,3,3,0,3,0,2,2,2,2,0, + 3,3,3,3,3,3,2,2,3,3,2,3,2,3,2,2,2,3,2,3,3,2,3,2,0,2,2,2,2,3,2,3,0,2,2,2,2,2, + 3,3,3,3,3,2,2,2,3,2,3,0,2,0,2,2,0,3,0,3,3,2,0,2,0,0,0,3,2,3,0,3,0,0,0,0,0,0, + 3,3,2,3,3,2,2,2,3,2,0,0,0,0,0,2,2,3,0,2,3,0,0,0,0,0,0,0,3,3,3,3,0,0,2,2,0,0, + 3,3,3,3,3,3,2,3,3,3,3,2,2,3,3,2,2,3,0,3,2,3,2,2,2,2,3,0,2,2,2,2,0,0,2,0,2,0, + 3,3,3,3,3,3,3,3,3,2,3,3,3,3,2,3,2,3,2,3,3,2,2,2,0,0,3,3,3,3,2,2,0,2,2,2,0,0, + 2,0,3,0,0,3,3,3,2,3,3,3,3,3,3,0,3,0,2,0,0,2,2,2,2,0,0,2,0,0,0,0,0,0,0,0,0,0, + 3,3,3,3,3,3,2,3,3,2,3,0,3,2,2,2,0,3,2,2,3,2,2,2,0,0,2,2,3,3,2,3,0,2,2,2,0,0, + 2,3,3,2,2,3,3,3,2,3,3,3,3,3,3,3,3,0,3,2,0,2,2,2,3,2,0,3,2,0,0,0,0,0,2,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,3,3,3,3,3,3,3,3,3,3,2,2,3,0,3,2,3,2,3,2,2,2,2,3,2,0,0,2,2,2,2,0,0,2,0,0,0, + 3,3,3,3,3,2,3,3,3,2,2,3,3,3,2,2,2,3,2,3,2,2,0,0,0,2,0,0,2,2,2,2,0,0,2,0,0,0, + 3,3,2,3,3,2,0,2,3,3,3,2,2,2,0,0,2,2,2,2,0,0,0,2,0,2,3,2,3,2,0,0,0,0,0,0,2,2, + 3,3,0,2,3,0,0,0,2,2,0,0,2,0,0,2,0,2,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0,2,0,0,0, + 3,3,2,3,3,3,0,2,3,2,3,2,0,0,2,0,2,2,2,2,2,0,0,2,0,2,0,0,2,2,0,0,0,0,0,2,0,0, + 3,3,2,3,3,3,3,3,3,2,2,3,2,0,2,0,0,0,2,2,2,0,0,0,0,2,0,0,2,2,0,0,0,2,2,0,0,0, + 3,3,2,3,3,2,2,2,3,2,3,3,3,2,0,2,2,2,2,3,3,0,0,2,0,0,2,2,2,2,0,2,0,2,2,0,2,0, + 2,0,3,0,0,3,3,3,0,3,2,3,3,2,0,2,3,0,2,0,0,2,2,0,3,0,0,3,0,0,0,0,0,0,0,0,0,0, + 0,0,3,0,0,2,0,0,0,2,2,2,0,2,3,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,2,3,0,0,3,0,3,0,3,3,2,2,3,2,3,3,2,0,0,0,2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,2,0,0,2,2,0,2,2,0,0,0,0,3,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0, + 3,3,2,2,3,2,2,0,2,0,2,0,0,0,0,0,2,0,0,2,0,0,0,0,0,0,2,0,0,0,0,0,0,2,0,0,0,0, + 2,0,2,0,2,0,2,0,0,2,0,2,2,0,0,0,2,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,2,0,0,2,2,2,0,2,2,2,2,0,0,0,2,0,0,0,0,0,0,2,0,2,0,2,0,0,0,0,0,0,0,0,0,0, + 0,0,2,0,0,2,2,0,0,0,0,2,2,0,0,2,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,2,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +}; + + +const SequenceModel Iso_8859_13LithuanianModel = +{ + Iso_8859_13_CharToOrderMap, + LithuanianLangModel, + 38, + (float)0.9930868640383149, + PR_TRUE, + "ISO-8859-13" +}; \ No newline at end of file diff --git a/src/nsSBCSGroupProber.cpp b/src/nsSBCSGroupProber.cpp index b1a60cc..871c46d 100644 --- a/src/nsSBCSGroupProber.cpp +++ b/src/nsSBCSGroupProber.cpp @@ -111,6 +111,8 @@ nsSBCSGroupProber::nsSBCSGroupProber() mProbers[33] = new nsSingleByteCharSetProber(&Iso_8859_1DanishModel); mProbers[34] = new nsSingleByteCharSetProber(&Windows_1252DanishModel); + mProbers[35] = new nsSingleByteCharSetProber(&Iso_8859_13LithuanianModel); + Reset(); } diff --git a/src/nsSBCSGroupProber.h b/src/nsSBCSGroupProber.h index c1ea4a1..19be64a 100644 --- a/src/nsSBCSGroupProber.h +++ b/src/nsSBCSGroupProber.h @@ -40,7 +40,7 @@ #define nsSBCSGroupProber_h__ -#define NUM_OF_SBCS_PROBERS 35 +#define NUM_OF_SBCS_PROBERS 36 class nsCharSetProber; class nsSBCSGroupProber: public nsCharSetProber { diff --git a/src/nsSBCharSetProber.h b/src/nsSBCharSetProber.h index 211846e..94bf697 100644 --- a/src/nsSBCharSetProber.h +++ b/src/nsSBCharSetProber.h @@ -172,5 +172,7 @@ extern const SequenceModel Iso_8859_15DanishModel; extern const SequenceModel Iso_8859_1DanishModel; extern const SequenceModel Windows_1252DanishModel; +extern const SequenceModel Iso_8859_13LithuanianModel; + #endif /* nsSingleByteCharSetProber_h__ */ diff --git a/test/lt/iso-8859-13.txt b/test/lt/iso-8859-13.txt new file mode 100644 index 0000000..1e3b02e --- /dev/null +++ b/test/lt/iso-8859-13.txt @@ -0,0 +1,3 @@ +Vincentas van Gogas (ol. Vincent van Gogh, 1853 m. kovo 30 d. Grot Zunderte, +Nyderlandai - 1890 m. liepos 29 d. Overe prie Uazos, Pranczija) - oland +tapytojas ir grafikas, postimpresionistas. diff --git a/test/lt/utf-8.txt b/test/lt/utf-8.txt new file mode 100644 index 0000000..de425f9 --- /dev/null +++ b/test/lt/utf-8.txt @@ -0,0 +1,3 @@ +Vincentas van Gogas (ol. Vincent van Gogh, 1853 m. kovo 30 d. Grot Zunderte, +Nyderlandai – 1890 m. liepos 29 d. Overe prie Uazos, Prancūzija) – olandų +tapytojas ir grafikas, postimpresionistas.