diff --git a/README.md b/README.md index 288e0b3..28c085a 100644 --- a/README.md +++ b/README.md @@ -18,6 +18,7 @@ uchardet started as a C language binding of the original C++ implementation of t * ISO-8859-6 * WINDOWS-1256 * Bulgarian + * UTF-8 * ISO-8859-5 * WINDOWS-1251 * Chinese diff --git a/script/BuildLangModelLogs/LangBulgarianModel.log b/script/BuildLangModelLogs/LangBulgarianModel.log new file mode 100644 index 0000000..452dda4 --- /dev/null +++ b/script/BuildLangModelLogs/LangBulgarianModel.log @@ -0,0 +1,263 @@ += Logs of language model for Bulgarian (bg) = + +- Generated by BuildLangModel.py +- Started: 2022-12-17 18:13:39.705509 +- Maximum depth: 4 +- Max number of pages: 200 + +== Parsed pages == + +Амурски_леопард (revision 11479353) +Пектусан (revision 11051736) +Тъкан (revision 11413541) +Растителноядно животно (revision 9401552) +Козмин (залив) (revision 10801896) +Око (revision 11307426) +Руска кухня (revision 8912349) +Обединена система за таксономична информация (revision 10952587) +Лисица (revision 11570875) +Сихоте Алин (revision 10913633) +Шриланкски леопард (revision 11478652) +Фазан (revision 11554738) +Северна Корея (revision 11596651) +Протисти (revision 11599945) +Калдера (revision 10605482) +Месо (revision 11396435) +Мезозойска ера (revision 11406482) +Тамилски (revision 11536357) +Птици (revision 11599947) +Паразитизъм (revision 10905879) +Череп (revision 11382448) +Домати (revision 11568692) +Гъби (revision 11575731) +Връх (revision 11560584) +Хабаровски край (revision 11326255) +Слъзна жлеза (revision 9848117) +Клетка (биология) (revision 11599652) +Чанбайшан (revision 11436397) +Усури (revision 11485897) +Нормативен контрол (revision 11218813) +Phasianus (revision 11554738) +Перм (период) (revision 10376629) +Въздух (revision 11586473) +Растения (revision 11599967) +Лов (revision 11549760) +Култ към личността (revision 11309525) +Биология (revision 11597684) +Азиатска късоноктеста видра (revision 11530864) +Ротатории (revision 10164408) +Торонто (revision 11500811) +Епител (revision 11544065) +Животни (revision 11599450) +Animal Diversity Web (revision 11280365) +Главоноги (revision 11321675) +Новозеландски морски лъв (revision 11531150) +Общомедия (revision 11583644) +Яйцеклетка (revision 11574210) +Риба (revision 11602135) +Ялуцзян (revision 11616897) +Водорасли (revision 11589165) +Тигрова генета (revision 11532904) +Карбон (revision 11440434) +Енотовидно куче (revision 11530902) +Пинин (revision 10953442) +Морска видра (revision 11022765) +Коткови (revision 11296822) +Сметана (revision 10602821) +Просо (revision 10908234) +Корейски полуостров (revision 11532552) +Уикивидове (revision 9824200) +Източна Азия (revision 10984512) +Злато (revision 11601280) +Лист (revision 11417909) +Уикиданни (revision 10288984) +Персийски леопард (revision 10731068) +Vormela (revision 11531190) +Африкански леопард (revision 10671790) +Далечен изток (revision 10098481) +Индийски леопард (revision 10949302) +Червен списък на световнозастрашените видове (revision 10923987) +Елда (revision 11398540) +Латински език (revision 11610275) +Николай Пржевалски (revision 11378214) +Корейски език (revision 11585784) +Цитоплазма (revision 10815311) +Силур (revision 10913196) +Дърво (revision 11599411) +Амур (revision 11232524) +Оцет (revision 10974969) +Индийски солонгой (revision 11530605) +Креда (revision 11194691) +BBC News (revision 11556539) +Ендодерма (revision 10159731) +Система на Маккюн-Райшауер (revision 10199499) +Вол (revision 11486361) +Тумънцзян (revision 11405669) +Тайга (revision 11596057) +Паренхим (revision 9238563) +Бикин (река) (revision 10416126) +Национален център за биотехнологична информация на САЩ (revision 10901368) +Кокошоподобни (revision 11377806) +Телевизор (revision 11587645) +Влажност (revision 11587428) +Анатолийски леопард (revision 10986842) +Синайски леопард (revision 10737955) +Акомодация (revision 9073034) +Бульон (revision 9265335) +Мляко (revision 11599803) +Хранителна верига (revision 9990974) +Китайски език (revision 11315056) +Мъжки (revision 11120791) +Камбрий (revision 10117802) +Зигота (revision 10544543) +Листо (revision 11417909) +Кромид лук (revision 10698110) +Хрян (revision 11494398) +Ектодерма (revision 10806725) +Храст (revision 11500525) +Геология (revision 11598573) +Дългоопашат скункс (revision 11531277) +Лигавица (revision 10894252) +Горчица (revision 8753833) +Подковонос на Мехели (revision 10377709) +Бозайници (revision 11597688) +Кванмьонсон-1 (revision 11507924) +Азиатска палмова цивета (revision 11531312) +Хранителни вещества (revision 11590475) +Дмитрий Орлов (revision 10880810) +Въглероден диоксид (revision 10769242) +Ракообразни (revision 11349934) +Испански език (revision 11599556) +Уикиречник (revision 9194836) +Уретра (revision 11600909) +ISO 639 (revision 10477132) +Биологична система (revision 10872761) +Палеозой (revision 10972967) +Розетка (revision 11250355) +Ихтиозаври (revision 11141622) +Хабаровск (revision 11427125) +Хавайски тюлен монах (revision 11531012) +Кодкод (revision 11480480) +Южна Европа (revision 10119488) +Вода (revision 11606762) +URL (revision 11283400) +Ивичест зурлест скункс (revision 11476684) +Храносмилателна система (revision 11298271) +Триас (revision 10657489) +ООН (revision 11599875) +Alexa Internet (revision 11547819) +Псориазис (revision 11607604) +Партеногенеза (revision 11201489) +Картоф (revision 11611083) +Коприва (revision 11416720) +Воден плъх (revision 11351201) +Прилепи (revision 11566273) +Odobenidae (revision 11032101) +Гондвана (revision 11074999) +Домашна муха (revision 11484479) +Трахея (revision 11408131) +Безполово размножаване (revision 10972108) +Карибски регион (revision 10503045) +Географска координатна система (revision 10929840) +Entoprocta (revision 10346607) +Бадем (revision 11339812) +Удил (revision 10422385) +Южноафриканска морска котка (revision 11476346) +Библиотечно дело (revision 11477309) +Организъм (revision 11079762) +Животно (revision 11599450) +Донг Фанг Хонг I (revision 11537199) +Палеоген (revision 9895031) +Триптофан (revision 11566722) +Боливия (revision 11584461) +Суспензия (revision 11306702) +Chlorophyceae (revision 11097610) +Тетраподоморфи (revision 10796558) +Wayback Machine (revision 11423066) +Mustelidae (revision 10988654) +Епителна тъкан (revision 11544065) +Чернолапа котка (revision 11545586) +Уралски федерален окръг (revision 11412555) +Северна Африка (revision 11617946) +Корейски архипелаг (revision 11436736) +Златна палмова цивета (revision 11530618) +Макроелемент (revision 11151625) +Международен съюз за защита на природата (revision 11546091) +Пролетен горицвет (revision 11560104) +Име (revision 11387941) +Neophoca (revision 11552636) +Алвеола (revision 10429710) +Лападови (revision 9926969) + +== End of Parsed pages == + +- Wikipedia parsing ended at: 2022-12-17 18:16:58.793948 + +59 characters appeared 866927 times. + +Most Frequent characters: +[ 0] Char а: 11.195290952986813 % +[ 1] Char и: 9.90394808328729 % +[ 2] Char о: 8.887830232533997 % +[ 3] Char е: 8.05834862681633 % +[ 4] Char т: 7.773895610587743 % +[ 5] Char н: 7.376976377480457 % +[ 6] Char р: 5.300561638984598 % +[ 7] Char с: 4.85496472021289 % +[ 8] Char в: 4.23022930419747 % +[ 9] Char л: 3.41978044287466 % +[10] Char к: 3.3481481139703804 % +[11] Char д: 2.8882477994110234 % +[12] Char п: 2.700227354783044 % +[13] Char з: 2.255207185841484 % +[14] Char м: 2.1408953695063135 % +[15] Char я: 1.6356625182973883 % +[16] Char ъ: 1.4382987264210252 % +[17] Char г: 1.3491332026802718 % +[18] Char ч: 1.2814227726209935 % +[19] Char у: 1.267234726799373 % +[20] Char б: 1.132852016375081 % +[21] Char ж: 0.7340871838113243 % +[22] Char ц: 0.6595711057563094 % +[23] Char х: 0.5456053393192275 % +[24] Char й: 0.5091547500539261 % +[25] Char a: 0.437522421149647 % +[26] Char ф: 0.37927068830478233 % +[27] Char щ: 0.3754641394258109 % +[28] Char i: 0.342589399107422 % +[29] Char e: 0.3205575555957999 % +[30] Char o: 0.3129444578378571 % +[31] Char ш: 0.27326406952373156 % +[32] Char r: 0.25757647414372836 % +[33] Char n: 0.24073537910343085 % +[34] Char s: 0.236006030496224 % +[35] Char t: 0.23069993205887002 % +[36] Char c: 0.2030159402118056 % +[37] Char l: 0.19990149112901087 % +[38] Char m: 0.16322020193165054 % +[39] Char u: 0.1605671527129735 % +[40] Char ю: 0.1558378041057667 % +[41] Char p: 0.12861521212282004 % +[42] Char d: 0.12065606446678902 % +[43] Char h: 0.11258156684472856 % +[44] Char b: 0.07832262693398637 % +[45] Char y: 0.07059417921001422 % +[46] Char g: 0.07047882924398478 % +[47] Char k: 0.053637734203687275 % +[48] Char f: 0.052368884577363495 % +[49] Char v: 0.04060318804236112 % +[50] Char w: 0.024108142900151914 % +[51] Char x: 0.022493243375739824 % +[52] Char ь: 0.01799459470059186 % + +The first 53 characters have an accumulated ratio of 0.9996920155907014. +The first 5 characters have an accumulated ratio of 0.4581931350621217. +All characters whose order is over 29 have an accumulated ratio of 0.03226223199877268. + +1236 sequences found. + +First 720 (typical positive ratio): 0.9950164618425456 +Next 201 (921-720): 0.003986830525963603 +Rest: 0.0009967076314908452 + +- Processing end: 2022-12-17 18:16:58.922580 diff --git a/script/charsets/iso-8859-5.py b/script/charsets/iso-8859-5.py new file mode 100644 index 0000000..953a437 --- /dev/null +++ b/script/charsets/iso-8859-5.py @@ -0,0 +1,72 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +# ##### BEGIN LICENSE BLOCK ##### +# Version: MPL 1.1/GPL 2.0/LGPL 2.1 +# +# The contents of this file are subject to the Mozilla Public License Version +# 1.1 (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# http://www.mozilla.org/MPL/ +# +# Software distributed under the License is distributed on an "AS IS" basis, +# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License +# for the specific language governing rights and limitations under the +# License. +# +# The Original Code is Mozilla Universal charset detector code. +# +# The Initial Developer of the Original Code is +# Netscape Communications Corporation. +# Portions created by the Initial Developer are Copyright (C) 2001 +# the Initial Developer. All Rights Reserved. +# +# Contributor(s): +# Jehan +# +# Alternatively, the contents of this file may be used under the terms of +# either the GNU General Public License Version 2 or later (the "GPL"), or +# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), +# in which case the provisions of the GPL or the LGPL are applicable instead +# of those above. If you wish to allow use of your version of this file only +# under the terms of either the GPL or the LGPL, and not to allow others to +# use your version of this file under the terms of the MPL, indicate your +# decision by deleting the provisions above and replace them with the notice +# and other provisions required by the GPL or the LGPL. If you do not delete +# the provisions above, a recipient may use your version of this file under +# the terms of any one of the MPL, the GPL or the LGPL. +# +# ##### END LICENSE BLOCK ##### + +from codepoints import * + +name = 'ISO-8859-5' +aliases = ['ISO_8859-5:1988', 'ISO_8859-5', 'iso-ir-144', + 'cyrillic', 'csISOLatinCyrillic'] + +language = \ +{ + 'complete': [ 'bg', 'be', 'ru', 'sr', 'mk' ], + 'incomplete': [ 'uk' ] +} + +# X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF # +charmap = \ +[ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, # 0X + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 1X + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # 2X + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, # 3X + SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 4X + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,SYM, # 5X + SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 6X + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,CTR, # 7X + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 8X + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 9X + SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,LET,LET, # AX + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # BX + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # CX + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # DX + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # EX + SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,LET,LET, # FX +] diff --git a/script/charsets/windows-1251.py b/script/charsets/windows-1251.py new file mode 100644 index 0000000..8ab389f --- /dev/null +++ b/script/charsets/windows-1251.py @@ -0,0 +1,75 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +# ##### BEGIN LICENSE BLOCK ##### +# Version: MPL 1.1/GPL 2.0/LGPL 2.1 +# +# The contents of this file are subject to the Mozilla Public License Version +# 1.1 (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# http://www.mozilla.org/MPL/ +# +# Software distributed under the License is distributed on an "AS IS" basis, +# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License +# for the specific language governing rights and limitations under the +# License. +# +# The Original Code is Mozilla Universal charset detector code. +# +# The Initial Developer of the Original Code is +# Netscape Communications Corporation. +# Portions created by the Initial Developer are Copyright (C) 2001 +# the Initial Developer. All Rights Reserved. +# +# Contributor(s): +# Jehan +# +# Alternatively, the contents of this file may be used under the terms of +# either the GNU General Public License Version 2 or later (the "GPL"), or +# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), +# in which case the provisions of the GPL or the LGPL are applicable instead +# of those above. If you wish to allow use of your version of this file only +# under the terms of either the GPL or the LGPL, and not to allow others to +# use your version of this file under the terms of the MPL, indicate your +# decision by deleting the provisions above and replace them with the notice +# and other provisions required by the GPL or the LGPL. If you do not delete +# the provisions above, a recipient may use your version of this file under +# the terms of any one of the MPL, the GPL or the LGPL. +# +# ##### END LICENSE BLOCK ##### + +from codepoints import * + +name = 'WINDOWS-1251' +aliases = ['CP-1251', 'cswindows1251'] + +language = \ +{ + # Windows-1251 is a popular 8-bit character encoding, designed to cover + # languages that use the Cyrillic script such as Russian, Bulgarian, Serbian + # Cyrillic and other languages. It is the most widely used for encoding the + # Bulgarian, Serbian and Macedonian languages. + 'complete': [ 'ru', 'uk', 'be', 'bg', 'sr', 'mk' ], + 'incomplete': [] +} + +# X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF # +charmap = \ +[ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, # 0X + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 1X + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # 2X + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, # 3X + SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 4X + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,SYM, # 5X + SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 6X + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,CTR, # 7X + LET,LET,SYM,LET,SYM,SYM,SYM,SYM,SYM,SYM,LET,SYM,LET,LET,LET,LET, # 8X + LET,SYM,SYM,SYM,SYM,SYM,SYM,SYM,ILL,SYM,LET,SYM,LET,LET,LET,LET, # 9X + SYM,LET,LET,LET,SYM,LET,SYM,SYM,LET,SYM,LET,SYM,SYM,SYM,SYM,LET, # AX + SYM,SYM,LET,LET,LET,SYM,SYM,SYM,LET,SYM,LET,SYM,LET,LET,LET,LET, # BX + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # CX + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # DX + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # EX + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # FX +] diff --git a/script/langs/bg.py b/script/langs/bg.py new file mode 100644 index 0000000..bce517e --- /dev/null +++ b/script/langs/bg.py @@ -0,0 +1,58 @@ +#!/bin/python3 +# -*- coding: utf-8 -*- + +# ##### BEGIN LICENSE BLOCK ##### +# Version: MPL 1.1/GPL 2.0/LGPL 2.1 +# +# The contents of this file are subject to the Mozilla Public License Version +# 1.1 (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# http://www.mozilla.org/MPL/ +# +# Software distributed under the License is distributed on an "AS IS" basis, +# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License +# for the specific language governing rights and limitations under the +# License. +# +# The Original Code is Mozilla Universal charset detector code. +# +# The Initial Developer of the Original Code is +# Netscape Communications Corporation. +# Portions created by the Initial Developer are Copyright (C) 2001 +# the Initial Developer. All Rights Reserved. +# +# Contributor(s): +# Jehan +# +# Alternatively, the contents of this file may be used under the terms of +# either the GNU General Public License Version 2 or later (the "GPL"), or +# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), +# in which case the provisions of the GPL or the LGPL are applicable instead +# of those above. If you wish to allow use of your version of this file only +# under the terms of either the GPL or the LGPL, and not to allow others to +# use your version of this file under the terms of the MPL, indicate your +# decision by deleting the provisions above and replace them with the notice +# and other provisions required by the GPL or the LGPL. If you do not delete +# the provisions above, a recipient may use your version of this file under +# the terms of any one of the MPL, the GPL or the LGPL. +# +# ##### END LICENSE BLOCK ##### + +import re + +## Mandatory Properties ## + +name = 'Bulgarian' +code = 'bg' +use_ascii = False +charsets = [ 'WINDOWS-1251', 'ISO-8859-5' ] + +## Optional Properties ## + +# Alphabet characters. +alphabet = 'абвгдежзийклмнопрстуфхцчшщъьюя' +# A starred page which was rewarded on the main page when I created +# the data. +start_pages = ['Амурски_леопард'] +wikipedia_code = code +case_mapping = True diff --git a/src/LangModels/LangBulgarianModel.cpp b/src/LangModels/LangBulgarianModel.cpp index 1120054..32bba1c 100644 --- a/src/LangModels/LangBulgarianModel.cpp +++ b/src/LangModels/LangBulgarianModel.cpp @@ -36,214 +36,244 @@ * ***** END LICENSE BLOCK ***** */ #include "../nsSBCharSetProber.h" -/**************************************************************** -CTR: Control characters that usually does not exist in any text -RET: Carriage/Return -SYM: symbol (punctuation) that does not belong to word -NUM: 0 - 9 +#include "../nsLanguageDetector.h" -*****************************************************************/ +/********* Language model for: Bulgarian *********/ -//Character Mapping Table: -//this talbe is modified base on win1251BulgarianCharToOrderMap, so -//only number <64 is sure valid +/** + * Generated by BuildLangModel.py + * On: 2022-12-17 18:16:58.794613 + **/ -static const unsigned char Latin5_BulgarianCharToOrderMap[] = +/* Character Mapping Table: + * ILL: illegal character. + * CTR: control character specific to the charset. + * RET: carriage/return. + * SYM: symbol (punctuation) that does not belong to word. + * NUM: 0 - 9. + * + * Other characters are ordered by probabilities + * (0 is the most common character in the language). + * + * Orders are generic to a language. So the codepoint with order X in + * CHARSET1 maps to the same character as the codepoint with the same + * order X in CHARSET2 for the same language. + * As such, it is possible to get missing order. For instance the + * ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1 + * even though they are both used for French. Same for the euro sign. + */ +static const unsigned char Windows_1251_CharToOrderMap[] = { -CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, //00 -CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, //10 -SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, //20 -NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, //30 -SYM, 77, 90, 99,100, 72,109,107,101, 79,185, 81,102, 76, 94, 82, //40 -110,186,108, 91, 74,119, 84, 96,111,187,115,SYM,SYM,SYM,SYM,SYM, //50 -SYM, 65, 69, 70, 66, 63, 68,112,103, 92,194,104, 95, 86, 87, 71, //60 -116,195, 85, 93, 97,113,196,197,198,199,200,SYM,SYM,SYM,SYM,SYM, //70 -194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209, //80 -210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225, //90 - 81,226,227,228,229,230,105,231,232,233,234,235,236, 45,237,238, //a0 - 31, 32, 35, 43, 37, 44, 55, 47, 40, 59, 33, 46, 38, 36, 41, 30, //b0 - 39, 28, 34, 51, 48, 49, 53, 50, 54, 57, 61,239, 67,240, 60, 56, //c0 - 1, 18, 9, 20, 11, 3, 23, 15, 2, 26, 12, 10, 14, 6, 4, 13, //d0 - 7, 8, 5, 19, 29, 25, 22, 21, 27, 24, 17, 75, 52,241, 42, 16, //e0 - 62,242,243,244, 58,245, 98,246,247,248,249,250,251, 91,NUM,SYM, //f0 + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 25, 44, 36, 42, 29, 48, 46, 43, 28, 54, 47, 37, 38, 33, 30, /* 4X */ + 41, 55, 32, 34, 35, 39, 49, 50, 51, 45, 53,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 25, 44, 36, 42, 29, 48, 46, 43, 28, 54, 47, 37, 38, 33, 30, /* 6X */ + 41, 55, 32, 34, 35, 39, 49, 50, 51, 45, 53,SYM,SYM,SYM,SYM,CTR, /* 7X */ + 59, 60,SYM, 61,SYM,SYM,SYM,SYM,SYM,SYM, 62,SYM, 63, 64, 65, 66, /* 8X */ + 67,SYM,SYM,SYM,SYM,SYM,SYM,SYM,ILL,SYM, 68,SYM, 69, 70, 71, 72, /* 9X */ + SYM, 73, 74, 75,SYM, 76,SYM,SYM, 77,SYM, 78,SYM,SYM,SYM,SYM, 79, /* AX */ + SYM,SYM, 57, 57, 80,SYM,SYM,SYM, 81,SYM, 82,SYM, 83, 84, 85, 86, /* BX */ + 0, 20, 8, 17, 11, 3, 21, 13, 1, 24, 10, 9, 14, 5, 2, 12, /* CX */ + 6, 7, 4, 19, 26, 23, 22, 18, 31, 27, 16, 56, 52, 58, 40, 15, /* DX */ + 0, 20, 8, 17, 11, 3, 21, 13, 1, 24, 10, 9, 14, 5, 2, 12, /* EX */ + 6, 7, 4, 19, 26, 23, 22, 18, 31, 27, 16, 56, 52, 58, 40, 15, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Iso_8859_5_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 25, 44, 36, 42, 29, 48, 46, 43, 28, 54, 47, 37, 38, 33, 30, /* 4X */ + 41, 55, 32, 34, 35, 39, 49, 50, 51, 45, 53,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 25, 44, 36, 42, 29, 48, 46, 43, 28, 54, 47, 37, 38, 33, 30, /* 6X */ + 41, 55, 32, 34, 35, 39, 49, 50, 51, 45, 53,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM, 87, 88, 89, 90, 91, 57, 92, 93, 94, 95, 96, 97,SYM, 98, 99, /* AX */ + 0, 20, 8, 17, 11, 3, 21, 13, 1, 24, 10, 9, 14, 5, 2, 12, /* BX */ + 6, 7, 4, 19, 26, 23, 22, 18, 31, 27, 16, 56, 52, 58, 40, 15, /* CX */ + 0, 20, 8, 17, 11, 3, 21, 13, 1, 24, 10, 9, 14, 5, 2, 12, /* DX */ + 6, 7, 4, 19, 26, 23, 22, 18, 31, 27, 16, 56, 52, 58, 40, 15, /* EX */ + SYM,100,101,102,103,104, 57,105,106,107,108,109,110,SYM,111,112, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const int Unicode_Char_size = 106; +static const unsigned int Unicode_CharOrder[] = +{ + 65, 25, 66, 44, 67, 36, 68, 42, 69, 29, 70, 48, 71, 46, 72, 43, + 73, 28, 75, 47, 76, 37, 77, 38, 78, 33, 79, 30, 80, 41, 82, 32, + 83, 34, 84, 35, 85, 39, 86, 49, 87, 50, 88, 51, 89, 45, 97, 25, + 98, 44, 99, 36, 100, 42, 101, 29, 102, 48, 103, 46, 104, 43, 105, 28, + 107, 47, 108, 37, 109, 38, 110, 33, 111, 30, 112, 41, 114, 32, 115, 34, + 116, 35, 117, 39, 118, 49, 119, 50, 120, 51, 121, 45, 1040, 0,1041, 20, + 1042, 8, 1043, 17, 1044, 11, 1045, 3, 1046, 21, 1047, 13, 1048, 1,1049, 24, + 1050, 10, 1051, 9, 1052, 14, 1053, 5, 1054, 2, 1055, 12, 1056, 6,1057, 7, + 1058, 4, 1059, 19, 1060, 26, 1061, 23, 1062, 22, 1063, 18, 1064, 31,1065, 27, + 1066, 16, 1068, 52, 1070, 40, 1071, 15, 1072, 0, 1073, 20, 1074, 8,1075, 17, + 1076, 11, 1077, 3, 1078, 21, 1079, 13, 1080, 1, 1081, 24, 1082, 10,1083, 9, + 1084, 14, 1085, 5, 1086, 2, 1087, 12, 1088, 6, 1089, 7, 1090, 4,1091, 19, + 1092, 26, 1093, 23, 1094, 22, 1095, 18, 1096, 31, 1097, 27, 1098, 16,1100, 52, + 1102, 40, 1103, 15, }; -static const unsigned char win1251BulgarianCharToOrderMap[] = + +/* Model Table: + * Total considered sequences: 1236 / 2809 + * - Positive sequences: first 720 (0.9950164618425456) + * - Probable sequences: next 201 (921-720) (0.003986830525963603) + * - Neutral sequences: last 1888 (0.0009967076314908452) + * - Negative sequences: 1573 (off-ratio) + * Negative sequences: TODO + */ +static const PRUint8 BulgarianLangModel[] = { -CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, //00 -CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, //10 -SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, //20 -NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, //30 -SYM, 77, 90, 99,100, 72,109,107,101, 79,185, 81,102, 76, 94, 82, //40 -110,186,108, 91, 74,119, 84, 96,111,187,115,SYM,SYM,SYM,SYM,SYM, //50 -SYM, 65, 69, 70, 66, 63, 68,112,103, 92,194,104, 95, 86, 87, 71, //60 -116,195, 85, 93, 97,113,196,197,198,199,200,SYM,SYM,SYM,SYM,SYM, //70 -206,207,208,209,210,211,212,213,120,214,215,216,217,218,219,220, //80 -221, 78, 64, 83,121, 98,117,105,ILL,223,224,225,226,227,228,229, //90 - 88,230,231,232,233,122, 89,106,234,235,236,237,238, 45,239,240, //a0 - 73, 80,118,114,241,242,243,244,245, 62, 58,246,247,248,249,250, //b0 - 31, 32, 35, 43, 37, 44, 55, 47, 40, 59, 33, 46, 38, 36, 41, 30, //c0 - 39, 28, 34, 51, 48, 49, 53, 50, 54, 57, 61,251, 67,NUM, 60, 56, //d0 - 1, 18, 9, 20, 11, 3, 23, 15, 2, 26, 12, 10, 14, 6, 4, 13, //e0 - 7, 8, 5, 19, 29, 25, 22, 21, 27, 24, 17, 75, 52,SYM, 42, 16, //f0 + 1,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,3,3,3,3,3,0, + 3,3,0,0,0,3,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,3,3,3,3,3,3,3,3,1, + 3,3,0,0,0,3,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,3,3,3,3,3,3,3,3,0, + 3,3,0,0,0,3,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,3,3,3,3,3,3,3,3,0, + 3,3,1,0,0,3,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,1,3,3,3,3,3,3,3,0,1,2,0,1, + 2,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,2, + 3,3,3,3,3,3,2,3,3,3,3,3,2,3,2,3,3,3,3,3,2,2,3,3,1,0, + 3,1,0,0,0,3,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,3, + 3,3,3,3,3,3,1,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,0, + 3,0,0,1,0,3,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,2, + 3,3,3,3,3,3,3,3,3,3,3,1,3,1,3,3,3,2,3,3,3,0,2,3,0,0, + 3,0,0,0,0,3,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1, + 3,3,3,3,3,3,3,3,1,3,3,3,3,3,3,3,3,3,3,3,0,1,3,1,0,0, + 1,0,0,0,0,2,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1, + 3,3,3,3,3,3,1,3,3,1,3,3,2,3,3,3,3,3,3,3,3,3,3,2,0,0, + 3,0,0,0,0,2,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,3, + 3,3,3,3,3,3,3,3,3,3,2,1,1,3,3,1,3,2,2,3,0,0,3,1,0,0, + 1,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,2, + 3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,2,2,3,2,3,2,3,0,0, + 1,0,0,0,0,3,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1, + 3,3,3,3,3,3,3,3,3,3,3,1,1,0,1,3,3,0,3,3,1,0,2,2,0,0, + 0,0,0,0,0,2,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,1, + 3,3,3,3,3,3,3,3,3,3,3,3,3,1,3,3,3,3,3,3,3,1,3,3,0,0, + 2,0,0,0,0,3,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1, + 3,3,3,3,2,3,3,3,3,3,3,1,3,1,1,3,3,2,2,3,3,1,2,1,0,0, + 2,0,0,0,0,2,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,2, + 1,1,1,2,3,3,3,3,3,3,3,3,3,3,3,2,0,3,3,1,3,1,2,3,3,1, + 0,3,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,1,3,3,3,3,3,3,3,3,3,3,3,3,3,0,0,3,3,0,3,3,3,3,3,0, + 2,3,0,0,0,2,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0, + 3,3,3,3,2,3,3,1,3,3,1,2,0,0,3,1,3,0,1,3,1,0,0,0,1,0, + 0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1, + 3,3,3,3,3,3,3,0,3,3,3,0,0,0,0,0,2,0,0,3,0,2,0,3,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,3,3,1,3,3,3,3,2,0, + 1,3,0,0,0,3,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0, + 3,3,3,3,2,3,3,3,3,3,2,2,2,1,3,3,3,1,2,3,0,0,2,3,0,0, + 0,3,0,0,0,2,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,1, + 3,3,3,3,3,3,1,0,3,3,3,3,0,0,1,0,3,0,0,3,3,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,3,2,3,1,1,1,0,3,0,3,0,1,2,0,3,3,0,0,2,0,0,0,1,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0, + 3,3,3,3,3,3,3,1,3,3,2,0,0,0,1,1,3,0,1,3,0,0,0,2,0,0, + 0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1, + 1,0,3,3,3,3,2,3,3,3,3,3,1,2,2,0,2,2,3,0,3,0,3,1,0,0, + 1,0,0,0,0,2,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1, + 0,0,3,3,2,0,3,3,3,3,3,3,3,3,0,3,3,1,2,3,3,1,2,2,2,2,0, + 3,3,3,3,3,2,3,3,0,3,1,0,0,0,0,3,2,1,0,3,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1, + 3,3,3,3,3,3,2,0,1,0,0,0,0,0,0,3,1,0,0,3,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3, + 0,0,3,3,3,0,3,3,3,3,3,3,3,3,0,3,3,1,2,1,3,2,3,3,1,2,0, + 0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,3, + 0,0,3,2,3,0,3,3,3,3,3,3,3,3,0,3,3,1,2,2,3,2,1,3,2,3,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3, + 0,0,3,2,3,0,3,3,3,3,3,3,3,3,0,3,3,2,3,1,3,2,3,2,2,2,0, + 3,3,3,3,0,3,2,0,3,3,3,0,1,0,1,0,2,0,2,2,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3, + 0,0,3,3,3,0,2,3,3,3,3,3,3,3,0,3,3,2,2,3,3,2,2,3,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3, + 0,0,3,3,3,0,1,3,3,3,3,1,1,3,0,1,3,2,2,3,3,2,2,2,0,2,0, + 0,1,1,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,3, + 0,0,3,3,3,0,1,2,3,3,3,1,2,3,0,3,1,3,3,2,0,1,1,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3, + 0,0,3,3,3,0,3,1,3,2,2,1,1,3,0,0,1,3,1,3,0,1,1,1,1,0,0, + 0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,3, + 0,0,3,3,3,0,3,3,2,3,2,3,3,3,0,1,2,3,2,2,1,3,1,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3, + 0,0,3,3,3,0,1,1,2,2,3,3,2,3,0,2,3,1,2,3,2,1,0,2,1,0,0, + 0,0,0,0,0,0,0,1,1,0,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,3, + 0,0,3,3,3,0,1,2,2,1,1,1,3,3,0,3,0,1,3,3,2,0,0,0,0,1,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3, + 0,0,3,3,1,0,3,3,3,3,3,3,3,1,0,2,2,0,2,1,2,2,2,2,0,2,0, + 1,2,0,0,3,3,3,3,2,3,2,3,2,3,2,0,0,3,3,0,2,3,3,1,2,0, + 1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3, + 0,0,3,3,3,0,3,1,2,3,0,3,2,3,0,2,2,3,1,2,1,1,1,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3, + 0,0,3,3,3,0,2,1,2,2,1,2,1,3,0,0,2,1,1,2,1,1,2,1,1,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3, + 0,0,3,3,3,0,2,2,1,2,2,2,1,2,0,1,1,0,1,3,1,1,1,0,1,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3, + 0,0,3,3,3,0,3,3,1,0,3,3,1,2,0,1,1,1,3,2,0,0,1,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2, + 0,0,2,1,3,0,2,3,3,3,3,3,1,1,0,2,2,1,3,1,2,0,0,0,0,3,0, + 0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3, + 0,0,3,3,3,0,3,3,1,1,1,2,1,3,0,1,2,2,0,2,2,1,1,0,1,1,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3, + 0,0,3,2,2,0,2,1,2,1,1,1,3,1,0,0,0,1,0,1,3,0,1,0,1,0,0, + 0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2, + 0,0,3,3,3,0,3,0,0,1,1,2,0,3,0,0,1,0,0,0,0,0,2,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3, + 0,0,3,3,3,0,0,0,0,0,0,1,0,2,0,0,1,1,0,1,0,1,0,1,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3, + 0,0,2,3,2,0,0,1,1,0,0,1,0,0,0,1,0,1,0,0,0,0,1,0,2,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2, + 0,0,2,1,2,0,0,0,0,2,1,1,1,2,0,1,0,0,0,1,0,0,1,2,0,2,0, + 0,1,3,0,0,1,0,1,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, }; -//Model Table: -//total sequences: 100% -//first 512 sequences: 96.9392% -//first 1024 sequences:3.0618% -//rest sequences: 0.2992% -//negative sequences: 0.0020% -static const PRUint8 BulgarianLangModel[] = -{ -0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,2,3,3,3,3,3, -3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,2,2,3,2,2,1,2,2, -3,1,3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,3,3,3,3,3,3,3,0,3,0,1, -0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1, -3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,2,3,3,3,3,3,3,3,3,0,3,1,0, -0,1,0,0,0,0,0,0,0,0,1,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1, -3,2,2,2,3,3,3,3,3,3,3,3,3,3,3,3,3,1,3,2,3,3,3,3,3,3,3,3,0,3,0,0, -0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -3,2,3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,1,3,2,3,3,3,3,3,3,3,3,0,3,0,0, -0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -3,3,3,3,3,3,3,3,3,3,3,2,3,2,2,1,3,3,3,3,2,2,2,1,1,2,0,1,0,1,0,0, -0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,1, -3,3,3,3,3,3,3,2,3,2,2,3,3,1,1,2,3,3,2,3,3,3,3,2,1,2,0,2,0,3,0,0, -0,0,0,0,0,0,0,1,0,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,1, -3,3,3,3,3,3,3,1,3,3,3,3,3,2,3,2,3,3,3,3,3,2,3,3,1,3,0,3,0,2,0,0, -0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1, -3,3,3,3,3,3,3,3,1,3,3,2,3,3,3,1,3,3,2,3,2,2,2,0,0,2,0,2,0,2,0,0, -0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,1, -3,3,3,3,3,3,3,3,3,0,3,3,3,2,2,3,3,3,1,2,2,3,2,1,1,2,0,2,0,0,0,0, -1,0,0,0,0,0,0,0,0,0,2,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1, -3,3,3,3,3,3,3,2,3,3,1,2,3,2,2,2,3,3,3,3,3,2,2,3,1,2,0,2,1,2,0,0, -0,0,0,0,0,0,0,0,0,0,3,0,0,1,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,1, -3,3,3,3,3,1,3,3,3,3,3,2,3,3,3,2,3,3,2,3,2,2,2,3,1,2,0,1,0,1,0,0, -0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1, -3,3,3,3,3,3,3,3,3,3,3,1,1,1,2,2,1,3,1,3,2,2,3,0,0,1,0,1,0,1,0,0, -0,0,0,1,0,0,0,0,1,0,2,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1, -3,3,3,3,3,2,2,3,2,2,3,1,2,1,1,1,2,3,1,3,1,2,2,0,1,1,1,1,0,1,0,0, -0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1, -3,3,3,3,3,1,3,2,2,3,3,1,2,3,1,1,3,3,3,3,1,2,2,1,1,1,0,2,0,2,0,1, -0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1, -3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,2,2,3,3,3,2,2,1,1,2,0,2,0,1,0,0, -0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1, -3,0,1,2,1,3,3,2,3,3,3,3,3,2,3,2,1,0,3,1,2,1,2,1,2,3,2,1,0,1,0,0, -0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -1,1,1,2,3,3,3,3,3,3,3,3,3,3,3,3,0,0,3,1,3,3,2,3,3,2,2,2,0,1,0,0, -0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -2,3,3,3,3,0,3,3,3,3,3,2,1,1,2,1,3,3,0,3,1,1,1,1,3,2,0,1,0,0,0,0, -0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1, -3,3,2,2,2,3,3,3,3,3,3,3,3,3,3,3,1,1,3,1,3,3,2,3,2,2,2,3,0,2,0,0, -0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -3,3,3,3,3,2,3,3,2,2,3,2,1,1,1,1,1,3,1,3,1,1,0,0,0,1,0,0,0,1,0,0, -0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0, -3,3,3,3,3,2,3,2,0,3,2,0,3,0,2,0,0,2,1,3,1,0,0,1,0,0,0,1,0,0,0,0, -0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1, -3,3,3,3,2,1,1,1,1,2,1,1,2,1,1,1,2,2,1,2,1,1,1,0,1,1,0,1,0,1,0,0, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1, -3,3,3,3,2,1,3,1,1,2,1,3,2,1,1,0,1,2,3,2,1,1,1,0,0,0,0,0,0,0,0,0, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -2,3,3,3,3,2,2,1,0,1,0,0,1,0,0,0,2,1,0,3,0,0,1,0,0,0,0,0,0,0,0,0, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1, -3,3,3,2,3,2,3,3,1,3,2,1,1,1,2,1,1,2,1,3,0,1,0,0,0,1,0,0,0,0,0,0, -0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -3,1,1,2,2,3,3,2,3,2,2,2,3,1,2,2,1,1,2,1,1,2,2,0,1,1,0,1,0,2,0,0, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -3,3,3,3,2,1,3,1,0,2,2,1,3,2,1,0,0,2,0,2,0,1,0,0,0,0,0,0,0,1,0,0, -0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1, -3,3,3,3,3,3,1,2,0,2,3,1,2,3,2,0,1,3,1,2,1,1,1,0,0,1,0,0,2,2,2,3, -2,2,2,2,1,2,1,1,2,2,1,1,2,0,1,1,1,0,0,1,1,0,0,1,1,0,0,0,1,1,0,1, -3,3,3,3,3,2,1,2,2,1,2,0,2,0,1,0,1,2,1,2,1,1,0,0,0,1,0,1,0,0,0,0, -0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,1, -3,3,2,3,3,1,1,3,1,0,3,2,1,0,0,0,1,2,0,2,0,1,0,0,0,1,0,1,2,1,2,2, -1,1,1,1,1,1,1,2,2,2,1,1,1,1,1,1,1,0,1,2,1,1,1,0,0,0,0,0,1,1,0,0, -3,1,0,1,0,2,3,2,2,2,3,2,2,2,2,2,1,0,2,1,2,1,1,1,0,1,2,1,2,2,2,1, -1,1,2,2,2,2,1,2,1,1,0,1,2,1,2,2,2,1,1,1,0,1,1,1,1,2,0,1,0,0,0,0, -2,3,2,3,3,0,0,2,1,0,2,1,0,0,0,0,2,3,0,2,0,0,0,0,0,1,0,0,2,0,1,2, -2,1,2,1,2,2,1,1,1,2,1,1,1,0,1,2,2,1,1,1,1,1,0,1,1,1,0,0,1,2,0,0, -3,3,2,2,3,0,2,3,1,1,2,0,0,0,1,0,0,2,0,2,0,0,0,1,0,1,0,1,2,0,2,2, -1,1,1,1,2,1,0,1,2,2,2,1,1,1,1,1,1,1,0,1,1,1,0,0,0,0,0,0,1,1,0,0, -2,3,2,3,3,0,0,3,0,1,1,0,1,0,0,0,2,2,1,2,0,0,0,0,0,0,0,0,2,0,1,2, -2,2,1,1,1,1,1,2,2,2,1,0,2,0,1,0,1,0,0,1,0,1,0,0,1,0,0,0,0,1,0,0, -3,3,3,3,2,2,2,2,2,0,2,1,1,1,1,2,1,2,1,1,0,2,0,1,0,1,0,0,2,0,1,2, -1,1,1,1,1,1,1,2,2,1,1,0,2,0,1,0,2,0,0,1,1,1,0,0,2,0,0,0,1,1,0,0, -2,3,3,3,3,1,0,0,0,0,0,0,0,0,0,0,2,0,0,1,1,0,0,0,0,0,0,1,2,0,1,2, -2,2,2,1,1,2,1,1,2,2,2,1,2,0,1,1,1,1,1,1,0,1,1,1,1,0,0,1,1,1,0,0, -2,3,3,3,3,0,2,2,0,2,1,0,0,0,1,1,1,2,0,2,0,0,0,3,0,0,0,0,2,0,2,2, -1,1,1,2,1,2,1,1,2,2,2,1,2,0,1,1,1,0,1,1,1,1,0,2,1,0,0,0,1,1,0,0, -2,3,3,3,3,0,2,1,0,0,2,0,0,0,0,0,1,2,0,2,0,0,0,0,0,0,0,0,2,0,1,2, -1,1,1,2,1,1,1,1,2,2,2,0,1,0,1,1,1,0,0,1,1,1,0,0,1,0,0,0,0,1,0,0, -3,3,2,2,3,0,1,0,1,0,0,0,0,0,0,0,1,1,0,3,0,0,0,0,0,0,0,0,1,0,2,2, -1,1,1,1,1,2,1,1,2,2,1,2,2,1,0,1,1,1,1,1,0,1,0,0,1,0,0,0,1,1,0,0, -3,1,0,1,0,2,2,2,2,3,2,1,1,1,2,3,0,0,1,0,2,1,1,0,1,1,1,1,2,1,1,1, -1,2,2,1,2,1,2,2,1,1,0,1,2,1,2,2,1,1,1,0,0,1,1,1,2,1,0,1,0,0,0,0, -2,1,0,1,0,3,1,2,2,2,2,1,2,2,1,1,1,0,2,1,2,2,1,1,2,1,1,0,2,1,1,1, -1,2,2,2,2,2,2,2,1,2,0,1,1,0,2,1,1,1,1,1,0,0,1,1,1,1,0,1,0,0,0,0, -2,1,1,1,1,2,2,2,2,1,2,2,2,1,2,2,1,1,2,1,2,3,2,2,1,1,1,1,0,1,0,0, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -2,2,2,3,2,0,1,2,0,1,2,1,1,0,1,0,1,2,1,2,0,0,0,1,1,0,0,0,1,0,0,2, -1,1,0,0,1,1,0,1,1,1,1,0,2,0,1,1,1,0,0,1,1,0,0,0,0,1,0,0,0,1,0,0, -2,0,0,0,0,1,2,2,2,2,2,2,2,1,2,1,1,1,1,1,1,1,0,1,1,1,1,1,2,1,1,1, -1,2,2,2,2,1,1,2,1,2,1,1,1,0,2,1,2,1,1,1,0,2,1,1,1,1,0,1,0,0,0,0, -3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0, -1,1,0,1,0,1,1,1,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -2,2,2,3,2,0,0,0,0,1,0,0,0,0,0,0,1,1,0,2,0,0,0,0,0,0,0,0,1,0,1,2, -1,1,1,1,1,1,0,0,2,2,2,2,2,0,1,1,0,1,1,1,1,1,0,0,1,0,0,0,1,1,0,1, -2,3,1,2,1,0,1,1,0,2,2,2,0,0,1,0,0,1,1,1,1,0,0,0,0,0,0,0,1,0,1,2, -1,1,1,1,2,1,1,1,1,1,1,1,1,0,1,1,0,1,0,1,0,1,0,0,1,0,0,0,0,1,0,0, -2,2,2,2,2,0,0,2,0,0,2,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,2,0,2,2, -1,1,1,1,1,0,0,1,2,1,1,0,1,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0, -1,2,2,2,2,0,0,2,0,1,1,0,0,0,1,0,0,2,0,2,0,0,0,0,0,0,0,0,0,0,1,1, -0,0,0,1,1,1,1,1,1,1,1,1,1,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0, -1,2,2,3,2,0,0,1,0,0,1,0,0,0,0,0,0,1,0,2,0,0,0,1,0,0,0,0,0,0,0,2, -1,1,0,0,1,0,0,0,1,1,0,0,1,0,1,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0, -2,1,2,2,2,1,2,1,2,2,1,1,2,1,1,1,0,1,1,1,1,2,0,1,0,1,1,1,1,0,1,1, -1,1,2,1,1,1,1,1,1,0,0,1,2,1,1,1,1,1,1,0,0,1,1,1,0,0,0,0,0,0,0,0, -1,0,0,1,3,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0, -0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -2,2,2,2,1,0,0,1,0,2,0,0,0,0,0,1,1,1,0,1,0,0,0,0,0,0,0,0,2,0,0,1, -0,2,0,1,0,0,1,1,2,0,1,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0, -1,2,2,2,2,0,1,1,0,2,1,0,1,1,1,0,0,1,0,2,0,1,0,0,0,0,0,0,0,0,0,1, -0,1,0,0,1,0,0,0,1,1,0,0,1,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0, -2,2,2,2,2,0,0,1,0,0,0,1,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1, -0,1,0,1,1,1,0,0,1,1,1,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0, -2,0,1,0,0,1,2,1,1,1,1,1,1,2,2,1,0,0,1,0,1,0,0,0,0,1,1,1,1,0,0,0, -1,1,2,1,1,1,1,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -2,2,1,2,1,0,0,1,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1, -0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -1,0,0,1,2,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0, -0,1,1,0,1,1,1,0,0,1,0,0,1,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0, -1,0,1,0,0,1,1,1,1,1,1,1,1,1,1,1,0,0,1,0,2,0,0,2,0,1,0,0,1,0,0,1, -1,1,0,0,1,1,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0, -0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0, -1,1,1,1,1,1,1,2,0,0,0,0,0,0,2,1,0,1,1,0,0,1,1,1,0,1,0,0,0,0,0,0, -2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -1,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,0,1,1,0,1,1,1,1,1,0,1,0,0, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1, -}; -const SequenceModel Latin5BulgarianModel = +const SequenceModel Windows_1251BulgarianModel = { - Latin5_BulgarianCharToOrderMap, + Windows_1251_CharToOrderMap, BulgarianLangModel, - 64, - (float)0.969392, + 53, + (float)0.9990032923685092, + PR_FALSE, + "WINDOWS-1251", + "bg" +}; + +const SequenceModel Iso_8859_5BulgarianModel = +{ + Iso_8859_5_CharToOrderMap, + BulgarianLangModel, + 53, + (float)0.9990032923685092, PR_FALSE, "ISO-8859-5", "bg" }; -const SequenceModel Win1251BulgarianModel = +const LanguageModel BulgarianModel = { - win1251BulgarianCharToOrderMap, + "bg", + Unicode_CharOrder, + 106, BulgarianLangModel, - 64, - (float)0.969392, - PR_FALSE, - "WINDOWS-1251", - "bg" + 53, + 5, + (float)0.4581931350621217, + 29, + (float)0.03226223199877268, }; diff --git a/src/nsLanguageDetector.h b/src/nsLanguageDetector.h index 5feb726..ff7cb4a 100644 --- a/src/nsLanguageDetector.h +++ b/src/nsLanguageDetector.h @@ -126,6 +126,7 @@ private: }; extern const LanguageModel ArabicModel; +extern const LanguageModel BulgarianModel; extern const LanguageModel CroatianModel; extern const LanguageModel CzechModel; extern const LanguageModel DanishModel; diff --git a/src/nsMBCSGroupProber.cpp b/src/nsMBCSGroupProber.cpp index 1006359..53f4c3a 100644 --- a/src/nsMBCSGroupProber.cpp +++ b/src/nsMBCSGroupProber.cpp @@ -93,6 +93,7 @@ nsMBCSGroupProber::nsMBCSGroupProber(PRUint32 aLanguageFilter) int j = 0; langDetectors[i][j++] = new nsLanguageDetector(&ArabicModel); + langDetectors[i][j++] = new nsLanguageDetector(&BulgarianModel); langDetectors[i][j++] = new nsLanguageDetector(&CroatianModel); langDetectors[i][j++] = new nsLanguageDetector(&CzechModel); langDetectors[i][j++] = new nsLanguageDetector(&DanishModel); diff --git a/src/nsMBCSGroupProber.h b/src/nsMBCSGroupProber.h index 1dea490..f36e820 100644 --- a/src/nsMBCSGroupProber.h +++ b/src/nsMBCSGroupProber.h @@ -49,7 +49,7 @@ #include "nsEUCTWProber.h" #define NUM_OF_PROBERS 8 -#define NUM_OF_LANGUAGES 31 +#define NUM_OF_LANGUAGES 32 class nsMBCSGroupProber: public nsCharSetProber { public: diff --git a/src/nsSBCSGroupProber.cpp b/src/nsSBCSGroupProber.cpp index 04b8c67..1b7da06 100644 --- a/src/nsSBCSGroupProber.cpp +++ b/src/nsSBCSGroupProber.cpp @@ -60,8 +60,8 @@ nsSBCSGroupProber::nsSBCSGroupProber() mProbers[n++] = new nsSingleByteCharSetProber(&Iso_8859_7GreekModel); mProbers[n++] = new nsSingleByteCharSetProber(&Windows_1253GreekModel); - mProbers[n++] = new nsSingleByteCharSetProber(&Latin5BulgarianModel); - mProbers[n++] = new nsSingleByteCharSetProber(&Win1251BulgarianModel); + mProbers[n++] = new nsSingleByteCharSetProber(&Iso_8859_5BulgarianModel); + mProbers[n++] = new nsSingleByteCharSetProber(&Windows_1251BulgarianModel); heb_prober_idx = n; mProbers[n++] = hebprober; diff --git a/src/nsSBCharSetProber.h b/src/nsSBCharSetProber.h index bccb9e1..d804b93 100644 --- a/src/nsSBCharSetProber.h +++ b/src/nsSBCharSetProber.h @@ -144,8 +144,8 @@ extern const SequenceModel Ibm855RussianModel; extern const SequenceModel Iso_8859_7GreekModel; extern const SequenceModel Windows_1253GreekModel; -extern const SequenceModel Latin5BulgarianModel; -extern const SequenceModel Win1251BulgarianModel; +extern const SequenceModel Iso_8859_5BulgarianModel; +extern const SequenceModel Windows_1251BulgarianModel; extern const SequenceModel Iso_8859_2HungarianModel; extern const SequenceModel Windows_1250HungarianModel; diff --git a/test/bg/iso-8859-5.txt b/test/bg/iso-8859-5.txt new file mode 100644 index 0000000..d1c7734 --- /dev/null +++ b/test/bg/iso-8859-5.txt @@ -0,0 +1,3 @@ + (Marmota) - (Sciuridae), 14 , (Spermophilus citellus). + + , . diff --git a/test/bg/utf-8.txt b/test/bg/utf-8.txt new file mode 100644 index 0000000..048b7ac --- /dev/null +++ b/test/bg/utf-8.txt @@ -0,0 +1,3 @@ +Мармотите (Marmota) са бозайници - род гризачи от семейство катерицови (Sciuridae), включващ 14 вида, включващи групата на лалугерите (Spermophilus citellus). + +За разлика от родствената катерица, мармотът и лалугерът водят наземен начин на живот.