diff --git a/script/BuildLangModelLogs/LangDanishModel.log b/script/BuildLangModelLogs/LangDanishModel.log new file mode 100644 index 0000000..cf183b3 --- /dev/null +++ b/script/BuildLangModelLogs/LangDanishModel.log @@ -0,0 +1,158 @@ += Logs of language model for Danish (da) = + +- Generated by BuildLangModel.py +- Started: 2016-02-19 17:53:58.564190 +- Maximum depth: 4 +- Max number of pages: 100 + +== Parsed pages == + +Forside (revision 2692411) +16. februar (revision 6877446) +17. februar (revision 8454583) +1878 (revision 8280505) +19. februar (revision 8206479) +1922 (revision 8455105) +1926 (revision 8425271) +1942 (revision 8443554) +1945 (revision 8448461) +1948 (revision 8454392) +1985 (revision 8409096) +2. verdenskrig (revision 8433181) +23. oktober (revision 6877825) +26. oktober (revision 7849938) +3C 273 (revision 8443798) +A-bus (revision 8427319) +Aktuelle begivenheder (revision 8440596) +B-52 Stratofortress (revision 8422571) +Borgerkrigen i Syrien (revision 8447763) +Boutros Boutros-Ghali (revision 8453935) +Brasilien (revision 8452750) +Cusco (region) (revision 7693764) +Danmark (revision 8451178) +Danmark i Eurovision Song Contest (revision 8453514) +Dansk (sprog) (revision 8455750) +Dansk Melodi Grand Prix 2016 (revision 8452164) +Dobbeltmordet pÃ¥ Peter Bangs Vej (revision 8334648) +Encyklopædi (revision 8446641) +Eritrea-sagen (revision 8452285) +Eurovision Song Contest 2014 (revision 8445804) +Eurovision Song Contest 2016 (revision 8453588) +Flygtningekrisen i Europa 2015 (revision 8452286) +Fonograf (revision 8177165) +Formel 1 (revision 8450846) +Formel 1 2016 (revision 8456463) +Frederik 6. (revision 8438503) +Første observation af gravitationsbølger (revision 8451269) +Grammofon (revision 8375093) +Guadalcanal (revision 7796248) +Harper Lee (revision 8456583) +Hartkorn (revision 8437552) +IC4 (revision 8446402) +IC4-sagen (revision 8434463) +Islamisk Stat (revision 8439228) +Jonathan Leunbach (revision 8452603) +Juliane Marie af Braunschweig-Wolfenbüttel (revision 8437957) +Kaliumklorid (revision 8452216) +Kejserriget Japan (revision 8044942) +Kevin Magnussen (revision 8455302) +København (revision 8427847) +LIGO (revision 8451266) +Latinamerika (revision 7692181) +Leonid Hurwicz (revision 8445727) +Lighthouse X (revision 8452940) +Linkoban (revision 8455879) +Machu Picchu (revision 8406907) +Matador (tv-serie) (revision 8454648) +Middelaldercentret (revision 8449194) +Nobelprisen (revision 8409809) +Nykøbing Falster (revision 8452825) +Nyligt afdøde (revision 8456580) +OvervÃ¥gning (revision 8455039) +Panorama (foto) (revision 8448393) +Peru (revision 8437485) +Peter Lauritsen (revision 8456097) +Professor (revision 8415451) +Renault F1 (revision 8450843) +S-bus (revision 8455589) +Salomonøerne (revision 8238961) +Slaget om Belgien (1940) (revision 8430013) +Slaget om Guadalcanal (revision 7762887) +Slaget om Henderson Field (revision 8445480) +Slaget om Iwo Jima (revision 8145239) +Soldiers of Love (Lighthouse X-sang) (revision 8452929) +Solen (revision 8276478) +Stillehavskrigen (revision 8430649) +Stockholm (revision 8358042) +Søslaget ved Guadalcanal (revision 7772812) +Thomas Edison (revision 8282441) +Togulykken ved Bad Aibling (revision 8455364) +Topografi (revision 6886168) +USA (revision 8448088) +United States Army (revision 8401635) +United States Marine Corps (revision 8401667) +Vestallierede (revision 6961443) +Wikimedia (revision 8263252) +Wikipedia (revision 8267051) +Zikavirus (revision 8454832) +1. februar (revision 8404985) +10. februar (revision 6877431) +11. februar (revision 6877433) +12. februar (revision 6877437) +13. februar (revision 6877438) +14. februar (revision 6877441) +1497 (revision 7369489) +15. februar (revision 7329463) +1560 (revision 7874693) +1568 (revision 7369703) +1620 (revision 7423903) +1688 (revision 7367090) +18. februar (revision 6877450) + +== End of Parsed pages == + +- Wikipedia parsing ended at: 2016-02-19 17:56:42.162636 + +53 characters appeared 1301488 times. + +First 30 characters: +[ 0] Char e: 15.272749345364689 % +[ 1] Char r: 8.48482659847805 % +[ 2] Char n: 7.695652975670924 % +[ 3] Char t: 6.977014002434137 % +[ 4] Char a: 6.780469739252302 % +[ 5] Char i: 6.164636170291236 % +[ 6] Char s: 6.0942551909814 % +[ 7] Char d: 5.953493232361728 % +[ 8] Char l: 5.076650725938311 % +[ 9] Char o: 4.883026197706011 % +[10] Char g: 4.012253666572415 % +[11] Char k: 3.232607599916403 % +[12] Char m: 3.0863135119186653 % +[13] Char f: 2.701600014752345 % +[14] Char v: 2.13970470722742 % +[15] Char b: 1.982423195603801 % +[16] Char u: 1.8339777239590376 % +[17] Char p: 1.5789619266562582 % +[18] Char h: 1.3433085821767086 % +[19] Char ø: 0.8730775850411222 % +[20] Char y: 0.859938777768216 % +[21] Char Ã¥: 0.7699648402443973 % +[22] Char æ: 0.7208671920140639 % +[23] Char j: 0.644108896893402 % +[24] Char c: 0.5698093259407694 % +[25] Char w: 0.11087309295206717 % +[26] Char z: 0.05309307500338075 % +[27] Char x: 0.032424424965885205 % +[28] Char é: 0.032193919575132464 % +[29] Char q: 0.012139950579644223 % + +The first 30 characters have an accumulated ratio of 0.9997241618823994. + +964 sequences found. + +First 512 (typical positive ratio): 0.9968082796759031 +Next 512 (512-1024): 7.68351302509128e-07 +Rest: 3.903127820947816e-17 + +- Processing end: 2016-02-19 17:56:42.304278 diff --git a/script/langs/da.py b/script/langs/da.py new file mode 100644 index 0000000..df94208 --- /dev/null +++ b/script/langs/da.py @@ -0,0 +1,78 @@ +#!/bin/python3 +# -*- coding: utf-8 -*- + +# ##### BEGIN LICENSE BLOCK ##### +# Version: MPL 1.1/GPL 2.0/LGPL 2.1 +# +# The contents of this file are subject to the Mozilla Public License Version +# 1.1 (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# http://www.mozilla.org/MPL/ +# +# Software distributed under the License is distributed on an "AS IS" basis, +# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License +# for the specific language governing rights and limitations under the +# License. +# +# The Original Code is Mozilla Universal charset detector code. +# +# The Initial Developer of the Original Code is +# Netscape Communications Corporation. +# Portions created by the Initial Developer are Copyright (C) 2001 +# the Initial Developer. All Rights Reserved. +# +# Contributor(s): +# Jehan +# +# Alternatively, the contents of this file may be used under the terms of +# either the GNU General Public License Version 2 or later (the "GPL"), or +# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), +# in which case the provisions of the GPL or the LGPL are applicable instead +# of those above. If you wish to allow use of your version of this file only +# under the terms of either the GPL or the LGPL, and not to allow others to +# use your version of this file under the terms of the MPL, indicate your +# decision by deleting the provisions above and replace them with the notice +# and other provisions required by the GPL or the LGPL. If you do not delete +# the provisions above, a recipient may use your version of this file under +# the terms of any one of the MPL, the GPL or the LGPL. +# +# ##### END LICENSE BLOCK ##### + +import re + +## Mandatory Properties ## + +# The human name for the language, in English. +name = 'Danish' +# Use 2-letter ISO 639-1 if possible, 3-letter ISO code otherwise, +# or use another catalog as a last resort. +code = 'da' +# ASCII characters are also used in French. +use_ascii = True +# The charsets we want to support and create data for. +charsets = ['ISO-8859-15', 'ISO-8859-1', 'WINDOWS-1252'] + +## Optional Properties ## + +# Alphabet characters. +# If use_ascii=True, there is no need to add any ASCII characters. +# If case_mapping=True, there is no need to add several cases of a same +# character (provided Python algorithms know the right cases). +alphabet = 'æøå' +# The start page. Though optional, it is advised to choose one yourself. +start_pages = ['Forside'] +# give possibility to select another code for the Wikipedia URL. +wikipedia_code = code +# 'a' and 'A' will be considered the same character, and so on. +# This uses Python algorithm to determine upper/lower-case of a given +# character. +case_mapping = True + +# A function to clean content returned by the `wikipedia` python lib, +# in case some unwanted data has been overlooked. +def clean_wikipedia_content(content): + # We get modify link in the text: "=== Articles connexesModifier ===" + cleaned = re.sub(r'(=+) *([^=]+) *\1', + r'\2', + content) + return cleaned diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 39f460c..7adbfcf 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -13,6 +13,7 @@ set( LangModels/LangRussianModel.cpp LangModels/LangEsperantoModel.cpp LangModels/LangFrenchModel.cpp + LangModels/LangDanishModel.cpp LangModels/LangGermanModel.cpp LangModels/LangGreekModel.cpp LangModels/LangHungarianModel.cpp diff --git a/src/LangModels/LangDanishModel.cpp b/src/LangModels/LangDanishModel.cpp new file mode 100644 index 0000000..46b6f28 --- /dev/null +++ b/src/LangModels/LangDanishModel.cpp @@ -0,0 +1,198 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is Mozilla Communicator client code. + * + * The Initial Developer of the Original Code is + * Netscape Communications Corporation. + * Portions created by the Initial Developer are Copyright (C) 1998 + * the Initial Developer. All Rights Reserved. + * + * Contributor(s): + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ + +#include "../nsSBCharSetProber.h" + +/********* Language model for: Danish *********/ + +/** + * Generated by BuildLangModel.py + * On: 2016-02-19 17:56:42.163975 + **/ + +/* Character Mapping Table: + * ILL: illegal character. + * CTR: control character specific to the charset. + * RET: carriage/return. + * SYM: symbol (punctuation) that does not belong to word. + * NUM: 0 - 9. + * + * Other characters are ordered by probabilities + * (0 is the most common character in the language). + * + * Orders are generic to a language. So the codepoint with order X in + * CHARSET1 maps to the same character as the codepoint with the same + * order X in CHARSET2 for the same language. + * As such, it is possible to get missing order. For instance the + * ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1 + * even though they are both used for French. Same for the euro sign. + */ +static const unsigned char Iso_8859_15_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 4, 15, 24, 7, 0, 13, 10, 18, 5, 23, 11, 8, 12, 2, 9, /* 4X */ + 17, 29, 1, 6, 3, 16, 14, 25, 27, 20, 26,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 4, 15, 24, 7, 0, 13, 10, 18, 5, 23, 11, 8, 12, 2, 9, /* 6X */ + 17, 29, 1, 6, 3, 16, 14, 25, 27, 20, 26,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM,SYM,SYM,SYM,SYM,SYM, 39,SYM, 39,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ + SYM,SYM,SYM,SYM, 53, 42,SYM,SYM, 54,SYM,SYM,SYM, 55, 56, 57,SYM, /* BX */ + 58, 33, 40, 35, 32, 21, 22, 38, 41, 28, 49, 45, 59, 34, 60, 50, /* CX */ + 43, 47, 51, 36, 52, 61, 30,SYM, 19, 62, 37, 44, 31, 46, 63, 48, /* DX */ + 64, 33, 40, 35, 32, 21, 22, 38, 41, 28, 49, 45, 65, 34, 66, 50, /* EX */ + 43, 47, 51, 36, 52, 67, 30,SYM, 19, 68, 37, 44, 31, 46, 69, 70, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Iso_8859_1_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 4, 15, 24, 7, 0, 13, 10, 18, 5, 23, 11, 8, 12, 2, 9, /* 4X */ + 17, 29, 1, 6, 3, 16, 14, 25, 27, 20, 26,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 4, 15, 24, 7, 0, 13, 10, 18, 5, 23, 11, 8, 12, 2, 9, /* 6X */ + 17, 29, 1, 6, 3, 16, 14, 25, 27, 20, 26,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ + SYM,SYM,SYM,SYM,SYM, 42,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */ + 71, 33, 40, 35, 32, 21, 22, 38, 41, 28, 49, 45, 72, 34, 73, 50, /* CX */ + 43, 47, 51, 36, 52, 74, 30,SYM, 19, 75, 37, 44, 31, 46, 76, 48, /* DX */ + 77, 33, 40, 35, 32, 21, 22, 38, 41, 28, 49, 45, 78, 34, 79, 50, /* EX */ + 43, 47, 51, 36, 52, 80, 30,SYM, 19, 81, 37, 44, 31, 46, 82, 83, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Windows_1252_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 4, 15, 24, 7, 0, 13, 10, 18, 5, 23, 11, 8, 12, 2, 9, /* 4X */ + 17, 29, 1, 6, 3, 16, 14, 25, 27, 20, 26,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 4, 15, 24, 7, 0, 13, 10, 18, 5, 23, 11, 8, 12, 2, 9, /* 6X */ + 17, 29, 1, 6, 3, 16, 14, 25, 27, 20, 26,SYM,SYM,SYM,SYM,CTR, /* 7X */ + SYM,ILL,SYM, 84,SYM,SYM,SYM,SYM,SYM,SYM, 39,SYM, 85,ILL, 86,ILL, /* 8X */ + ILL,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 39,SYM, 87,ILL, 88, 89, /* 9X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ + SYM,SYM,SYM,SYM,SYM, 42,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */ + 90, 33, 40, 35, 32, 21, 22, 38, 41, 28, 49, 45, 91, 34, 92, 50, /* CX */ + 43, 47, 51, 36, 52, 93, 30,SYM, 19, 94, 37, 44, 31, 46, 95, 48, /* DX */ + 96, 33, 40, 35, 32, 21, 22, 38, 41, 28, 49, 45, 97, 34, 98, 50, /* EX */ + 43, 47, 51, 36, 52, 99, 30,SYM, 19,100, 37, 44, 31, 46,101,102, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + + +/* Model Table: + * Total sequences: 964 + * First 512 sequences: 0.9968082796759031 + * Next 512 sequences (512-1024): 0.0031917203240968304 + * Rest: 3.903127820947816e-17 + * Negative sequences: TODO + */ +static const PRUint8 DanishLangModel[] = +{ + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,2,2,3,3,3,2,3,0,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,2,2,2,2,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,2,2,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,0,2,3,3,3,3,3,2,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,0,2,3,3,2,3,3,2,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,0,2,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,0,2,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,0,2,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,2,2,3,3,3,2,2,0,0, + 3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,2,3,3,3,3,3,3,2,2,2,2,3,2, + 3,3,3,3,3,3,3,2,3,3,2,3,3,2,3,2,3,2,3,3,3,3,3,2,2,2,2,2,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,2,2,2,3,0, + 3,3,3,3,3,3,3,2,3,3,3,2,2,3,3,3,3,2,3,3,3,3,3,3,2,2,2,2,2,0, + 3,3,3,3,3,3,3,3,3,3,2,2,2,2,2,2,3,2,2,2,2,3,3,3,2,2,0,0,2,0, + 3,3,3,3,3,3,3,2,3,3,2,2,2,2,2,3,3,2,2,3,3,3,3,3,2,2,0,0,2,0, + 3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,2,3,3,2,3,0,2,2,3,2,3,3,0,2, + 3,3,3,3,3,3,3,3,3,3,3,2,2,3,3,3,3,3,3,3,2,3,3,2,2,0,2,0,2,0, + 3,3,3,3,3,3,2,2,3,3,2,2,3,2,3,2,3,2,2,3,3,3,3,3,2,3,2,2,2,0, + 3,3,3,3,2,2,3,3,3,2,3,3,3,2,3,3,0,2,2,2,2,0,0,3,0,0,2,0,0,0, + 3,3,3,3,3,2,3,3,3,3,3,3,3,2,3,3,3,3,2,2,2,0,0,0,2,2,2,0,0,0, + 3,3,3,3,2,0,3,3,3,2,3,3,2,2,3,3,0,2,2,2,0,0,0,0,0,0,0,0,0,0, + 2,3,3,3,0,3,3,3,3,2,3,3,3,3,3,3,2,2,2,0,0,0,0,0,2,0,0,0,0,0, + 3,3,2,3,3,3,3,3,3,3,2,2,2,2,2,2,3,2,2,3,3,2,3,2,2,0,0,0,0,0, + 3,3,2,3,3,3,2,2,3,3,2,3,2,2,0,2,3,2,3,0,3,0,0,2,3,2,2,0,2,2, + 3,2,2,2,3,3,2,2,2,3,0,2,2,2,0,2,2,0,2,0,2,0,0,0,2,2,2,0,0,0, + 3,2,2,2,3,3,2,2,0,3,0,2,2,0,0,2,2,2,2,2,2,0,0,2,2,0,2,0,0,0, + 3,2,0,2,2,3,2,0,2,2,0,0,2,2,2,2,2,2,2,2,0,0,0,0,2,2,0,0,2,0, + 2,3,2,2,2,0,2,2,2,2,2,2,2,0,2,2,0,2,0,0,0,0,0,0,2,0,0,0,0,0, + 0,0,0,0,3,2,2,2,2,2,0,0,0,0,2,2,3,0,2,0,0,0,0,0,0,0,0,0,0,2, +}; + + +const SequenceModel Iso_8859_15DanishModel = +{ + Iso_8859_15_CharToOrderMap, + DanishLangModel, + 30, + (float)0.9968082796759031, + PR_TRUE, + "ISO-8859-15" +}; + +const SequenceModel Iso_8859_1DanishModel = +{ + Iso_8859_1_CharToOrderMap, + DanishLangModel, + 30, + (float)0.9968082796759031, + PR_TRUE, + "ISO-8859-1" +}; + +const SequenceModel Windows_1252DanishModel = +{ + Windows_1252_CharToOrderMap, + DanishLangModel, + 30, + (float)0.9968082796759031, + PR_TRUE, + "WINDOWS-1252" +}; \ No newline at end of file diff --git a/src/nsSBCSGroupProber.cpp b/src/nsSBCSGroupProber.cpp index 42b3a03..b1a60cc 100644 --- a/src/nsSBCSGroupProber.cpp +++ b/src/nsSBCSGroupProber.cpp @@ -107,6 +107,10 @@ nsSBCSGroupProber::nsSBCSGroupProber() mProbers[30] = new nsSingleByteCharSetProber(&VisciiVietnameseModel); mProbers[31] = new nsSingleByteCharSetProber(&Windows_1258VietnameseModel); + mProbers[32] = new nsSingleByteCharSetProber(&Iso_8859_15DanishModel); + mProbers[33] = new nsSingleByteCharSetProber(&Iso_8859_1DanishModel); + mProbers[34] = new nsSingleByteCharSetProber(&Windows_1252DanishModel); + Reset(); } diff --git a/src/nsSBCSGroupProber.h b/src/nsSBCSGroupProber.h index c50ede6..c1ea4a1 100644 --- a/src/nsSBCSGroupProber.h +++ b/src/nsSBCSGroupProber.h @@ -40,7 +40,7 @@ #define nsSBCSGroupProber_h__ -#define NUM_OF_SBCS_PROBERS 32 +#define NUM_OF_SBCS_PROBERS 35 class nsCharSetProber; class nsSBCSGroupProber: public nsCharSetProber { diff --git a/src/nsSBCharSetProber.h b/src/nsSBCharSetProber.h index 28b26e9..fb40d3f 100644 --- a/src/nsSBCharSetProber.h +++ b/src/nsSBCharSetProber.h @@ -165,5 +165,9 @@ extern const SequenceModel Iso_8859_9TurkishModel; extern const SequenceModel VisciiVietnameseModel; extern const SequenceModel Windows_1258VietnameseModel; +extern const SequenceModel Iso_8859_15DanishModel; +extern const SequenceModel Iso_8859_1DanishModel; +extern const SequenceModel Windows_1252DanishModel; + #endif /* nsSingleByteCharSetProber_h__ */ diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index a36a739..0339263 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -37,6 +37,7 @@ foreach(dir ${dirs}) if ("${lang}:${charset}" STREQUAL "ja:utf-16le" OR "${lang}:${charset}" STREQUAL "ja:utf-16be" OR "${lang}:${charset}" STREQUAL "es:iso-8859-15" OR + "${lang}:${charset}" STREQUAL "da:iso-8859-1" OR "${lang}:${charset}" STREQUAL "he:iso-8859-8") message(STATUS "Skipping test ${lang}:${charset} (known broken)") else() diff --git a/test/da/iso-8859-1.txt b/test/da/iso-8859-1.txt new file mode 100644 index 0000000..f36d4ab --- /dev/null +++ b/test/da/iso-8859-1.txt @@ -0,0 +1,7 @@ +Dansk er et nord-germansk sprog af den østnordiske (kontinentale) gruppe, der +tales af ca. seks millioner mennesker. Det er stærkt påvirket af plattysk. Dansk +tales også i Sydslesvig (i Flensborg ca. 20 %) samt på Færøerne og Grønland [1]. +Dansk er tæt forbundet med norsk. Fra et sprogvidenskabeligt synspunkt kan den +fremherskende form af norsk, bokmål (og i endnu højere grad riksmål), betragtes +som dansk, i hvert fald hvad skriftsproget angår. Både dansk, norsk og svensk er +skandinaviske sprog og minder meget om hinanden. diff --git a/test/da/iso-8859-15.txt b/test/da/iso-8859-15.txt new file mode 100644 index 0000000..c400e0a --- /dev/null +++ b/test/da/iso-8859-15.txt @@ -0,0 +1,10 @@ +Eurosymbolet eller eurotegnet (¤) anvendes som valutasymbol for møntenheden +euro. Symbolsk kombinerer det et E eller et græsk epsilon med de to parallelle +streger, man ofte ser i valutasymboler. + +Det vides ikke med sikkerhed, hvem eurosymbolet blev designet af. Nogle medier +hævder, det blev skabt af tidligere designer ved EF Arthur Eisenmenger, mens +andre påstår, det blev skabt af en lille gruppe ledet af Alain Billiet. Muligvis +er ingen af disse forklaringer korrekte, da Den Paneuropæiske Union udsendte en +'1 euro'-medalje i 1972, hvorpå man kan se et symbol, der i høj grad ligner det +nuværende eurosymbol. diff --git a/test/da/utf-8.txt b/test/da/utf-8.txt new file mode 100644 index 0000000..e5e0274 --- /dev/null +++ b/test/da/utf-8.txt @@ -0,0 +1,10 @@ +Eurosymbolet eller eurotegnet (€) anvendes som valutasymbol for møntenheden +euro. Symbolsk kombinerer det et E eller et græsk epsilon med de to parallelle +streger, man ofte ser i valutasymboler. + +Det vides ikke med sikkerhed, hvem eurosymbolet blev designet af. Nogle medier +hævder, det blev skabt af tidligere designer ved EF Arthur Eisenmenger, mens +andre pÃ¥stÃ¥r, det blev skabt af en lille gruppe ledet af Alain Billiet. Muligvis +er ingen af disse forklaringer korrekte, da Den Paneuropæiske Union udsendte en +'1 euro'-medalje i 1972, hvorpÃ¥ man kan se et symbol, der i høj grad ligner det +nuværende eurosymbol. diff --git a/test/da/windows-1252.txt b/test/da/windows-1252.txt new file mode 100644 index 0000000..db8faf1 --- /dev/null +++ b/test/da/windows-1252.txt @@ -0,0 +1,10 @@ +Eurosymbolet eller eurotegnet (€) anvendes som valutasymbol for møntenheden +euro. Symbolsk kombinerer det et E eller et græsk epsilon med de to parallelle +streger, man ofte ser i valutasymboler. + +Det vides ikke med sikkerhed, hvem eurosymbolet blev designet af. Nogle medier +hævder, det blev skabt af tidligere designer ved EF Arthur Eisenmenger, mens +andre påstår, det blev skabt af en lille gruppe ledet af Alain Billiet. Muligvis +er ingen af disse forklaringer korrekte, da Den Paneuropæiske Union udsendte en +'1 euro'-medalje i 1972, hvorpå man kan se et symbol, der i høj grad ligner det +nuværende eurosymbol.