diff --git a/script/BuildLangModelLogs/LangMalteseModel.log b/script/BuildLangModelLogs/LangMalteseModel.log new file mode 100644 index 0000000..ad867b3 --- /dev/null +++ b/script/BuildLangModelLogs/LangMalteseModel.log @@ -0,0 +1,147 @@ += Logs of language model for Maltese (mt) = + +- Generated by BuildLangModel.py +- Started: 2016-09-21 02:05:23.411546 +- Maximum depth: 5 +- Max number of pages: 100 + +== Parsed pages == + +Unjoni Ewropea (revision 246298) +1951 (revision 229183) +1952 (revision 229184) +1957 (revision 229188) +1958 (revision 229189) +1973 (revision 223536) +1979 (revision 243876) +1981 (revision 205545) +1985 (revision 216368) +1986 (revision 231433) +1990 (revision 237666) +1992 (revision 244087) +1995 (revision 214650) +1 ta' Mejju (revision 245374) +2007 (revision 214851) +2013 (revision 245606) +Albanija (revision 243079) +Awstrija (revision 243627) +Awtonomija (revision 245824) +Ażores (revision 246298) +Bank ÄŠentrali Ewropew (revision 246298) +Belt kapitali (revision 237400) +BelÄ¡ju (revision 244363) +Brussell (revision 243311) +Bulgarija (revision 243622) +Danimarka (revision 244419) +De facto (revision 215102) +Estonja (revision 243826) +European Free Trade Association (revision 246298) +Ewropa (revision 244177) +Ex Repubblika Jugoslava tal-MaÄ‹edonja (revision 246298) +Federazzjoni (revision 246226) +Finlandja (revision 245824) +Frankfurt (revision 243576) +Franza (revision 244461) +GreÄ‹ja (revision 244423) +Groenlandja (revision 243829) +Indja (revision 244873) +Islanda (revision 243771) +Isle of Man (revision 246298) +Istitut tal-Unjoni Ewropea għall-Istudji dwar is-Sigurtà (revision 244412) +Italja (revision 246323) +Kilometru kwadru (revision 244871) +Komunitajiet Ewropej (revision 246298) +Komunità Ekonomika Ewropea (revision 246298) +Kroazja (revision 245711) +Kummissjoni Ewropea (revision 243311) +Kunsill Ewropew (revision 246298) +Kunsill tal-Ewropa (revision 243334) +Kunsill tal-Unjoni Ewropea (revision 243311) +Latvja (revision 245746) +Lista ta' pajjiżi skont id-daqs (revision 244419) +Lista ta' pajjiżi skont il-popolazzjoni (revision 246128) +Litwanja (revision 243114) +LiÄ¡ijiet tal-Unjoni Ewropea (revision 246298) +Lussemburgu (revision 244239) +Lussemburgu (belt) (revision 243587) +Madejra (revision 243625) +Malta (revision 247210) +Montenegro (revision 243930) +NorveÄ¡ja (revision 243829) +Olanda (revision 243989) +Organizzazzjoni Internazzjonali (revision 246724) +Pajjiżi l-Baxxi (revision 243989) +Pajjiżi membri tal-Unjoni Ewropea (revision 243625) +Pajjiżi Ä¡irien li jdawru l-Unjoni Ewropea (revision 246298) +Parlament Ewropew (revision 243907) +Patt ta' Stabilità u Tkabbir (revision 246298) +Politika agrikola komuni (revision 244363) +Politika reÄ¡jonali tal-Unjoni Ewropea (revision 246298) +Polonja (revision 244530) +Portugall (revision 243625) +Relazzjonijiet ta' terzi pajjiżi ma l-UE (revision 246298) +Renju Unit (revision 247318) +Repubblika Federali tal-Ä ermanja (revision 244859) +Repubblika tal-Irlanda (revision 243686) +Repubblika ÄŠeka (revision 246832) +Rumanija (revision 243623) +Segretarjat tal-Parlament Ewropew (revision 246298) +Serbja (revision 243728) +Slovakkja (revision 243831) +Slovenja (revision 244588) +Spanja (revision 246856) +Stati Uniti tal-Amerika (revision 243926) +Stati membri tal-Unjoni Ewropea (revision 243114) +Strasburgu (revision 243503) +Sui generis (revision 247150) +Suq komuni (revision 246298) +Svezja (revision 244871) + +== End of Parsed pages == + +- Wikipedia parsing ended at: 2016-09-21 02:07:45.508113 + +48 characters appeared 474337 times. + +First 31 characters: +[ 0] Char a: 12.326257492036252 % +[ 1] Char i: 12.069899670487438 % +[ 2] Char t: 8.064941170518008 % +[ 3] Char l: 7.795301652622502 % +[ 4] Char e: 6.615971345267184 % +[ 5] Char n: 6.128132530247482 % +[ 6] Char r: 5.579577389071483 % +[ 7] Char u: 4.376424356522894 % +[ 8] Char o: 3.8337721915009797 % +[ 9] Char j: 3.7378488289971057 % +[10] Char m: 3.6084049947611088 % +[11] Char s: 3.3533120966738834 % +[12] Char k: 2.588033402412209 % +[13] Char d: 2.3173397816320462 % +[14] Char p: 2.0555006250830106 % +[15] Char b: 2.017131280081461 % +[16] Char f: 2.004692866042497 % +[17] Char ħ: 1.6372326004507345 % +[18] Char w: 1.4801712706366992 % +[19] Char g: 1.4763765002519307 % +[20] Char z: 1.3150987588992635 % +[21] Char ż: 0.9910675321554084 % +[22] Char h: 0.9750451683086075 % +[23] Char Ä¡: 0.7640137708000851 % +[24] Char Ä‹: 0.6723068198348432 % +[25] Char x: 0.5892435125237964 % +[26] Char v: 0.5668965313690478 % +[27] Char q: 0.5647883255997318 % +[28] Char c: 0.2759641352034524 % +[29] Char à: 0.10730767365817974 % +[30] Char y: 0.059029761540845424 % + +The first 31 characters have an accumulated ratio of 0.9994708403519017. + +870 sequences found. + +First 512 (typical positive ratio): 0.9959115850692665 +Next 512 (512-1024): 2.108205769315908e-06 +Rest: -4.423544863740858e-17 + +- Processing end: 2016-09-21 02:07:45.646198 diff --git a/script/langs/mt.py b/script/langs/mt.py new file mode 100644 index 0000000..8a28a82 --- /dev/null +++ b/script/langs/mt.py @@ -0,0 +1,80 @@ +#!/bin/python3 +# -*- coding: utf-8 -*- + +# ##### BEGIN LICENSE BLOCK ##### +# Version: MPL 1.1/GPL 2.0/LGPL 2.1 +# +# The contents of this file are subject to the Mozilla Public License Version +# 1.1 (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# http://www.mozilla.org/MPL/ +# +# Software distributed under the License is distributed on an "AS IS" basis, +# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License +# for the specific language governing rights and limitations under the +# License. +# +# The Original Code is Mozilla Universal charset detector code. +# +# The Initial Developer of the Original Code is +# Netscape Communications Corporation. +# Portions created by the Initial Developer are Copyright (C) 2001 +# the Initial Developer. All Rights Reserved. +# +# Contributor(s): +# Jehan +# +# Alternatively, the contents of this file may be used under the terms of +# either the GNU General Public License Version 2 or later (the "GPL"), or +# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), +# in which case the provisions of the GPL or the LGPL are applicable instead +# of those above. If you wish to allow use of your version of this file only +# under the terms of either the GPL or the LGPL, and not to allow others to +# use your version of this file under the terms of the MPL, indicate your +# decision by deleting the provisions above and replace them with the notice +# and other provisions required by the GPL or the LGPL. If you do not delete +# the provisions above, a recipient may use your version of this file under +# the terms of any one of the MPL, the GPL or the LGPL. +# +# ##### END LICENSE BLOCK ##### + +import re + +## Mandatory Properties ## + +# The human name for the language, in English. +name = 'Maltese' +# Use 2-letter ISO 639-1 if possible, 3-letter ISO code otherwise, +# or use another catalog as a last resort. +code = 'mt' +# ASCII characters are also used in French. +use_ascii = True +# The charsets we want to support and create data for. +charsets = ['ISO-8859-3'] + +## Optional Properties ## + +# Alphabet characters. +# If use_ascii=True, there is no need to add any ASCII characters. +# If case_mapping=True, there is no need to add several cases of a same +# character (provided Python algorithms know the right cases). +alphabet = 'ċġħż' +# The starred page which was rewarded on the main page when I created +# the data. +start_pages = ['Unjoni Ewropea'] +# give possibility to select another code for the Wikipedia URL. +wikipedia_code = code +# 'a' and 'A' will be considered the same character, and so on. +# This uses Python algorithm to determine upper/lower-case of a given +# character. +case_mapping = True + +# A function to clean content returned by the `wikipedia` python lib, +# in case some unwanted data has been overlooked. +# Note that we are already cleaning away the '=' from the title syntax +# of Wikipedia, as well as double spaces. But sometimes, Wikipedia in +# some language may return weird syntax or UI text which should be +# discarded. If you encounter one of these cases, use this function. +def clean_wikipedia_content(content): + # Do your garbage text cleaning here. + return content diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 5209ab5..b5c4620 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -19,6 +19,7 @@ set( LangModels/LangHebrewModel.cpp LangModels/LangLithuanianModel.cpp LangModels/LangLatvianModel.cpp + LangModels/LangMalteseModel.cpp LangModels/LangPortugueseModel.cpp LangModels/LangSpanishModel.cpp LangModels/LangThaiModel.cpp diff --git a/src/LangModels/LangMalteseModel.cpp b/src/LangModels/LangMalteseModel.cpp new file mode 100644 index 0000000..d4573ae --- /dev/null +++ b/src/LangModels/LangMalteseModel.cpp @@ -0,0 +1,137 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is Mozilla Communicator client code. + * + * The Initial Developer of the Original Code is + * Netscape Communications Corporation. + * Portions created by the Initial Developer are Copyright (C) 1998 + * the Initial Developer. All Rights Reserved. + * + * Contributor(s): + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ + +#include "../nsSBCharSetProber.h" + +/********* Language model for: Maltese *********/ + +/** + * Generated by BuildLangModel.py + * On: 2016-09-21 02:07:45.509404 + **/ + +/* Character Mapping Table: + * ILL: illegal character. + * CTR: control character specific to the charset. + * RET: carriage/return. + * SYM: symbol (punctuation) that does not belong to word. + * NUM: 0 - 9. + * + * Other characters are ordered by probabilities + * (0 is the most common character in the language). + * + * Orders are generic to a language. So the codepoint with order X in + * CHARSET1 maps to the same character as the codepoint with the same + * order X in CHARSET2 for the same language. + * As such, it is possible to get missing order. For instance the + * ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1 + * even though they are both used for French. Same for the euro sign. + */ +static const unsigned char Iso_8859_3_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 0, 15, 28, 13, 4, 16, 19, 22, 1, 9, 12, 3, 10, 5, 8, /* 4X */ + 14, 27, 6, 11, 2, 7, 26, 18, 25, 30, 20,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 0, 15, 28, 13, 4, 16, 19, 22, 1, 9, 12, 3, 10, 5, 8, /* 6X */ + 14, 27, 6, 11, 2, 7, 26, 18, 25, 30, 20,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM, 17,SYM,SYM,SYM,ILL, 48,SYM,SYM, 49, 50, 51, 52,SYM,ILL, 21, /* AX */ + SYM, 17,SYM,SYM,SYM,SYM, 53,SYM,SYM, 54, 55, 56, 57,SYM,ILL, 21, /* BX */ + 29, 36, 47,ILL, 58, 24, 59, 40, 33, 31, 60, 39, 45, 35, 61, 62, /* CX */ + ILL, 37, 32, 34, 44, 23, 38,SYM, 63, 43, 42, 64, 46, 65, 66, 41, /* DX */ + 29, 36, 47,ILL, 67, 24, 68, 40, 33, 31, 69, 39, 45, 35, 70, 71, /* EX */ + ILL, 37, 32, 34, 44, 23, 38,SYM, 72, 43, 42, 73, 46, 74, 75,SYM, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + + +/* Model Table: + * Total sequences: 870 + * First 512 sequences: 0.9959115850692665 + * Next 512 sequences (512-1024): 0.004088414930733575 + * Rest: -4.423544863740858e-17 + * Negative sequences: TODO + */ +static const PRUint8 MalteseLangModel[] = +{ + 3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,0,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,0,2,3,3,3,3,3,2,0,3,0,0,3,3,3,2,3,3, + 3,3,3,3,3,2,2,3,3,3,3,3,3,3,2,3,3,2,3,3,2,0,3,3,0,3,3,3,2,0,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3, + 3,3,3,3,3,3,2,3,3,3,2,3,3,3,2,3,3,3,3,3,3,3,3,3,3,2,3,3,3,0,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,3,3,2,3,3,3,3,3,2,3,0,3, + 3,3,3,3,3,3,3,2,2,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,2, + 3,3,2,2,3,3,3,3,3,3,3,3,3,3,2,3,3,3,2,3,0,3,3,2,2,2,2,2,0,0,0, + 3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,2,0,2,3,2,2,3,2,2,2,0,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,0,0,3,2,0,0,3,3,3,2,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,0,2,3,3,2,3,0,0,0,2,0,3,2,0,0,0,0,2, + 3,3,3,3,3,3,3,3,3,3,3,3,2,3,2,2,2,3,3,2,2,0,3,0,0,2,2,0,2,2,2, + 3,3,2,3,3,2,3,3,3,3,2,3,2,2,3,0,0,0,2,3,0,0,3,0,2,0,2,0,2,0,0, + 3,3,3,3,3,3,3,3,3,3,0,3,3,3,0,3,2,3,3,3,0,3,2,0,0,2,0,3,3,0,2, + 3,3,3,3,3,3,3,3,3,3,0,3,2,2,0,2,3,0,0,2,0,0,2,0,0,0,0,2,2,0,2, + 3,3,3,3,3,2,3,3,3,3,3,3,2,3,0,3,2,3,2,0,0,2,3,2,0,2,0,3,0,0,0, + 3,3,3,3,3,3,3,2,3,2,2,3,3,3,2,2,2,2,3,2,0,2,2,3,2,3,2,2,0,0,2, + 3,3,2,3,3,3,3,3,3,2,2,2,2,3,2,2,0,3,3,3,2,3,3,0,0,0,3,0,2,2,3, + 3,3,2,2,3,2,2,3,2,3,2,0,0,0,2,0,0,0,2,2,3,0,0,0,0,0,2,2,0,0,0, + 3,3,2,3,3,2,0,3,3,3,3,0,0,3,0,2,2,0,2,3,0,3,0,0,0,0,3,0,0,0,0, + 3,3,3,2,3,2,3,3,3,0,3,2,2,2,2,2,0,0,2,0,2,0,2,0,0,0,0,2,0,0,2, + 3,3,2,2,3,3,3,3,3,3,2,0,0,3,0,2,0,2,2,3,2,2,0,3,0,0,2,0,0,2,0, + 3,3,2,2,3,0,2,2,0,3,0,0,2,0,2,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0, + 3,3,3,3,3,2,3,3,3,3,3,0,2,2,0,3,2,0,2,0,0,0,3,0,0,3,2,0,2,0,0, + 3,3,0,2,3,2,3,3,3,3,0,2,0,3,2,0,0,0,0,0,0,0,0,0,0,0,3,0,2,0,2, + 3,3,3,2,3,0,3,3,3,3,2,3,2,3,0,3,3,0,3,3,0,0,2,2,2,2,0,3,0,2,0, + 3,3,3,3,3,0,2,2,3,2,0,3,3,3,0,2,3,0,0,0,2,0,3,0,0,0,0,2,2,0,2, + 0,2,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 2,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,0,0,2,0,0,0,0,0,0,0,2,0,2,0,2, +}; + + +const SequenceModel Iso_8859_3MalteseModel = +{ + Iso_8859_3_CharToOrderMap, + MalteseLangModel, + 31, + (float)0.9959115850692665, + PR_TRUE, + "ISO-8859-3" +}; \ No newline at end of file diff --git a/src/nsSBCSGroupProber.cpp b/src/nsSBCSGroupProber.cpp index e90902f..ecfacab 100644 --- a/src/nsSBCSGroupProber.cpp +++ b/src/nsSBCSGroupProber.cpp @@ -124,6 +124,8 @@ nsSBCSGroupProber::nsSBCSGroupProber() mProbers[43] = new nsSingleByteCharSetProber(&Iso_8859_15PortugueseModel); mProbers[44] = new nsSingleByteCharSetProber(&Windows_1252PortugueseModel); + mProbers[45] = new nsSingleByteCharSetProber(&Iso_8859_3MalteseModel); + Reset(); } diff --git a/src/nsSBCSGroupProber.h b/src/nsSBCSGroupProber.h index 48444de..5053a49 100644 --- a/src/nsSBCSGroupProber.h +++ b/src/nsSBCSGroupProber.h @@ -40,7 +40,7 @@ #define nsSBCSGroupProber_h__ -#define NUM_OF_SBCS_PROBERS 45 +#define NUM_OF_SBCS_PROBERS 46 class nsCharSetProber; class nsSBCSGroupProber: public nsCharSetProber { diff --git a/src/nsSBCharSetProber.h b/src/nsSBCharSetProber.h index 5b40db7..a48591d 100644 --- a/src/nsSBCharSetProber.h +++ b/src/nsSBCharSetProber.h @@ -185,5 +185,7 @@ extern const SequenceModel Iso_8859_9PortugueseModel; extern const SequenceModel Iso_8859_15PortugueseModel; extern const SequenceModel Windows_1252PortugueseModel; +extern const SequenceModel Iso_8859_3MalteseModel; + #endif /* nsSingleByteCharSetProber_h__ */ diff --git a/test/mt/iso-8859-3.txt b/test/mt/iso-8859-3.txt new file mode 100644 index 0000000..255269b --- /dev/null +++ b/test/mt/iso-8859-3.txt @@ -0,0 +1,4 @@ +Franza (Franåi¿:France), uffiåjalment ir-Repubblika Franåi¿a (Franåi¿: +République française), hi pajji¿ fl-Ewropa tal-Punent. Il-belt belt kapitali +tag±ha hi Pariõi. Hi membru tal-Unjoni Ewropea. Franza hi maqsuma f'22 régions +li huma suddivi¿i f' départements. diff --git a/test/mt/utf-8.txt b/test/mt/utf-8.txt new file mode 100644 index 0000000..079f387 --- /dev/null +++ b/test/mt/utf-8.txt @@ -0,0 +1,4 @@ +Franza (FranÄ‹iż:France), uffiÄ‹jalment ir-Repubblika FranÄ‹iża (FranÄ‹iż: +République française), hi pajjiż fl-Ewropa tal-Punent. Il-belt belt kapitali +tagħha hi PariÄ¡i. Hi membru tal-Unjoni Ewropea. Franza hi maqsuma f'22 régions +li huma suddiviżi f' départements.